P5_CS6262

project5.zip

Home >Computer Science homework help >P5_CS6262

project 5/answers.txt

Project-Quiz:- Answer 1: Answer 2: Answer 3: Answer 4: Answer 5:

project 5/PAYL/analysis.py

''' Trains Model and Tests samples. Dependency : distance_and_clustering.py ''' import sys, os, string import numpy as np import matplotlib.pyplot as plt import matplotlib import distance_and_clustering as dc import read_pcap as dpr def is_ascii(string1): for c in string1: if ord(c) >= 256: print('"'+c+'" '+str(ord(c))) return 0 return 1 def get_freq_from_ascii_string(ascii_string): freq_array1 = [0]*256 for c in ascii_string: freq_array1[ord(c)] = freq_array1[ord(c)]+1 return freq_array1 def get_mahabs_distance(pool, training_length_dict, averaged_feature_vector, new_frequency_distribution, smoothing_factor): for str2 in pool: mahabs_distance = sys.maxsize if is_ascii(str2) == 0 : sys.exit('Error: File contains non-ascii characters! Exiting') new_frequency_distribution = get_freq_from_ascii_string(str2) # now, check if the length has been encountered is or not ! if(len(str2) in list(training_length_dict.keys())): averaged_feature_vector = (feature_vector[min_length-len(str2)]) mahabs_distance = dc.give_mahalanobis_distance(averaged_feature_vector, new_frequency_distribution, smoothing_factor) else: mahabs_distance = sys.maxsize return mahabs_distance # if verbose is set to True, the graphs are generated def train_and_test(training, test, attack_file, smoothing_factor, threshold_for_classification, verbose = "False"): all_ascii = [str(i) for i in range(256)] all_ascii_int = [ j for j in range(256)] matplotlib.rcParams.update({'font.size': 10}) # sort the files by length training_length_dict = {} min_length = len(training[0]) max_length = len(training[0]) for payload in training: if is_ascii(payload) == 0 : sys.exit('Error: File contains non-ascii characters! Exiting') payload_length = len(payload) if payload_length in list(training_length_dict.keys()): training_length_dict[payload_length].append(payload) else: training_length_dict[payload_length] =[payload] if min_length > payload_length: min_length = payload_length if max_length < payload_length: max_length = payload_length feature_vector = [] for i in range(1, max_length - min_length + 2): mean = [0] * 256 stddev = [0] * 256 feature_vector.append(np.vstack((mean,stddev)).T) print('Training the Model') # process the sorted files and store the models by the length of the files for key in sorted(training_length_dict.keys()): i = 0 # frequency array for each length group freq_array_per_length= [[0] * 256] * (len(training_length_dict[key])) for argu in training_length_dict[key]: if is_ascii(argu) == 0 : sys.exit('Error: File contains non-ascii characters! Exiting') freq_array_per_length[i] = get_freq_from_ascii_string(argu) i = i+1 stddev_array_per_length = np.std(freq_array_per_length,axis=0) mean_array_per_length = np.mean(freq_array_per_length, axis=0) feature_vector[min_length-key] = np.vstack((mean_array_per_length,stddev_array_per_length)).T #print str(min_length-key) #print feature_vector[min_length-key] #print "NEW" # plotting the mean array if verbose == "True": plt.xticks(all_ascii_int, all_ascii) plt.bar(all_ascii_int, mean_array_per_length) plt.title('Mean frequency of each of the acsii characters for length '+str(key)) plt.show() # plotting the std dev array if verbose == "True": plt.xticks(all_ascii_int, all_ascii) plt.xticks(all_ascii_int, all_ascii) plt.bar(all_ascii_int, stddev_array_per_length) plt.title('Std Dev of freq of each of the ascii characters for Length '+str(key)) plt.show() #print 'Training lengths:' #for key in sorted(training_length_dict.iterkeys()): # print "Training key payload length: " + str(key) + ":" + str(len(training_length_dict[key])) print('Testing the Model') true_positive = 0 false_negative = 0 for str2 in test: mahabs_distance = sys.maxsize if is_ascii(str2) == 0 : sys.exit('Error: File contains non-ascii characters! Exiting') new_frequency_distribution = get_freq_from_ascii_string(str2) # now, check if the length has been encountered is or not ! if(len(str2) in list(training_length_dict.keys())): averaged_feature_vector = (feature_vector[min_length-len(str2)]) mahabs_distance = dc.give_mahalanobis_distance(averaged_feature_vector, new_frequency_distribution, smoothing_factor) else: mahabs_distance = sys.maxsize if mahabs_distance <= threshold_for_classification: true_positive = true_positive + 1 if verbose == "True": print(str(mahabs_distance)) else: false_negative = false_negative + 1 if verbose == "True": print(str(mahabs_distance)) print('Total Number of testing samples: '+str(len(test))) print('Percentage of True positives: '+str((true_positive/float(len(test)))*100.0)) #Attack data loading false_positive = 0 true_negative = 0 if attack_file is not None: if attack_file.lower().endswith('.pcap'): attack = dpr.readPcap(attack_file, "testing") print("Attack Data:\n") print(attack) else: attack = dpr.read_attack_data(attack_file) print("Attack Data:\n") print(attack) else: print('\nExiting now') return print('--------------------------------------') print('Analysing attack data, of length '+str(len(attack[0]))) if len(attack[0]) not in list(training_length_dict.keys()): print("\n\n\nCannot compare the test/attack paylaod against the model.\nThe test/attack length payload is not found in training length payloads.\nExiting.") sys.exit() for str2 in attack: mahabs_distance = sys.maxsize if is_ascii(str2) == 0 : sys.exit('Error: File contains non-ascii characters! Exiting') new_frequency_distribution = get_freq_from_ascii_string(str2) # now, check if the length has been encountered is or not ! if(len(str2) in list(training_length_dict.keys())): #print "No of samples in length:" + str(len(training_length_dict[len(str2)])) #print str(min_length-len(str2)) #print feature_vector[min_length-len(str2)] #averaged_feature_vector = (feature_vector[-2503]) averaged_feature_vector = (feature_vector[min_length-len(str2)]) #averaged_feature_vector = (feature_vector[-105]) #print averaged_feature_vector mahabs_distance = dc.give_mahalanobis_distance(averaged_feature_vector, new_frequency_distribution, smoothing_factor) else: mahabs_distance = sys.maxsize if mahabs_distance <= threshold_for_classification: false_positive = false_positive + 1 print("Calculated distance of {:.2f} is lesser than the threshold of {:d}. It fits the model." .format(mahabs_distance, threshold_for_classification)) else: true_negative = true_negative + 1 print("Calculated distance of {:.2f} is greater than the threshold of {:d}. It doesn\'t fit the model." .format(mahabs_distance, threshold_for_classification)) print('Total number of True Negatives: '+ str((true_negative/float(len(attack)))*100.0)) print('Total number of False Positives: '+ str((false_positive/float(len(attack)))*100.0)+'\n')

project 5/PAYL/data/dns.pcap

project 5/PAYL/data/friday_dns.pcap

project 5/PAYL/data/HTTPtext_V1.pcap

project 5/PAYL/data/HTTPtext_V2.pcap

project 5/PAYL/data/modified_new3_simple_http.pcap

project 5/PAYL/data/modified_new4_simple_http.pcap

project 5/PAYL/data/modified_new5_simple_http.pcap

project 5/PAYL/data/modified_new6_simple_http.pcap

project 5/PAYL/data/modified_new_simple_http.pcap

project 5/PAYL/data/thursday_dns.pcap

project 5/PAYL/distance_and_clustering.py

''' Code for calculating various distances and clustering the models ''' import numpy as np import scipy.spatial.distance as dist import math ''' averaged_feature_vector : 256*2 array representing <mean,variance> pairs for each of the 256 ASCII characters new_frequency_distribution : 256*1 array representing the frequencies of each of the 256 ASCII characters smoothing_factor : single scalar value ''' def give_mahalanobis_distance(averaged_feature_vector, new_frequency_distribution, smoothing_factor): if (smoothing_factor == 0): raise Exception("Smoothing factor cannot be zero") distance = 0 for n in range(0,256) : xi = averaged_feature_vector[n][0] yi = new_frequency_distribution[n] sigi = averaged_feature_vector[n][1] if (sigi < 0): print(sigi) distance = distance + (abs(xi-yi) / (sigi + smoothing_factor)) return distance #Takes as argument two 1-D list and gives distance as double ''' model_i : 256*1 array representing the average frequency values of each of the 256 ASCII characters model_k : 256*1 array representing the average frequency values of each of the 256 ASCII characters ''' def manhattan_distance(model_i, model_k): x = np.array(model_i) y = np.array(model_k) return dist.cityblock(x,y) ''' variance1 : single scalar value variance2 : single scalar value mean1 : single scalar value mean2 : single scalar value size1 : single scalar value size2 : single scalar value ''' def weighted_variance(variance1, variance2, mean1,mean2,size1,size2): term1 = size1 * (variance1 + (mean1*mean1)) term2 = size2*(variance2 + (mean2*mean2)) size = size1 + size2 weighted_mean = ((size1*mean1)+(size2*mean2))/ float(size) #print(weighted_mean) return ((term1 + term2)/ float(size)) - (weighted_mean*weighted_mean) ''' variance1 : single scalar value variance2 : single scalar value mean1 : single scalar value mean2 : single scalar value size1 : single scalar value size2 : single scalar value ''' def weighted_sd(sd1,sd2,mean1,mean2,size1,size2): variance1 = sd1 * sd1 variance2 = sd2 * sd2 variance = weighted_variance(variance1, variance2, mean1, mean2, size1,size2) return math.sqrt(variance) ''' There can be multiple implementations of this. We merge and update based on weighted average frequency where weight being the number of samples each model has model_i: 256*2 feature array where each tuple is <freq,stddev> for length i model_k: 256*2 feature array where each tuple is <freq,stddev> for length k n_i: single scalar value representing the number of samples for length i n_k: single scalar value representing the number of samples for length k ''' def merge_update(model_i,model_k,ni,nk): # print("merge and update") if (ni == 0 and nk == 0): return model_i, model_k for j in range(0,255): avg_frequency_i = model_i[j][0] avg_frequency_k = model_k[j][0] n = ni + nk avg_cumulative_mean = ((avg_frequency_i * ni) + (avg_frequency_k * nk))/n avg_cumulative_stddev = weighted_sd(model_i[j][1], model_k[j][1], model_i[j][0], model_k[j][0], ni, nk) model_i[j][0] = avg_cumulative_mean model_k[j][0] = avg_cumulative_mean model_i[j][1] = avg_cumulative_stddev model_k[j][1] = avg_cumulative_stddev return model_i,model_k ''' Takes as argument models - 2d list with payload length and average frequency for 256 characters [n][256] where n is the number of models. The list is sorted based on payload length threshold - when two models should be merged lengthwise_sample_numbers - 1-D list number of samples recorded for each length while training Uses manhattan distance to decide which models need to be merged Model that remains will be a compact model and will not have all the lengths. For the lengths not found, look for the largest predecessor threshold : single scalar value models : (range of payload length)*256*2 lengthwise_sample_numbers : (range of payload length)*1 ''' def cluster(threshold, models, lengthwise_sample_numbers): i = 0 # iterate over each of the models while (i < len(models)): # for each model, search through nearby models and update k = i + 1 while(k < len(models) and manhattan_distance(models[i][0], models[k][0]) < threshold): ni = lengthwise_sample_numbers[i] nk = lengthwise_sample_numbers[k] models[i], models[k] = merge_update(models[i], models[k], ni, nk) k = k + 1 i = k + 1 return models

project 5/PAYL/dns_artificial_profile.pcap

project 5/PAYL/http_artificial_profile.pcap

project 5/PAYL/parameters.txt

project 5/PAYL/read_pcap.py

import dpkt, dpkt.dns import sys import ast def hexify(x): #In case the strings from DNS resolver contain non-ASCII characters" toHex = lambda x:"".join([hex(ord(c))[2:].zfill(2) for c in x]) return toHex(x) def concatBytesToStr(bytes_str): return "".join(map(chr, bytes_str)) def decode_dns_response ( rr, response_type) : #source: https://github.com/jeffsilverm/dpkt_doc/blob/master/decode_dns.py r_type = rr.type r_data = rr.rdata #print(repr(r_data)) type_table = {1:"A", # IP v4 address, RFC 1035 2:"NS", # Authoritative name server, RFC 1035 5:"CNAME", # Canonical name for an alias, RFC 1035 6:"SOA", # Marks the start of a zone of authority, RFC 1035 12:"PTR", # Domain name pointer, RFC 1035 13:"HINFO", # Host information, RFC 1035 15:"MX", # Mail exchange, RFC 1035 28:"AAAA", # IP v6 address, RFC 3596 16:"TXT", # 33:"SRV", # RFC 2782 255:"ANY", # all cached reco } rr_string = "" if sys.version_info[0] >= 3: try: r_data.decode('utf-8') except UnicodeDecodeError: # print('This string contains more than just the ASCII characters.') return "\\" + type_table[r_type] + "\\" + concatBytesToStr(r_data) else: try: r_data.encode('utf-8') except UnicodeDecodeError: # Decoding error for checking ASCII # print('This string contains more than just the ASCII characters.') return "\\" + type_table[r_type] + "\\" + r_data if r_type == dpkt.dns.DNS_CNAME : #print("Response is a CNAME ", rr.cname) rr_string = rr.cname elif r_type == dpkt.dns.DNS_A : #print("response is an IPv4 address", socket.inet_ntoa( r_data )) rr_string = socket.inet_ntoa( r_data ) #print("DNS_A", rr_string) elif r_type == dpkt.dns.DNS_NS : #print("Response is a NS name", rr.nsname) rr_string = rr.nsname elif r_type == dpkt.dns.DNS_AAAA : #print("response is an IPv6 address", socket.inet_ntop( socket.AF_INET6, r_data )) rr_string = socket.inet_ntop( socket.AF_INET6, r_data ) #print("DNS_AAA:", rr_string) elif r_type == dpkt.dns.DNS_PTR : #print("response is a hostname from an IP address", rr.ptrname) rr_string = rr.ptrname elif r_type == dpkt.dns.DNS_SOA : #print('DNS_SOA:',rr.mname,rr.rname,rr.serial,rr.refresh,rr.retry,rr.expire, rr.minimum) rr_string = rr.mname + "," + rr.rname + "," + rr.serial + "," + rr.refresh + "," + rr.retry + "," + rr.expire + "," + rr.minimum elif r_type == dpkt.dns.DNS_MX : #print('DNS_MX:',rr.mxname,rr.preference) rr_string = rr.mxname + "," + rr.preference elif r_type == dpkt.dns.DNS_HINFO : #print('DNS_HINFO:',rr.text) rr_string = rr.rtext elif r_type == dpkt.dns.DNS_TXT : #print("TEXT:",rr.text) rr_string = rr.rtext elif r_type == dpkt.dns.DNS_SRV : #print('DNS_SRV:',rr.srvname,rr.port,rr.priority,rr.weight) rr_string = rr.srvname + "," + rr.port + "," + rr.priority + "," + rr.weight else : #print("Unknown") rr_string = "Unknown" #return "r_type" + ":" + type_table[r_type] + ":" + rr_string return "\\" + type_table[r_type] + "\\" + rr_string def readPcap(fileName, mode): type_table = {1:"A", # IP v4 address, RFC 1035 2:"NS", # Authoritative name server, RFC 1035 5:"CNAME", # Canonical name for an alias, RFC 1035 6:"SOA", # Marks the start of a zone of authority, RFC 1035 12:"PTR", # Domain name pointer, RFC 1035 13:"HINFO", # Host information, RFC 1035 15:"MX", # Mail exchange, RFC 1035 28:"AAAA", # IP v6 address, RFC 3596 16:"TXT", # 33:"SRV", # RFC 2782 255:"ANY", # all cached reco } payload_list = [] f = open(fileName,"rb") pcap = dpkt.pcap.Reader(f) total = 0 for ts, buf in pcap: try: eth = dpkt.ethernet.Ethernet(buf) ip = eth.data proto_data = ip.data if proto_data.sport == 53: dns_payload = dpkt.dns.DNS(proto_data.data) dns_payload_string = "" for rr in dns_payload.an: rr_string = decode_dns_response ( rr, "AN" ) if rr_string == "Unknown": print("DNS data unknown") continue if dns_payload_string == "": dns_payload_string = rr_string else: dns_payload_string = dns_payload_string + "," + str(rr_string) for rr in dns_payload.ns: rr_string = decode_dns_response ( rr, "NS" ) if rr_string == "Unknown": continue if dns_payload_string == "": dns_payload_string = rr_string else: dns_payload_string = dns_payload_string + "," + str(rr_string) for rr in dns_payload.ar: rr_string = decode_dns_response ( rr, "AR" ) if rr_string == "Unknown": continue if dns_payload_string == "": dns_payload_string = rr_string else: dns_payload_string = dns_payload_string + "," + str(rr_string) # #print("Payload string response:") # #print(dns_payload_string) if dns_payload_string != "": dns_payload_string = str(dns_payload.id) + "\\" + str(dns_payload.qr) + "\\" + str(dns_payload.opcode) + "\\" + str(dns_payload.rcode) + "\\" + str(len(dns_payload.an)) + "\\" + str(len(dns_payload.ns)) + "\\" + str(len(dns_payload.ar)) + "\\" + dns_payload_string payload_list.append(str(dns_payload_string)) total = total + 1 #if mode == "testing": # print("\n\n\n= = = = = = = = = = = = = = = = =") # print("My testing payload has length: " + str(len(dns_payload_string))) # print(dns_payload_string) # print("\n") elif (proto_data.dport == 53): dns_payload = dpkt.dns.DNS(proto_data.data) #print(dns_payload) dns_payload_string = "" dns_payload_string = str(dns_payload.id) + "\\" + str(dns_payload.qr) + "\\" + str(dns_payload.opcode) + "\\" + str(dns_payload.rcode) + "\\" + str(len(dns_payload.an)) + "\\" + str(len(dns_payload.ns)) + "\\" + str(len(dns_payload.ar)) + "\\" + dns_payload.qd[0].name + "\\" + str(dns_payload.qd[0].type) + "\\" + type_table[dns_payload.qd[0].type] if dns_payload_string != "": payload_list.append(str(dns_payload_string)) #print(dns_payload_string) total = total + 1 #if mode == "testing": # print("\n\n\n= = = = = = = = = = = = = = = = =") # print("My testing payload has length: " + str(len(dns_payload_string)) + ":") # print(dns_payload_string) # print("\n") elif (proto_data.sport == 1924 and proto_data.dport == 1957): dns_payload_string = proto_data.data if sys.version_info[0] >= 3: dns_payload_string = concatBytesToStr(dns_payload_string) payload_list.append(str(dns_payload_string)) total = total + 1 elif (proto_data.sport == 80 or proto_data.dport == 80): payload = proto_data.data if sys.version_info[0] >= 3: payload = concatBytesToStr(payload) payload_list.append(str(payload)) total = total + 1 #if mode == "testing": # print("\n\n\n= = = = = = = = = = = = = = = = =") # print("My testing payload has length: " + str(len(payload)) + ":") # print(payload) # print("\n") elif (mode == "testing"): payload = payload_str if sys.version_info[0] >= 3: payload = concatBytesToStr(payload) payload_list.append(str(payload)) total = total + 1 # print("\n\n\n= = = = = = = = = = = = = = = = =") # print("My testing payload has length: " + str(len(payload)) + ":") # print(payload) # print("\n") except : continue # print("Total payloads read:" + str(fileName) + ":" + str(total) + "\n") return payload_list def getPayloadStrings(training_protocol): payload_list = [] #DNS if training_protocol == "DNS": list1_dns = readPcap('data/dns.pcap', "training") payload_list.extend(list1_dns) #HTTP elif training_protocol == "HTTP": list1 = readPcap('data/HTTPtext_V1.pcap', "training") list2 = readPcap('data/HTTPtext_V2.pcap', "training") list3 = readPcap('data/modified_new3_simple_http.pcap', "training") list4 = readPcap('data/modified_new4_simple_http.pcap', "training") list5 = readPcap('data/modified_new5_simple_http.pcap', "training") list6 = readPcap('data/modified_new6_simple_http.pcap', "training") list7 = readPcap('data/modified_new_simple_http.pcap', "training") payload_list.extend(list1) payload_list.extend(list2) payload_list.extend(list3) payload_list.extend(list4) payload_list.extend(list5) payload_list.extend(list6) payload_list.extend(list7) return payload_list def read_attack_data(filename): #This function reads the output of the polymorphic blend code (the file does not end in pcap) listl = open(filename, 'rb') listl1 = listl.read().decode("utf8") #print(listl1) return [listl1]

project 5/PAYL/requirements.txt

numpy==1.19.0 scipy==1.5.0 dpkt==1.9.2 matplotlib==3.2.2

project 5/PAYL/SETUP.txt

PREREQUISITES ------------- The commands in the following instructions are for Linux OS only. If you use some other OS, please look for equivalent commands online. - Python version 3 - You need to have python version 3 for this. We have tested it on 3.6.9 but it should work on other python 3 versions too. - How to check your python3 version? `python3 --version` - Python modules required (refer to `requirements.txt`) You can first create a virtual environment by using the following commands - `python3 -m venv /path/to/new/virtual/environment` e.g. create an virtual environment in the working directory, please use `python3 -m venv new_env` - Start the virtual environment: `source new_env/bin/activate` - Exit the virtual environment: `deactivate` - Resource: <https://docs.python.org/3/tutorial/venv.html> - How to install? `pip install <module name>` - To install all of them in one go `pip install -r requirements.txt` - How to check module version? `pip freeze | grep <module name>` HOW TO RUN THE CODE ------------------- - Code is in PAYL/ directory. - To run the code given `python3 wrapper.py` - Once you run it, it will take some time to generate the model. Based on the values passed, it will compute false negative and true positive rate. Before modifying the code, run the original code given to see how it works and if your setup is correct. - Now, read the project description to understand what the project task is and modify and run the code. - *Optional*: Set `verbose = True` (in wrapper.py) to get the graphs for each model This will not be graded but can help you understand the generated models better. CODE COMPONENTS --------------- - `wrapper.py` It is the entry point for the project code. It calls all other functions and modules for the project. - `analysis.py` It trains the model and tests it. - `distance_and_clustering.py` This module computes different distances and clusters model. - `read_pcap.py` It parses the pcap data and makes it ready for the wrapper to use. DATA ---- We have 10 pcap files in PAYL/data directory. We have already written script that reads the files and gives data to the model. All the best !!! Contact us or post on piazza if you run into issues.

project 5/PAYL/wrapper.py

''' Wrapper script for CS 6262 Project 5 run as : python3 wrapper.py for training mode ''' import read_pcap as dpr import random as rn import sys import analysis #Choose the protocol: training_protocol = "HTTP" #OR #training_protocol = "DNS" split_ratio = 0.75 def partition(payloads): # shuffle the data to randomly pick samples rn.shuffle(payloads) split_index = int(len(payloads) * split_ratio) training = payloads[0 : split_index + 1] test = payloads[split_index + 1 :len(payloads)] return training, test if __name__=='__main__': print(("Working with protocol: " + training_protocol + " : in training data.")) attack_file = None # check which mode the program is being run in len_of_args = len(sys.argv) if(len_of_args == 1): print('\n\tAttack data not provided, training and testing model based on pcap files in \'data/\' folder alone.') print('\tTo provide attack data, run the code as: python wrapper.py <attack-data-file-name>') else: print(('\n\tAttack data provided, as command line argument \''+sys.argv[1]+'\'')) attack_file = sys.argv[1] print('---------------------------------------------') payloads = dpr.getPayloadStrings(training_protocol) training, test = partition(payloads) # For HTTP training data, may need to ensure it includes large packet e.g., # In the PAYL paper, 418, 730, 1460 are chosen # We need at least one min or max length samples in the training data set for HTTP if training_protocol == "HTTP": min_length = 0 max_length = 0 while min_length == 0 or max_length == 0: for x in training: if len(x) == 0: min_length = 1 if len(x) == 1460: max_length = 1 training, test = partition(payloads) for j in range(0, len(test)): if len(test[j]) == 705: for i in range(0, len(training)): if len(training[i]) != 0 and len(training[i]) != 1460 and len(training[i]) !=705: t = training[i] training[i] = test[j] test[j] = t break # Simple sanity check if len(payloads) != len(test)+len(training) or split_ratio >= 1.0: sys.exit() else: ''' To better understand the behaviour of the model with different parameters, we typically let the parameters iterate over a range. Here, range(threshold_for_mahalanobis_lower, threshold_for_mahalanobis_upper+1) is the range over which the mahalanobis threshold iterates. Similarly, range(smoothing_factor_lower, smoothing_factor_upper+0.1) is the range over which the smoothing factor iterates. For each such combination of mahalanobis threshold and smoothing factor, the model is generated with these parameters. ''' # Configure the parameters. smoothing_factor_lower = 3 smoothing_factor_upper = 10 threshold_for_mahalanobis_lower = 20 threshold_for_mahalanobis_upper = 9000 # this loops from smoothing_factor_lower to smoothing_factor_upper in steps of 0.1 for smoothing_factor in range(smoothing_factor_lower, smoothing_factor_upper+1): for mahabs in range(threshold_for_mahalanobis_lower, threshold_for_mahalanobis_upper+1, 50): print(('Smoothing Factor: '+str(smoothing_factor/10.0))) print(('Threshold for Mahalanobis Distance: '+str(mahabs))) analysis.train_and_test(training, test, attack_file, smoothing_factor/10.0, mahabs, verbose = "False") print('---------------------------------------------')

project 5/Polymorphic_blend/frequency.py

#!/usr/bin/env python3 import struct from collections import Counter def sorting(dictFrequency): result = sorted(dictFrequency.items(), reverse = True, key = lambda x: x[1] ) return result def frequency(payload): c = Counter(payload) number = 0.0 for (k,n) in list(dict(c).items()): number = number + n #print(number) result = {} for (k,n) in list(dict(c).items()): result.update({k:round(n/number,3)}) #print(result) return result

project 5/Polymorphic_blend/http_artificial_profile.pcap

project 5/Polymorphic_blend/Makefile

a.out: shellcode.o payload.o gcc -g3 -m32 shellcode.o payload.o -o a.out shellcode.o: shellcode.S gcc -g3 -c shellcode.S -m32 -o shellcode.o payload.o: payload.bin objcopy -I binary -O elf32-i386 -B i386 payload.bin payload.o

project 5/Polymorphic_blend/padding.py

#!/usr/bin/env python3 import struct import math import random from frequency import * from collections import Counter def padding(artificial_payload, raw_payload): padding = "" # Get frequency of raw_payload and artificial profile payload artificial_frequency = frequency(artificial_payload) raw_payload_frequency = frequency(raw_payload) # To simplify padding, you only need to find the maximum frequency difference for each # byte in raw_payload and artificial_payload, and pad that byte at the end of the # raw_payload. # Note: only consider the differences when artificial profile has higher frequency. # Depending upon the difference, call raw_payload.append # Your code here ...

project 5/Polymorphic_blend/shellcode.bin

project 5/Polymorphic_blend/shellcode.S

.extern _binary_payload_bin_start .globl main main: jmp call start: /*popl %ebx get address of substituted attack body and xor table */ movl $_binary_payload_bin_start, %ebx /* copy the address to ecx */ movl %ebx, %ecx addl $0x80, %ecx movl $0x0,%edx movl $0x0,%eax subl $0x80,%esp loop: cmpl $0x20,%edx jge run /* if (0x20<edx) goto skip */ movl (%ecx),%eax xor (%ebx),%eax movl %eax,(%esp) add $0x4,%ebx /* move pointer to adjusted attack body */ add $0x4,%ecx /* move pointer to xor table */ add $0x4,%esp /* move pointer to result */ add $0x1,%edx jmp loop run: /* print out decrypted code */ leal -0x80(%esp),%ecx movl $0x80,%edx movl $0x1,%ebx xorl %eax,%eax movl $0x4,%eax int $0x80 /*quit*/ xorl %eax,%eax inc %al int $0x80 call: call start .string

project 5/Polymorphic_blend/substitution.py

#!/usr/bin/env python3 import struct import math import dpkt import socket from collections import Counter from frequency import * def substitute(attack_payload, substitution_table): # Using the substitution table you generated to encrypt attack payload # Note that you also need to generate a xor_table which will be used to decrypt # the attack_payload # i.e. (encrypted attack payload) XOR (xor_table) = (original attack payload) b_attack_payload = bytearray(attack_payload, "utf8") result = [] xor_table = [] # Based on your implementattion of substitution table, please prepare result # and xor_table as output return (xor_table, result) def getSubstitutionTable(artificial_payload, attack_payload): # You will need to generate a substitution table which can be used to encrypt the attack # body by replacing the most frequent byte in attack body by the most frequent byte in # artificial profile one by one # Note that the frequency for each byte is provided below in dictionay format. # Please check frequency.py for more details artificial_frequency = frequency(artificial_payload) attack_frequency = frequency(attack_payload) sorted_artificial_frequency = sorting(artificial_frequency) sorted_attack_frequency = sorting(attack_frequency) # Your code here ... # Make sure your substitution table can be used in # substitute(attack_payload, subsitution_table) print(substitution_table) return substitution_table def getAttackBodyPayload(path): f = open(path, 'rb') pcap = dpkt.pcap.Reader(f) for ts, buf in pcap: eth = dpkt.ethernet.Ethernet(buf) ip = eth.data if socket.inet_ntoa(ip.dst) == "192.150.11.111": tcp = ip.data if tcp.data == "": continue return tcp.data.rstrip() def getArtificialPayload(path): f = open(path, 'rb') pcap = dpkt.pcap.Reader(f) for ts, buf in pcap: eth = dpkt.ethernet.Ethernet(buf) ip = eth.data tcp = ip.data if tcp.sport == 80 and len(tcp.data) > 0: return tcp.data

project 5/Polymorphic_blend/task1.py

#!/usr/bin/env python3 import struct from collections import Counter from substitution import * from padding import * ARTIFICIAL_PATH = "http_artificial_profile.pcap" ATTACKBODY_PATH = "YOUR_GTUSERNAME.pcap" # replace the file name by the one you downloaded if __name__ == '__main__': attack_payload_bytes = getAttackBodyPayload(ATTACKBODY_PATH) artificial_payload_bytes = getArtificialPayload(ARTIFICIAL_PATH) artificial_payload = artificial_payload_bytes.decode("utf8") attack_payload = attack_payload_bytes.decode("utf8") # Generate substitution table based on byte frequency in file substitution_table = getSubstitutionTable(artificial_payload, attack_payload) # Substitution table will be used to encrypt attack body and generate corresponding # xor_table which will be used to decrypt the attack body (xor_table, adjusted_attack_body) = substitute(attack_payload, substitution_table) # For xor operation, should be a multiple of 4 while len(xor_table) < 128: # CHECK: 128 can be some other number (greater than and multiple of 4) # per your attack trace length xor_table.append(chr(0)) # For xor operation, should be a multiple of 4 while len(adjusted_attack_body) < 128: # CHECK: 128 can be some other number (greater than and multiple of 4) per # your attack trace length adjusted_attack_body.append(chr(0)) # Read in decryptor binary to append at the start of payload # Prepare byte list for payload with open("shellcode.bin", mode='rb') as file: shellcode_content = file.read() b_list = [] for b in shellcode_content: b_list.append(chr(b)) # Raw payload will be constructed by encrypted attack body and xor_table raw_payload = b_list + adjusted_attack_body + xor_table while len(raw_payload) < len(artificial_payload): padding(artificial_payload, raw_payload) # Write prepared payload to Output file with open("output", "wb") as result_file: result_file.write(bytearray("".join(raw_payload), "utf8")) # Write code here to generate payload.bin! # with open("payload.bin", "wb") as payload_file: # payload_file.write(bytearray("".join(adjusted_attack_body + xor_table), "utf8"))