import logging import datetime # logfile = 'analysis_' + datetime.datetime.now().strftime('%Y-%m-%d_%H:%M') + '.log' # https://stackoverflow.com/questions/1943747/python-logging-before-you-run-logging-basicconfig # logging.basicConfig(filename=logfile, filemode='w') # important to set basicConfig only once for all modules logging.basicConfig() import logging import datetime import gzip import csv import numpy as np import graphviz import time import db_redis import domain import ip import ttl import csv_tools import progressbar # import db_sql from sklearn.datasets import load_iris from sklearn import tree logger = logging.getLogger('train') logger.setLevel(logging.DEBUG) db_format_time = '%Y-%m-%d %H:%M:%S' train_start = datetime.date(2017, 5, 1) train_end = datetime.date(2017, 5, 4) id_upto = 379283817 # record types that should be analysed (e.g. only A) record_types = ['A'] # id_upto = db.mariadb_get_nearest_id(train_end.strftime(db_format_time)) def train(): start = time.time() for day in range(csv_tools.analysis_days_amount): log_files_hour = csv_tools.get_log_files_for_hours_of_day(csv_tools.analysis_days[day]) progress_bar = progressbar.ProgressBar() for hour in progress_bar(range(24)): for hour_files in log_files_hour[hour]: with gzip.open(hour_files, 'rt', newline='') as file: reader = csv.reader(file) for row in reader: if row[2] in record_types: entity = {'timestamp': row[0], 'domain': row[1], 'type': row[2], 'record': row[3], 'ttl': row[4]} try: prepare_features_redis(entity) # pass except Exception as e: logger.error(e) logger.error('Exception occured processing entity: ' + str(entity)) def get_logs_from_db(): results = db.mariadb_get_logs(id_upto) row = results.fetch_row(how=1) logger.debug("# entity: " + row[0]['domain']) features = prepare_features_redis(row[0]) logger.debug(str(features)) # while row: # logger.debug("# entity: " + row[0]['domain']) # # features = prepare_features(row[0]) # # logger.debug(str(features)) # # row = results.fetch_row(how=1) def prepare_features_redis(entity): checkpoint = time.time() domain_stats = db_redis.get_stats_for_domain(entity['domain']) ip_stats = db_redis.get_stats_for_ip(entity['record']) logger.debug('redis took' + str(time.time() - checkpoint)) logger.debug(domain_stats) if len(domain_stats) != 1: logger.debug('no stats in redis for entity: ' + entity) domain_stats = domain_stats[0] # TODO ips = [] # feature 5: Number of distinct IP addresses distinct_ips = len(ips) # feature 6: Number of distinct countries distinct_countries = len([ip.get_country_by_ip(ip_str) for ip_str in ips]) # feature 7: Number of (distinct) domains share the IP with distinct_domains_with_same_ip = len(ip_stats) # feature 8: Reverse DNS query results # 5 atomic feature # atomic 1: ratio of IP addresses that cannot be matched with a domain name (NX domains) ratio_ips_nx = 0 # atomic 2: ratio of ips that are used for DSL lines ratio_ips_dsl = 0 # atomic 3: ratio of ips that belong to hosting services ratio_ips_hoster = 0 # atomic 4: ratio of ips that belong to known ISPs ratio_ips_isp = 0 # atomic 5: ips that can be matched with a valid domain name ratio_ips_valid = 0 # TODO add atomics to 'all_features' reverse_dns_result = 0 # feature 9: Average TTL average_ttl = sum(domain_stats['ttls']) / len(domain_stats['ttls']) # feature 10: Standard Deviation of TTL standard_deviation = ttl.standard_deviation(domain_stats['ttls']) # TODO distinct ttls for std deviation? # feature 11: Number of distinct TTL values distinct_ttl = len(list(set(domain_stats['ttls']))) # feature 12: Number of TTL change ttl_changes = ttl.changes(domain_stats['ttls']) # feature 13: Percentage usage of specific TTL ranges # specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf] # TODO check if 5 individual features make a difference specific_ttl_ranges = ttl.specific_range(entity['ttl']) # feature 14: % of numerical characters numerical_characters_percent = domain.ratio_numerical_to_alpha(entity['domain']) # feature 15: % of the length of the LMS lms_percent = domain.ratio_lms_to_fqdn(entity['domain']) all_features = np.array([ distinct_ips, distinct_countries, distinct_domains_with_same_ip, reverse_dns_result, average_ttl, standard_deviation, distinct_ttl, ttl_changes, specific_ttl_ranges, numerical_characters_percent, lms_percent ]) logger.debug(all_features) exit() return all_features def prepare_features_mysql(entity): checkpoint = time.time() logger.debug('get logs for domain start') # get all logs for the same domain # BIG TODO check if we need the ip addresses of a specific response (not of all [different] responses) somewhere logs_for_domain = db.mariadb_get_logs_for_domain(entity['domain'], id_upto) logger.info('get logs for domain done' + str(time.time() - checkpoint) + ' s') # TODO do this efficient ttls = [log['ttl'] for log in logs_for_domain] logger.debug('ttls ' + str(ttls)) ips = [log['record'] for log in logs_for_domain] # TODO check if valid ip address logger.debug(ips) response_timestamps = [log['timestamp'] for log in logs_for_domain] logger.debug(response_timestamps) domains_with_same_ip = [] # get all logs for the same ip if valid ip if ip.is_valid_ipv4(entity['record']) or ip.is_valid_ipv6(entity['record']): checkpoint = time.time() logger.info('get logs for ip start') logs_for_ip = db.mariadb_get_logs_for_ip(entity['record'], id_upto) logger.info('get logs for ip done' + str(time.time() - checkpoint) + ' s') domains_with_same_ip = [log['domain'] for log in logs_for_ip] # feature 1: Short Life # 2 atomic features # atomic 1: # atomic 2: short_life = 0 # feature 2: Daily Similarity daily_similarity = 0 # feature 3: Repeating Patterns # 2 atomic features # atomic 1: # atomic 2: repeating_patterns = 0 # feature 4: Access ratio # 2 atomic features # atomic 1: # atomic 2: access_ratio = 0 # feature 5: Number of distinct IP addresses distinct_ips = len(list(set(ips))) # feature 6: Number of distinct countries distinct_countries = len(list(set([ip.get_country_by_ip(ip_str) for ip_str in list(set(ips))]))) # feature 7: Number of (distinct) domains share the IP with distinct_domains_with_same_ip = len(list(set(domains_with_same_ip))) # feature 8: Reverse DNS query results # 5 atomic feature # atomic 1: ratio of IP addresses that cannot be matched with a domain name (NX domains) # atomic 2: ips that are used for DSL lines # atomic 3: ips that belong to hosting services # atomic 4: ips that belong to known ISPs # atomic 5: ips that can be matched with a valid domain name # TODO add atomics to 'all_features' reverse_dns_result = 0 # feature 9: Average TTL average_ttl = sum(ttls) / len(ttls) # feature 10: Standard Deviation of TTL standard_deviation = ttl.standard_deviation(ttls) # TODO distinct ttls for std deviation? # feature 11: Number of distinct TTL values distinct_ttl = len(list(set(ttls))) # feature 12: Number of TTL change ttl_changes = ttl.changes(ttls) # feature 13: Percentage usage of specific TTL ranges # specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf] # TODO check if 5 individual features make a difference specific_ttl_ranges = ttl.specific_range(entity['ttl']) # feature 14: % of numerical characters numerical_characters_percent = domain.ratio_numerical_to_alpha(entity['domain']) # feature 15: % of the length of the LMS lms_percent = domain.ratio_lms_to_fqdn(entity['domain']) all_features = np.array([ short_life, daily_similarity, repeating_patterns, access_ratio, distinct_ips, distinct_countries, distinct_domains_with_same_ip, reverse_dns_result, average_ttl, standard_deviation, distinct_ttl, ttl_changes, specific_ttl_ranges, numerical_characters_percent, lms_percent ]) return all_features def test(): start = time.time() logger.info('starting training ' + str(start)) train() logger.info('total duration: ' + str(time.time() - start) + 's') cleanup() # db.mariadb_get_distinct_ttl('d2s45lswxaswrw.cloudfront.net', train_start.strftime(db_format_time), train_end.strftime(db_format_time)) def flow(): iris = load_iris() clf = tree.DecisionTreeClassifier() clf = clf.fit(iris.data, iris.target) # training set, manual classification # predict single or multiple sets with clf.predict([[]]) # visualize decision tree classifier dot_data = tree.export_graphviz(clf, out_file=None) graph = graphviz.Source(dot_data) graph.render('test', view=True) def cleanup(): db.close() if __name__ == "__main__": test()