From 45a9e74c146c6fe444a34b1c6b0e0d65291c5f83 Mon Sep 17 00:00:00 2001 From: Felix Steghofer Date: Tue, 12 Dec 2017 18:18:38 +0100 Subject: [PATCH] performance, continue analysis where left off (on file basis) --- src/DoresA/classify.py | 31 ++++++++- src/DoresA/config.py | 14 +++- src/DoresA/csv_tools.py | 10 ++- src/DoresA/db_redis.py | 67 ++++++++++++------ src/DoresA/ip.py | 33 +++++---- src/DoresA/train.py | 148 +++++++++++++++++++++++++++++----------- 6 files changed, 221 insertions(+), 82 deletions(-) diff --git a/src/DoresA/classify.py b/src/DoresA/classify.py index ecbaf18..16e3dca 100644 --- a/src/DoresA/classify.py +++ b/src/DoresA/classify.py @@ -1,15 +1,25 @@ +import pickle +import os.path -def load_whitelist(): +def generate_whitelist(): filename = 'res/benign_domains.txt' whitelist = [] for item in open(filename).read().splitlines(): if item not in whitelist: whitelist.append(item) + whitelist_pkl = open('whitelist.pkl', 'wb') + pickle.dump(whitelist, whitelist_pkl) + whitelist_pkl.close() + + +def load_whitelist(): + whitelist_pkl = open('whitelist.pkl', 'rb') + whitelist = pickle.load(whitelist_pkl) return whitelist -def load_blacklist(): +def generate_blacklist(): filename = 'res/malicious_domains.txt' blacklist = [] for item in open(filename).read().splitlines(): @@ -17,6 +27,14 @@ def load_blacklist(): # do not add to black (as EXPOSURE is handling) if item not in blacklist and item not in whitelist: blacklist.append(item) + blacklist_pkl = open('blacklist.pkl', 'wb') + pickle.dump(blacklist, blacklist_pkl) + blacklist_pkl.close() + + +def load_blacklist(): + blacklist_pkl = open('blacklist.pkl', 'rb') + blacklist = pickle.load(blacklist_pkl) return blacklist @@ -25,7 +43,7 @@ def is_malicious(domain): def test(): - print('blacklist length: ' + str(len(blacklist))) + # print('blacklist length: ' + str(len(blacklist))) # dupes = [x for n, x in enumerate(whitelist) if x in whitelist[:n]] # print(dupes) @@ -37,7 +55,14 @@ def test(): pass +if not os.path.isfile('whitelist.pkl'): + generate_whitelist() + whitelist = load_whitelist() + + +if not os.path.isfile('blacklist.pkl'): + generate_blacklist() blacklist = load_blacklist() if __name__ == "__main__": diff --git a/src/DoresA/config.py b/src/DoresA/config.py index 450bde7..ccf4d3f 100644 --- a/src/DoresA/config.py +++ b/src/DoresA/config.py @@ -7,9 +7,19 @@ train_end = datetime.date(2017, 9, 7) analysis_start_date = datetime.date(2017, 9, 1) analysis_days_amount = 7 -#pdns_logs_path = '/home/felix/pdns/' -pdns_logs_path = '/mnt/old/2017' # e.g. analysis_days = ['2017-04-07', '2017-04-08', '2017-04-09'] analysis_days = [(analysis_start_date + datetime.timedelta(days=x)).strftime(format_date) for x in range(analysis_days_amount)] + +serialized_path = 'serialized/' +#pdns_logs_path = '/home/felix/pdns/' +pdns_logs_path = '/mnt/old/2017/' + +# 32 cors on janus +num_cores = 32 + +gz = False +multiprocessed = True + + diff --git a/src/DoresA/csv_tools.py b/src/DoresA/csv_tools.py index 6d76a90..31eefe7 100644 --- a/src/DoresA/csv_tools.py +++ b/src/DoresA/csv_tools.py @@ -44,8 +44,6 @@ def serialize_logs_to_db(): # for log_file in ['data/pdns_capture.pc -# TODOap-sgsgpdc0n9x-2017-04-07_00-00-02.csv.gz']: - for day in range(analysis_days_amount): log_files_hour = get_log_files_for_hours_of_day(analysis_days[day]) # everything[day] = {} @@ -55,7 +53,7 @@ def serialize_logs_to_db(): for hour in progress_bar(range(24)): progress_bar.next() for hour_files in log_files_hour[hour]: - with gzip.open(hour_files, 'rt', newline='') as file: + with open(hour_files, 'rt') as file: reader = csv.reader(file) all_rows = list(reader) @@ -91,7 +89,7 @@ def batch(iterable, n=1): # raise Exception('Log files inconsistency') -def get_log_files_for_range_of_day(date, minutes_range, gz=True): +def get_log_files_for_range_of_day(date, minutes_range, gz=False): slot_files = {} slots_amount = int(1440 / minutes_range) @@ -103,7 +101,7 @@ def get_log_files_for_range_of_day(date, minutes_range, gz=True): slot_files[slot] = 'data/*' + date + '_' + time_range + '*.csv' + ('.gz' if gz else '') -def get_log_files_for_hours_of_day(date, gz=True): +def get_log_files_for_hours_of_day(date, gz=False): slot_files = {} slots_amount = 24 @@ -113,7 +111,7 @@ def get_log_files_for_hours_of_day(date, gz=True): return slot_files -def get_log_files_for_day(date, gz=True): +def get_log_files_for_day(date, gz=False): log_files = 'data/*' + date + '*.csv.gz' + ('.gz' if gz else '') return glob.glob(log_files) diff --git a/src/DoresA/db_redis.py b/src/DoresA/db_redis.py index 1c52543..030995b 100644 --- a/src/DoresA/db_redis.py +++ b/src/DoresA/db_redis.py @@ -11,14 +11,34 @@ logger.setLevel(logging.INFO) logger.debug('connecting redis') redis_host = 'localhost' -# TODO name ports properly -redis_start_port_first_seen = 2337 -redis_start_port_last_seen = 2340 -redis_port_reverse = 2343 -redis_port_4 = 2344 -redis_port_ttl = 2345 bucket_mod = 1048576 # = 2 ** 20 +# redis_start_port_first_seen = 2337 +# redis_start_port_last_seen = 2340 +# redis_port_reverse = 2343 +# redis_port_4 = 2344 +# redis_port_ttl = 2345 +# redis_f = Redis(unix_socket_path=base + 'redis_local_f.sock') +# redis_f1 = Redis(redis_host, port=2338) +# redis_f2 = Redis(redis_host, port=2339) +# redis_l = Redis(redis_host, port=2340) +# redis_l1 = Redis(redis_host, port=2341) +# redis_l2 = Redis(redis_host, port=2342) +# redis_r = Redis(redis_host, port=2343) +# redis_v = Redis(redis_host, port=2344) +# redis_t = Redis(redis_host, port=2345) + +base = '/home/tek/felix/redis/redis/' +redis_f = Redis(unix_socket_path=base + 'redis_local_f.sock') +redis_f1 = Redis(unix_socket_path=base + 'redis_local_f2.sock') +redis_f2 = Redis(unix_socket_path=base + 'redis_local_f3.sock') +redis_l = Redis(unix_socket_path=base + 'redis_local_l.sock') +redis_l1 = Redis(unix_socket_path=base + 'redis_local_l2.sock') +redis_l2 = Redis(unix_socket_path=base + 'redis_local_l3.sock') +redis_r = Redis(unix_socket_path=base + 'redis_local_r.sock') +redis_v = Redis(unix_socket_path=base + 'redis_local_v.sock') +redis_t = Redis(unix_socket_path=base + 'redis_local_t.sock') + def _get_redis_shard(rrname): bucket = crc32(rrname.encode('utf-8')) % bucket_mod # convert string to byte array @@ -29,11 +49,15 @@ def _get_redis_shard(rrname): def get_stats_for_domain(rrname, rrtype='A'): bucket, shard = _get_redis_shard(rrname) + local_redis_f = redis_f + local_redis_l = redis_l - redis_f = Redis(redis_host, port=redis_start_port_first_seen + shard) - redis_l = Redis(redis_host, port=redis_start_port_last_seen + shard) - redis_r = Redis(redis_host, port=redis_port_reverse) - redis_t = Redis(redis_host, port=redis_port_ttl) + if shard == 1: + local_redis_f = redis_f1 + local_redis_l = redis_l1 + elif shard == 2: + local_redis_f = redis_f2 + local_redis_l = redis_l2 ttls_b = redis_t.lrange('t:{}:{}'.format(rrname, rrtype), 0, -1) @@ -55,8 +79,8 @@ def get_stats_for_domain(rrname, rrtype='A'): logger.debug('res: ' + str(result)) logger.debug('id: ' + str('f' + str(bucket))) - t_f = float(unpack('