diff --git a/literature/EXPOSURE_Finding Malicious Domains Using Passive DNS Analysis.pdf b/literature/EXPOSURE_Finding Malicious Domains Using Passive DNS Analysis_2011.pdf similarity index 100% rename from literature/EXPOSURE_Finding Malicious Domains Using Passive DNS Analysis.pdf rename to literature/EXPOSURE_Finding Malicious Domains Using Passive DNS Analysis_2011.pdf diff --git a/literature/EXPOSURE_Finding Malicious Domains Using Passive DNS Analysis_2014.pdf b/literature/EXPOSURE_Finding Malicious Domains Using Passive DNS Analysis_2014.pdf new file mode 100644 index 0000000..5766028 Binary files /dev/null and b/literature/EXPOSURE_Finding Malicious Domains Using Passive DNS Analysis_2014.pdf differ diff --git a/src/DoresA/classify.py b/src/DoresA/classify.py new file mode 100644 index 0000000..407b3d0 --- /dev/null +++ b/src/DoresA/classify.py @@ -0,0 +1,23 @@ + + +def load_whitelist(): + filename = 'res/benign_domains.txt' + return open(filename).read().splitlines() + + +def load_blacklist(): + filename = 'res/malicious_domains.txt' + return open(filename).read().splitlines() + + +# TODO check if white and blacklists have to be stripped to tld (subdomains) +# TODO also check if subdomains are located in db +def test(): + filter_list = load_whitelist() + + for item in filter_list: + print(item.count('.')) + + +if __name__ == "__main__": + test() diff --git a/src/DoresA/db.py b/src/DoresA/db.py index aa0878a..799d778 100644 --- a/src/DoresA/db.py +++ b/src/DoresA/db.py @@ -1,6 +1,7 @@ import MySQLdb as mariadb import time import os +import logging from pymongo import MongoClient @@ -18,6 +19,10 @@ sql_pw = '3qfACEZzbXY4b' sql_table_name = 'pdns_logs_test' +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger('db') + + if 'MYSQL_HOST' in os.environ: sql_host = os.environ['MYSQL_HOST'] @@ -63,7 +68,7 @@ def mariadb_insert_logs(csv_entries): csv_entry[2], csv_entry[3], csv_entry[4]] inserts_sql += '(%s, %s, %s, %s, %s), ' except IndexError: - print('index error for csv entry: ' + str(csv_entry)) + logger.error('index error for csv entry: ' + str(csv_entry)) pass # remove last comma for sql compatibility @@ -72,15 +77,15 @@ def mariadb_insert_logs(csv_entries): try: sql_cursor.execute(inserts_sql, values) except Exception: - print('could not insert entry: ' + str(values)) # TODO proper error handling + logger.error('could not insert entry: ' + str(values)) # TODO proper error handling pass sql_connection.commit() -def mariadb_get_logs(from_time, to_time): +def mariadb_get_logs(id_upto, from_time=None, to_time=None): # get_logs_from_to = 'SELECT * FROM ' + sql_table_name + ' WHERE timestamp BETWEEN \'{}\' and \'{}\';'.format(from_time, to_time) - get_logs_from_to = 'SELECT * FROM ' + sql_table_name + ' WHERE id < 379283817;' + get_logs_from_to = 'SELECT * FROM ' + sql_table_name + ' WHERE id < {};'.format(id_upto) sql_connection.query(get_logs_from_to) return sql_connection.use_result() @@ -94,7 +99,7 @@ def mariadb_get_logs(from_time, to_time): # return sql_connection.use_result() -def mariadb_get_logs_for_domain(domain, from_time, to_time): +def mariadb_get_logs_for_domain(domain, id_upto, from_time=None, to_time=None): # we need a second connection for this query as this usually (always) run in parallel to the first query sql_connection_tmp = mariadb.connect(host=sql_host, user=sql_user_name, passwd=sql_pw, db=sql_db_name, port=sql_port) @@ -103,7 +108,7 @@ def mariadb_get_logs_for_domain(domain, from_time, to_time): # ' WHERE timestamp BETWEEN \'{}\' and \'{}\' '.format(from_time, to_time) + \ # 'AND domain=\'' + domain + '\';' get_distinct_ttl = 'SELECT * FROM ' + sql_table_name + \ - ' WHERE id < 379283817 ' + \ + ' WHERE id < {} '.format(id_upto) + \ 'AND domain=\'' + domain + '\';' sql_connection_tmp.query(get_distinct_ttl) result = sql_connection_tmp.use_result() @@ -114,7 +119,7 @@ def mariadb_get_logs_for_domain(domain, from_time, to_time): return logs_for_domain -def mariadb_get_logs_for_ip(ip, from_time, to_time): +def mariadb_get_logs_for_ip(ip, id_upto, from_time=None, to_time=None): # we need a second connection for this query as this usually (always) run in parallel to the first query sql_connection_tmp = mariadb.connect(host=sql_host, user=sql_user_name, passwd=sql_pw, db=sql_db_name, port=sql_port) sql_cursor_tmp = sql_connection_tmp.cursor() @@ -122,7 +127,7 @@ def mariadb_get_logs_for_ip(ip, from_time, to_time): # ' WHERE timestamp BETWEEN \'{}\' and \'{}\' '.format(from_time, to_time) + \ # 'AND domain=\'' + str(ip) + '\';' get_distinct_ttl = 'SELECT * FROM ' + sql_table_name + \ - ' WHERE id < 379283817 ' + \ + ' WHERE id < {} '.format(id_upto) + \ 'AND domain=\'' + str(ip) + '\';' sql_connection_tmp.query(get_distinct_ttl) diff --git a/src/DoresA/dns.py b/src/DoresA/dns.py index 95deebf..bbea6e7 100644 --- a/src/DoresA/dns.py +++ b/src/DoresA/dns.py @@ -3,3 +3,11 @@ import socket def reverse(ip): return socket.gethostbyaddr(ip) + + +def resolve(domain): + return socket.gethostbyname(domain) + + +if __name__ == "__main__": + exit() diff --git a/src/DoresA/ip.py b/src/DoresA/ip.py index 8eaff44..3d1dcb8 100644 --- a/src/DoresA/ip.py +++ b/src/DoresA/ip.py @@ -1,4 +1,28 @@ import re +import logging +from geoip2 import database, errors + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger('ip') + + +def get_country_by_ip(ip): + with database.Reader('res/GeoLite2-Country_20170905/GeoLite2-Country.mmdb') as reader: + result = reader.country(ip) + return result.country.names['en'] + + +def get_isp_by_ip(ip): + with database.Reader('res/GeoLite2-ASN_20171107/GeoLite2-ASN.mmdb') as reader: + try: + result = reader.asn(ip) + return result.autonomous_system_number + except errors.AddressNotFoundError: + logger.debug('address not in isp database') + + +def test(): + print(get_isp_by_ip('178.27.82.37')) # proudly taken from https://stackoverflow.com/questions/319279/how-to-validate-ip-address-in-python @@ -72,3 +96,7 @@ def is_valid_ipv6(ip): $ """, re.VERBOSE | re.IGNORECASE | re.DOTALL) return pattern.match(ip) is not None + + +if __name__ == "__main__": + test() diff --git a/src/DoresA/location.py b/src/DoresA/location.py deleted file mode 100644 index 7709cab..0000000 --- a/src/DoresA/location.py +++ /dev/null @@ -1,14 +0,0 @@ -from geolite2 import geolite2 - - -def get_country_by_ip(ip): - with geolite2 as gl2: - reader = gl2.reader() - result = reader.get(ip) - - if result: - return result['country']['names']['en'] - - -if __name__ == "__main__": - exit() diff --git a/src/DoresA/logs/delete_not_A_or_AAAA.txt b/src/DoresA/logs/delete_not_A_or_AAAA.txt new file mode 100644 index 0000000..6443b48 --- /dev/null +++ b/src/DoresA/logs/delete_not_A_or_AAAA.txt @@ -0,0 +1,9 @@ + +delete from pdns_logs_test where type != 'A' AND type != 'AAAA'; +ERROR 2006 (HY000): MySQL server has gone away +No connection. Trying to reconnect... +Connection id: 3 +Current database: doresa + +Query OK, 101235298 rows affected (47 min 38.87 sec) + diff --git a/src/DoresA/pip-selfcheck.json b/src/DoresA/pip-selfcheck.json index dadc5f4..34448d6 100644 --- a/src/DoresA/pip-selfcheck.json +++ b/src/DoresA/pip-selfcheck.json @@ -1 +1 @@ -{"last_check":"2017-09-27T12:13:16Z","pypi_version":"9.0.1"} \ No newline at end of file +{"last_check":"2017-11-07T13:21:55Z","pypi_version":"9.0.1"} \ No newline at end of file diff --git a/src/DoresA/requirements.txt b/src/DoresA/requirements.txt index 45138c6..5009db5 100644 --- a/src/DoresA/requirements.txt +++ b/src/DoresA/requirements.txt @@ -1,11 +1,27 @@ +certifi==2017.11.5 +chardet==3.0.4 +cycler==0.10.0 +geoip2==2.6.0 +graphviz==0.8 +idna==2.6 +javabridge==1.0.15 +matplotlib==2.0.2 maxminddb==1.3.0 maxminddb-geolite2==2017.803 mysqlclient==1.3.12 +nltk==3.2.5 numpy==1.13.1 +pandas==0.20.3 progress==1.3 pyenchant==1.6.11 pymongo==3.5.1 +pyparsing==2.2.0 python-dateutil==2.6.1 python-geoip==1.2 +python-weka-wrapper3==0.1.3 pytz==2017.2 +requests==2.18.4 +scikit-learn==0.19.0 +scipy==0.19.1 six==1.10.0 +urllib3==1.22 diff --git a/src/DoresA/res/GeoLite2-ASN_20171107/GeoLite2-ASN.mmdb b/src/DoresA/res/GeoLite2-ASN_20171107/GeoLite2-ASN.mmdb new file mode 100644 index 0000000..c40cfba Binary files /dev/null and b/src/DoresA/res/GeoLite2-ASN_20171107/GeoLite2-ASN.mmdb differ diff --git a/src/DoresA/time.py b/src/DoresA/time.py index 4cae2da..68899d4 100644 --- a/src/DoresA/time.py +++ b/src/DoresA/time.py @@ -1,5 +1,7 @@ from detect_cusum import detect_cusum import numpy as np +import datetime +import math def cusum(): @@ -18,11 +20,32 @@ def variance(a): return np.var(a) +# 'slot_length': amount of seconds for one slot +# returns a dictionary with slots as keys and timestamps as values +def split_to_fixed_slots(all_response_timestamps, analysis_start, slot_length): + fixed_slots = {} + + for response_timestamp in all_response_timestamps: + slot = math.floor((response_timestamp - analysis_start).seconds / slot_length) + fixed_slots[slot] = response_timestamp + return fixed_slots + + +def is_short_lived(all_response_timestamps, analysis_start, analysis_end): + fixed_slots = split_to_fixed_slots(all_response_timestamps, analysis_start, 3600) # timeslot of length 3600 seconds + print(fixed_slots) + + def test(): # a = np.array((1, 2, 3)) # b = np.array((0, 1, 2)) # print(variance(a)) - cusum() + + test_timestamps = [datetime.datetime(2017, 5, 1, 2, 5, 5), datetime.datetime(2017, 5, 1, 2, 44, 21), datetime.datetime(2017, 5, 1, 4, 48, 19), datetime.datetime(2017, 5, 1, 4, 9, 40), datetime.datetime(2017, 5, 1, 5, 14, 11), datetime.datetime(2017, 5, 1, 6, 50, 54), datetime.datetime(2017, 5, 1, 8, 52, 52), datetime.datetime(2017, 5, 1, 9, 24, 31), datetime.datetime(2017, 5, 1, 13, 29, 12), datetime.datetime(2017, 5, 1, 12, 56, 56), datetime.datetime(2017, 5, 1, 15, 3, 15), datetime.datetime(2017, 5, 1, 15, 57, 6), datetime.datetime(2017, 5, 1, 17, 58, 44), datetime.datetime(2017, 5, 1, 20, 35, 52), datetime.datetime(2017, 5, 1, 23, 6, 16), datetime.datetime(2017, 5, 2, 0, 26, 11), datetime.datetime(2017, 5, 2, 2, 41, 52), datetime.datetime(2017, 5, 2, 3, 14, 5), datetime.datetime(2017, 5, 2, 4, 43, 52), datetime.datetime(2017, 5, 2, 5, 12, 2), datetime.datetime(2017, 5, 2, 9, 32, 59), datetime.datetime(2017, 5, 2, 11, 35, 18), datetime.datetime(2017, 5, 2, 13, 30, 43), datetime.datetime(2017, 5, 2, 17, 41), datetime.datetime(2017, 5, 2, 16, 58, 55), datetime.datetime(2017, 5, 2, 22, 9, 53), datetime.datetime(2017, 5, 2, 22, 18, 15), datetime.datetime(2017, 5, 3, 2, 22, 1), datetime.datetime(2017, 5, 3, 2, 21, 8), datetime.datetime(2017, 5, 3, 4, 24, 20), datetime.datetime(2017, 5, 3, 6, 53, 43), datetime.datetime(2017, 5, 3, 6, 56, 54), datetime.datetime(2017, 5, 3, 8, 27, 58), datetime.datetime(2017, 5, 3, 11, 0), datetime.datetime(2017, 5, 3, 13, 32, 53), datetime.datetime(2017, 5, 3, 13, 2), datetime.datetime(2017, 5, 3, 13, 3, 44), datetime.datetime(2017, 5, 3, 17, 6), datetime.datetime(2017, 5, 3, 20, 12, 22), datetime.datetime(2017, 5, 3, 20, 39, 54), datetime.datetime(2017, 5, 3, 20, 9, 1), datetime.datetime(2017, 5, 3, 22, 42, 2), datetime.datetime(2017, 5, 4, 0, 21, 32), datetime.datetime(2017, 5, 4, 0, 13, 15), datetime.datetime(2017, 5, 4, 1, 14, 3), datetime.datetime(2017, 5, 4, 1, 24, 33), datetime.datetime(2017, 5, 4, 5, 35, 41), datetime.datetime(2017, 5, 4, 7, 41, 13), datetime.datetime(2017, 5, 4, 9, 44), datetime.datetime(2017, 5, 4, 13, 49, 34), datetime.datetime(2017, 5, 4, 18, 1), datetime.datetime(2017, 5, 4, 19, 57, 25), datetime.datetime(2017, 5, 4, 20, 2, 56), datetime.datetime(2017, 5, 4, 23, 22, 25), datetime.datetime(2017, 5, 5, 0, 42, 21), datetime.datetime(2017, 5, 5, 1, 47, 24), datetime.datetime(2017, 5, 5, 5, 47, 6), datetime.datetime(2017, 5, 5, 9, 51, 1), datetime.datetime(2017, 5, 5, 11, 53, 1), datetime.datetime(2017, 5, 5, 12, 9, 22), datetime.datetime(2017, 5, 5, 13, 2, 33), datetime.datetime(2017, 5, 5, 13, 55, 1), datetime.datetime(2017, 5, 5, 15, 57, 1), datetime.datetime(2017, 5, 5, 16, 44, 29), datetime.datetime(2017, 5, 5, 18, 24, 55), datetime.datetime(2017, 5, 5, 23, 29, 55), datetime.datetime(2017, 5, 6, 1, 6, 2), datetime.datetime(2017, 5, 6, 2, 7, 1), datetime.datetime(2017, 5, 6, 3, 8, 1), datetime.datetime(2017, 5, 6, 5, 44, 4), datetime.datetime(2017, 5, 6, 7, 12, 4), datetime.datetime(2017, 5, 6, 8, 13, 2), datetime.datetime(2017, 5, 6, 8, 13, 9), datetime.datetime(2017, 5, 6, 9, 40, 54), datetime.datetime(2017, 5, 6, 10, 24, 15), datetime.datetime(2017, 5, 6, 12, 43, 56), datetime.datetime(2017, 5, 6, 17, 11, 2), datetime.datetime(2017, 5, 6, 17, 22, 1), datetime.datetime(2017, 5, 6, 19, 50, 54), datetime.datetime(2017, 5, 6, 19, 14, 29), datetime.datetime(2017, 5, 6, 21, 15, 34), datetime.datetime(2017, 5, 6, 23, 19, 34), datetime.datetime(2017, 5, 7, 1, 56, 54), datetime.datetime(2017, 5, 7, 2, 31, 23), datetime.datetime(2017, 5, 7, 3, 58, 54), datetime.datetime(2017, 5, 7, 6, 31, 22), datetime.datetime(2017, 5, 7, 7, 36, 1), datetime.datetime(2017, 5, 7, 8, 33, 51), datetime.datetime(2017, 5, 7, 9, 3, 56), datetime.datetime(2017, 5, 7, 10, 35, 36), datetime.datetime(2017, 5, 7, 12, 41, 1), datetime.datetime(2017, 5, 7, 17, 17, 37), datetime.datetime(2017, 5, 7, 18, 47, 19), datetime.datetime(2017, 5, 7, 19, 20, 37), datetime.datetime(2017, 5, 7, 21, 53, 30), datetime.datetime(2017, 5, 7, 23, 52, 20)] + + train_start = datetime.datetime(2017, 5, 1) + train_end = datetime.datetime(2017, 5, 8) + is_short_lived(test_timestamps, train_start, train_end) if __name__ == "__main__": diff --git a/src/DoresA/train.py b/src/DoresA/train.py index 23cdc32..2e39a9c 100644 --- a/src/DoresA/train.py +++ b/src/DoresA/train.py @@ -4,54 +4,80 @@ from sklearn import tree import numpy as np import graphviz import datetime +import logging import time import db import domain import ip -import location +import ttl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger('train') db_format_time = '%Y-%m-%d %H:%M:%S' train_start = datetime.date(2017, 5, 1) -train_end = datetime.date(2017, 5, 2) +train_end = datetime.date(2017, 5, 8) + +id_upto = 379283817 + +# id_upto = db.mariadb_get_nearest_id(train_end.strftime(db_format_time)) def get_logs_from_db(): - results = db.mariadb_get_logs(train_start.strftime(db_format_time), train_end.strftime(db_format_time)) + results = db.mariadb_get_logs(id_upto) row = results.fetch_row(how=1) - print("# entity: " + row[0]['domain']) + logger.debug("# entity: " + row[0]['domain']) features = prepare_features(row[0]) - print(str(features)) + logger.info(str(features)) # while row: - # print("# entity: " + row[0]['domain']) + # logger.debug("# entity: " + row[0]['domain']) # # features = prepare_features(row[0]) # - # print(str(features)) + # logger.info(str(features)) # # row = results.fetch_row(how=1) def prepare_features(entity): # get all logs for the same domain - logs_for_domain = db.mariadb_get_logs_for_domain(entity['domain'], train_start.strftime(db_format_time), - train_end.strftime(db_format_time)) + + checkpoint = time.time() + logger.debug('get logs for domain start') + # BIG TODO check if we need the ip addresses of a specific response (not of all [different] responses) somewhere + logs_for_domain = db.mariadb_get_logs_for_domain(entity['domain'], id_upto) + logger.debug('get logs for domain done' + str(time.time() - checkpoint) + ' s') + + # TODO do this efficient ttls = [log['ttl'] for log in logs_for_domain] + logger.info('ttls ' + str(ttls)) ips = [log['record'] for log in logs_for_domain] # TODO check if valid ip address + logger.info(ips) + response_timestamps = [log['timestamp'] for log in logs_for_domain] + logger.info(response_timestamps) domains_with_same_ip = [] # get all logs for the same ip if valid ip if ip.is_valid_ipv4(entity['record']) or ip.is_valid_ipv6(entity['record']): - logs_for_ip = db.mariadb_get_logs_for_ip(entity['record'], train_start.strftime(db_format_time), - train_end.strftime(db_format_time)) + checkpoint = time.time() + logger.debug('get logs for ip start') + logs_for_ip = db.mariadb_get_logs_for_ip(entity['record'], id_upto) + logger.debug('get logs for ip done' + str(time.time() - checkpoint) + ' s') domains_with_same_ip = [log['domain'] for log in logs_for_ip] # feature 1: Short Life + # 2 atomic features + + # atomic 1: + + # atomic 2: + short_life = 0 # feature 2: Daily Similarity @@ -60,10 +86,22 @@ def prepare_features(entity): # feature 3: Repeating Patterns + # 2 atomic features + + # atomic 1: + + # atomic 2: + repeating_patterns = 0 # feature 4: Access ratio + # 2 atomic features + + # atomic 1: + + # atomic 2: + access_ratio = 0 # feature 5: Number of distinct IP addresses @@ -72,7 +110,7 @@ def prepare_features(entity): # feature 6: Number of distinct countries - distinct_countries = len(list(set([location.get_country_by_ip(ip) for ip in list(set(ips))]))) + distinct_countries = len(list(set([ip.get_country_by_ip(ip_str) for ip_str in list(set(ips))]))) # feature 7: Number of (distinct) domains share the IP with @@ -80,6 +118,20 @@ def prepare_features(entity): # feature 8: Reverse DNS query results + # 5 atomic feature + + # atomic 1: ratio of IP addresses that cannot be matched with a domain name (NX domains) + + # atomic 2: ips that are used for DSL lines + + # atomic 3: ips that belong to hosting services + + # atomic 4: ips that belong to known ISPs + + # atomic 5: ips that can be matched with a valid domain name + + # TODO add atomics to 'all_features' + reverse_dns_result = 0 # feature 9: Average TTL @@ -88,7 +140,7 @@ def prepare_features(entity): # feature 10: Standard Deviation of TTL - standard_deviation = 0 + standard_deviation = ttl.standard_deviation(ttls) # TODO distinct ttls for std deviation? # feature 11: Number of distinct TTL values @@ -96,23 +148,12 @@ def prepare_features(entity): # feature 12: Number of TTL change - ttl_changes = 0 + ttl_changes = ttl.changes(ttls) # feature 13: Percentage usage of specific TTL ranges # specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf] - # TODO decide if 5 individual features make a difference - - ttl = entity['ttl'] - specific_ttl_ranges = 4 # default is [900, inf] - - if 0 < ttl <= 1: - specific_ttl_ranges = 0 - elif 1 < ttl <= 100: - specific_ttl_ranges = 1 - elif 100 < ttl <= 300: - specific_ttl_ranges = 2 - elif 300 < ttl <= 900: - specific_ttl_ranges = 3 + # TODO check if 5 individual features make a difference + specific_ttl_ranges = ttl.specific_range(entity['ttl']) # feature 14: % of numerical characters @@ -133,11 +174,11 @@ def prepare_features(entity): def test(): start = time.time() - print('starting training ' + str(start)) + logger.debug('starting training ' + str(start)) get_logs_from_db() - print('total duration: ' + str(time.time() - start) + 's') + logger.debug('total duration: ' + str(time.time() - start) + 's') db.close() # db.mariadb_get_distinct_ttl('d2s45lswxaswrw.cloudfront.net', train_start.strftime(db_format_time), train_end.strftime(db_format_time)) diff --git a/src/DoresA/ttl.py b/src/DoresA/ttl.py index 7843634..51f16fa 100644 --- a/src/DoresA/ttl.py +++ b/src/DoresA/ttl.py @@ -5,5 +5,31 @@ def standard_deviation(array): return np.std(array) +def changes(array): + current = array[0] + changes = 0 + + for item in array: + if item != current: + changes += 1 + current = item + return changes + + +# specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf] +def specific_range(ttl): + specific_ttl_ranges = 4 # default is [900, inf] + + if 0 < ttl <= 1: + specific_ttl_ranges = 0 + elif 1 < ttl <= 100: + specific_ttl_ranges = 1 + elif 100 < ttl <= 300: + specific_ttl_ranges = 2 + elif 300 < ttl <= 900: + specific_ttl_ranges = 3 + return specific_ttl_ranges + + if __name__ == "__main__": exit()