added logging, use index (id) for sql queries (id hardcoded atm), added newer version of EXPOSURE paper

2017-11-15 13:35:58 +01:00
parent f31f645323
commit 9888f178f8
14 changed files with 218 additions and 53 deletions
--- a/src/DoresA/train.py
+++ b/src/DoresA/train.py
@@ -4,54 +4,80 @@ from sklearn import tree
 import numpy as np
 import graphviz
 import datetime
+import logging
 import time
 import db
 import domain
 import ip
-import location
+import ttl
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger('train')

 db_format_time = '%Y-%m-%d %H:%M:%S'

 train_start = datetime.date(2017, 5, 1)
-train_end = datetime.date(2017, 5, 2)
+train_end = datetime.date(2017, 5, 8)
+
+id_upto = 379283817
+
+# id_upto = db.mariadb_get_nearest_id(train_end.strftime(db_format_time))


 def get_logs_from_db():
-    results = db.mariadb_get_logs(train_start.strftime(db_format_time), train_end.strftime(db_format_time))
+    results = db.mariadb_get_logs(id_upto)

    row = results.fetch_row(how=1)

-    print("# entity: " + row[0]['domain'])
+    logger.debug("# entity: " + row[0]['domain'])

    features = prepare_features(row[0])

-    print(str(features))
+    logger.info(str(features))
    # while row:
-    #     print("# entity: " + row[0]['domain'])
+    #     logger.debug("# entity: " + row[0]['domain'])
    #
    #     features = prepare_features(row[0])
    #
-    #     print(str(features))
+    #     logger.info(str(features))
    #
    #     row = results.fetch_row(how=1)


 def prepare_features(entity):
    # get all logs for the same domain
-    logs_for_domain = db.mariadb_get_logs_for_domain(entity['domain'], train_start.strftime(db_format_time),
-                                                     train_end.strftime(db_format_time))
+
+    checkpoint = time.time()
+    logger.debug('get logs for domain start')
+    # BIG TODO check if we need the ip addresses of a specific response (not of all [different] responses) somewhere
+    logs_for_domain = db.mariadb_get_logs_for_domain(entity['domain'], id_upto)
+    logger.debug('get logs for domain done' + str(time.time() - checkpoint) + ' s')
+
+    # TODO do this efficient
    ttls = [log['ttl'] for log in logs_for_domain]
+    logger.info('ttls ' + str(ttls))
    ips = [log['record'] for log in logs_for_domain]  # TODO check if valid ip address
+    logger.info(ips)
+    response_timestamps = [log['timestamp'] for log in logs_for_domain]
+    logger.info(response_timestamps)

    domains_with_same_ip = []
    # get all logs for the same ip if valid ip
    if ip.is_valid_ipv4(entity['record']) or ip.is_valid_ipv6(entity['record']):
-        logs_for_ip = db.mariadb_get_logs_for_ip(entity['record'], train_start.strftime(db_format_time),
-                                                 train_end.strftime(db_format_time))
+        checkpoint = time.time()
+        logger.debug('get logs for ip start')
+        logs_for_ip = db.mariadb_get_logs_for_ip(entity['record'], id_upto)
+        logger.debug('get logs for ip done' + str(time.time() - checkpoint) + ' s')
        domains_with_same_ip = [log['domain'] for log in logs_for_ip]

    # feature 1: Short Life

+    # 2 atomic features
+
+    # atomic 1:
+
+    # atomic 2:
+
    short_life = 0

    # feature 2: Daily Similarity
@@ -60,10 +86,22 @@ def prepare_features(entity):

    # feature 3: Repeating Patterns

+    # 2 atomic features
+
+    # atomic 1:
+
+    # atomic 2:
+
    repeating_patterns = 0

    # feature 4: Access ratio

+    # 2 atomic features
+
+    # atomic 1:
+
+    # atomic 2:
+
    access_ratio = 0

    # feature 5: Number of distinct IP addresses
@@ -72,7 +110,7 @@ def prepare_features(entity):

    # feature 6: Number of distinct countries

-    distinct_countries = len(list(set([location.get_country_by_ip(ip) for ip in list(set(ips))])))
+    distinct_countries = len(list(set([ip.get_country_by_ip(ip_str) for ip_str in list(set(ips))])))

    # feature 7: Number of (distinct) domains share the IP with

@@ -80,6 +118,20 @@ def prepare_features(entity):

    # feature 8: Reverse DNS query results

+    # 5 atomic feature
+
+    # atomic 1: ratio of IP addresses that cannot be matched with a domain name (NX domains)
+
+    # atomic 2: ips that are used for DSL lines
+
+    # atomic 3: ips that belong to hosting services
+
+    # atomic 4: ips that belong to known ISPs
+
+    # atomic 5: ips that can be matched with a valid domain name
+
+    # TODO add atomics to 'all_features'
+
    reverse_dns_result = 0

    # feature 9: Average TTL
@@ -88,7 +140,7 @@ def prepare_features(entity):

    # feature 10: Standard Deviation of TTL

-    standard_deviation = 0
+    standard_deviation = ttl.standard_deviation(ttls)  # TODO distinct ttls for std deviation?

    # feature 11: Number of distinct TTL values

@@ -96,23 +148,12 @@ def prepare_features(entity):

    # feature 12: Number of TTL change

-    ttl_changes = 0
+    ttl_changes = ttl.changes(ttls)

    # feature 13: Percentage usage of specific TTL ranges
    # specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf]
-    # TODO decide if 5 individual features make a difference
-
-    ttl = entity['ttl']
-    specific_ttl_ranges = 4  # default is [900, inf]
-
-    if 0 < ttl <= 1:
-        specific_ttl_ranges = 0
-    elif 1 < ttl <= 100:
-        specific_ttl_ranges = 1
-    elif 100 < ttl <= 300:
-        specific_ttl_ranges = 2
-    elif 300 < ttl <= 900:
-        specific_ttl_ranges = 3
+    # TODO check if 5 individual features make a difference
+    specific_ttl_ranges = ttl.specific_range(entity['ttl'])

    # feature 14: % of numerical characters

@@ -133,11 +174,11 @@ def prepare_features(entity):

 def test():
    start = time.time()
-    print('starting training ' + str(start))
+    logger.debug('starting training ' + str(start))

    get_logs_from_db()

-    print('total duration: ' + str(time.time() - start) + 's')
+    logger.debug('total duration: ' + str(time.time() - start) + 's')
    db.close()

    # db.mariadb_get_distinct_ttl('d2s45lswxaswrw.cloudfront.net', train_start.strftime(db_format_time), train_end.strftime(db_format_time))