first features ready for training

2017-11-06 21:29:55 +01:00
parent 89c6490019
commit f31f645323
12 changed files with 1861 additions and 21 deletions
--- a/src/DoresA/.gitignore
+++ b/src/DoresA/.gitignore
@@ -6,3 +6,4 @@
 /include/
 /lib/
 /__pycache__/
+*.pyc
--- a/src/DoresA/db.py
+++ b/src/DoresA/db.py
@@ -79,11 +79,70 @@ def mariadb_insert_logs(csv_entries):


 def mariadb_get_logs(from_time, to_time):
-    get_logs_from_to = 'SELECT * FROM ' + sql_table_name + ' WHERE timestamp BETWEEN \'{}\' and \'{}\';'.format(from_time, to_time)
+    # get_logs_from_to = 'SELECT * FROM ' + sql_table_name + ' WHERE timestamp BETWEEN \'{}\' and \'{}\';'.format(from_time, to_time)
+    get_logs_from_to = 'SELECT * FROM ' + sql_table_name + ' WHERE id < 379283817;'
    sql_connection.query(get_logs_from_to)
    return sql_connection.use_result()


+# TODO not used
+# def mariadb_get_distinct_ttl(domain, from_time, to_time):
+#     get_distinct_ttl = 'SELECT DISTINCT ttl FROM ' + sql_table_name + \
+#                        ' WHERE timestamp BETWEEN \'{}\' and \'{}\' '.format(from_time, to_time) + \
+#                        'AND domain=\'' + domain + '\';'
+#     sql_connection.query(get_distinct_ttl)
+#     return sql_connection.use_result()
+
+
+def mariadb_get_logs_for_domain(domain, from_time, to_time):
+    # we need a second connection for this query as this usually (always) run in parallel to the first query
+    sql_connection_tmp = mariadb.connect(host=sql_host, user=sql_user_name, passwd=sql_pw, db=sql_db_name, port=sql_port)
+
+    # timestamp comparison super slow, check if better with index
+    # get_distinct_ttl = 'SELECT * FROM ' + sql_table_name + \
+    #                    ' WHERE timestamp BETWEEN \'{}\' and \'{}\' '.format(from_time, to_time) + \
+    #                    'AND domain=\'' + domain + '\';'
+    get_distinct_ttl = 'SELECT * FROM ' + sql_table_name + \
+                       ' WHERE id < 379283817 ' + \
+                       'AND domain=\'' + domain + '\';'
+    sql_connection_tmp.query(get_distinct_ttl)
+    result = sql_connection_tmp.use_result()
+    logs_for_domain = result.fetch_row(maxrows=0, how=1)  # TODO this can consume a lot of memory, think of alternatives
+
+    sql_connection_tmp.close()
+
+    return logs_for_domain
+
+
+def mariadb_get_logs_for_ip(ip, from_time, to_time):
+    # we need a second connection for this query as this usually (always) run in parallel to the first query
+    sql_connection_tmp = mariadb.connect(host=sql_host, user=sql_user_name, passwd=sql_pw, db=sql_db_name, port=sql_port)
+    sql_cursor_tmp = sql_connection_tmp.cursor()
+    # get_distinct_ttl = 'SELECT * FROM ' + sql_table_name + \
+    #                    ' WHERE timestamp BETWEEN \'{}\' and \'{}\' '.format(from_time, to_time) + \
+    #                    'AND domain=\'' + str(ip) + '\';'
+    get_distinct_ttl = 'SELECT * FROM ' + sql_table_name + \
+                       ' WHERE id < 379283817 ' + \
+                       'AND domain=\'' + str(ip) + '\';'
+    sql_connection_tmp.query(get_distinct_ttl)
+
+    result = sql_connection_tmp.use_result()
+    logs_for_ip = result.fetch_row(maxrows=0, how=1)  # TODO this can consume a lot of memory, think of alternatives
+
+    # sql_cursor_tmp.close()
+    sql_connection_tmp.close()
+
+    return logs_for_ip
+
+
+def mariadb_get_nearest_id(timestamp):
+    get_nearest_id = 'SELECT id FROM ' + sql_table_name + ' WHERE timestamp > \'{}\' LIMIT 1;'.format(timestamp)
+    sql_connection.query(get_nearest_id)
+    result = sql_connection.use_result()
+    entities = result.fetch_row(maxrows=0, how=1)
+    return entities[0].id
+
+
 def mariadb_create_table():
    create_table = 'CREATE TABLE IF NOT EXISTS ' + sql_table_name + """ (
          id INTEGER AUTO_INCREMENT PRIMARY KEY,
--- a/src/DoresA/ip.py
+++ b/src/DoresA/ip.py
@@ -0,0 +1,74 @@
+import re
+
+
+# proudly taken from https://stackoverflow.com/questions/319279/how-to-validate-ip-address-in-python
+def is_valid_ipv4(ip):
+    """Validates IPv4 addresses.
+    """
+    pattern = re.compile(r"""
+        ^
+        (?:
+          # Dotted variants:
+          (?:
+            # Decimal 1-255 (no leading 0's)
+            [3-9]\d?|2(?:5[0-5]|[0-4]?\d)?|1\d{0,2}
+          |
+            0x0*[0-9a-f]{1,2}  # Hexadecimal 0x0 - 0xFF (possible leading 0's)
+          |
+            0+[1-3]?[0-7]{0,2} # Octal 0 - 0377 (possible leading 0's)
+          )
+          (?:                  # Repeat 0-3 times, separated by a dot
+            \.
+            (?:
+              [3-9]\d?|2(?:5[0-5]|[0-4]?\d)?|1\d{0,2}
+            |
+              0x0*[0-9a-f]{1,2}
+            |
+              0+[1-3]?[0-7]{0,2}
+            )
+          ){0,3}
+        |
+          0x0*[0-9a-f]{1,8}    # Hexadecimal notation, 0x0 - 0xffffffff
+        |
+          0+[0-3]?[0-7]{0,10}  # Octal notation, 0 - 037777777777
+        |
+          # Decimal notation, 1-4294967295:
+          429496729[0-5]|42949672[0-8]\d|4294967[01]\d\d|429496[0-6]\d{3}|
+          42949[0-5]\d{4}|4294[0-8]\d{5}|429[0-3]\d{6}|42[0-8]\d{7}|
+          4[01]\d{8}|[1-3]\d{0,9}|[4-9]\d{0,8}
+        )
+        $
+    """, re.VERBOSE | re.IGNORECASE)
+    return pattern.match(ip) is not None
+
+
+def is_valid_ipv6(ip):
+    """Validates IPv6 addresses.
+    """
+    pattern = re.compile(r"""
+        ^
+        \s*                         # Leading whitespace
+        (?!.*::.*::)                # Only a single whildcard allowed
+        (?:(?!:)|:(?=:))            # Colon iff it would be part of a wildcard
+        (?:                         # Repeat 6 times:
+            [0-9a-f]{0,4}           #   A group of at most four hexadecimal digits
+            (?:(?<=::)|(?<!::):)    #   Colon unless preceeded by wildcard
+        ){6}                        #
+        (?:                         # Either
+            [0-9a-f]{0,4}           #   Another group
+            (?:(?<=::)|(?<!::):)    #   Colon unless preceeded by wildcard
+            [0-9a-f]{0,4}           #   Last group
+            (?: (?<=::)             #   Colon iff preceeded by exacly one colon
+             |  (?<!:)              #
+             |  (?<=:) (?<!::) :    #
+             )                      # OR
+         |                          #   A v4 address with NO leading zeros 
+            (?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)
+            (?: \.
+                (?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)
+            ){3}
+        )
+        \s*                         # Trailing whitespace
+        $
+    """, re.VERBOSE | re.IGNORECASE | re.DOTALL)
+    return pattern.match(ip) is not None
--- a/src/DoresA/logs/one_week_serialize_to_db.txt
+++ b/src/DoresA/logs/one_week_serialize_to_db.txt
@@ -0,0 +1,2 @@
+starting analysis 1509926518.1677592
+total duration: 24594.95610165596s
--- a/src/DoresA/logs/prepare_features_for_one_domain_with_index.txt
+++ b/src/DoresA/logs/prepare_features_for_one_domain_with_index.txt
@@ -0,0 +1,8 @@
+starting training 1509988006.1670337
+# entity: 99-183-224-60.lightspeed.livnmi.sbcglobal.net
+[  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
+   1.00000000e+00   1.00000000e+00   0.00000000e+00   0.00000000e+00
+   6.78306250e+03   0.00000000e+00   3.00000000e+00   0.00000000e+00
+   4.00000000e+00   6.07142857e-01   1.33333333e-01]
+total duration: 84.75222444534302s
+
--- a/src/DoresA/logs/prepare_features_for_one_domain_without_index.txt
+++ b/src/DoresA/logs/prepare_features_for_one_domain_without_index.txt
@@ -0,0 +1,8 @@
+starting training 1509985884.1062775
+# entity: 99-183-224-60.lightspeed.livnmi.sbcglobal.net
+[  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
+   1.00000000e+00   1.00000000e+00   0.00000000e+00   0.00000000e+00
+   6.75526667e+03   0.00000000e+00   3.00000000e+00   0.00000000e+00
+   4.00000000e+00   6.07142857e-01   1.33333333e-01]
+total duration: 573.4299128055573s
+
--- a/src/DoresA/res/all-tld.txt
+++ b/src/DoresA/res/all-tld.txt
--- a/src/DoresA/scripts/mongodb/collection_stats.js
+++ b/src/DoresA/scripts/mongodb/collection_stats.js
--- a/src/DoresA/scripts/sql/find_nearest_date.sql
+++ b/src/DoresA/scripts/sql/find_nearest_date.sql
@@ -0,0 +1 @@
+SELECT id FROM pdns_logs_test where timestamp > '2017-05-08 00:00:00' LIMIT 1;
--- a/src/DoresA/serialize_logs_to_db.py
+++ b/src/DoresA/serialize_logs_to_db.py
@@ -8,11 +8,10 @@ from progress.bar import Bar

 import db

-# TODO environment this
 analysis_start_date = datetime.date(2017, 5, 1)
-analysis_days_amount = 31
+analysis_days_amount = 7
 # pdns_logs_path = 'data/'
-pdns_logs_path = '/data/'
+pdns_logs_path = '/run/media/felix/ext/2017.05/'

 # e.g. analysis_days = ['2017-04-07', '2017-04-08', '2017-04-09']
 analysis_days = [(analysis_start_date + datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in
@@ -29,7 +28,7 @@ def main():
    # everything = {}

    # for log_file in ['data/pdns_capture.pcap-sgsgpdc0n9x-2017-04-07_00-00-02.csv.gz']:
-    
+
    for day in range(analysis_days_amount):
        log_files_hour = get_log_files_for_hours_of_day(analysis_days[day])
        # everything[day] = {}
--- a/src/DoresA/time.py
+++ b/src/DoresA/time.py
@@ -18,22 +18,6 @@ def variance(a):
    return np.var(a)


-def test_decision_tree():
-    from sklearn.datasets import load_iris
-    from sklearn import tree
-    iris = load_iris()
-    clf = tree.DecisionTreeClassifier()
-    clf = clf.fit(iris.data, iris.target)  # training set, manual classification
-
-    # predict single or multiple sets with clf.predict([[]])
-
-    # visualize decision tree classifier
-    import graphviz
-    dot_data = tree.export_graphviz(clf, out_file=None)
-    graph = graphviz.Source(dot_data)
-    graph.render('iris', view=True)
-
-
 def test():
    # a = np.array((1, 2, 3))
    # b = np.array((0, 1, 2))
--- a/src/DoresA/train.py
+++ b/src/DoresA/train.py
@@ -0,0 +1,160 @@
+from sklearn.datasets import load_iris
+from sklearn import tree
+
+import numpy as np
+import graphviz
+import datetime
+import time
+import db
+import domain
+import ip
+import location
+
+db_format_time = '%Y-%m-%d %H:%M:%S'
+
+train_start = datetime.date(2017, 5, 1)
+train_end = datetime.date(2017, 5, 2)
+
+
+def get_logs_from_db():
+    results = db.mariadb_get_logs(train_start.strftime(db_format_time), train_end.strftime(db_format_time))
+
+    row = results.fetch_row(how=1)
+
+    print("# entity: " + row[0]['domain'])
+
+    features = prepare_features(row[0])
+
+    print(str(features))
+    # while row:
+    #     print("# entity: " + row[0]['domain'])
+    #
+    #     features = prepare_features(row[0])
+    #
+    #     print(str(features))
+    #
+    #     row = results.fetch_row(how=1)
+
+
+def prepare_features(entity):
+    # get all logs for the same domain
+    logs_for_domain = db.mariadb_get_logs_for_domain(entity['domain'], train_start.strftime(db_format_time),
+                                                     train_end.strftime(db_format_time))
+    ttls = [log['ttl'] for log in logs_for_domain]
+    ips = [log['record'] for log in logs_for_domain]  # TODO check if valid ip address
+
+    domains_with_same_ip = []
+    # get all logs for the same ip if valid ip
+    if ip.is_valid_ipv4(entity['record']) or ip.is_valid_ipv6(entity['record']):
+        logs_for_ip = db.mariadb_get_logs_for_ip(entity['record'], train_start.strftime(db_format_time),
+                                                 train_end.strftime(db_format_time))
+        domains_with_same_ip = [log['domain'] for log in logs_for_ip]
+
+    # feature 1: Short Life
+
+    short_life = 0
+
+    # feature 2: Daily Similarity
+
+    daily_similarity = 0
+
+    # feature 3: Repeating Patterns
+
+    repeating_patterns = 0
+
+    # feature 4: Access ratio
+
+    access_ratio = 0
+
+    # feature 5: Number of distinct IP addresses
+
+    distinct_ips = len(list(set(ips)))
+
+    # feature 6: Number of distinct countries
+
+    distinct_countries = len(list(set([location.get_country_by_ip(ip) for ip in list(set(ips))])))
+
+    # feature 7: Number of (distinct) domains share the IP with
+
+    distinct_domains_with_same_ip = len(list(set(domains_with_same_ip)))
+
+    # feature 8: Reverse DNS query results
+
+    reverse_dns_result = 0
+
+    # feature 9: Average TTL
+
+    average_ttl = sum(ttls) / len(ttls)
+
+    # feature 10: Standard Deviation of TTL
+
+    standard_deviation = 0
+
+    # feature 11: Number of distinct TTL values
+
+    distinct_ttl = len(list(set(ttls)))
+
+    # feature 12: Number of TTL change
+
+    ttl_changes = 0
+
+    # feature 13: Percentage usage of specific TTL ranges
+    # specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf]
+    # TODO decide if 5 individual features make a difference
+
+    ttl = entity['ttl']
+    specific_ttl_ranges = 4  # default is [900, inf]
+
+    if 0 < ttl <= 1:
+        specific_ttl_ranges = 0
+    elif 1 < ttl <= 100:
+        specific_ttl_ranges = 1
+    elif 100 < ttl <= 300:
+        specific_ttl_ranges = 2
+    elif 300 < ttl <= 900:
+        specific_ttl_ranges = 3
+
+    # feature 14: % of numerical characters
+
+    numerical_characters_percent = domain.ratio_numerical_to_alpha(entity['domain'])
+
+    # feature 15: % of the length of the LMS
+
+    lms_percent = domain.ratio_lms_to_fqdn(entity['domain'])
+
+    all_features = np.array([
+        short_life, daily_similarity, repeating_patterns, access_ratio, distinct_ips, distinct_countries,
+        distinct_domains_with_same_ip, reverse_dns_result, average_ttl, standard_deviation, distinct_ttl, ttl_changes,
+        specific_ttl_ranges, numerical_characters_percent, lms_percent
+    ])
+
+    return all_features
+
+
+def test():
+    start = time.time()
+    print('starting training ' + str(start))
+
+    get_logs_from_db()
+
+    print('total duration: ' + str(time.time() - start) + 's')
+    db.close()
+
+    # db.mariadb_get_distinct_ttl('d2s45lswxaswrw.cloudfront.net', train_start.strftime(db_format_time), train_end.strftime(db_format_time))
+
+
+def flow():
+    iris = load_iris()
+    clf = tree.DecisionTreeClassifier()
+    clf = clf.fit(iris.data, iris.target)  # training set, manual classification
+
+    # predict single or multiple sets with clf.predict([[]])
+
+    # visualize decision tree classifier
+    dot_data = tree.export_graphviz(clf, out_file=None)
+    graph = graphviz.Source(dot_data)
+    graph.render('test', view=True)
+
+
+if __name__ == "__main__":
+    test()
				`@@ -0,0 +1 @@`
				`SELECT id FROM pdns_logs_test where timestamp > '2017-05-08 00:00:00' LIMIT 1;`