From 7dc393ae92d45842ea32df8ba646663a10cc3d36 Mon Sep 17 00:00:00 2001 From: Felix Steghofer Date: Mon, 2 Oct 2017 12:16:05 +0200 Subject: [PATCH] iterate logs from db --- src/DoresA/db.py | 27 ++++++++++++--- src/DoresA/domain.py | 2 +- src/DoresA/scripts/get_all_tld.sh | 8 +++++ src/DoresA/scripts/preprocess_benign.sh | 2 ++ src/DoresA/scripts/preprocess_malicious.sh | 2 ++ src/DoresA/serialize_logs_to_db.py | 38 +++++++++------------- src/DoresA/time.py | 5 ++- 7 files changed, 54 insertions(+), 30 deletions(-) create mode 100755 src/DoresA/scripts/get_all_tld.sh diff --git a/src/DoresA/db.py b/src/DoresA/db.py index e1e76cf..fd4dfc9 100644 --- a/src/DoresA/db.py +++ b/src/DoresA/db.py @@ -1,10 +1,11 @@ import MySQLdb as mariadb +import time from pymongo import MongoClient mongo_client = MongoClient('localhost', 27017) -db = mongo_client.doresa -pdns_logs_mongo = db.pdns_logs +mongo_db = mongo_client.doresa +pdns_logs_mongo = mongo_db.pdns_logs sql_connection = mariadb.connect(user='doresa', passwd='3qfACEZzbXY4b', db='doresa') @@ -14,7 +15,8 @@ sql_cursor = sql_connection.cursor() def mariadb_insert_log(csv_entry): insert_sql = 'INSERT INTO pdns_logs (timestamp, domain, type, record, ttl) VALUES (%s, %s, %s, %s, %s)' - values = (csv_entry[0], csv_entry[1], csv_entry[2], csv_entry[3], csv_entry[4]) + values = (convert_timestamp_to_sql_datetime(float(csv_entry[0])), csv_entry[1], + csv_entry[2], csv_entry[3], csv_entry[4]) sql_cursor.execute(insert_sql, values) sql_connection.commit() @@ -29,17 +31,24 @@ def mariadb_insert_logs(csv_entries): values = [] for csv_entry in csv_entries: - values += [csv_entry[0], csv_entry[1], csv_entry[2], csv_entry[3], csv_entry[4]] + values += [convert_timestamp_to_sql_datetime(float(csv_entry[0])), csv_entry[1], + csv_entry[2], csv_entry[3], csv_entry[4]] sql_cursor.execute(inserts_sql, values) sql_connection.commit() +def mariadb_get_logs(from_time, to_time): + get_logs_from_to = 'SELECT * FROM pdns_logs WHERE timestamp BETWEEN \'{}\' and \'{}\';'.format(from_time, to_time) + sql_connection.query(get_logs_from_to) + return sql_connection.use_result() + + def mariadb_create_table(): create_table = """ CREATE TABLE pdns_logs ( id INTEGER AUTO_INCREMENT PRIMARY KEY, - timestamp VARCHAR(50), + timestamp DATETIME, domain VARCHAR(255), type VARCHAR(50), record VARCHAR(255), @@ -63,9 +72,17 @@ def mongodb_insert_logs(log_entries): pdns_logs_mongo.insert_many(db_entries) +def convert_timestamp_to_sql_datetime(timestamp): + return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(timestamp)) + + def close(): # mariadb sql_cursor.close() sql_connection.close() # mongodb mongo_client.close() + + +if __name__ == "__main__": + exit() diff --git a/src/DoresA/domain.py b/src/DoresA/domain.py index a77a652..4983ebf 100644 --- a/src/DoresA/domain.py +++ b/src/DoresA/domain.py @@ -25,7 +25,7 @@ def find_longest_meaningful_substring(string): return match -# TODO strip of protocol and TLD (if needed) +# TODO strip of protocol and TLD (if needed) [only 1 domain with http: and 1 with https: for 3 days] def ratio_lms_to_fqdn(string): lms = find_longest_meaningful_substring(string) return len(lms) / len(string) diff --git a/src/DoresA/scripts/get_all_tld.sh b/src/DoresA/scripts/get_all_tld.sh new file mode 100755 index 0000000..fa4e5e7 --- /dev/null +++ b/src/DoresA/scripts/get_all_tld.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# run from root + +cd res; +rm all-tld.txt; + +curl -o all-tld.txt http://data.iana.org/TLD/tlds-alpha-by-domain.txt; diff --git a/src/DoresA/scripts/preprocess_benign.sh b/src/DoresA/scripts/preprocess_benign.sh index ca8aa71..2ed4739 100755 --- a/src/DoresA/scripts/preprocess_benign.sh +++ b/src/DoresA/scripts/preprocess_benign.sh @@ -1,5 +1,7 @@ #!/bin/bash +# run from root + # how much to take COUNT=1000; diff --git a/src/DoresA/scripts/preprocess_malicious.sh b/src/DoresA/scripts/preprocess_malicious.sh index 58ba88b..f136ca8 100755 --- a/src/DoresA/scripts/preprocess_malicious.sh +++ b/src/DoresA/scripts/preprocess_malicious.sh @@ -1,5 +1,7 @@ #!/bin/bash +# run from root + # cleanup cd res; echo "" > malicious_domains.txt; diff --git a/src/DoresA/serialize_logs_to_db.py b/src/DoresA/serialize_logs_to_db.py index 1af82d8..26cb22f 100644 --- a/src/DoresA/serialize_logs_to_db.py +++ b/src/DoresA/serialize_logs_to_db.py @@ -12,7 +12,9 @@ analysis_start_date = datetime.date(2017, 4, 7) analysis_days_amount = 3 # e.g. analysis_days = ['2017-04-07', '2017-04-08', '2017-04-09'] -analysis_days = [(analysis_start_date + datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in range(analysis_days_amount)] +analysis_days = [(analysis_start_date + datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in + range(analysis_days_amount)] + # mongodb @@ -38,6 +40,10 @@ def main(): progress_bar.next() # everything[day][hour] = {} for hour_files in log_files_hour[hour]: + + # a bit faster + # df = pandas.read_csv(log_file, compression='gzip', header=None) + # print(df.iloc[0]) with gzip.open(hour_files, 'rt', newline='') as file: reader = csv.reader(file) all_rows = list(reader) @@ -45,37 +51,23 @@ def main(): # batch mode (batches of 1000 entries) for log_entries in batch(all_rows, 1000): db.mariadb_insert_logs(log_entries) - db.mongodb_insert_logs(log_entries) + # db.mongodb_insert_logs(log_entries) - # single mode - # for log_entry in reader: - # db.mariadb_insert_log(log_entry) - # # db.mongodb_insert_log(log_entry) + # single mode + # for log_entry in reader: + # db.mariadb_insert_log(log_entry) + # # db.mongodb_insert_log(log_entry) progress_bar.finish() - # log_entry[4] == TTL - # if log_entry[4] in distinct_ttl_count: - # distinct_ttl_count[log_entry[4]] += 1 - # else: - # distinct_ttl_count[log_entry[4]] = 1 - # - # everything[day][hour]['ttl'] = distinct_ttl_count - - # a bit faster - # df = pandas.read_csv(log_file, compression='gzip', header=None) - # print(df.iloc[0]) - - # print('distinct TTLs: ' + str(len(everything[0][0]['ttl'].keys()))) - print('total duration: ' + str(time.time() - start) + 's') db.close() def batch(iterable, n=1): - l = len(iterable) - for ndx in range(0, l, n): - yield iterable[ndx:min(ndx + n, l)] + length = len(iterable) + for ndx in range(0, length, n): + yield iterable[ndx:min(ndx + n, length)] def check_duplicates(): diff --git a/src/DoresA/time.py b/src/DoresA/time.py index 5938cd2..03ef5c5 100644 --- a/src/DoresA/time.py +++ b/src/DoresA/time.py @@ -23,8 +23,11 @@ def test_decision_tree(): from sklearn import tree iris = load_iris() clf = tree.DecisionTreeClassifier() - clf = clf.fit(iris.data, iris.target) + clf = clf.fit(iris.data, iris.target) # training set, manual classification + # predict single or multiple sets with clf.predict([[]]) + + # visualize decision tree classifier import graphviz dot_data = tree.export_graphviz(clf, out_file=None) graph = graphviz.Source(dot_data)