iterate logs from db

This commit is contained in:
2017-10-02 12:16:05 +02:00
parent e655faaf62
commit 7dc393ae92
7 changed files with 54 additions and 30 deletions

View File

@@ -1,10 +1,11 @@
import MySQLdb as mariadb import MySQLdb as mariadb
import time
from pymongo import MongoClient from pymongo import MongoClient
mongo_client = MongoClient('localhost', 27017) mongo_client = MongoClient('localhost', 27017)
db = mongo_client.doresa mongo_db = mongo_client.doresa
pdns_logs_mongo = db.pdns_logs pdns_logs_mongo = mongo_db.pdns_logs
sql_connection = mariadb.connect(user='doresa', passwd='3qfACEZzbXY4b', db='doresa') sql_connection = mariadb.connect(user='doresa', passwd='3qfACEZzbXY4b', db='doresa')
@@ -14,7 +15,8 @@ sql_cursor = sql_connection.cursor()
def mariadb_insert_log(csv_entry): def mariadb_insert_log(csv_entry):
insert_sql = 'INSERT INTO pdns_logs (timestamp, domain, type, record, ttl) VALUES (%s, %s, %s, %s, %s)' insert_sql = 'INSERT INTO pdns_logs (timestamp, domain, type, record, ttl) VALUES (%s, %s, %s, %s, %s)'
values = (csv_entry[0], csv_entry[1], csv_entry[2], csv_entry[3], csv_entry[4]) values = (convert_timestamp_to_sql_datetime(float(csv_entry[0])), csv_entry[1],
csv_entry[2], csv_entry[3], csv_entry[4])
sql_cursor.execute(insert_sql, values) sql_cursor.execute(insert_sql, values)
sql_connection.commit() sql_connection.commit()
@@ -29,17 +31,24 @@ def mariadb_insert_logs(csv_entries):
values = [] values = []
for csv_entry in csv_entries: for csv_entry in csv_entries:
values += [csv_entry[0], csv_entry[1], csv_entry[2], csv_entry[3], csv_entry[4]] values += [convert_timestamp_to_sql_datetime(float(csv_entry[0])), csv_entry[1],
csv_entry[2], csv_entry[3], csv_entry[4]]
sql_cursor.execute(inserts_sql, values) sql_cursor.execute(inserts_sql, values)
sql_connection.commit() sql_connection.commit()
def mariadb_get_logs(from_time, to_time):
get_logs_from_to = 'SELECT * FROM pdns_logs WHERE timestamp BETWEEN \'{}\' and \'{}\';'.format(from_time, to_time)
sql_connection.query(get_logs_from_to)
return sql_connection.use_result()
def mariadb_create_table(): def mariadb_create_table():
create_table = """ create_table = """
CREATE TABLE pdns_logs ( CREATE TABLE pdns_logs (
id INTEGER AUTO_INCREMENT PRIMARY KEY, id INTEGER AUTO_INCREMENT PRIMARY KEY,
timestamp VARCHAR(50), timestamp DATETIME,
domain VARCHAR(255), domain VARCHAR(255),
type VARCHAR(50), type VARCHAR(50),
record VARCHAR(255), record VARCHAR(255),
@@ -63,9 +72,17 @@ def mongodb_insert_logs(log_entries):
pdns_logs_mongo.insert_many(db_entries) pdns_logs_mongo.insert_many(db_entries)
def convert_timestamp_to_sql_datetime(timestamp):
return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(timestamp))
def close(): def close():
# mariadb # mariadb
sql_cursor.close() sql_cursor.close()
sql_connection.close() sql_connection.close()
# mongodb # mongodb
mongo_client.close() mongo_client.close()
if __name__ == "__main__":
exit()

View File

@@ -25,7 +25,7 @@ def find_longest_meaningful_substring(string):
return match return match
# TODO strip of protocol and TLD (if needed) # TODO strip of protocol and TLD (if needed) [only 1 domain with http: and 1 with https: for 3 days]
def ratio_lms_to_fqdn(string): def ratio_lms_to_fqdn(string):
lms = find_longest_meaningful_substring(string) lms = find_longest_meaningful_substring(string)
return len(lms) / len(string) return len(lms) / len(string)

View File

@@ -0,0 +1,8 @@
#!/bin/bash
# run from root
cd res;
rm all-tld.txt;
curl -o all-tld.txt http://data.iana.org/TLD/tlds-alpha-by-domain.txt;

View File

@@ -1,5 +1,7 @@
#!/bin/bash #!/bin/bash
# run from root
# how much to take # how much to take
COUNT=1000; COUNT=1000;

View File

@@ -1,5 +1,7 @@
#!/bin/bash #!/bin/bash
# run from root
# cleanup # cleanup
cd res; cd res;
echo "" > malicious_domains.txt; echo "" > malicious_domains.txt;

View File

@@ -12,7 +12,9 @@ analysis_start_date = datetime.date(2017, 4, 7)
analysis_days_amount = 3 analysis_days_amount = 3
# e.g. analysis_days = ['2017-04-07', '2017-04-08', '2017-04-09'] # e.g. analysis_days = ['2017-04-07', '2017-04-08', '2017-04-09']
analysis_days = [(analysis_start_date + datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in range(analysis_days_amount)] analysis_days = [(analysis_start_date + datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in
range(analysis_days_amount)]
# mongodb # mongodb
@@ -38,6 +40,10 @@ def main():
progress_bar.next() progress_bar.next()
# everything[day][hour] = {} # everything[day][hour] = {}
for hour_files in log_files_hour[hour]: for hour_files in log_files_hour[hour]:
# a bit faster
# df = pandas.read_csv(log_file, compression='gzip', header=None)
# print(df.iloc[0])
with gzip.open(hour_files, 'rt', newline='') as file: with gzip.open(hour_files, 'rt', newline='') as file:
reader = csv.reader(file) reader = csv.reader(file)
all_rows = list(reader) all_rows = list(reader)
@@ -45,37 +51,23 @@ def main():
# batch mode (batches of 1000 entries) # batch mode (batches of 1000 entries)
for log_entries in batch(all_rows, 1000): for log_entries in batch(all_rows, 1000):
db.mariadb_insert_logs(log_entries) db.mariadb_insert_logs(log_entries)
db.mongodb_insert_logs(log_entries) # db.mongodb_insert_logs(log_entries)
# single mode # single mode
# for log_entry in reader: # for log_entry in reader:
# db.mariadb_insert_log(log_entry) # db.mariadb_insert_log(log_entry)
# # db.mongodb_insert_log(log_entry) # # db.mongodb_insert_log(log_entry)
progress_bar.finish() progress_bar.finish()
# log_entry[4] == TTL
# if log_entry[4] in distinct_ttl_count:
# distinct_ttl_count[log_entry[4]] += 1
# else:
# distinct_ttl_count[log_entry[4]] = 1
#
# everything[day][hour]['ttl'] = distinct_ttl_count
# a bit faster
# df = pandas.read_csv(log_file, compression='gzip', header=None)
# print(df.iloc[0])
# print('distinct TTLs: ' + str(len(everything[0][0]['ttl'].keys())))
print('total duration: ' + str(time.time() - start) + 's') print('total duration: ' + str(time.time() - start) + 's')
db.close() db.close()
def batch(iterable, n=1): def batch(iterable, n=1):
l = len(iterable) length = len(iterable)
for ndx in range(0, l, n): for ndx in range(0, length, n):
yield iterable[ndx:min(ndx + n, l)] yield iterable[ndx:min(ndx + n, length)]
def check_duplicates(): def check_duplicates():

View File

@@ -23,8 +23,11 @@ def test_decision_tree():
from sklearn import tree from sklearn import tree
iris = load_iris() iris = load_iris()
clf = tree.DecisionTreeClassifier() clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target) clf = clf.fit(iris.data, iris.target) # training set, manual classification
# predict single or multiple sets with clf.predict([[]])
# visualize decision tree classifier
import graphviz import graphviz
dot_data = tree.export_graphviz(clf, out_file=None) dot_data = tree.export_graphviz(clf, out_file=None)
graph = graphviz.Source(dot_data) graph = graphviz.Source(dot_data)