iterate logs from db
This commit is contained in:
@@ -1,10 +1,11 @@
|
|||||||
import MySQLdb as mariadb
|
import MySQLdb as mariadb
|
||||||
|
import time
|
||||||
from pymongo import MongoClient
|
from pymongo import MongoClient
|
||||||
|
|
||||||
|
|
||||||
mongo_client = MongoClient('localhost', 27017)
|
mongo_client = MongoClient('localhost', 27017)
|
||||||
db = mongo_client.doresa
|
mongo_db = mongo_client.doresa
|
||||||
pdns_logs_mongo = db.pdns_logs
|
pdns_logs_mongo = mongo_db.pdns_logs
|
||||||
|
|
||||||
|
|
||||||
sql_connection = mariadb.connect(user='doresa', passwd='3qfACEZzbXY4b', db='doresa')
|
sql_connection = mariadb.connect(user='doresa', passwd='3qfACEZzbXY4b', db='doresa')
|
||||||
@@ -14,7 +15,8 @@ sql_cursor = sql_connection.cursor()
|
|||||||
def mariadb_insert_log(csv_entry):
|
def mariadb_insert_log(csv_entry):
|
||||||
insert_sql = 'INSERT INTO pdns_logs (timestamp, domain, type, record, ttl) VALUES (%s, %s, %s, %s, %s)'
|
insert_sql = 'INSERT INTO pdns_logs (timestamp, domain, type, record, ttl) VALUES (%s, %s, %s, %s, %s)'
|
||||||
|
|
||||||
values = (csv_entry[0], csv_entry[1], csv_entry[2], csv_entry[3], csv_entry[4])
|
values = (convert_timestamp_to_sql_datetime(float(csv_entry[0])), csv_entry[1],
|
||||||
|
csv_entry[2], csv_entry[3], csv_entry[4])
|
||||||
sql_cursor.execute(insert_sql, values)
|
sql_cursor.execute(insert_sql, values)
|
||||||
sql_connection.commit()
|
sql_connection.commit()
|
||||||
|
|
||||||
@@ -29,17 +31,24 @@ def mariadb_insert_logs(csv_entries):
|
|||||||
values = []
|
values = []
|
||||||
|
|
||||||
for csv_entry in csv_entries:
|
for csv_entry in csv_entries:
|
||||||
values += [csv_entry[0], csv_entry[1], csv_entry[2], csv_entry[3], csv_entry[4]]
|
values += [convert_timestamp_to_sql_datetime(float(csv_entry[0])), csv_entry[1],
|
||||||
|
csv_entry[2], csv_entry[3], csv_entry[4]]
|
||||||
|
|
||||||
sql_cursor.execute(inserts_sql, values)
|
sql_cursor.execute(inserts_sql, values)
|
||||||
sql_connection.commit()
|
sql_connection.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def mariadb_get_logs(from_time, to_time):
|
||||||
|
get_logs_from_to = 'SELECT * FROM pdns_logs WHERE timestamp BETWEEN \'{}\' and \'{}\';'.format(from_time, to_time)
|
||||||
|
sql_connection.query(get_logs_from_to)
|
||||||
|
return sql_connection.use_result()
|
||||||
|
|
||||||
|
|
||||||
def mariadb_create_table():
|
def mariadb_create_table():
|
||||||
create_table = """
|
create_table = """
|
||||||
CREATE TABLE pdns_logs (
|
CREATE TABLE pdns_logs (
|
||||||
id INTEGER AUTO_INCREMENT PRIMARY KEY,
|
id INTEGER AUTO_INCREMENT PRIMARY KEY,
|
||||||
timestamp VARCHAR(50),
|
timestamp DATETIME,
|
||||||
domain VARCHAR(255),
|
domain VARCHAR(255),
|
||||||
type VARCHAR(50),
|
type VARCHAR(50),
|
||||||
record VARCHAR(255),
|
record VARCHAR(255),
|
||||||
@@ -63,9 +72,17 @@ def mongodb_insert_logs(log_entries):
|
|||||||
pdns_logs_mongo.insert_many(db_entries)
|
pdns_logs_mongo.insert_many(db_entries)
|
||||||
|
|
||||||
|
|
||||||
|
def convert_timestamp_to_sql_datetime(timestamp):
|
||||||
|
return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(timestamp))
|
||||||
|
|
||||||
|
|
||||||
def close():
|
def close():
|
||||||
# mariadb
|
# mariadb
|
||||||
sql_cursor.close()
|
sql_cursor.close()
|
||||||
sql_connection.close()
|
sql_connection.close()
|
||||||
# mongodb
|
# mongodb
|
||||||
mongo_client.close()
|
mongo_client.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit()
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ def find_longest_meaningful_substring(string):
|
|||||||
return match
|
return match
|
||||||
|
|
||||||
|
|
||||||
# TODO strip of protocol and TLD (if needed)
|
# TODO strip of protocol and TLD (if needed) [only 1 domain with http: and 1 with https: for 3 days]
|
||||||
def ratio_lms_to_fqdn(string):
|
def ratio_lms_to_fqdn(string):
|
||||||
lms = find_longest_meaningful_substring(string)
|
lms = find_longest_meaningful_substring(string)
|
||||||
return len(lms) / len(string)
|
return len(lms) / len(string)
|
||||||
|
|||||||
8
src/DoresA/scripts/get_all_tld.sh
Executable file
8
src/DoresA/scripts/get_all_tld.sh
Executable file
@@ -0,0 +1,8 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# run from root
|
||||||
|
|
||||||
|
cd res;
|
||||||
|
rm all-tld.txt;
|
||||||
|
|
||||||
|
curl -o all-tld.txt http://data.iana.org/TLD/tlds-alpha-by-domain.txt;
|
||||||
@@ -1,5 +1,7 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
|
# run from root
|
||||||
|
|
||||||
# how much to take
|
# how much to take
|
||||||
COUNT=1000;
|
COUNT=1000;
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
|
# run from root
|
||||||
|
|
||||||
# cleanup
|
# cleanup
|
||||||
cd res;
|
cd res;
|
||||||
echo "" > malicious_domains.txt;
|
echo "" > malicious_domains.txt;
|
||||||
|
|||||||
@@ -12,7 +12,9 @@ analysis_start_date = datetime.date(2017, 4, 7)
|
|||||||
analysis_days_amount = 3
|
analysis_days_amount = 3
|
||||||
|
|
||||||
# e.g. analysis_days = ['2017-04-07', '2017-04-08', '2017-04-09']
|
# e.g. analysis_days = ['2017-04-07', '2017-04-08', '2017-04-09']
|
||||||
analysis_days = [(analysis_start_date + datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in range(analysis_days_amount)]
|
analysis_days = [(analysis_start_date + datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in
|
||||||
|
range(analysis_days_amount)]
|
||||||
|
|
||||||
|
|
||||||
# mongodb
|
# mongodb
|
||||||
|
|
||||||
@@ -38,6 +40,10 @@ def main():
|
|||||||
progress_bar.next()
|
progress_bar.next()
|
||||||
# everything[day][hour] = {}
|
# everything[day][hour] = {}
|
||||||
for hour_files in log_files_hour[hour]:
|
for hour_files in log_files_hour[hour]:
|
||||||
|
|
||||||
|
# a bit faster
|
||||||
|
# df = pandas.read_csv(log_file, compression='gzip', header=None)
|
||||||
|
# print(df.iloc[0])
|
||||||
with gzip.open(hour_files, 'rt', newline='') as file:
|
with gzip.open(hour_files, 'rt', newline='') as file:
|
||||||
reader = csv.reader(file)
|
reader = csv.reader(file)
|
||||||
all_rows = list(reader)
|
all_rows = list(reader)
|
||||||
@@ -45,37 +51,23 @@ def main():
|
|||||||
# batch mode (batches of 1000 entries)
|
# batch mode (batches of 1000 entries)
|
||||||
for log_entries in batch(all_rows, 1000):
|
for log_entries in batch(all_rows, 1000):
|
||||||
db.mariadb_insert_logs(log_entries)
|
db.mariadb_insert_logs(log_entries)
|
||||||
db.mongodb_insert_logs(log_entries)
|
# db.mongodb_insert_logs(log_entries)
|
||||||
|
|
||||||
# single mode
|
# single mode
|
||||||
# for log_entry in reader:
|
# for log_entry in reader:
|
||||||
# db.mariadb_insert_log(log_entry)
|
# db.mariadb_insert_log(log_entry)
|
||||||
# # db.mongodb_insert_log(log_entry)
|
# # db.mongodb_insert_log(log_entry)
|
||||||
|
|
||||||
progress_bar.finish()
|
progress_bar.finish()
|
||||||
|
|
||||||
# log_entry[4] == TTL
|
|
||||||
# if log_entry[4] in distinct_ttl_count:
|
|
||||||
# distinct_ttl_count[log_entry[4]] += 1
|
|
||||||
# else:
|
|
||||||
# distinct_ttl_count[log_entry[4]] = 1
|
|
||||||
#
|
|
||||||
# everything[day][hour]['ttl'] = distinct_ttl_count
|
|
||||||
|
|
||||||
# a bit faster
|
|
||||||
# df = pandas.read_csv(log_file, compression='gzip', header=None)
|
|
||||||
# print(df.iloc[0])
|
|
||||||
|
|
||||||
# print('distinct TTLs: ' + str(len(everything[0][0]['ttl'].keys())))
|
|
||||||
|
|
||||||
print('total duration: ' + str(time.time() - start) + 's')
|
print('total duration: ' + str(time.time() - start) + 's')
|
||||||
db.close()
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
def batch(iterable, n=1):
|
def batch(iterable, n=1):
|
||||||
l = len(iterable)
|
length = len(iterable)
|
||||||
for ndx in range(0, l, n):
|
for ndx in range(0, length, n):
|
||||||
yield iterable[ndx:min(ndx + n, l)]
|
yield iterable[ndx:min(ndx + n, length)]
|
||||||
|
|
||||||
|
|
||||||
def check_duplicates():
|
def check_duplicates():
|
||||||
|
|||||||
@@ -23,8 +23,11 @@ def test_decision_tree():
|
|||||||
from sklearn import tree
|
from sklearn import tree
|
||||||
iris = load_iris()
|
iris = load_iris()
|
||||||
clf = tree.DecisionTreeClassifier()
|
clf = tree.DecisionTreeClassifier()
|
||||||
clf = clf.fit(iris.data, iris.target)
|
clf = clf.fit(iris.data, iris.target) # training set, manual classification
|
||||||
|
|
||||||
|
# predict single or multiple sets with clf.predict([[]])
|
||||||
|
|
||||||
|
# visualize decision tree classifier
|
||||||
import graphviz
|
import graphviz
|
||||||
dot_data = tree.export_graphviz(clf, out_file=None)
|
dot_data = tree.export_graphviz(clf, out_file=None)
|
||||||
graph = graphviz.Source(dot_data)
|
graph = graphviz.Source(dot_data)
|
||||||
|
|||||||
Reference in New Issue
Block a user