added logging, use index (id) for sql queries (id hardcoded atm), added newer version of EXPOSURE paper
This commit is contained in:
Binary file not shown.
23
src/DoresA/classify.py
Normal file
23
src/DoresA/classify.py
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
|
||||||
|
|
||||||
|
def load_whitelist():
|
||||||
|
filename = 'res/benign_domains.txt'
|
||||||
|
return open(filename).read().splitlines()
|
||||||
|
|
||||||
|
|
||||||
|
def load_blacklist():
|
||||||
|
filename = 'res/malicious_domains.txt'
|
||||||
|
return open(filename).read().splitlines()
|
||||||
|
|
||||||
|
|
||||||
|
# TODO check if white and blacklists have to be stripped to tld (subdomains)
|
||||||
|
# TODO also check if subdomains are located in db
|
||||||
|
def test():
|
||||||
|
filter_list = load_whitelist()
|
||||||
|
|
||||||
|
for item in filter_list:
|
||||||
|
print(item.count('.'))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test()
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
import MySQLdb as mariadb
|
import MySQLdb as mariadb
|
||||||
import time
|
import time
|
||||||
import os
|
import os
|
||||||
|
import logging
|
||||||
from pymongo import MongoClient
|
from pymongo import MongoClient
|
||||||
|
|
||||||
|
|
||||||
@@ -18,6 +19,10 @@ sql_pw = '3qfACEZzbXY4b'
|
|||||||
sql_table_name = 'pdns_logs_test'
|
sql_table_name = 'pdns_logs_test'
|
||||||
|
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
logger = logging.getLogger('db')
|
||||||
|
|
||||||
|
|
||||||
if 'MYSQL_HOST' in os.environ:
|
if 'MYSQL_HOST' in os.environ:
|
||||||
sql_host = os.environ['MYSQL_HOST']
|
sql_host = os.environ['MYSQL_HOST']
|
||||||
|
|
||||||
@@ -63,7 +68,7 @@ def mariadb_insert_logs(csv_entries):
|
|||||||
csv_entry[2], csv_entry[3], csv_entry[4]]
|
csv_entry[2], csv_entry[3], csv_entry[4]]
|
||||||
inserts_sql += '(%s, %s, %s, %s, %s), '
|
inserts_sql += '(%s, %s, %s, %s, %s), '
|
||||||
except IndexError:
|
except IndexError:
|
||||||
print('index error for csv entry: ' + str(csv_entry))
|
logger.error('index error for csv entry: ' + str(csv_entry))
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# remove last comma for sql compatibility
|
# remove last comma for sql compatibility
|
||||||
@@ -72,15 +77,15 @@ def mariadb_insert_logs(csv_entries):
|
|||||||
try:
|
try:
|
||||||
sql_cursor.execute(inserts_sql, values)
|
sql_cursor.execute(inserts_sql, values)
|
||||||
except Exception:
|
except Exception:
|
||||||
print('could not insert entry: ' + str(values)) # TODO proper error handling
|
logger.error('could not insert entry: ' + str(values)) # TODO proper error handling
|
||||||
pass
|
pass
|
||||||
|
|
||||||
sql_connection.commit()
|
sql_connection.commit()
|
||||||
|
|
||||||
|
|
||||||
def mariadb_get_logs(from_time, to_time):
|
def mariadb_get_logs(id_upto, from_time=None, to_time=None):
|
||||||
# get_logs_from_to = 'SELECT * FROM ' + sql_table_name + ' WHERE timestamp BETWEEN \'{}\' and \'{}\';'.format(from_time, to_time)
|
# get_logs_from_to = 'SELECT * FROM ' + sql_table_name + ' WHERE timestamp BETWEEN \'{}\' and \'{}\';'.format(from_time, to_time)
|
||||||
get_logs_from_to = 'SELECT * FROM ' + sql_table_name + ' WHERE id < 379283817;'
|
get_logs_from_to = 'SELECT * FROM ' + sql_table_name + ' WHERE id < {};'.format(id_upto)
|
||||||
sql_connection.query(get_logs_from_to)
|
sql_connection.query(get_logs_from_to)
|
||||||
return sql_connection.use_result()
|
return sql_connection.use_result()
|
||||||
|
|
||||||
@@ -94,7 +99,7 @@ def mariadb_get_logs(from_time, to_time):
|
|||||||
# return sql_connection.use_result()
|
# return sql_connection.use_result()
|
||||||
|
|
||||||
|
|
||||||
def mariadb_get_logs_for_domain(domain, from_time, to_time):
|
def mariadb_get_logs_for_domain(domain, id_upto, from_time=None, to_time=None):
|
||||||
# we need a second connection for this query as this usually (always) run in parallel to the first query
|
# we need a second connection for this query as this usually (always) run in parallel to the first query
|
||||||
sql_connection_tmp = mariadb.connect(host=sql_host, user=sql_user_name, passwd=sql_pw, db=sql_db_name, port=sql_port)
|
sql_connection_tmp = mariadb.connect(host=sql_host, user=sql_user_name, passwd=sql_pw, db=sql_db_name, port=sql_port)
|
||||||
|
|
||||||
@@ -103,7 +108,7 @@ def mariadb_get_logs_for_domain(domain, from_time, to_time):
|
|||||||
# ' WHERE timestamp BETWEEN \'{}\' and \'{}\' '.format(from_time, to_time) + \
|
# ' WHERE timestamp BETWEEN \'{}\' and \'{}\' '.format(from_time, to_time) + \
|
||||||
# 'AND domain=\'' + domain + '\';'
|
# 'AND domain=\'' + domain + '\';'
|
||||||
get_distinct_ttl = 'SELECT * FROM ' + sql_table_name + \
|
get_distinct_ttl = 'SELECT * FROM ' + sql_table_name + \
|
||||||
' WHERE id < 379283817 ' + \
|
' WHERE id < {} '.format(id_upto) + \
|
||||||
'AND domain=\'' + domain + '\';'
|
'AND domain=\'' + domain + '\';'
|
||||||
sql_connection_tmp.query(get_distinct_ttl)
|
sql_connection_tmp.query(get_distinct_ttl)
|
||||||
result = sql_connection_tmp.use_result()
|
result = sql_connection_tmp.use_result()
|
||||||
@@ -114,7 +119,7 @@ def mariadb_get_logs_for_domain(domain, from_time, to_time):
|
|||||||
return logs_for_domain
|
return logs_for_domain
|
||||||
|
|
||||||
|
|
||||||
def mariadb_get_logs_for_ip(ip, from_time, to_time):
|
def mariadb_get_logs_for_ip(ip, id_upto, from_time=None, to_time=None):
|
||||||
# we need a second connection for this query as this usually (always) run in parallel to the first query
|
# we need a second connection for this query as this usually (always) run in parallel to the first query
|
||||||
sql_connection_tmp = mariadb.connect(host=sql_host, user=sql_user_name, passwd=sql_pw, db=sql_db_name, port=sql_port)
|
sql_connection_tmp = mariadb.connect(host=sql_host, user=sql_user_name, passwd=sql_pw, db=sql_db_name, port=sql_port)
|
||||||
sql_cursor_tmp = sql_connection_tmp.cursor()
|
sql_cursor_tmp = sql_connection_tmp.cursor()
|
||||||
@@ -122,7 +127,7 @@ def mariadb_get_logs_for_ip(ip, from_time, to_time):
|
|||||||
# ' WHERE timestamp BETWEEN \'{}\' and \'{}\' '.format(from_time, to_time) + \
|
# ' WHERE timestamp BETWEEN \'{}\' and \'{}\' '.format(from_time, to_time) + \
|
||||||
# 'AND domain=\'' + str(ip) + '\';'
|
# 'AND domain=\'' + str(ip) + '\';'
|
||||||
get_distinct_ttl = 'SELECT * FROM ' + sql_table_name + \
|
get_distinct_ttl = 'SELECT * FROM ' + sql_table_name + \
|
||||||
' WHERE id < 379283817 ' + \
|
' WHERE id < {} '.format(id_upto) + \
|
||||||
'AND domain=\'' + str(ip) + '\';'
|
'AND domain=\'' + str(ip) + '\';'
|
||||||
sql_connection_tmp.query(get_distinct_ttl)
|
sql_connection_tmp.query(get_distinct_ttl)
|
||||||
|
|
||||||
|
|||||||
@@ -3,3 +3,11 @@ import socket
|
|||||||
|
|
||||||
def reverse(ip):
|
def reverse(ip):
|
||||||
return socket.gethostbyaddr(ip)
|
return socket.gethostbyaddr(ip)
|
||||||
|
|
||||||
|
|
||||||
|
def resolve(domain):
|
||||||
|
return socket.gethostbyname(domain)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit()
|
||||||
|
|||||||
@@ -1,4 +1,28 @@
|
|||||||
import re
|
import re
|
||||||
|
import logging
|
||||||
|
from geoip2 import database, errors
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
logger = logging.getLogger('ip')
|
||||||
|
|
||||||
|
|
||||||
|
def get_country_by_ip(ip):
|
||||||
|
with database.Reader('res/GeoLite2-Country_20170905/GeoLite2-Country.mmdb') as reader:
|
||||||
|
result = reader.country(ip)
|
||||||
|
return result.country.names['en']
|
||||||
|
|
||||||
|
|
||||||
|
def get_isp_by_ip(ip):
|
||||||
|
with database.Reader('res/GeoLite2-ASN_20171107/GeoLite2-ASN.mmdb') as reader:
|
||||||
|
try:
|
||||||
|
result = reader.asn(ip)
|
||||||
|
return result.autonomous_system_number
|
||||||
|
except errors.AddressNotFoundError:
|
||||||
|
logger.debug('address not in isp database')
|
||||||
|
|
||||||
|
|
||||||
|
def test():
|
||||||
|
print(get_isp_by_ip('178.27.82.37'))
|
||||||
|
|
||||||
|
|
||||||
# proudly taken from https://stackoverflow.com/questions/319279/how-to-validate-ip-address-in-python
|
# proudly taken from https://stackoverflow.com/questions/319279/how-to-validate-ip-address-in-python
|
||||||
@@ -72,3 +96,7 @@ def is_valid_ipv6(ip):
|
|||||||
$
|
$
|
||||||
""", re.VERBOSE | re.IGNORECASE | re.DOTALL)
|
""", re.VERBOSE | re.IGNORECASE | re.DOTALL)
|
||||||
return pattern.match(ip) is not None
|
return pattern.match(ip) is not None
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test()
|
||||||
|
|||||||
@@ -1,14 +0,0 @@
|
|||||||
from geolite2 import geolite2
|
|
||||||
|
|
||||||
|
|
||||||
def get_country_by_ip(ip):
|
|
||||||
with geolite2 as gl2:
|
|
||||||
reader = gl2.reader()
|
|
||||||
result = reader.get(ip)
|
|
||||||
|
|
||||||
if result:
|
|
||||||
return result['country']['names']['en']
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
exit()
|
|
||||||
9
src/DoresA/logs/delete_not_A_or_AAAA.txt
Normal file
9
src/DoresA/logs/delete_not_A_or_AAAA.txt
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
|
||||||
|
delete from pdns_logs_test where type != 'A' AND type != 'AAAA';
|
||||||
|
ERROR 2006 (HY000): MySQL server has gone away
|
||||||
|
No connection. Trying to reconnect...
|
||||||
|
Connection id: 3
|
||||||
|
Current database: doresa
|
||||||
|
|
||||||
|
Query OK, 101235298 rows affected (47 min 38.87 sec)
|
||||||
|
|
||||||
@@ -1 +1 @@
|
|||||||
{"last_check":"2017-09-27T12:13:16Z","pypi_version":"9.0.1"}
|
{"last_check":"2017-11-07T13:21:55Z","pypi_version":"9.0.1"}
|
||||||
@@ -1,11 +1,27 @@
|
|||||||
|
certifi==2017.11.5
|
||||||
|
chardet==3.0.4
|
||||||
|
cycler==0.10.0
|
||||||
|
geoip2==2.6.0
|
||||||
|
graphviz==0.8
|
||||||
|
idna==2.6
|
||||||
|
javabridge==1.0.15
|
||||||
|
matplotlib==2.0.2
|
||||||
maxminddb==1.3.0
|
maxminddb==1.3.0
|
||||||
maxminddb-geolite2==2017.803
|
maxminddb-geolite2==2017.803
|
||||||
mysqlclient==1.3.12
|
mysqlclient==1.3.12
|
||||||
|
nltk==3.2.5
|
||||||
numpy==1.13.1
|
numpy==1.13.1
|
||||||
|
pandas==0.20.3
|
||||||
progress==1.3
|
progress==1.3
|
||||||
pyenchant==1.6.11
|
pyenchant==1.6.11
|
||||||
pymongo==3.5.1
|
pymongo==3.5.1
|
||||||
|
pyparsing==2.2.0
|
||||||
python-dateutil==2.6.1
|
python-dateutil==2.6.1
|
||||||
python-geoip==1.2
|
python-geoip==1.2
|
||||||
|
python-weka-wrapper3==0.1.3
|
||||||
pytz==2017.2
|
pytz==2017.2
|
||||||
|
requests==2.18.4
|
||||||
|
scikit-learn==0.19.0
|
||||||
|
scipy==0.19.1
|
||||||
six==1.10.0
|
six==1.10.0
|
||||||
|
urllib3==1.22
|
||||||
|
|||||||
BIN
src/DoresA/res/GeoLite2-ASN_20171107/GeoLite2-ASN.mmdb
Normal file
BIN
src/DoresA/res/GeoLite2-ASN_20171107/GeoLite2-ASN.mmdb
Normal file
Binary file not shown.
@@ -1,5 +1,7 @@
|
|||||||
from detect_cusum import detect_cusum
|
from detect_cusum import detect_cusum
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import datetime
|
||||||
|
import math
|
||||||
|
|
||||||
|
|
||||||
def cusum():
|
def cusum():
|
||||||
@@ -18,11 +20,32 @@ def variance(a):
|
|||||||
return np.var(a)
|
return np.var(a)
|
||||||
|
|
||||||
|
|
||||||
|
# 'slot_length': amount of seconds for one slot
|
||||||
|
# returns a dictionary with slots as keys and timestamps as values
|
||||||
|
def split_to_fixed_slots(all_response_timestamps, analysis_start, slot_length):
|
||||||
|
fixed_slots = {}
|
||||||
|
|
||||||
|
for response_timestamp in all_response_timestamps:
|
||||||
|
slot = math.floor((response_timestamp - analysis_start).seconds / slot_length)
|
||||||
|
fixed_slots[slot] = response_timestamp
|
||||||
|
return fixed_slots
|
||||||
|
|
||||||
|
|
||||||
|
def is_short_lived(all_response_timestamps, analysis_start, analysis_end):
|
||||||
|
fixed_slots = split_to_fixed_slots(all_response_timestamps, analysis_start, 3600) # timeslot of length 3600 seconds
|
||||||
|
print(fixed_slots)
|
||||||
|
|
||||||
|
|
||||||
def test():
|
def test():
|
||||||
# a = np.array((1, 2, 3))
|
# a = np.array((1, 2, 3))
|
||||||
# b = np.array((0, 1, 2))
|
# b = np.array((0, 1, 2))
|
||||||
# print(variance(a))
|
# print(variance(a))
|
||||||
cusum()
|
|
||||||
|
test_timestamps = [datetime.datetime(2017, 5, 1, 2, 5, 5), datetime.datetime(2017, 5, 1, 2, 44, 21), datetime.datetime(2017, 5, 1, 4, 48, 19), datetime.datetime(2017, 5, 1, 4, 9, 40), datetime.datetime(2017, 5, 1, 5, 14, 11), datetime.datetime(2017, 5, 1, 6, 50, 54), datetime.datetime(2017, 5, 1, 8, 52, 52), datetime.datetime(2017, 5, 1, 9, 24, 31), datetime.datetime(2017, 5, 1, 13, 29, 12), datetime.datetime(2017, 5, 1, 12, 56, 56), datetime.datetime(2017, 5, 1, 15, 3, 15), datetime.datetime(2017, 5, 1, 15, 57, 6), datetime.datetime(2017, 5, 1, 17, 58, 44), datetime.datetime(2017, 5, 1, 20, 35, 52), datetime.datetime(2017, 5, 1, 23, 6, 16), datetime.datetime(2017, 5, 2, 0, 26, 11), datetime.datetime(2017, 5, 2, 2, 41, 52), datetime.datetime(2017, 5, 2, 3, 14, 5), datetime.datetime(2017, 5, 2, 4, 43, 52), datetime.datetime(2017, 5, 2, 5, 12, 2), datetime.datetime(2017, 5, 2, 9, 32, 59), datetime.datetime(2017, 5, 2, 11, 35, 18), datetime.datetime(2017, 5, 2, 13, 30, 43), datetime.datetime(2017, 5, 2, 17, 41), datetime.datetime(2017, 5, 2, 16, 58, 55), datetime.datetime(2017, 5, 2, 22, 9, 53), datetime.datetime(2017, 5, 2, 22, 18, 15), datetime.datetime(2017, 5, 3, 2, 22, 1), datetime.datetime(2017, 5, 3, 2, 21, 8), datetime.datetime(2017, 5, 3, 4, 24, 20), datetime.datetime(2017, 5, 3, 6, 53, 43), datetime.datetime(2017, 5, 3, 6, 56, 54), datetime.datetime(2017, 5, 3, 8, 27, 58), datetime.datetime(2017, 5, 3, 11, 0), datetime.datetime(2017, 5, 3, 13, 32, 53), datetime.datetime(2017, 5, 3, 13, 2), datetime.datetime(2017, 5, 3, 13, 3, 44), datetime.datetime(2017, 5, 3, 17, 6), datetime.datetime(2017, 5, 3, 20, 12, 22), datetime.datetime(2017, 5, 3, 20, 39, 54), datetime.datetime(2017, 5, 3, 20, 9, 1), datetime.datetime(2017, 5, 3, 22, 42, 2), datetime.datetime(2017, 5, 4, 0, 21, 32), datetime.datetime(2017, 5, 4, 0, 13, 15), datetime.datetime(2017, 5, 4, 1, 14, 3), datetime.datetime(2017, 5, 4, 1, 24, 33), datetime.datetime(2017, 5, 4, 5, 35, 41), datetime.datetime(2017, 5, 4, 7, 41, 13), datetime.datetime(2017, 5, 4, 9, 44), datetime.datetime(2017, 5, 4, 13, 49, 34), datetime.datetime(2017, 5, 4, 18, 1), datetime.datetime(2017, 5, 4, 19, 57, 25), datetime.datetime(2017, 5, 4, 20, 2, 56), datetime.datetime(2017, 5, 4, 23, 22, 25), datetime.datetime(2017, 5, 5, 0, 42, 21), datetime.datetime(2017, 5, 5, 1, 47, 24), datetime.datetime(2017, 5, 5, 5, 47, 6), datetime.datetime(2017, 5, 5, 9, 51, 1), datetime.datetime(2017, 5, 5, 11, 53, 1), datetime.datetime(2017, 5, 5, 12, 9, 22), datetime.datetime(2017, 5, 5, 13, 2, 33), datetime.datetime(2017, 5, 5, 13, 55, 1), datetime.datetime(2017, 5, 5, 15, 57, 1), datetime.datetime(2017, 5, 5, 16, 44, 29), datetime.datetime(2017, 5, 5, 18, 24, 55), datetime.datetime(2017, 5, 5, 23, 29, 55), datetime.datetime(2017, 5, 6, 1, 6, 2), datetime.datetime(2017, 5, 6, 2, 7, 1), datetime.datetime(2017, 5, 6, 3, 8, 1), datetime.datetime(2017, 5, 6, 5, 44, 4), datetime.datetime(2017, 5, 6, 7, 12, 4), datetime.datetime(2017, 5, 6, 8, 13, 2), datetime.datetime(2017, 5, 6, 8, 13, 9), datetime.datetime(2017, 5, 6, 9, 40, 54), datetime.datetime(2017, 5, 6, 10, 24, 15), datetime.datetime(2017, 5, 6, 12, 43, 56), datetime.datetime(2017, 5, 6, 17, 11, 2), datetime.datetime(2017, 5, 6, 17, 22, 1), datetime.datetime(2017, 5, 6, 19, 50, 54), datetime.datetime(2017, 5, 6, 19, 14, 29), datetime.datetime(2017, 5, 6, 21, 15, 34), datetime.datetime(2017, 5, 6, 23, 19, 34), datetime.datetime(2017, 5, 7, 1, 56, 54), datetime.datetime(2017, 5, 7, 2, 31, 23), datetime.datetime(2017, 5, 7, 3, 58, 54), datetime.datetime(2017, 5, 7, 6, 31, 22), datetime.datetime(2017, 5, 7, 7, 36, 1), datetime.datetime(2017, 5, 7, 8, 33, 51), datetime.datetime(2017, 5, 7, 9, 3, 56), datetime.datetime(2017, 5, 7, 10, 35, 36), datetime.datetime(2017, 5, 7, 12, 41, 1), datetime.datetime(2017, 5, 7, 17, 17, 37), datetime.datetime(2017, 5, 7, 18, 47, 19), datetime.datetime(2017, 5, 7, 19, 20, 37), datetime.datetime(2017, 5, 7, 21, 53, 30), datetime.datetime(2017, 5, 7, 23, 52, 20)]
|
||||||
|
|
||||||
|
train_start = datetime.datetime(2017, 5, 1)
|
||||||
|
train_end = datetime.datetime(2017, 5, 8)
|
||||||
|
is_short_lived(test_timestamps, train_start, train_end)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -4,54 +4,80 @@ from sklearn import tree
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import graphviz
|
import graphviz
|
||||||
import datetime
|
import datetime
|
||||||
|
import logging
|
||||||
import time
|
import time
|
||||||
import db
|
import db
|
||||||
import domain
|
import domain
|
||||||
import ip
|
import ip
|
||||||
import location
|
import ttl
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger('train')
|
||||||
|
|
||||||
db_format_time = '%Y-%m-%d %H:%M:%S'
|
db_format_time = '%Y-%m-%d %H:%M:%S'
|
||||||
|
|
||||||
train_start = datetime.date(2017, 5, 1)
|
train_start = datetime.date(2017, 5, 1)
|
||||||
train_end = datetime.date(2017, 5, 2)
|
train_end = datetime.date(2017, 5, 8)
|
||||||
|
|
||||||
|
id_upto = 379283817
|
||||||
|
|
||||||
|
# id_upto = db.mariadb_get_nearest_id(train_end.strftime(db_format_time))
|
||||||
|
|
||||||
|
|
||||||
def get_logs_from_db():
|
def get_logs_from_db():
|
||||||
results = db.mariadb_get_logs(train_start.strftime(db_format_time), train_end.strftime(db_format_time))
|
results = db.mariadb_get_logs(id_upto)
|
||||||
|
|
||||||
row = results.fetch_row(how=1)
|
row = results.fetch_row(how=1)
|
||||||
|
|
||||||
print("# entity: " + row[0]['domain'])
|
logger.debug("# entity: " + row[0]['domain'])
|
||||||
|
|
||||||
features = prepare_features(row[0])
|
features = prepare_features(row[0])
|
||||||
|
|
||||||
print(str(features))
|
logger.info(str(features))
|
||||||
# while row:
|
# while row:
|
||||||
# print("# entity: " + row[0]['domain'])
|
# logger.debug("# entity: " + row[0]['domain'])
|
||||||
#
|
#
|
||||||
# features = prepare_features(row[0])
|
# features = prepare_features(row[0])
|
||||||
#
|
#
|
||||||
# print(str(features))
|
# logger.info(str(features))
|
||||||
#
|
#
|
||||||
# row = results.fetch_row(how=1)
|
# row = results.fetch_row(how=1)
|
||||||
|
|
||||||
|
|
||||||
def prepare_features(entity):
|
def prepare_features(entity):
|
||||||
# get all logs for the same domain
|
# get all logs for the same domain
|
||||||
logs_for_domain = db.mariadb_get_logs_for_domain(entity['domain'], train_start.strftime(db_format_time),
|
|
||||||
train_end.strftime(db_format_time))
|
checkpoint = time.time()
|
||||||
|
logger.debug('get logs for domain start')
|
||||||
|
# BIG TODO check if we need the ip addresses of a specific response (not of all [different] responses) somewhere
|
||||||
|
logs_for_domain = db.mariadb_get_logs_for_domain(entity['domain'], id_upto)
|
||||||
|
logger.debug('get logs for domain done' + str(time.time() - checkpoint) + ' s')
|
||||||
|
|
||||||
|
# TODO do this efficient
|
||||||
ttls = [log['ttl'] for log in logs_for_domain]
|
ttls = [log['ttl'] for log in logs_for_domain]
|
||||||
|
logger.info('ttls ' + str(ttls))
|
||||||
ips = [log['record'] for log in logs_for_domain] # TODO check if valid ip address
|
ips = [log['record'] for log in logs_for_domain] # TODO check if valid ip address
|
||||||
|
logger.info(ips)
|
||||||
|
response_timestamps = [log['timestamp'] for log in logs_for_domain]
|
||||||
|
logger.info(response_timestamps)
|
||||||
|
|
||||||
domains_with_same_ip = []
|
domains_with_same_ip = []
|
||||||
# get all logs for the same ip if valid ip
|
# get all logs for the same ip if valid ip
|
||||||
if ip.is_valid_ipv4(entity['record']) or ip.is_valid_ipv6(entity['record']):
|
if ip.is_valid_ipv4(entity['record']) or ip.is_valid_ipv6(entity['record']):
|
||||||
logs_for_ip = db.mariadb_get_logs_for_ip(entity['record'], train_start.strftime(db_format_time),
|
checkpoint = time.time()
|
||||||
train_end.strftime(db_format_time))
|
logger.debug('get logs for ip start')
|
||||||
|
logs_for_ip = db.mariadb_get_logs_for_ip(entity['record'], id_upto)
|
||||||
|
logger.debug('get logs for ip done' + str(time.time() - checkpoint) + ' s')
|
||||||
domains_with_same_ip = [log['domain'] for log in logs_for_ip]
|
domains_with_same_ip = [log['domain'] for log in logs_for_ip]
|
||||||
|
|
||||||
# feature 1: Short Life
|
# feature 1: Short Life
|
||||||
|
|
||||||
|
# 2 atomic features
|
||||||
|
|
||||||
|
# atomic 1:
|
||||||
|
|
||||||
|
# atomic 2:
|
||||||
|
|
||||||
short_life = 0
|
short_life = 0
|
||||||
|
|
||||||
# feature 2: Daily Similarity
|
# feature 2: Daily Similarity
|
||||||
@@ -60,10 +86,22 @@ def prepare_features(entity):
|
|||||||
|
|
||||||
# feature 3: Repeating Patterns
|
# feature 3: Repeating Patterns
|
||||||
|
|
||||||
|
# 2 atomic features
|
||||||
|
|
||||||
|
# atomic 1:
|
||||||
|
|
||||||
|
# atomic 2:
|
||||||
|
|
||||||
repeating_patterns = 0
|
repeating_patterns = 0
|
||||||
|
|
||||||
# feature 4: Access ratio
|
# feature 4: Access ratio
|
||||||
|
|
||||||
|
# 2 atomic features
|
||||||
|
|
||||||
|
# atomic 1:
|
||||||
|
|
||||||
|
# atomic 2:
|
||||||
|
|
||||||
access_ratio = 0
|
access_ratio = 0
|
||||||
|
|
||||||
# feature 5: Number of distinct IP addresses
|
# feature 5: Number of distinct IP addresses
|
||||||
@@ -72,7 +110,7 @@ def prepare_features(entity):
|
|||||||
|
|
||||||
# feature 6: Number of distinct countries
|
# feature 6: Number of distinct countries
|
||||||
|
|
||||||
distinct_countries = len(list(set([location.get_country_by_ip(ip) for ip in list(set(ips))])))
|
distinct_countries = len(list(set([ip.get_country_by_ip(ip_str) for ip_str in list(set(ips))])))
|
||||||
|
|
||||||
# feature 7: Number of (distinct) domains share the IP with
|
# feature 7: Number of (distinct) domains share the IP with
|
||||||
|
|
||||||
@@ -80,6 +118,20 @@ def prepare_features(entity):
|
|||||||
|
|
||||||
# feature 8: Reverse DNS query results
|
# feature 8: Reverse DNS query results
|
||||||
|
|
||||||
|
# 5 atomic feature
|
||||||
|
|
||||||
|
# atomic 1: ratio of IP addresses that cannot be matched with a domain name (NX domains)
|
||||||
|
|
||||||
|
# atomic 2: ips that are used for DSL lines
|
||||||
|
|
||||||
|
# atomic 3: ips that belong to hosting services
|
||||||
|
|
||||||
|
# atomic 4: ips that belong to known ISPs
|
||||||
|
|
||||||
|
# atomic 5: ips that can be matched with a valid domain name
|
||||||
|
|
||||||
|
# TODO add atomics to 'all_features'
|
||||||
|
|
||||||
reverse_dns_result = 0
|
reverse_dns_result = 0
|
||||||
|
|
||||||
# feature 9: Average TTL
|
# feature 9: Average TTL
|
||||||
@@ -88,7 +140,7 @@ def prepare_features(entity):
|
|||||||
|
|
||||||
# feature 10: Standard Deviation of TTL
|
# feature 10: Standard Deviation of TTL
|
||||||
|
|
||||||
standard_deviation = 0
|
standard_deviation = ttl.standard_deviation(ttls) # TODO distinct ttls for std deviation?
|
||||||
|
|
||||||
# feature 11: Number of distinct TTL values
|
# feature 11: Number of distinct TTL values
|
||||||
|
|
||||||
@@ -96,23 +148,12 @@ def prepare_features(entity):
|
|||||||
|
|
||||||
# feature 12: Number of TTL change
|
# feature 12: Number of TTL change
|
||||||
|
|
||||||
ttl_changes = 0
|
ttl_changes = ttl.changes(ttls)
|
||||||
|
|
||||||
# feature 13: Percentage usage of specific TTL ranges
|
# feature 13: Percentage usage of specific TTL ranges
|
||||||
# specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf]
|
# specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf]
|
||||||
# TODO decide if 5 individual features make a difference
|
# TODO check if 5 individual features make a difference
|
||||||
|
specific_ttl_ranges = ttl.specific_range(entity['ttl'])
|
||||||
ttl = entity['ttl']
|
|
||||||
specific_ttl_ranges = 4 # default is [900, inf]
|
|
||||||
|
|
||||||
if 0 < ttl <= 1:
|
|
||||||
specific_ttl_ranges = 0
|
|
||||||
elif 1 < ttl <= 100:
|
|
||||||
specific_ttl_ranges = 1
|
|
||||||
elif 100 < ttl <= 300:
|
|
||||||
specific_ttl_ranges = 2
|
|
||||||
elif 300 < ttl <= 900:
|
|
||||||
specific_ttl_ranges = 3
|
|
||||||
|
|
||||||
# feature 14: % of numerical characters
|
# feature 14: % of numerical characters
|
||||||
|
|
||||||
@@ -133,11 +174,11 @@ def prepare_features(entity):
|
|||||||
|
|
||||||
def test():
|
def test():
|
||||||
start = time.time()
|
start = time.time()
|
||||||
print('starting training ' + str(start))
|
logger.debug('starting training ' + str(start))
|
||||||
|
|
||||||
get_logs_from_db()
|
get_logs_from_db()
|
||||||
|
|
||||||
print('total duration: ' + str(time.time() - start) + 's')
|
logger.debug('total duration: ' + str(time.time() - start) + 's')
|
||||||
db.close()
|
db.close()
|
||||||
|
|
||||||
# db.mariadb_get_distinct_ttl('d2s45lswxaswrw.cloudfront.net', train_start.strftime(db_format_time), train_end.strftime(db_format_time))
|
# db.mariadb_get_distinct_ttl('d2s45lswxaswrw.cloudfront.net', train_start.strftime(db_format_time), train_end.strftime(db_format_time))
|
||||||
|
|||||||
@@ -5,5 +5,31 @@ def standard_deviation(array):
|
|||||||
return np.std(array)
|
return np.std(array)
|
||||||
|
|
||||||
|
|
||||||
|
def changes(array):
|
||||||
|
current = array[0]
|
||||||
|
changes = 0
|
||||||
|
|
||||||
|
for item in array:
|
||||||
|
if item != current:
|
||||||
|
changes += 1
|
||||||
|
current = item
|
||||||
|
return changes
|
||||||
|
|
||||||
|
|
||||||
|
# specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf]
|
||||||
|
def specific_range(ttl):
|
||||||
|
specific_ttl_ranges = 4 # default is [900, inf]
|
||||||
|
|
||||||
|
if 0 < ttl <= 1:
|
||||||
|
specific_ttl_ranges = 0
|
||||||
|
elif 1 < ttl <= 100:
|
||||||
|
specific_ttl_ranges = 1
|
||||||
|
elif 100 < ttl <= 300:
|
||||||
|
specific_ttl_ranges = 2
|
||||||
|
elif 300 < ttl <= 900:
|
||||||
|
specific_ttl_ranges = 3
|
||||||
|
return specific_ttl_ranges
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
exit()
|
exit()
|
||||||
|
|||||||
Reference in New Issue
Block a user