splitted db into separate files

added redis
train using csv files instead of sql
This commit is contained in:
2017-11-30 15:51:46 +01:00
parent 42fce4f17c
commit cf0536483b
12 changed files with 22775 additions and 73 deletions

View File

@@ -1,29 +1,69 @@
from sklearn.datasets import load_iris
from sklearn import tree
import logging
import datetime
# logfile = 'analysis_' + datetime.datetime.now().strftime('%Y-%m-%d_%H:%M') + '.log' # https://stackoverflow.com/questions/1943747/python-logging-before-you-run-logging-basicconfig
# logging.basicConfig(filename=logfile, filemode='w') # important to set basicConfig only once for all modules
logging.basicConfig()
import logging
import datetime
import gzip
import csv
import numpy as np
import graphviz
import datetime
import logging
import time
import db
import db_redis
import domain
import ip
import ttl
import csv_tools
import progressbar
# import db_sql
from sklearn.datasets import load_iris
from sklearn import tree
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('train')
logger.setLevel(logging.DEBUG)
db_format_time = '%Y-%m-%d %H:%M:%S'
train_start = datetime.date(2017, 5, 1)
train_end = datetime.date(2017, 5, 8)
train_end = datetime.date(2017, 5, 4)
id_upto = 379283817
# record types that should be analysed (e.g. only A)
record_types = ['A']
# id_upto = db.mariadb_get_nearest_id(train_end.strftime(db_format_time))
def train():
start = time.time()
for day in range(csv_tools.analysis_days_amount):
log_files_hour = csv_tools.get_log_files_for_hours_of_day(csv_tools.analysis_days[day])
progress_bar = progressbar.ProgressBar()
for hour in progress_bar(range(24)):
for hour_files in log_files_hour[hour]:
with gzip.open(hour_files, 'rt', newline='') as file:
reader = csv.reader(file)
for row in reader:
if row[2] in record_types:
entity = {'timestamp': row[0], 'domain': row[1], 'type': row[2],
'record': row[3], 'ttl': row[4]}
try:
prepare_features_redis(entity)
# pass
except Exception as e:
logger.error(e)
logger.error('Exception occured processing entity: ' + str(entity))
def get_logs_from_db():
results = db.mariadb_get_logs(id_upto)
@@ -31,43 +71,133 @@ def get_logs_from_db():
logger.debug("# entity: " + row[0]['domain'])
features = prepare_features(row[0])
features = prepare_features_redis(row[0])
logger.info(str(features))
logger.debug(str(features))
# while row:
# logger.debug("# entity: " + row[0]['domain'])
#
# features = prepare_features(row[0])
#
# logger.info(str(features))
# logger.debug(str(features))
#
# row = results.fetch_row(how=1)
def prepare_features(entity):
# get all logs for the same domain
def prepare_features_redis(entity):
checkpoint = time.time()
domain_stats = db_redis.get_stats_for_domain(entity['domain'])
ip_stats = db_redis.get_stats_for_ip(entity['record'])
logger.debug('redis took' + str(time.time() - checkpoint))
logger.debug(domain_stats)
if len(domain_stats) != 1:
logger.debug('no stats in redis for entity: ' + entity)
domain_stats = domain_stats[0]
# TODO
ips = []
# feature 5: Number of distinct IP addresses
distinct_ips = len(ips)
# feature 6: Number of distinct countries
distinct_countries = len([ip.get_country_by_ip(ip_str) for ip_str in ips])
# feature 7: Number of (distinct) domains share the IP with
distinct_domains_with_same_ip = len(ip_stats)
# feature 8: Reverse DNS query results
# 5 atomic feature
# atomic 1: ratio of IP addresses that cannot be matched with a domain name (NX domains)
ratio_ips_nx = 0
# atomic 2: ratio of ips that are used for DSL lines
ratio_ips_dsl = 0
# atomic 3: ratio of ips that belong to hosting services
ratio_ips_hoster = 0
# atomic 4: ratio of ips that belong to known ISPs
ratio_ips_isp = 0
# atomic 5: ips that can be matched with a valid domain name
ratio_ips_valid = 0
# TODO add atomics to 'all_features'
reverse_dns_result = 0
# feature 9: Average TTL
average_ttl = sum(domain_stats['ttls']) / len(domain_stats['ttls'])
# feature 10: Standard Deviation of TTL
standard_deviation = ttl.standard_deviation(domain_stats['ttls']) # TODO distinct ttls for std deviation?
# feature 11: Number of distinct TTL values
distinct_ttl = len(list(set(domain_stats['ttls'])))
# feature 12: Number of TTL change
ttl_changes = ttl.changes(domain_stats['ttls'])
# feature 13: Percentage usage of specific TTL ranges
# specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf]
# TODO check if 5 individual features make a difference
specific_ttl_ranges = ttl.specific_range(entity['ttl'])
# feature 14: % of numerical characters
numerical_characters_percent = domain.ratio_numerical_to_alpha(entity['domain'])
# feature 15: % of the length of the LMS
lms_percent = domain.ratio_lms_to_fqdn(entity['domain'])
all_features = np.array([
distinct_ips, distinct_countries,
distinct_domains_with_same_ip, reverse_dns_result, average_ttl, standard_deviation, distinct_ttl, ttl_changes,
specific_ttl_ranges, numerical_characters_percent, lms_percent
])
logger.debug(all_features)
exit()
return all_features
def prepare_features_mysql(entity):
checkpoint = time.time()
logger.debug('get logs for domain start')
# get all logs for the same domain
# BIG TODO check if we need the ip addresses of a specific response (not of all [different] responses) somewhere
logs_for_domain = db.mariadb_get_logs_for_domain(entity['domain'], id_upto)
logger.debug('get logs for domain done' + str(time.time() - checkpoint) + ' s')
logger.info('get logs for domain done' + str(time.time() - checkpoint) + ' s')
# TODO do this efficient
ttls = [log['ttl'] for log in logs_for_domain]
logger.info('ttls ' + str(ttls))
logger.debug('ttls ' + str(ttls))
ips = [log['record'] for log in logs_for_domain] # TODO check if valid ip address
logger.info(ips)
logger.debug(ips)
response_timestamps = [log['timestamp'] for log in logs_for_domain]
logger.info(response_timestamps)
logger.debug(response_timestamps)
domains_with_same_ip = []
# get all logs for the same ip if valid ip
if ip.is_valid_ipv4(entity['record']) or ip.is_valid_ipv6(entity['record']):
checkpoint = time.time()
logger.debug('get logs for ip start')
logger.info('get logs for ip start')
logs_for_ip = db.mariadb_get_logs_for_ip(entity['record'], id_upto)
logger.debug('get logs for ip done' + str(time.time() - checkpoint) + ' s')
logger.info('get logs for ip done' + str(time.time() - checkpoint) + ' s')
domains_with_same_ip = [log['domain'] for log in logs_for_ip]
# feature 1: Short Life
@@ -174,12 +304,12 @@ def prepare_features(entity):
def test():
start = time.time()
logger.debug('starting training ' + str(start))
logger.info('starting training ' + str(start))
get_logs_from_db()
train()
logger.debug('total duration: ' + str(time.time() - start) + 's')
db.close()
logger.info('total duration: ' + str(time.time() - start) + 's')
cleanup()
# db.mariadb_get_distinct_ttl('d2s45lswxaswrw.cloudfront.net', train_start.strftime(db_format_time), train_end.strftime(db_format_time))
@@ -197,5 +327,9 @@ def flow():
graph.render('test', view=True)
def cleanup():
db.close()
if __name__ == "__main__":
test()