336 lines
9.4 KiB
Python
336 lines
9.4 KiB
Python
import logging
|
|
import datetime
|
|
# logfile = 'analysis_' + datetime.datetime.now().strftime('%Y-%m-%d_%H:%M') + '.log' # https://stackoverflow.com/questions/1943747/python-logging-before-you-run-logging-basicconfig
|
|
# logging.basicConfig(filename=logfile, filemode='w') # important to set basicConfig only once for all modules
|
|
logging.basicConfig()
|
|
|
|
import logging
|
|
import datetime
|
|
import gzip
|
|
import csv
|
|
import numpy as np
|
|
import graphviz
|
|
import time
|
|
import db_redis
|
|
import domain
|
|
import ip
|
|
import ttl
|
|
import csv_tools
|
|
import progressbar
|
|
# import db_sql
|
|
|
|
from sklearn.datasets import load_iris
|
|
from sklearn import tree
|
|
|
|
logger = logging.getLogger('train')
|
|
logger.setLevel(logging.DEBUG)
|
|
|
|
db_format_time = '%Y-%m-%d %H:%M:%S'
|
|
|
|
train_start = datetime.date(2017, 5, 1)
|
|
train_end = datetime.date(2017, 5, 4)
|
|
|
|
id_upto = 379283817
|
|
|
|
# record types that should be analysed (e.g. only A)
|
|
record_types = ['A']
|
|
|
|
|
|
# id_upto = db.mariadb_get_nearest_id(train_end.strftime(db_format_time))
|
|
|
|
|
|
def train():
|
|
start = time.time()
|
|
|
|
for day in range(csv_tools.analysis_days_amount):
|
|
log_files_hour = csv_tools.get_log_files_for_hours_of_day(csv_tools.analysis_days[day])
|
|
|
|
progress_bar = progressbar.ProgressBar()
|
|
|
|
for hour in progress_bar(range(24)):
|
|
for hour_files in log_files_hour[hour]:
|
|
with gzip.open(hour_files, 'rt', newline='') as file:
|
|
reader = csv.reader(file)
|
|
|
|
for row in reader:
|
|
if row[2] in record_types:
|
|
entity = {'timestamp': row[0], 'domain': row[1], 'type': row[2],
|
|
'record': row[3], 'ttl': row[4]}
|
|
try:
|
|
prepare_features_redis(entity)
|
|
# pass
|
|
except Exception as e:
|
|
logger.error(e)
|
|
logger.error('Exception occured processing entity: ' + str(entity))
|
|
|
|
|
|
def get_logs_from_db():
|
|
results = db.mariadb_get_logs(id_upto)
|
|
|
|
row = results.fetch_row(how=1)
|
|
|
|
logger.debug("# entity: " + row[0]['domain'])
|
|
|
|
features = prepare_features_redis(row[0])
|
|
|
|
logger.debug(str(features))
|
|
# while row:
|
|
# logger.debug("# entity: " + row[0]['domain'])
|
|
#
|
|
# features = prepare_features(row[0])
|
|
#
|
|
# logger.debug(str(features))
|
|
#
|
|
# row = results.fetch_row(how=1)
|
|
|
|
|
|
def prepare_features_redis(entity):
|
|
checkpoint = time.time()
|
|
domain_stats = db_redis.get_stats_for_domain(entity['domain'])
|
|
ip_stats = db_redis.get_stats_for_ip(entity['record'])
|
|
logger.debug('redis took' + str(time.time() - checkpoint))
|
|
|
|
logger.debug(domain_stats)
|
|
|
|
if len(domain_stats) != 1:
|
|
logger.debug('no stats in redis for entity: ' + entity)
|
|
|
|
domain_stats = domain_stats[0]
|
|
|
|
# TODO
|
|
ips = []
|
|
|
|
# feature 5: Number of distinct IP addresses
|
|
|
|
distinct_ips = len(ips)
|
|
|
|
# feature 6: Number of distinct countries
|
|
|
|
distinct_countries = len([ip.get_country_by_ip(ip_str) for ip_str in ips])
|
|
|
|
# feature 7: Number of (distinct) domains share the IP with
|
|
|
|
distinct_domains_with_same_ip = len(ip_stats)
|
|
|
|
# feature 8: Reverse DNS query results
|
|
|
|
# 5 atomic feature
|
|
|
|
# atomic 1: ratio of IP addresses that cannot be matched with a domain name (NX domains)
|
|
ratio_ips_nx = 0
|
|
|
|
# atomic 2: ratio of ips that are used for DSL lines
|
|
ratio_ips_dsl = 0
|
|
|
|
# atomic 3: ratio of ips that belong to hosting services
|
|
ratio_ips_hoster = 0
|
|
|
|
# atomic 4: ratio of ips that belong to known ISPs
|
|
ratio_ips_isp = 0
|
|
|
|
# atomic 5: ips that can be matched with a valid domain name
|
|
ratio_ips_valid = 0
|
|
|
|
# TODO add atomics to 'all_features'
|
|
|
|
reverse_dns_result = 0
|
|
|
|
# feature 9: Average TTL
|
|
|
|
average_ttl = sum(domain_stats['ttls']) / len(domain_stats['ttls'])
|
|
|
|
# feature 10: Standard Deviation of TTL
|
|
|
|
standard_deviation = ttl.standard_deviation(domain_stats['ttls']) # TODO distinct ttls for std deviation?
|
|
|
|
# feature 11: Number of distinct TTL values
|
|
|
|
distinct_ttl = len(list(set(domain_stats['ttls'])))
|
|
|
|
# feature 12: Number of TTL change
|
|
|
|
ttl_changes = ttl.changes(domain_stats['ttls'])
|
|
|
|
# feature 13: Percentage usage of specific TTL ranges
|
|
# specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf]
|
|
# TODO check if 5 individual features make a difference
|
|
specific_ttl_ranges = ttl.specific_range(entity['ttl'])
|
|
|
|
# feature 14: % of numerical characters
|
|
|
|
numerical_characters_percent = domain.ratio_numerical_to_alpha(entity['domain'])
|
|
|
|
# feature 15: % of the length of the LMS
|
|
|
|
lms_percent = domain.ratio_lms_to_fqdn(entity['domain'])
|
|
|
|
all_features = np.array([
|
|
distinct_ips, distinct_countries,
|
|
distinct_domains_with_same_ip, reverse_dns_result, average_ttl, standard_deviation, distinct_ttl, ttl_changes,
|
|
specific_ttl_ranges, numerical_characters_percent, lms_percent
|
|
])
|
|
logger.debug(all_features)
|
|
exit()
|
|
return all_features
|
|
|
|
|
|
def prepare_features_mysql(entity):
|
|
|
|
checkpoint = time.time()
|
|
logger.debug('get logs for domain start')
|
|
# get all logs for the same domain
|
|
# BIG TODO check if we need the ip addresses of a specific response (not of all [different] responses) somewhere
|
|
logs_for_domain = db.mariadb_get_logs_for_domain(entity['domain'], id_upto)
|
|
logger.info('get logs for domain done' + str(time.time() - checkpoint) + ' s')
|
|
|
|
# TODO do this efficient
|
|
ttls = [log['ttl'] for log in logs_for_domain]
|
|
logger.debug('ttls ' + str(ttls))
|
|
ips = [log['record'] for log in logs_for_domain] # TODO check if valid ip address
|
|
logger.debug(ips)
|
|
response_timestamps = [log['timestamp'] for log in logs_for_domain]
|
|
logger.debug(response_timestamps)
|
|
|
|
domains_with_same_ip = []
|
|
# get all logs for the same ip if valid ip
|
|
if ip.is_valid_ipv4(entity['record']) or ip.is_valid_ipv6(entity['record']):
|
|
checkpoint = time.time()
|
|
logger.info('get logs for ip start')
|
|
logs_for_ip = db.mariadb_get_logs_for_ip(entity['record'], id_upto)
|
|
logger.info('get logs for ip done' + str(time.time() - checkpoint) + ' s')
|
|
domains_with_same_ip = [log['domain'] for log in logs_for_ip]
|
|
|
|
# feature 1: Short Life
|
|
|
|
# 2 atomic features
|
|
|
|
# atomic 1:
|
|
|
|
# atomic 2:
|
|
|
|
short_life = 0
|
|
|
|
# feature 2: Daily Similarity
|
|
|
|
daily_similarity = 0
|
|
|
|
# feature 3: Repeating Patterns
|
|
|
|
# 2 atomic features
|
|
|
|
# atomic 1:
|
|
|
|
# atomic 2:
|
|
|
|
repeating_patterns = 0
|
|
|
|
# feature 4: Access ratio
|
|
|
|
# 2 atomic features
|
|
|
|
# atomic 1:
|
|
|
|
# atomic 2:
|
|
|
|
access_ratio = 0
|
|
|
|
# feature 5: Number of distinct IP addresses
|
|
|
|
distinct_ips = len(list(set(ips)))
|
|
|
|
# feature 6: Number of distinct countries
|
|
|
|
distinct_countries = len(list(set([ip.get_country_by_ip(ip_str) for ip_str in list(set(ips))])))
|
|
|
|
# feature 7: Number of (distinct) domains share the IP with
|
|
|
|
distinct_domains_with_same_ip = len(list(set(domains_with_same_ip)))
|
|
|
|
# feature 8: Reverse DNS query results
|
|
|
|
# 5 atomic feature
|
|
|
|
# atomic 1: ratio of IP addresses that cannot be matched with a domain name (NX domains)
|
|
|
|
# atomic 2: ips that are used for DSL lines
|
|
|
|
# atomic 3: ips that belong to hosting services
|
|
|
|
# atomic 4: ips that belong to known ISPs
|
|
|
|
# atomic 5: ips that can be matched with a valid domain name
|
|
|
|
# TODO add atomics to 'all_features'
|
|
|
|
reverse_dns_result = 0
|
|
|
|
# feature 9: Average TTL
|
|
|
|
average_ttl = sum(ttls) / len(ttls)
|
|
|
|
# feature 10: Standard Deviation of TTL
|
|
|
|
standard_deviation = ttl.standard_deviation(ttls) # TODO distinct ttls for std deviation?
|
|
|
|
# feature 11: Number of distinct TTL values
|
|
|
|
distinct_ttl = len(list(set(ttls)))
|
|
|
|
# feature 12: Number of TTL change
|
|
|
|
ttl_changes = ttl.changes(ttls)
|
|
|
|
# feature 13: Percentage usage of specific TTL ranges
|
|
# specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf]
|
|
# TODO check if 5 individual features make a difference
|
|
specific_ttl_ranges = ttl.specific_range(entity['ttl'])
|
|
|
|
# feature 14: % of numerical characters
|
|
|
|
numerical_characters_percent = domain.ratio_numerical_to_alpha(entity['domain'])
|
|
|
|
# feature 15: % of the length of the LMS
|
|
|
|
lms_percent = domain.ratio_lms_to_fqdn(entity['domain'])
|
|
|
|
all_features = np.array([
|
|
short_life, daily_similarity, repeating_patterns, access_ratio, distinct_ips, distinct_countries,
|
|
distinct_domains_with_same_ip, reverse_dns_result, average_ttl, standard_deviation, distinct_ttl, ttl_changes,
|
|
specific_ttl_ranges, numerical_characters_percent, lms_percent
|
|
])
|
|
|
|
return all_features
|
|
|
|
|
|
def test():
|
|
start = time.time()
|
|
logger.info('starting training ' + str(start))
|
|
|
|
train()
|
|
|
|
logger.info('total duration: ' + str(time.time() - start) + 's')
|
|
cleanup()
|
|
|
|
# db.mariadb_get_distinct_ttl('d2s45lswxaswrw.cloudfront.net', train_start.strftime(db_format_time), train_end.strftime(db_format_time))
|
|
|
|
|
|
def flow():
|
|
iris = load_iris()
|
|
clf = tree.DecisionTreeClassifier()
|
|
clf = clf.fit(iris.data, iris.target) # training set, manual classification
|
|
|
|
# predict single or multiple sets with clf.predict([[]])
|
|
|
|
# visualize decision tree classifier
|
|
dot_data = tree.export_graphviz(clf, out_file=None)
|
|
graph = graphviz.Source(dot_data)
|
|
graph.render('test', view=True)
|
|
|
|
|
|
def cleanup():
|
|
db.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test()
|