added logging, use index (id) for sql queries (id hardcoded atm), added newer version of EXPOSURE paper
This commit is contained in:
@@ -4,54 +4,80 @@ from sklearn import tree
|
||||
import numpy as np
|
||||
import graphviz
|
||||
import datetime
|
||||
import logging
|
||||
import time
|
||||
import db
|
||||
import domain
|
||||
import ip
|
||||
import location
|
||||
import ttl
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger('train')
|
||||
|
||||
db_format_time = '%Y-%m-%d %H:%M:%S'
|
||||
|
||||
train_start = datetime.date(2017, 5, 1)
|
||||
train_end = datetime.date(2017, 5, 2)
|
||||
train_end = datetime.date(2017, 5, 8)
|
||||
|
||||
id_upto = 379283817
|
||||
|
||||
# id_upto = db.mariadb_get_nearest_id(train_end.strftime(db_format_time))
|
||||
|
||||
|
||||
def get_logs_from_db():
|
||||
results = db.mariadb_get_logs(train_start.strftime(db_format_time), train_end.strftime(db_format_time))
|
||||
results = db.mariadb_get_logs(id_upto)
|
||||
|
||||
row = results.fetch_row(how=1)
|
||||
|
||||
print("# entity: " + row[0]['domain'])
|
||||
logger.debug("# entity: " + row[0]['domain'])
|
||||
|
||||
features = prepare_features(row[0])
|
||||
|
||||
print(str(features))
|
||||
logger.info(str(features))
|
||||
# while row:
|
||||
# print("# entity: " + row[0]['domain'])
|
||||
# logger.debug("# entity: " + row[0]['domain'])
|
||||
#
|
||||
# features = prepare_features(row[0])
|
||||
#
|
||||
# print(str(features))
|
||||
# logger.info(str(features))
|
||||
#
|
||||
# row = results.fetch_row(how=1)
|
||||
|
||||
|
||||
def prepare_features(entity):
|
||||
# get all logs for the same domain
|
||||
logs_for_domain = db.mariadb_get_logs_for_domain(entity['domain'], train_start.strftime(db_format_time),
|
||||
train_end.strftime(db_format_time))
|
||||
|
||||
checkpoint = time.time()
|
||||
logger.debug('get logs for domain start')
|
||||
# BIG TODO check if we need the ip addresses of a specific response (not of all [different] responses) somewhere
|
||||
logs_for_domain = db.mariadb_get_logs_for_domain(entity['domain'], id_upto)
|
||||
logger.debug('get logs for domain done' + str(time.time() - checkpoint) + ' s')
|
||||
|
||||
# TODO do this efficient
|
||||
ttls = [log['ttl'] for log in logs_for_domain]
|
||||
logger.info('ttls ' + str(ttls))
|
||||
ips = [log['record'] for log in logs_for_domain] # TODO check if valid ip address
|
||||
logger.info(ips)
|
||||
response_timestamps = [log['timestamp'] for log in logs_for_domain]
|
||||
logger.info(response_timestamps)
|
||||
|
||||
domains_with_same_ip = []
|
||||
# get all logs for the same ip if valid ip
|
||||
if ip.is_valid_ipv4(entity['record']) or ip.is_valid_ipv6(entity['record']):
|
||||
logs_for_ip = db.mariadb_get_logs_for_ip(entity['record'], train_start.strftime(db_format_time),
|
||||
train_end.strftime(db_format_time))
|
||||
checkpoint = time.time()
|
||||
logger.debug('get logs for ip start')
|
||||
logs_for_ip = db.mariadb_get_logs_for_ip(entity['record'], id_upto)
|
||||
logger.debug('get logs for ip done' + str(time.time() - checkpoint) + ' s')
|
||||
domains_with_same_ip = [log['domain'] for log in logs_for_ip]
|
||||
|
||||
# feature 1: Short Life
|
||||
|
||||
# 2 atomic features
|
||||
|
||||
# atomic 1:
|
||||
|
||||
# atomic 2:
|
||||
|
||||
short_life = 0
|
||||
|
||||
# feature 2: Daily Similarity
|
||||
@@ -60,10 +86,22 @@ def prepare_features(entity):
|
||||
|
||||
# feature 3: Repeating Patterns
|
||||
|
||||
# 2 atomic features
|
||||
|
||||
# atomic 1:
|
||||
|
||||
# atomic 2:
|
||||
|
||||
repeating_patterns = 0
|
||||
|
||||
# feature 4: Access ratio
|
||||
|
||||
# 2 atomic features
|
||||
|
||||
# atomic 1:
|
||||
|
||||
# atomic 2:
|
||||
|
||||
access_ratio = 0
|
||||
|
||||
# feature 5: Number of distinct IP addresses
|
||||
@@ -72,7 +110,7 @@ def prepare_features(entity):
|
||||
|
||||
# feature 6: Number of distinct countries
|
||||
|
||||
distinct_countries = len(list(set([location.get_country_by_ip(ip) for ip in list(set(ips))])))
|
||||
distinct_countries = len(list(set([ip.get_country_by_ip(ip_str) for ip_str in list(set(ips))])))
|
||||
|
||||
# feature 7: Number of (distinct) domains share the IP with
|
||||
|
||||
@@ -80,6 +118,20 @@ def prepare_features(entity):
|
||||
|
||||
# feature 8: Reverse DNS query results
|
||||
|
||||
# 5 atomic feature
|
||||
|
||||
# atomic 1: ratio of IP addresses that cannot be matched with a domain name (NX domains)
|
||||
|
||||
# atomic 2: ips that are used for DSL lines
|
||||
|
||||
# atomic 3: ips that belong to hosting services
|
||||
|
||||
# atomic 4: ips that belong to known ISPs
|
||||
|
||||
# atomic 5: ips that can be matched with a valid domain name
|
||||
|
||||
# TODO add atomics to 'all_features'
|
||||
|
||||
reverse_dns_result = 0
|
||||
|
||||
# feature 9: Average TTL
|
||||
@@ -88,7 +140,7 @@ def prepare_features(entity):
|
||||
|
||||
# feature 10: Standard Deviation of TTL
|
||||
|
||||
standard_deviation = 0
|
||||
standard_deviation = ttl.standard_deviation(ttls) # TODO distinct ttls for std deviation?
|
||||
|
||||
# feature 11: Number of distinct TTL values
|
||||
|
||||
@@ -96,23 +148,12 @@ def prepare_features(entity):
|
||||
|
||||
# feature 12: Number of TTL change
|
||||
|
||||
ttl_changes = 0
|
||||
ttl_changes = ttl.changes(ttls)
|
||||
|
||||
# feature 13: Percentage usage of specific TTL ranges
|
||||
# specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf]
|
||||
# TODO decide if 5 individual features make a difference
|
||||
|
||||
ttl = entity['ttl']
|
||||
specific_ttl_ranges = 4 # default is [900, inf]
|
||||
|
||||
if 0 < ttl <= 1:
|
||||
specific_ttl_ranges = 0
|
||||
elif 1 < ttl <= 100:
|
||||
specific_ttl_ranges = 1
|
||||
elif 100 < ttl <= 300:
|
||||
specific_ttl_ranges = 2
|
||||
elif 300 < ttl <= 900:
|
||||
specific_ttl_ranges = 3
|
||||
# TODO check if 5 individual features make a difference
|
||||
specific_ttl_ranges = ttl.specific_range(entity['ttl'])
|
||||
|
||||
# feature 14: % of numerical characters
|
||||
|
||||
@@ -133,11 +174,11 @@ def prepare_features(entity):
|
||||
|
||||
def test():
|
||||
start = time.time()
|
||||
print('starting training ' + str(start))
|
||||
logger.debug('starting training ' + str(start))
|
||||
|
||||
get_logs_from_db()
|
||||
|
||||
print('total duration: ' + str(time.time() - start) + 's')
|
||||
logger.debug('total duration: ' + str(time.time() - start) + 's')
|
||||
db.close()
|
||||
|
||||
# db.mariadb_get_distinct_ttl('d2s45lswxaswrw.cloudfront.net', train_start.strftime(db_format_time), train_end.strftime(db_format_time))
|
||||
|
||||
Reference in New Issue
Block a user