first features ready for training

This commit is contained in:
2017-11-06 21:29:55 +01:00
parent 89c6490019
commit f31f645323
12 changed files with 1861 additions and 21 deletions

160
src/DoresA/train.py Normal file
View File

@@ -0,0 +1,160 @@
from sklearn.datasets import load_iris
from sklearn import tree
import numpy as np
import graphviz
import datetime
import time
import db
import domain
import ip
import location
db_format_time = '%Y-%m-%d %H:%M:%S'
train_start = datetime.date(2017, 5, 1)
train_end = datetime.date(2017, 5, 2)
def get_logs_from_db():
results = db.mariadb_get_logs(train_start.strftime(db_format_time), train_end.strftime(db_format_time))
row = results.fetch_row(how=1)
print("# entity: " + row[0]['domain'])
features = prepare_features(row[0])
print(str(features))
# while row:
# print("# entity: " + row[0]['domain'])
#
# features = prepare_features(row[0])
#
# print(str(features))
#
# row = results.fetch_row(how=1)
def prepare_features(entity):
# get all logs for the same domain
logs_for_domain = db.mariadb_get_logs_for_domain(entity['domain'], train_start.strftime(db_format_time),
train_end.strftime(db_format_time))
ttls = [log['ttl'] for log in logs_for_domain]
ips = [log['record'] for log in logs_for_domain] # TODO check if valid ip address
domains_with_same_ip = []
# get all logs for the same ip if valid ip
if ip.is_valid_ipv4(entity['record']) or ip.is_valid_ipv6(entity['record']):
logs_for_ip = db.mariadb_get_logs_for_ip(entity['record'], train_start.strftime(db_format_time),
train_end.strftime(db_format_time))
domains_with_same_ip = [log['domain'] for log in logs_for_ip]
# feature 1: Short Life
short_life = 0
# feature 2: Daily Similarity
daily_similarity = 0
# feature 3: Repeating Patterns
repeating_patterns = 0
# feature 4: Access ratio
access_ratio = 0
# feature 5: Number of distinct IP addresses
distinct_ips = len(list(set(ips)))
# feature 6: Number of distinct countries
distinct_countries = len(list(set([location.get_country_by_ip(ip) for ip in list(set(ips))])))
# feature 7: Number of (distinct) domains share the IP with
distinct_domains_with_same_ip = len(list(set(domains_with_same_ip)))
# feature 8: Reverse DNS query results
reverse_dns_result = 0
# feature 9: Average TTL
average_ttl = sum(ttls) / len(ttls)
# feature 10: Standard Deviation of TTL
standard_deviation = 0
# feature 11: Number of distinct TTL values
distinct_ttl = len(list(set(ttls)))
# feature 12: Number of TTL change
ttl_changes = 0
# feature 13: Percentage usage of specific TTL ranges
# specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf]
# TODO decide if 5 individual features make a difference
ttl = entity['ttl']
specific_ttl_ranges = 4 # default is [900, inf]
if 0 < ttl <= 1:
specific_ttl_ranges = 0
elif 1 < ttl <= 100:
specific_ttl_ranges = 1
elif 100 < ttl <= 300:
specific_ttl_ranges = 2
elif 300 < ttl <= 900:
specific_ttl_ranges = 3
# feature 14: % of numerical characters
numerical_characters_percent = domain.ratio_numerical_to_alpha(entity['domain'])
# feature 15: % of the length of the LMS
lms_percent = domain.ratio_lms_to_fqdn(entity['domain'])
all_features = np.array([
short_life, daily_similarity, repeating_patterns, access_ratio, distinct_ips, distinct_countries,
distinct_domains_with_same_ip, reverse_dns_result, average_ttl, standard_deviation, distinct_ttl, ttl_changes,
specific_ttl_ranges, numerical_characters_percent, lms_percent
])
return all_features
def test():
start = time.time()
print('starting training ' + str(start))
get_logs_from_db()
print('total duration: ' + str(time.time() - start) + 's')
db.close()
# db.mariadb_get_distinct_ttl('d2s45lswxaswrw.cloudfront.net', train_start.strftime(db_format_time), train_end.strftime(db_format_time))
def flow():
iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target) # training set, manual classification
# predict single or multiple sets with clf.predict([[]])
# visualize decision tree classifier
dot_data = tree.export_graphviz(clf, out_file=None)
graph = graphviz.Source(dot_data)
graph.render('test', view=True)
if __name__ == "__main__":
test()