master_thesis/src/DoresA/train.py

import logging
import datetime
# logfile = 'analysis_' + datetime.datetime.now().strftime('%Y-%m-%d_%H:%M') + '.log'  # https://stackoverflow.com/questions/1943747/python-logging-before-you-run-logging-basicconfig
# logging.basicConfig(filename=logfile, filemode='w')  # important to set basicConfig only once for all modules
logging.basicConfig()

import logging
import datetime
import gzip
import csv
import numpy as np
import graphviz
import time
import db_redis
import domain
import ip
import ttl
import csv_tools
import progressbar
# import db_sql

from sklearn.datasets import load_iris
from sklearn import tree

logger = logging.getLogger('train')
logger.setLevel(logging.DEBUG)

db_format_time = '%Y-%m-%d %H:%M:%S'

train_start = datetime.date(2017, 5, 1)
train_end = datetime.date(2017, 5, 4)

id_upto = 379283817

# record types that should be analysed (e.g. only A)
record_types = ['A']


# id_upto = db.mariadb_get_nearest_id(train_end.strftime(db_format_time))


def train():
    start = time.time()

    for day in range(csv_tools.analysis_days_amount):
        log_files_hour = csv_tools.get_log_files_for_hours_of_day(csv_tools.analysis_days[day])

        progress_bar = progressbar.ProgressBar()

        for hour in progress_bar(range(24)):
            for hour_files in log_files_hour[hour]:
                with gzip.open(hour_files, 'rt', newline='') as file:
                    reader = csv.reader(file)

                    for row in reader:
                        if row[2] in record_types:
                            entity = {'timestamp': row[0], 'domain': row[1], 'type': row[2],
                                      'record': row[3], 'ttl': row[4]}
                            try:
                                prepare_features_redis(entity)
                                # pass
                            except Exception as e:
                                logger.error(e)
                                logger.error('Exception occured processing entity: ' + str(entity))


def get_logs_from_db():
    results = db.mariadb_get_logs(id_upto)

    row = results.fetch_row(how=1)

    logger.debug("# entity: " + row[0]['domain'])

    features = prepare_features_redis(row[0])

    logger.debug(str(features))
    # while row:
    #     logger.debug("# entity: " + row[0]['domain'])
    #
    #     features = prepare_features(row[0])
    #
    #     logger.debug(str(features))
    #
    #     row = results.fetch_row(how=1)


def prepare_features_redis(entity):
    checkpoint = time.time()
    domain_stats = db_redis.get_stats_for_domain(entity['domain'])
    ip_stats = db_redis.get_stats_for_ip(entity['record'])
    logger.debug('redis took' + str(time.time() - checkpoint))

    logger.debug(domain_stats)

    if len(domain_stats) != 1:
        logger.debug('no stats in redis for entity: ' + entity)

    domain_stats = domain_stats[0]

    # TODO
    ips = []

    # feature 5: Number of distinct IP addresses

    distinct_ips = len(ips)

    # feature 6: Number of distinct countries

    distinct_countries = len([ip.get_country_by_ip(ip_str) for ip_str in ips])

    # feature 7: Number of (distinct) domains share the IP with

    distinct_domains_with_same_ip = len(ip_stats)

    # feature 8: Reverse DNS query results

    # 5 atomic feature

    # atomic 1: ratio of IP addresses that cannot be matched with a domain name (NX domains)
    ratio_ips_nx = 0

    # atomic 2: ratio of ips that are used for DSL lines
    ratio_ips_dsl = 0

    # atomic 3: ratio of ips that belong to hosting services
    ratio_ips_hoster = 0

    # atomic 4: ratio of ips that belong to known ISPs
    ratio_ips_isp = 0

    # atomic 5: ips that can be matched with a valid domain name
    ratio_ips_valid = 0

    # TODO add atomics to 'all_features'

    reverse_dns_result = 0

    # feature 9: Average TTL

    average_ttl = sum(domain_stats['ttls']) / len(domain_stats['ttls'])

    # feature 10: Standard Deviation of TTL

    standard_deviation = ttl.standard_deviation(domain_stats['ttls'])  # TODO distinct ttls for std deviation?

    # feature 11: Number of distinct TTL values

    distinct_ttl = len(list(set(domain_stats['ttls'])))

    # feature 12: Number of TTL change

    ttl_changes = ttl.changes(domain_stats['ttls'])

    # feature 13: Percentage usage of specific TTL ranges
    # specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf]
    # TODO check if 5 individual features make a difference
    specific_ttl_ranges = ttl.specific_range(entity['ttl'])

    # feature 14: % of numerical characters

    numerical_characters_percent = domain.ratio_numerical_to_alpha(entity['domain'])

    # feature 15: % of the length of the LMS

    lms_percent = domain.ratio_lms_to_fqdn(entity['domain'])

    all_features = np.array([
        distinct_ips, distinct_countries,
        distinct_domains_with_same_ip, reverse_dns_result, average_ttl, standard_deviation, distinct_ttl, ttl_changes,
        specific_ttl_ranges, numerical_characters_percent, lms_percent
    ])
    logger.debug(all_features)
    exit()
    return all_features


def prepare_features_mysql(entity):

    checkpoint = time.time()
    logger.debug('get logs for domain start')
    # get all logs for the same domain
    # BIG TODO check if we need the ip addresses of a specific response (not of all [different] responses) somewhere
    logs_for_domain = db.mariadb_get_logs_for_domain(entity['domain'], id_upto)
    logger.info('get logs for domain done' + str(time.time() - checkpoint) + ' s')

    # TODO do this efficient
    ttls = [log['ttl'] for log in logs_for_domain]
    logger.debug('ttls ' + str(ttls))
    ips = [log['record'] for log in logs_for_domain]  # TODO check if valid ip address
    logger.debug(ips)
    response_timestamps = [log['timestamp'] for log in logs_for_domain]
    logger.debug(response_timestamps)

    domains_with_same_ip = []
    # get all logs for the same ip if valid ip
    if ip.is_valid_ipv4(entity['record']) or ip.is_valid_ipv6(entity['record']):
        checkpoint = time.time()
        logger.info('get logs for ip start')
        logs_for_ip = db.mariadb_get_logs_for_ip(entity['record'], id_upto)
        logger.info('get logs for ip done' + str(time.time() - checkpoint) + ' s')
        domains_with_same_ip = [log['domain'] for log in logs_for_ip]

    # feature 1: Short Life

    # 2 atomic features

    # atomic 1:

    # atomic 2:

    short_life = 0

    # feature 2: Daily Similarity

    daily_similarity = 0

    # feature 3: Repeating Patterns

    # 2 atomic features

    # atomic 1:

    # atomic 2:

    repeating_patterns = 0

    # feature 4: Access ratio

    # 2 atomic features

    # atomic 1:

    # atomic 2:

    access_ratio = 0

    # feature 5: Number of distinct IP addresses

    distinct_ips = len(list(set(ips)))

    # feature 6: Number of distinct countries

    distinct_countries = len(list(set([ip.get_country_by_ip(ip_str) for ip_str in list(set(ips))])))

    # feature 7: Number of (distinct) domains share the IP with

    distinct_domains_with_same_ip = len(list(set(domains_with_same_ip)))

    # feature 8: Reverse DNS query results

    # 5 atomic feature

    # atomic 1: ratio of IP addresses that cannot be matched with a domain name (NX domains)

    # atomic 2: ips that are used for DSL lines

    # atomic 3: ips that belong to hosting services

    # atomic 4: ips that belong to known ISPs

    # atomic 5: ips that can be matched with a valid domain name

    # TODO add atomics to 'all_features'

    reverse_dns_result = 0

    # feature 9: Average TTL

    average_ttl = sum(ttls) / len(ttls)

    # feature 10: Standard Deviation of TTL

    standard_deviation = ttl.standard_deviation(ttls)  # TODO distinct ttls for std deviation?

    # feature 11: Number of distinct TTL values

    distinct_ttl = len(list(set(ttls)))

    # feature 12: Number of TTL change

    ttl_changes = ttl.changes(ttls)

    # feature 13: Percentage usage of specific TTL ranges
    # specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf]
    # TODO check if 5 individual features make a difference
    specific_ttl_ranges = ttl.specific_range(entity['ttl'])

    # feature 14: % of numerical characters

    numerical_characters_percent = domain.ratio_numerical_to_alpha(entity['domain'])

    # feature 15: % of the length of the LMS

    lms_percent = domain.ratio_lms_to_fqdn(entity['domain'])

    all_features = np.array([
        short_life, daily_similarity, repeating_patterns, access_ratio, distinct_ips, distinct_countries,
        distinct_domains_with_same_ip, reverse_dns_result, average_ttl, standard_deviation, distinct_ttl, ttl_changes,
        specific_ttl_ranges, numerical_characters_percent, lms_percent
    ])

    return all_features


def test():
    start = time.time()
    logger.info('starting training ' + str(start))

    train()

    logger.info('total duration: ' + str(time.time() - start) + 's')
    cleanup()

    # db.mariadb_get_distinct_ttl('d2s45lswxaswrw.cloudfront.net', train_start.strftime(db_format_time), train_end.strftime(db_format_time))


def flow():
    iris = load_iris()
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(iris.data, iris.target)  # training set, manual classification

    # predict single or multiple sets with clf.predict([[]])

    # visualize decision tree classifier
    dot_data = tree.export_graphviz(clf, out_file=None)
    graph = graphviz.Source(dot_data)
    graph.render('test', view=True)


def cleanup():
    db.close()


if __name__ == "__main__":
    test()