complete flow possible?!
This commit is contained in:
@@ -2,22 +2,43 @@
|
||||
|
||||
def load_whitelist():
|
||||
filename = 'res/benign_domains.txt'
|
||||
return open(filename).read().splitlines()
|
||||
whitelist = []
|
||||
for item in open(filename).read().splitlines():
|
||||
if item not in whitelist:
|
||||
whitelist.append(item)
|
||||
return whitelist
|
||||
|
||||
|
||||
def load_blacklist():
|
||||
filename = 'res/malicious_domains.txt'
|
||||
return open(filename).read().splitlines()
|
||||
blacklist = []
|
||||
for item in open(filename).read().splitlines():
|
||||
# eliminate duplicates. If domain both in white and black,
|
||||
# do not add to black (as EXPOSURE is handling)
|
||||
if item not in blacklist and item not in whitelist:
|
||||
blacklist.append(item)
|
||||
return blacklist
|
||||
|
||||
|
||||
def is_malicious(domain):
|
||||
return 1.0 if domain in blacklist else .0
|
||||
|
||||
|
||||
# TODO check if white and blacklists have to be stripped to tld (subdomains)
|
||||
# TODO also check if subdomains are located in db
|
||||
def test():
|
||||
filter_list = load_whitelist()
|
||||
print('blacklist length: ' + str(len(blacklist)))
|
||||
|
||||
for item in filter_list:
|
||||
print(item.count('.'))
|
||||
# dupes = [x for n, x in enumerate(whitelist) if x in whitelist[:n]]
|
||||
# print(dupes)
|
||||
|
||||
# domain contained in both benign and malicious
|
||||
# for domain in blacklist:
|
||||
# if domain in whitelist:
|
||||
# print(domain + ' found both in black and white')
|
||||
pass
|
||||
|
||||
|
||||
whitelist = load_whitelist()
|
||||
blacklist = load_blacklist()
|
||||
|
||||
if __name__ == "__main__":
|
||||
test()
|
||||
|
||||
15
src/DoresA/config.py
Normal file
15
src/DoresA/config.py
Normal file
@@ -0,0 +1,15 @@
|
||||
import datetime
|
||||
|
||||
db_format_time = '%Y-%m-%d %H:%M:%S'
|
||||
format_date = '%Y-%m-%d'
|
||||
train_start = datetime.date(2017, 9, 1)
|
||||
train_end = datetime.date(2017, 9, 7)
|
||||
|
||||
analysis_start_date = datetime.date(2017, 9, 1)
|
||||
analysis_days_amount = 7
|
||||
#pdns_logs_path = '/home/felix/pdns/'
|
||||
pdns_logs_path = '/mnt/old/2017'
|
||||
|
||||
# e.g. analysis_days = ['2017-04-07', '2017-04-08', '2017-04-09']
|
||||
analysis_days = [(analysis_start_date + datetime.timedelta(days=x)).strftime(format_date) for x in
|
||||
range(analysis_days_amount)]
|
||||
@@ -2,26 +2,25 @@ import csv
|
||||
import gzip
|
||||
import glob
|
||||
import time
|
||||
import datetime
|
||||
import os
|
||||
import config
|
||||
import logging
|
||||
import progressbar
|
||||
# import db_sql
|
||||
|
||||
logger = logging.getLogger('csv')
|
||||
logger.setLevel(logging.INFO)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
analysis_start_date = datetime.date(2017, 9, 1)
|
||||
analysis_days_amount = 7
|
||||
pdns_logs_path = '/home/felix/pdns/'
|
||||
analysis_start_date = config.analysis_start_date
|
||||
analysis_days_amount = config.analysis_days_amount
|
||||
pdns_logs_path = config.pdns_logs_path
|
||||
|
||||
# e.g. analysis_days = ['2017-04-07', '2017-04-08', '2017-04-09']
|
||||
analysis_days = [(analysis_start_date + datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in
|
||||
range(analysis_days_amount)]
|
||||
analysis_days = config.analysis_days
|
||||
|
||||
|
||||
def iterate_logs():
|
||||
start = time.time()
|
||||
logger.info('iterate logs: ' + str(start))
|
||||
|
||||
for day in range(analysis_days_amount):
|
||||
log_files_hour = get_log_files_for_hours_of_day(analysis_days[day])
|
||||
@@ -34,33 +33,28 @@ def iterate_logs():
|
||||
reader = csv.reader(file)
|
||||
|
||||
for row in reader:
|
||||
logger.info('loaded row: ' + str(row))
|
||||
logger.debug('loaded row: ' + str(row))
|
||||
logger.info('iterate logs duration: ' + str(time.time() - start) + 's')
|
||||
|
||||
|
||||
def serialize_logs_to_db():
|
||||
# check_duplicates() TODO readd
|
||||
start = time.time()
|
||||
|
||||
print('starting analysis ' + str(start))
|
||||
|
||||
distinct_ttl_count = {}
|
||||
# everything = {}
|
||||
# for log_file in ['data/pdns_capture.pc
|
||||
|
||||
# for log_file in ['data/pdns_capture.pcap-sgsgpdc0n9x-2017-04-07_00-00-02.csv.gz']:
|
||||
# TODOap-sgsgpdc0n9x-2017-04-07_00-00-02.csv.gz']:
|
||||
|
||||
for day in range(analysis_days_amount):
|
||||
log_files_hour = get_log_files_for_hours_of_day(analysis_days[day])
|
||||
# everything[day] = {}
|
||||
|
||||
progress_bar = Bar(analysis_days[day], max=24)
|
||||
progress_bar = progressbar.ProgressBar()
|
||||
|
||||
for hour in range(24):
|
||||
for hour in progress_bar(range(24)):
|
||||
progress_bar.next()
|
||||
# everything[day][hour] = {}
|
||||
for hour_files in log_files_hour[hour]:
|
||||
# a bit faster, 10-15% (but pandas overhead)
|
||||
# df = pandas.read_csv(log_file, compression='gzip', header=None)
|
||||
# print(df.iloc[0])
|
||||
with gzip.open(hour_files, 'rt', newline='') as file:
|
||||
reader = csv.reader(file)
|
||||
all_rows = list(reader)
|
||||
@@ -75,8 +69,6 @@ def serialize_logs_to_db():
|
||||
# db_sql.mariadb_insert_log(log_entry)
|
||||
# # db_mongo.mongodb_insert_log(log_entry)
|
||||
|
||||
progress_bar.finish()
|
||||
|
||||
print('total duration: ' + str(time.time() - start) + 's')
|
||||
db_sql.close()
|
||||
|
||||
@@ -87,20 +79,19 @@ def batch(iterable, n=1):
|
||||
yield iterable[ndx:min(ndx + n, length)]
|
||||
|
||||
|
||||
def check_duplicates():
|
||||
days_cumulated = 0
|
||||
|
||||
for day in analysis_days:
|
||||
days_cumulated += len(get_log_files_for_day(day))
|
||||
|
||||
all_logs = len(get_log_files_for_day(''))
|
||||
|
||||
if days_cumulated != all_logs:
|
||||
raise Exception('Log files inconsistency')
|
||||
# def check_duplicates():
|
||||
# days_cumulated = 0
|
||||
#
|
||||
# for day in analysis_days:
|
||||
# days_cumulated += len(get_log_files_for_day(day))
|
||||
#
|
||||
# all_logs = len(get_log_files_for_day(''))
|
||||
#
|
||||
# if days_cumulated != all_logs:
|
||||
# raise Exception('Log files inconsistency')
|
||||
|
||||
|
||||
# TODO
|
||||
def get_log_files_for_range_of_day(date, minutes_range):
|
||||
def get_log_files_for_range_of_day(date, minutes_range, gz=True):
|
||||
slot_files = {}
|
||||
slots_amount = int(1440 / minutes_range)
|
||||
|
||||
@@ -109,21 +100,21 @@ def get_log_files_for_range_of_day(date, minutes_range):
|
||||
hours, minutes = divmod(total_mins, 60)
|
||||
|
||||
time_range = '%02d-%02d' % (hours, minutes)
|
||||
slot_files[slot] = 'data/*' + date + '_' + time_range + '*.csv.gz'
|
||||
slot_files[slot] = 'data/*' + date + '_' + time_range + '*.csv' + ('.gz' if gz else '')
|
||||
|
||||
|
||||
def get_log_files_for_hours_of_day(date):
|
||||
def get_log_files_for_hours_of_day(date, gz=True):
|
||||
slot_files = {}
|
||||
slots_amount = 24
|
||||
|
||||
for slot in range(slots_amount):
|
||||
slot_files[slot] = glob.glob(pdns_logs_path + '*' + date + '_' + ('%02d' % slot) + '*.csv.gz')
|
||||
slot_files[slot] = glob.glob(pdns_logs_path + '*' + date + '_' + ('%02d' % slot) + '*.csv' + ('.gz' if gz else ''))
|
||||
|
||||
return slot_files
|
||||
|
||||
|
||||
def get_log_files_for_day(date):
|
||||
log_files = 'data/*' + date + '*.csv.gz'
|
||||
def get_log_files_for_day(date, gz=True):
|
||||
log_files = 'data/*' + date + '*.csv.gz' + ('.gz' if gz else '')
|
||||
|
||||
return glob.glob(log_files)
|
||||
|
||||
|
||||
@@ -74,6 +74,25 @@ def get_stats_for_domain(rrname, rrtype='A'):
|
||||
logger.error(e)
|
||||
|
||||
|
||||
def get_all_ips_for_domain(rrname):
|
||||
redis_r = Redis(redis_host, port=redis_port_reverse)
|
||||
|
||||
# remove trailing slash
|
||||
rrname = rrname.rstrip('/')
|
||||
rrtype = 'A'
|
||||
|
||||
try:
|
||||
all_ips = []
|
||||
for res in redis_r.smembers('r:{}:{}'.format(rrname, rrtype)):
|
||||
ip = unpack('>L', res)[0]
|
||||
ip = '.'.join([str(tuple) for tuple in ((ip & (0xff << 8 * i)) >> 8 * i for i in range(4))])
|
||||
all_ips.append(ip)
|
||||
|
||||
return all_ips
|
||||
except RedisError as e:
|
||||
logger.error(e)
|
||||
|
||||
|
||||
def get_stats_for_ip(rdata):
|
||||
redis_v = Redis(redis_host, port=redis_port_4)
|
||||
|
||||
@@ -104,8 +123,10 @@ def get_stats_for_ip(rdata):
|
||||
|
||||
|
||||
def test():
|
||||
asd = get_stats_for_domain('ZN015105.ppp.dion.ne.jp')
|
||||
# asd = get_stats_for_ip('172.217.27.14')
|
||||
# asd = get_stats_for_domain('ZN015105.ppp.dion.ne.jp')
|
||||
# asd = get_all_ips_for_domain('google.de')
|
||||
asd = get_stats_for_ip('172.217.27.14')
|
||||
print(len(asd))
|
||||
logger.info(asd)
|
||||
|
||||
|
||||
|
||||
@@ -1,13 +1,17 @@
|
||||
import enchant
|
||||
# import enchant
|
||||
import numpy as np
|
||||
import spell
|
||||
|
||||
# check if dictionary is installed: $aspell dicts (or enchant.list_languages() in python)
|
||||
# check if dictionary is installed: $ aspell dicts (or enchant.list_languages() in python)
|
||||
# if not, check http://pythonhosted.org/pyenchant/tutorial.html
|
||||
dictionary = enchant.Dict('en_US')
|
||||
# dictionary = enchant.Dict('en_US')
|
||||
|
||||
# TODO readd enchant?!
|
||||
|
||||
|
||||
def check_if_english_word(string):
|
||||
return dictionary.check(string)
|
||||
# return dictionary.check(string)
|
||||
return spell.check(string)
|
||||
|
||||
|
||||
# TODO strip of protocol and TLD (if needed)
|
||||
@@ -19,7 +23,7 @@ def find_longest_meaningful_substring(string):
|
||||
for j in range(i+1, len(string)):
|
||||
if min_length <= (j + 1 - i) <= max_length:
|
||||
substring = string[i:j+1]
|
||||
if dictionary.check(substring):
|
||||
if check_if_english_word(substring):
|
||||
if len(match) < len(substring):
|
||||
match = substring
|
||||
return match
|
||||
@@ -32,7 +36,7 @@ def ratio_lms_to_fqdn(string):
|
||||
|
||||
|
||||
def test():
|
||||
print(ratio_lms_to_fqdn('www.google.de'))
|
||||
print(ratio_lms_to_fqdn('www.hallo.de'))
|
||||
exit()
|
||||
|
||||
|
||||
|
||||
@@ -1,15 +1,29 @@
|
||||
import re
|
||||
import logging
|
||||
import json
|
||||
from geoip2 import database, errors
|
||||
|
||||
logger = logging.getLogger('ip')
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
top_100_hosters = json.load(open('res/asns.json'))
|
||||
top_100_hosters_asns = []
|
||||
for hoster in top_100_hosters.values():
|
||||
top_100_hosters_asns.extend(hoster['asns'])
|
||||
|
||||
|
||||
def is_hoster_ip(ip):
|
||||
return str(get_isp_by_ip(ip)) in top_100_hosters_asns
|
||||
|
||||
|
||||
# if specific country not available in database take continent instead
|
||||
def get_country_by_ip(ip):
|
||||
with database.Reader('res/GeoLite2-Country_20170905/GeoLite2-Country.mmdb') as reader:
|
||||
result = reader.country(ip)
|
||||
return result.country.names['en']
|
||||
if not result.country:
|
||||
return result.continent.geoname_id
|
||||
else:
|
||||
return result.country.geoname_id
|
||||
|
||||
|
||||
def get_isp_by_ip(ip):
|
||||
@@ -21,8 +35,35 @@ def get_isp_by_ip(ip):
|
||||
logger.debug('address not in isp database')
|
||||
|
||||
|
||||
def ratio_ips_hoster(ips):
|
||||
hosted = []
|
||||
not_hosted = []
|
||||
for ip in ips:
|
||||
if is_hoster_ip(ip):
|
||||
hosted.append(ip)
|
||||
else:
|
||||
not_hosted.append(ip)
|
||||
return float(len(hosted)) / (len(hosted) + len(not_hosted))
|
||||
|
||||
|
||||
def ratio_ips_isp(ips):
|
||||
known = []
|
||||
unknown = []
|
||||
for ip in ips:
|
||||
if get_isp_by_ip(ip):
|
||||
known.append(ip)
|
||||
else:
|
||||
unknown.append(ip)
|
||||
return float(len(known)) / (len(known) + len(unknown))
|
||||
|
||||
|
||||
def test():
|
||||
print(get_isp_by_ip('178.27.82.37'))
|
||||
#print(is_hoster_ip('208.97.151.195'))
|
||||
print(get_country_by_ip('80.146.228.87'))
|
||||
# print(is_hoster_ip('8.8.8.8'))
|
||||
# print(get_isp_by_ip('65.254.244.120'))
|
||||
# print(ratio_ips_isp(['205.204.101.47', '198.11.132.53', '1.1.1.1']))
|
||||
pass
|
||||
|
||||
|
||||
# proudly taken from https://stackoverflow.com/questions/319279/how-to-validate-ip-address-in-python
|
||||
@@ -86,7 +127,7 @@ def is_valid_ipv6(ip):
|
||||
| (?<!:) #
|
||||
| (?<=:) (?<!::) : #
|
||||
) # OR
|
||||
| # A v4 address with NO leading zeros
|
||||
| # A v4 address with NO leading zeros
|
||||
(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)
|
||||
(?: \.
|
||||
(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)
|
||||
|
||||
@@ -5,22 +5,21 @@ cycler==0.10.0
|
||||
geoip2==2.6.0
|
||||
graphviz==0.8
|
||||
idna==2.6
|
||||
javabridge==1.0.15
|
||||
#javabridge==1.0.15
|
||||
lxml==4.1.1
|
||||
matplotlib==2.0.2
|
||||
maxminddb==1.3.0
|
||||
maxminddb-geolite2==2017.803
|
||||
mysqlclient==1.3.12
|
||||
#mysqlclient==1.3.12
|
||||
nltk==3.2.5
|
||||
numpy==1.13.1
|
||||
pandas==0.20.3
|
||||
progressbar2==3.34.3
|
||||
pyenchant==1.6.11
|
||||
pymongo==3.5.1
|
||||
#pymongo==3.5.1
|
||||
pyparsing==2.2.0
|
||||
python-dateutil==2.6.1
|
||||
python-geoip==1.2
|
||||
python-utils==2.2.0
|
||||
python-weka-wrapper3==0.1.3
|
||||
#python-weka-wrapper3==0.1.3
|
||||
pytz==2017.2
|
||||
redis==2.10.6
|
||||
requests==2.18.4
|
||||
|
||||
100
src/DoresA/res/20170222_report.csv
Normal file
100
src/DoresA/res/20170222_report.csv
Normal file
@@ -0,0 +1,100 @@
|
||||
1,Godaddy.com LLC,USA,www.godaddy.com,390.672,14455 N Hayden Road Suite 226 Scottsdale AZ 85260 US, +1-480-505-8809,https://myip.ms/view/web_hosting/2433,
|
||||
2,Cloudflare Inc,USA,www.cloudflare.com,305.278,665 Third Street #207 San Francisco CA 94107 US, +1-650-319-8930,https://myip.ms/view/web_hosting/4638,
|
||||
3,Amazon.com Inc,USA,www.amazonaws.com,285.105,1200 12Th Avenue South Seattle WA 98144 US, +1-206-266-4064,https://myip.ms/view/web_hosting/615,
|
||||
4,Hostgator.com Llc,USA,www.hostgator.com,204.779,11251 Northwest Freeway Suite 400 Houston TX 77092 US, +1-866-964-2867 +1-832-443-1710,https://myip.ms/view/web_hosting/26757,
|
||||
5,Hetzner Online Ag,Germany,www.hetzner.de,174.990,Hetzner Online Ag Stuttgarter Str 1 D-91710 Gunzenhausen Germany, +49 9831 61 00 61,https://myip.ms/view/web_hosting/45081,
|
||||
6,Ovh Sas,France,www.ovh.com,168.907,140 Quai Du Sartel 59100 Roubaix France, +33 9 7453 1323 +33 3 2020 0957,https://myip.ms/view/web_hosting/7593,
|
||||
7,Bluehost Inc,USA,https://www.bluehost.com,130.372,1958 South 950 East Provo UT 84606 US, +1-801-765-9400,https://myip.ms/view/web_hosting/3886,
|
||||
8,Rackspace Hosting,USA,www.rackspace.com,102.826,5000 Walzem Road San Antonio TX 78218 US, +1-210-892-4000,https://myip.ms/view/web_hosting/486,
|
||||
9,Google Inc,USA,sites.google.com,96.064,1600 Amphitheatre Parkway Mountain View CA 94043 US, +1-650-253-0000,https://myip.ms/view/web_hosting/617,
|
||||
10,Aliyun Computing Co. Ltd,China,www.aliyun.com,82.460,No.391 Wen'er Road Hangzhou Zhejiang 310099 China, +86-0571-85022600 +86-0571-85022088,https://myip.ms/view/web_hosting/179248,
|
||||
11,Trellian Pty. Limited,Australia,www.trellian.com,70.588,8 East Concourse Beaumaris Victoria 3193 Australia, +61 395897946,https://myip.ms/view/web_hosting/419061,
|
||||
12,Confluence Networks Inc,Virgin Islands British,www.confluence-networks.com,68.897,3Rd Floor Omar Hodge Building Wickhams Cay I P.o. Box 362 Road Town Tortola Vg1110 VG, +1-415-462-7734,https://myip.ms/view/web_hosting/14729,
|
||||
13,Digital Ocean Inc,USA,www.digitalocean.com,64.479,270 Lafayette St Suite 1206 New York NY 10012 US, +1-347-985-0306,https://myip.ms/view/web_hosting/38903,
|
||||
14,Sakura Internet Inc,Japan,www.sakura.ne.jp,60.301,1-8-14 Minami Honmachi Chuo-ku Osaka 541-0054 Japan, +81-3-5297-2311,https://myip.ms/view/web_hosting/5654,
|
||||
15,Team Internet Ag,Germany,www.teaminternet.de,54.803,Team Internet Ag Lindwurmstr. 25 80337 / Muenchen Germany, +49-89-416146-010,https://myip.ms/view/web_hosting/91136,
|
||||
16,Liquid Web Inc,USA,www.liquidweb.com,54.523,4210 Creyts Rd. Lansing MI 48917 US, +1-800-580-4985,https://myip.ms/view/web_hosting/6574,
|
||||
17,1&1 Internet Ag,Germany,www.1und1.de,54.439,1&1 Internet Ag Axel Fischer Brauerstr. 48 76135 Karlsruhe Germany, +49 721 91374 0,https://myip.ms/view/web_hosting/18944,
|
||||
18,Rook Media GmbH,Switzerland,www.rookmedia.net,51.888,Alte Landstrasse 131 8800 Thalwil Zurich Switzerland, +41 788905326,https://myip.ms/view/web_hosting/21083,
|
||||
19,Linode,USA,www.linode.com,48.337,329 E. Jimmie Leeds Road Suite A Galloway NJ 08205 US, +1-609-593-7103,https://myip.ms/view/web_hosting/3220,
|
||||
20,Ovh Hosting Inc,Canada,www.ovh.ca,48.138,625 Avenue Du President Kennedy Bureau 310 Montreal QC H3A 1K2 Canada, +1-855-684-5463,https://myip.ms/view/web_hosting/1232,
|
||||
21,New Dream Network LLC,USA,www.dreamhost.com,47.665,417 Associated Rd. Pmb #257 Brea CA 92821 US, +1-424-646-4949,https://myip.ms/view/web_hosting/13529,
|
||||
22,Akamai Technologies Inc,USA,www.akamai.com,40.582,8 Cambridge Center 02142 Cambridge MA United States, +1 617 4444768,https://myip.ms/view/web_hosting/115822,
|
||||
23,The Endurance International Group Inc,USA,www.maileig.com,40.255,70 Blanchard Road Burlington MA 01803 US, +1-781-852-3254,https://myip.ms/view/web_hosting/6777,
|
||||
24,Xserver Inc,Japan,https://www.xserver.co.jp,36.678,Karasuma-Building6F Mikura-Cho 85-1 Karasumanishiiru Sanjodori Nakagyo-Ku Kyoto-City JP, +81-3-5297-2311,https://myip.ms/view/web_hosting/20749,
|
||||
25,Aruba S.p.a,Italy,www.aruba.it,36.661,Aruba S.p.A Loc. Palazzetto 4 52011 Bibbiena Stazione - Arezzo Italy, +39 0575 0505,https://myip.ms/view/web_hosting/58626,
|
||||
26,Korea Telecom,South Korea,www.kt.com/eng/main.jsp,35.569,206 Jungja-Dong Bundang-Gu Sungnam-Ci 463-711, +82 2-500-6630,https://myip.ms/view/web_hosting/3284,
|
||||
27,Peer 1 Network (USA) Inc,USA,www.peer1.net,34.401,75 Broad Street 2Nd Floor New York NY 10004 US, +1-604-683-7747 +1-604-484-2588,https://myip.ms/view/web_hosting/198987,
|
||||
28,Softlayer Technologies Inc,USA,www.softlayer.com,32.270,4849 Alpha Rd. Dallas TX 75244 US, +1-214-442-0601,https://myip.ms/view/web_hosting/2165,
|
||||
29,Web - Hosting.com,USA,www.web-hosting.com,32.108,11400 W. Olympic Blvd Suite 200 Los Angeles CA 90064 US, ,https://myip.ms/view/web_hosting/86913,
|
||||
30,Media Temple Inc,USA,www.mediatemple.net,31.889,8520 National Blvd. Building B Culver City CA 90232 US, +1-877-578-4000,https://myip.ms/view/web_hosting/6922,
|
||||
31,Reg.ru Ltd,Russia,www.reg.ru,31.503,Vassily Petushkova St. House 3 Office 326 Moscow 125476 Russia, +7 495 580-11-11,https://myip.ms/view/web_hosting/90869,
|
||||
32,Strato Ag,Germany,www.strato.de,29.782,Strato Ag Pascalstr. 10 D-10587 Berlin Germany, +49 30 39802-0,https://myip.ms/view/web_hosting/82506,
|
||||
33,Shopify Inc,Canada,www.shopify.com,28.480,111 W. Jackson Blvd. Suite 1600 150 Elgin Street 8Th Floor Ottawa ON K2P 1L4 CA, +1-888-746-7439,https://myip.ms/view/web_hosting/376714,
|
||||
34,Online Sas,France,www.online.net,27.612,18 Rue De Londres 75441 Paris France, +33 1 78 56 90 00,https://myip.ms/view/web_hosting/19052,
|
||||
35,1&1 Internet Inc,USA,www.1and1.com,27.313,701 Lee Rd Suite 300 Chesterbrook PA 19087 US, +1-610-560-1617,https://myip.ms/view/web_hosting/4927,
|
||||
36,Gmo Pepabo Inc,Japan,https://pepabo.com,27.207,asd, +81 3 5456 2622,https://myip.ms/view/web_hosting/639855,
|
||||
37,Host Europe Gmbh,Germany,www.hosteurope.de,27.117,Host Europe Gmbh Welserstrasse 14 51149 Cologne Germany, +49 2203 1045 0,https://myip.ms/view/web_hosting/2727,
|
||||
38,Neue Medien Muennich Gmbh,Germany,www.all-inkl.com,26.640,Neue Medien Muennich Gmbh Hauptstrasse 68 D-02742 Friedersdorf Germany, +49 35872 353 10,https://myip.ms/view/web_hosting/15651,
|
||||
39,Enom Inc,USA,www.enom.com,26.223,5808 Lake Washington Blvd. Suite 300 Kirkland WA 98033 US, +1-310-917-6444,https://myip.ms/view/web_hosting/1186,
|
||||
40,Level 3 Communications Inc,USA,www.level3.com,24.294,1025 Eldorado Blvd. Broomfield CO 80021 US, +1-877-453-8353 +1-303-414-5000,https://myip.ms/view/web_hosting/99,
|
||||
41,Locaweb Serviços De Internet S/a,Brazil,www.locaweb.com.br,23.486,Rua Fradique Coutinho 50 – 13 anda, +55 11 3544-0555,https://myip.ms/view/web_hosting/6859,
|
||||
42,Leaseweb B.V,Netherlands,www.leaseweb.com,22.993,Ocom B.V. P.o. Box 93054 1090 Bb Amsterdam Netherlands, +31203162880 +31 20 3162880,https://myip.ms/view/web_hosting/47654,
|
||||
43,Domainfactory Gmbh,Germany,www.df.eu,22.365,Domainfactory Gmbh Oskar-Messter-Str. 33 85737 Ismaning DE, +49 89 55266 112,https://myip.ms/view/web_hosting/47315,
|
||||
44,Beget Ltd,Russia,www.beget.ru,22.002,2-Line Vasilevsky Island Building 5 Letter A Of. 11N St. Petersburg 199034 Russia, +7 812 6408088,https://myip.ms/view/web_hosting/92924,
|
||||
45,Microsoft Corporation,USA,www.microsoft.com,21.804,One Microsoft Way Redmond WA 98052 US, +1-425-882-8080 +1-425-936-7382 +1-425-706-2751 +1-352-278-8979,https://myip.ms/view/web_hosting/198615,
|
||||
46,Rook Media Usa Inc,USA,,21.375,1 Meadow Road Suite 210 Florida NY 10921 US, +1-469-269-2764,https://myip.ms/view/web_hosting/364805,
|
||||
47,TimeWeb Co. Ltd,Russia,www.timeweb.ru,20.822,22A Zastavskaya Str Saint-Petersburg 196084 Russia, +7 812 2441081,https://myip.ms/view/web_hosting/27148,
|
||||
48,PDR,USA,bh-44.webhostbox.net 1,19.631,P.d.r Solutions Llc 10 Corporate Drive Suite 300 Burlington MA 01803 US, +1-201-377-5952,https://myip.ms/view/web_hosting/523753,
|
||||
49,Squarespace Inc,USA,,19.028,225 Varick St New York NY 10014 US, +1-347-758-4644,https://myip.ms/view/web_hosting/641261,
|
||||
50,Sk Broadband Co Ltd,South Korea,www.skbroadband.com,18.676,267 Seoul Namdaemunno 5(O)-Ga Jung-Gu Sk Namsangreen Bldg 100-711, +82-2-3473-0094,https://myip.ms/view/web_hosting/1788,
|
||||
51,Gmo Internet Inc,Japan,www.gmo.jp,18.602,Cerulean Tower Sakuragicho 26-1 Shibyua-Ku 150-8512 Tokyo Japan JP, +81-3-5297-2311,https://myip.ms/view/web_hosting/11234,
|
||||
52,Athenix Inc,USA,www.athenixinc.com,18.122,523 W 6Th St. Los Angeles CA 90014 US, +1-213-536-4767,https://myip.ms/view/web_hosting/118741,
|
||||
53,Lg Dacom Corporation,South Korea,www.lguplus.co.kr,17.318,Dacom Bldg. 706-1 Yoeksam-Dong Kangnam-Ku Seoul South Korea, +82-2-6440-2925 +82-2-2089-7755,https://myip.ms/view/web_hosting/95702,
|
||||
54,Automattic Inc,USA,www.automattic.com,16.952,60 29Th Street #343 San Francisco CA 94110 US, +1 877-273-8550,https://myip.ms/view/web_hosting/3435,
|
||||
55,1&1 Internet Ag,Poland,,16.876,1&1 Internet Se, ,https://myip.ms/view/web_hosting/557035,
|
||||
56,Singlehop Inc,USA,www.singlehop.com,16.851,215 W. Ohio St. 5Th Floor Chicago IL 60654 US, +1-866-817-2811,https://myip.ms/view/web_hosting/728,
|
||||
57,Incapsula Inc,USA,www.incapsula.com,16.186,3400 Bridge Parkway Suite 200 Redwood Shores CA 94065, +1 (866) 250 7659,https://myip.ms/view/web_hosting/15021,
|
||||
58,Home.pl S.a,Poland,www.home.pl,16.067,Home.pl S.a Plac Rodla 9 70-419 Szczecin Poland, +48 801 44 55 55,https://myip.ms/view/web_hosting/93632,
|
||||
59,Hostdime.com Inc,USA,www.hostdime.com,16.048,189 South Orange Avenue Suite 1500S Orlando FL 32801 US, +1 407-756-1126,https://myip.ms/view/web_hosting/1401,
|
||||
60,Alibaba.com Llc,USA,www.alibaba-inc.com,16.045,3945 Freedom Circle Suite 600 Santa Clara CA 95054 US, +1-408-748-1200,https://myip.ms/view/web_hosting/14483,
|
||||
61,Namecheap Inc,USA,www.namecheap.com,14.526,11400 W. Olympic Blvd. Suite 200 Los Angeles CA 90064 US, +1-323-375-2822 +1-661-310-2107,https://myip.ms/view/web_hosting/17640,
|
||||
62,Hetzner (Pty) Ltd,South Africa,www.hetzner.co.za,14.295,Frazzitta Business Park Cnr Battis And Langeberd Road Durbanville Cape Town 7550 South Africa, +27 21 970 2000,https://myip.ms/view/web_hosting/21661,
|
||||
63,Chinanet Guangdong Province Network,China,www.189.cn,14.045,No.31 Jingrong Street Beijing 100032, +86-20-83877223 +86-10-58501724,https://myip.ms/view/web_hosting/883,
|
||||
64,Heart Internet Ltd,United Kingdom,www.heartinternet.co.uk,13.899,2 Castle Quay Castle Boulevard Nottingham. Ng7 1Fw, +44 845 644 7750,https://myip.ms/view/web_hosting/16055,
|
||||
65,Masterhost Cjsc,Russia,www.masterhost.ru,13.590,Masterhost Ostapovskiy Proezd 3 27/29 109316 Moscow Russian Federation, +7 495 7729720,https://myip.ms/view/web_hosting/14883,
|
||||
66,KnownHost Llc,USA,www.knownhost.com,13.348,1379 Dilworthtown Crossing Suite 214 West Chester PA 19382 USA, +1-866-332-9894,https://myip.ms/view/web_hosting/16393,
|
||||
67,Inmotion Hosting Inc,USA,advanced1969.inmotionhosting.com www.inmotionhosting.com,13.002,6100 Center Drive Suite 1190 Los Angeles CA 90045 US, +1-888-321-4678,https://myip.ms/view/web_hosting/517387,
|
||||
68,Iweb Technologies Inc,Canada,www.iweb.com,12.735,20 Place Du Commerce Montreal QC H3E-1Z6 Canada, +1-514-286-4242,https://myip.ms/view/web_hosting/6607,
|
||||
69,Chunghwa Telecom Co. Ltd,Taiwan,www.cht.com.tw,12.657,No.21-3 Sec. 1 Xinyi Rd. Zhongzheng Dist. Taipei City 100 Taiwan, +886 2 2322 3495 +886 2 2344 3007,https://myip.ms/view/web_hosting/600,
|
||||
70,Cogini Hong Kong Limited,Hong Kong,1,12.452,800-1801 Mcgill College Room 1005 Allied Kajima Bldg138 Gloucester Road Wanchai QC 00000 HK, +1-855-684-5463,https://myip.ms/view/web_hosting/570618,
|
||||
71,Chinanet Jiangsu Province Network,China,www.jsinfo.net,12.382,260 Zhongyang Road Nanjing 210037, +86-25-86588231,https://myip.ms/view/web_hosting/51427,
|
||||
72,Cdmon,Spain,vxadj-20.srv.cat,11.801,C/ Girona 81 - 83 Local 6 08380 Malgrat De Mar Spain, +34 937653268,https://myip.ms/view/web_hosting/18003,
|
||||
73,Ru - Center Jsc,Russia,www.nic.ru,11.624,"Jsc ""Ru-Center"" 123308 Moscow Russian Federation 3 Khoroshevskaya 2-1", +7 495 737 0601,https://myip.ms/view/web_hosting/92451,
|
||||
74,Verotel International,Netherlands,,11.360,Verotel International Keizersgracht 213 Nl-1016 Dt Amsterdam Netherlands, +31 20 5315726 +31 20 5315757,https://myip.ms/view/web_hosting/345493,
|
||||
75,Corespace Inc,USA,www.fetisssh.com www.corespace.com,11.319,7505 John W. Carpenter Freeway Dallas TX 75247 US, +1-800-976-2673,https://myip.ms/view/web_hosting/10357,
|
||||
76,Hostinger International Ltd,Cyprus,www.main-hosting.com,11.096,Hostinger International Ltd 61 Lordou Vyronos Lumiel Building 4Th Floor 6023 Larnaca Cyprus, +37 064503378,https://myip.ms/view/web_hosting/388419,
|
||||
77,Fasthosts Internet Ltd,United Kingdom,www.fasthosts.co.uk,10.902,Discovery House 154 Southgate Street Gloucester GL1 2EX UK, +44 1452 561874,https://myip.ms/view/web_hosting/18956,
|
||||
78,Parklogic,USA,www.gigenet.com,10.834,asd, asd,https://myip.ms/view/web_hosting/580441,
|
||||
79,Linode LLC,USA,li349-246.members.linode.com,10.697,329 E. Jimmie Leeds Road Suite A Galloway Nj 08205 USA, +16093807100 +16093807504 +1-609-380-7100,https://myip.ms/view/web_hosting/517366,
|
||||
80,Spaceweb Cjsc,Russia,www.sweb.ru,10.644,Tsvetochnaya Ul. 18 Of.137 Saint-Petersburg 196084 Russia, +7 812 3341222,https://myip.ms/view/web_hosting/149021,
|
||||
81,Gandi SAS,France,www.gandi.net,10.626,63-65 Boulevard Massena 75013 Paris France, +33 1 70 39 37 55,https://myip.ms/view/web_hosting/11091,
|
||||
82,Hawk Host Inc,Canada,www.arandomserver.com www.hawkhost.com,10.595,710 Tower St South Po Box 50081 Fergus ON N1M 2R0 CA, +1-800-859-8803 +1-408-761-1354 +1.866.398.7638,https://myip.ms/view/web_hosting/118178,
|
||||
83,Digirock Inc,Japan,www.digi-rock.com,10.570,Bakuro-Machi 4-7-5 Honmachi Ts Building 6F Chuo-Ku Osaka-Shi Osaka Japan JP, +81-3-5297-2311,https://myip.ms/view/web_hosting/17471,
|
||||
84,Ovh Hispano,Spain,www.ovh.es,10.569,Calle Princesa 22 2 Dcha Madrid 28008 Spain, +34 902 106 113,https://myip.ms/view/web_hosting/46221,
|
||||
85,Enzu Inc,USA,150.135-80-192.rdns.scalabledns.com www.enzu.com,10.549,10120 S Eastern Ave Suite #248 Henderson NV 89052 US, +1-702-965-1615,https://myip.ms/view/web_hosting/572612,
|
||||
86,123 Server Inc,Japan,www.123server.jp,10.272,700-0024 Okayama Prefecture Okayama Kita-ku Motomachi Station No. 1 No. 4 Terminal Square 10F, +81-3-5297-2311,https://myip.ms/view/web_hosting/39737,
|
||||
87,Bodis LLC,USA,www.bodis.com,10.115,1133 Broadway Suite 706 New York NY 10010 US, +1 877-263-4744,https://myip.ms/view/web_hosting/90378,
|
||||
88,Chinanet Shanghai Province Network,China,www.online.sh.cn,9.985,No.31 Jingrong Street Beijing 100032, +86-21-63630562 +86-10-58501724,https://myip.ms/view/web_hosting/10332,
|
||||
89,Weebly Inc,USA,portal.editmysite.com www.weebly.com,9.972,460 Bryant St San Francisco CA 94107 US, +1-415-375-3266 +1-510-771-7036,https://myip.ms/view/web_hosting/371569,
|
||||
90,Kddi Web Communications Inc,Japan,www.kddi-webcommunications.co.jp,9.781,Sumitomo Fudousan Kojimachi Building No. 3 3-6 Kojimachi Chiyoda-ku Tokyo 102-0083 JP, +81-3-3238-5780,https://myip.ms/view/web_hosting/19217,
|
||||
91,Contabo Gmbh,Germany,www.contabo.de,9.401,Aschauer Str. 32A 81549 Muenchen Germany, +49 89 21268372,https://myip.ms/view/web_hosting/89812,
|
||||
92,Siteground Chicago,USA,www.siteground.com,9.394,Racho Petkov Kazandjiata 8 Floor 3 Siteground, +359886660270 +442071839093,https://myip.ms/view/web_hosting/384220,
|
||||
93,P.d.r Solutions Fzc,United Arab Emirates,md-in-6.webhostbox.net,9.292,P.d.r Solutions Fzc F-20 Business Center 1 Business Park Rak Free Trade Zone Ras Al Khaimah, +1 .2013775952,https://myip.ms/view/web_hosting/604868,
|
||||
94,Nazwa.pl Sp.z.o.o,Poland,ajp6.rev.netart.pl,9.191,Ul. Cystersów 20A 31-553 Krakow Poland, +48 122 978 810,https://myip.ms/view/web_hosting/545716,
|
||||
95,Webclusters For Customers,Denmark,,9.176,One.com Kalvebod Brygge 24 7 Dk-1560 Copenhagen V Denmark, ,https://myip.ms/view/web_hosting/567444,
|
||||
96,Transip B.V,Netherlands,www.transip.nl,9.130,Schipholweg 9B 2316 Xb Leiden Netherlands, +31 71 5241919,https://myip.ms/view/web_hosting/108620,
|
||||
97,Selectel Ltd,Russia,mail.eaxyz.com,9.015,Russia Saint-Petersburg Cvetochnaya St. 21, +78127188036 +78126778036,https://myip.ms/view/web_hosting/408471,
|
||||
98,Avguro Technologies Ltd,Russia,www.avguro.ru,9.005,Avguro Technologies Ltd 18 912 Yunnatov Str 127083 Moscow Russia, +74 952293031,https://myip.ms/view/web_hosting/93301,
|
||||
99,Trabia Network,Moldova,www.streamboard.tv,8.773,I.c.s. Trabia-Network S.r.l Moldova, +373 22 994-994,https://myip.ms/view/web_hosting/604588,
|
||||
100,Paperboy&co. Inc,Japan,www.paperboy.co.jp,8.669,, ,https://myip.ms/view/web_hosting/96112,
|
||||
|
Binary file not shown.
1
src/DoresA/res/asns.json
Normal file
1
src/DoresA/res/asns.json
Normal file
File diff suppressed because one or more lines are too long
128457
src/DoresA/res/big.txt
Normal file
128457
src/DoresA/res/big.txt
Normal file
File diff suppressed because it is too large
Load Diff
6
src/DoresA/res/howto_asns.txt
Normal file
6
src/DoresA/res/howto_asns.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
get first 100 hosting providers from here https://myip.ms/info/top_hosting/TOP_Best_World_Web_Hosting_Companies_Real_Time_Statistics.html
|
||||
|
||||
- parse excel file
|
||||
- open detail page (column 7 or so)
|
||||
- extract asns from html
|
||||
- where needed (null value) open source code of detail page and manually extract asns
|
||||
41
src/DoresA/scripts/get_hosting_asn.py
Normal file
41
src/DoresA/scripts/get_hosting_asn.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import requests
|
||||
import json
|
||||
from lxml import html
|
||||
|
||||
# https://myip.ms/browse/web_hosting/World_Web_Hosting_Global_Statistics.html
|
||||
# download excel file and export first sheet (report for 2017) as a proper csv file (need to remove some line breaks in cells)
|
||||
# beware of api limits ;)
|
||||
|
||||
asns = {}
|
||||
with open('../res/20170222_report.csv', 'rt') as file:
|
||||
for line in file:
|
||||
row = line.split(',')
|
||||
asns[row[0]] = {
|
||||
'url': row[7],
|
||||
'hoster': row[1],
|
||||
'asns': []
|
||||
}
|
||||
url = row[7]
|
||||
response = requests.get(url)
|
||||
root = html.fromstring(response.content)
|
||||
|
||||
# xpath of asn span(s) (if more then some 10 asns, not all asns are displayed here)
|
||||
# //*[@id="web_hosting857"]/table/tbody/tr/td[2]/div/span
|
||||
|
||||
# better xpath (all asns in url)
|
||||
# //*[@id="web_hosting857"]
|
||||
asd = root.xpath('//*[@id="web_hosting857"]')
|
||||
|
||||
if len(asd) == 0:
|
||||
print('no asn for hoster: ' + row[1])
|
||||
continue
|
||||
all_asns_link = asd[0].attrib['similar']
|
||||
|
||||
all_asns = all_asns_link.replace('/browse/ip_owners/1/asn/', '').replace('/asn_A/1', '').split('%5E')
|
||||
asns[row[0]]['asns'].extend(all_asns)
|
||||
|
||||
# debug
|
||||
print(row[0])
|
||||
|
||||
with open('../res/asns_new.json', 'w') as outfile:
|
||||
json.dump(asns, outfile)
|
||||
51
src/DoresA/spell.py
Normal file
51
src/DoresA/spell.py
Normal file
@@ -0,0 +1,51 @@
|
||||
import re
|
||||
from collections import Counter
|
||||
|
||||
|
||||
# http://norvig.com/spell-correct.html
|
||||
|
||||
def words(text): return re.findall(r'\w+', text.lower())
|
||||
|
||||
|
||||
WORDS = Counter(words(open('res/big.txt').read()))
|
||||
|
||||
|
||||
def P(word, N=sum(WORDS.values())):
|
||||
"""Probability of `word`."""
|
||||
return WORDS[word] / N
|
||||
|
||||
|
||||
def correction(word):
|
||||
"""Most probable spelling correction for word."""
|
||||
return max(candidates(word), key=P)
|
||||
|
||||
|
||||
def candidates(word):
|
||||
"""Generate possible spelling corrections for word."""
|
||||
return known([word]) or known(edits1(word)) or known(edits2(word)) or [word]
|
||||
|
||||
|
||||
def known(words):
|
||||
"""The subset of `words` that appear in the dictionary of WORDS."""
|
||||
return set(w for w in words if w in WORDS)
|
||||
|
||||
|
||||
def edits1(word):
|
||||
"""All edits that are one edit away from `word`."""
|
||||
letters = 'abcdefghijklmnopqrstuvwxyz'
|
||||
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
|
||||
deletes = [L + R[1:] for L, R in splits if R]
|
||||
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
|
||||
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
|
||||
inserts = [L + c + R for L, R in splits for c in letters]
|
||||
return set(deletes + transposes + replaces + inserts)
|
||||
|
||||
|
||||
def edits2(word):
|
||||
"""All edits that are two edits away from `word`."""
|
||||
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
|
||||
|
||||
|
||||
def check(word):
|
||||
"""Check if 'word' is an english word"""
|
||||
return word in WORDS.keys()
|
||||
@@ -17,18 +17,21 @@ import ip
|
||||
import ttl
|
||||
import csv_tools
|
||||
import progressbar
|
||||
import pickle
|
||||
import classify
|
||||
import config
|
||||
import traceback
|
||||
# import db_sql
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn import tree
|
||||
|
||||
logger = logging.getLogger('train')
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
db_format_time = '%Y-%m-%d %H:%M:%S'
|
||||
|
||||
train_start = datetime.date(2017, 5, 1)
|
||||
train_end = datetime.date(2017, 5, 4)
|
||||
db_format_time = config.db_format_time
|
||||
train_start = config.train_start
|
||||
train_end = config.train_end
|
||||
|
||||
id_upto = 379283817
|
||||
|
||||
@@ -38,17 +41,23 @@ record_types = ['A']
|
||||
|
||||
# id_upto = db.mariadb_get_nearest_id(train_end.strftime(db_format_time))
|
||||
|
||||
|
||||
def train():
|
||||
def generate_features_and_classify():
|
||||
start = time.time()
|
||||
logger.info('feature generation start: ' + str(start))
|
||||
|
||||
for day in range(csv_tools.analysis_days_amount):
|
||||
log_files_hour = csv_tools.get_log_files_for_hours_of_day(csv_tools.analysis_days[day])
|
||||
all_features = []
|
||||
all_classifications = []
|
||||
for day in range(config.analysis_days_amount):
|
||||
# TODO dev
|
||||
# log_files_hour = csv_tools.get_log_files_for_hours_of_day(config.analysis_days[day], gz=False)
|
||||
log_files_hour = csv_tools.get_log_files_for_hours_of_day(config.analysis_days[day])
|
||||
|
||||
progress_bar = progressbar.ProgressBar()
|
||||
|
||||
for hour in progress_bar(range(24)):
|
||||
for hour_files in log_files_hour[hour]:
|
||||
# TODO dev
|
||||
# with open(hour_files, 'rt') as file:
|
||||
with gzip.open(hour_files, 'rt', newline='') as file:
|
||||
reader = csv.reader(file)
|
||||
|
||||
@@ -56,59 +65,75 @@ def train():
|
||||
if row[2] in record_types:
|
||||
entity = {'timestamp': row[0], 'domain': row[1], 'type': row[2],
|
||||
'record': row[3], 'ttl': row[4]}
|
||||
|
||||
try:
|
||||
prepare_features_redis(entity)
|
||||
# pass
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
all_features.append(prepare_features_redis(entity))
|
||||
all_classifications.append(classify.is_malicious(entity['domain']))
|
||||
except Exception:
|
||||
logger.error(traceback.format_exc())
|
||||
logger.error('Exception occured processing entity: ' + str(entity))
|
||||
# break
|
||||
# break
|
||||
# break
|
||||
# break
|
||||
# iris = load_iris()
|
||||
# return iris.data, iris.target
|
||||
|
||||
logger.info('feature generation duration: ' + str(time.time() - start) + 's')
|
||||
return np.array(all_features), np.array(all_classifications)
|
||||
|
||||
|
||||
def get_logs_from_db():
|
||||
results = db.mariadb_get_logs(id_upto)
|
||||
def train():
|
||||
start = time.time()
|
||||
logger.info('training start: ' + str(start))
|
||||
|
||||
row = results.fetch_row(how=1)
|
||||
features, classification = generate_features_and_classify()
|
||||
|
||||
logger.debug("# entity: " + row[0]['domain'])
|
||||
# TODO save serialized features and classification
|
||||
|
||||
features = prepare_features_redis(row[0])
|
||||
decision_tree_model = tree.DecisionTreeClassifier()
|
||||
decision_tree_model = decision_tree_model.fit(features, classification) # training set, manual classification
|
||||
|
||||
logger.debug(str(features))
|
||||
# while row:
|
||||
# logger.debug("# entity: " + row[0]['domain'])
|
||||
#
|
||||
# features = prepare_features(row[0])
|
||||
#
|
||||
# logger.debug(str(features))
|
||||
#
|
||||
# row = results.fetch_row(how=1)
|
||||
# predict single or multiple sets with clf.predict([[]])
|
||||
|
||||
# visualize decision tree classifier
|
||||
dot_data = tree.export_graphviz(decision_tree_model, out_file=None)
|
||||
graph = graphviz.Source(dot_data)
|
||||
graph.render('plot' + datetime.datetime.now().strftime(config.format_date))
|
||||
|
||||
# dump trained decision tree classifier to file
|
||||
decision_tree_pkl_filename = 'dtc_' + datetime.datetime.now().strftime(config.format_date) + '.pkl'
|
||||
decision_tree_model_pkl = open(decision_tree_pkl_filename, 'wb')
|
||||
pickle.dump(decision_tree_model, decision_tree_model_pkl)
|
||||
decision_tree_model_pkl.close()
|
||||
|
||||
|
||||
def prepare_features_redis(entity):
|
||||
checkpoint = time.time()
|
||||
domain_stats = db_redis.get_stats_for_domain(entity['domain'])
|
||||
ip_stats = db_redis.get_stats_for_ip(entity['record'])
|
||||
logger.debug('redis took' + str(time.time() - checkpoint))
|
||||
logger.debug('redis took ' + str(time.time() - checkpoint) + ' s')
|
||||
|
||||
logger.debug(domain_stats)
|
||||
|
||||
if len(domain_stats) != 1:
|
||||
if not domain_stats:
|
||||
logger.debug('no stats in redis for entity: ' + entity)
|
||||
|
||||
domain_stats = domain_stats[0]
|
||||
|
||||
# TODO
|
||||
ips = []
|
||||
ips = db_redis.get_all_ips_for_domain(entity['domain'])
|
||||
|
||||
# feature 5: Number of distinct IP addresses
|
||||
logger.debug('all ips seen for domain ' + str(ips))
|
||||
|
||||
# feature 5: Number of distinct IP addresses (0)
|
||||
|
||||
distinct_ips = len(ips)
|
||||
|
||||
# feature 6: Number of distinct countries
|
||||
# feature 6: Number of distinct countries (1)
|
||||
|
||||
distinct_countries = len([ip.get_country_by_ip(ip_str) for ip_str in ips])
|
||||
distinct_countries = len(set([ip.get_country_by_ip(ip_str) for ip_str in ips]))
|
||||
|
||||
# feature 7: Number of (distinct) domains share the IP with
|
||||
# feature 7: Number of (distinct) domains share the IP with (2)
|
||||
|
||||
distinct_domains_with_same_ip = len(ip_stats)
|
||||
|
||||
@@ -117,24 +142,23 @@ def prepare_features_redis(entity):
|
||||
# 5 atomic feature
|
||||
|
||||
# atomic 1: ratio of IP addresses that cannot be matched with a domain name (NX domains)
|
||||
# TODO not possible?
|
||||
ratio_ips_nx = 0
|
||||
|
||||
# atomic 2: ratio of ips that are used for DSL lines
|
||||
# TODO maxmind?
|
||||
ratio_ips_dsl = 0
|
||||
|
||||
# atomic 3: ratio of ips that belong to hosting services
|
||||
ratio_ips_hoster = 0
|
||||
ratio_ips_hoster = ip.ratio_ips_hoster(ips)
|
||||
|
||||
# atomic 4: ratio of ips that belong to known ISPs
|
||||
ratio_ips_isp = 0
|
||||
ratio_ips_isp = ip.ratio_ips_isp(ips)
|
||||
|
||||
# atomic 5: ips that can be matched with a valid domain name
|
||||
# TODO not possible?
|
||||
ratio_ips_valid = 0
|
||||
|
||||
# TODO add atomics to 'all_features'
|
||||
|
||||
reverse_dns_result = 0
|
||||
|
||||
# feature 9: Average TTL
|
||||
|
||||
average_ttl = sum(domain_stats['ttls']) / len(domain_stats['ttls'])
|
||||
@@ -153,8 +177,7 @@ def prepare_features_redis(entity):
|
||||
|
||||
# feature 13: Percentage usage of specific TTL ranges
|
||||
# specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf]
|
||||
# TODO check if 5 individual features make a difference
|
||||
specific_ttl_ranges = ttl.specific_range(entity['ttl'])
|
||||
ttl_range_0, ttl_range_1, ttl_range_2, ttl_range_3, ttl_range_4 = ttl.specific_ranges(domain_stats['ttls'])
|
||||
|
||||
# feature 14: % of numerical characters
|
||||
|
||||
@@ -166,14 +189,38 @@ def prepare_features_redis(entity):
|
||||
|
||||
all_features = np.array([
|
||||
distinct_ips, distinct_countries,
|
||||
distinct_domains_with_same_ip, reverse_dns_result, average_ttl, standard_deviation, distinct_ttl, ttl_changes,
|
||||
specific_ttl_ranges, numerical_characters_percent, lms_percent
|
||||
distinct_domains_with_same_ip, ratio_ips_nx, ratio_ips_dsl, ratio_ips_hoster,
|
||||
ratio_ips_isp, ratio_ips_valid,
|
||||
average_ttl, standard_deviation, distinct_ttl, ttl_changes,
|
||||
ttl_range_0, ttl_range_1, ttl_range_2, ttl_range_3, ttl_range_4,
|
||||
numerical_characters_percent, lms_percent
|
||||
])
|
||||
logger.debug(all_features)
|
||||
exit()
|
||||
# logger.debug(all_features)
|
||||
return all_features
|
||||
|
||||
|
||||
# TODO depreated
|
||||
def get_logs_from_db():
|
||||
results = db_sql.mariadb_get_logs(id_upto)
|
||||
|
||||
row = results.fetch_row(how=1)
|
||||
|
||||
logger.debug("# entity: " + row[0]['domain'])
|
||||
|
||||
features = prepare_features_redis(row[0])
|
||||
|
||||
logger.debug(str(features))
|
||||
# while row:
|
||||
# logger.debug("# entity: " + row[0]['domain'])
|
||||
#
|
||||
# features = prepare_features(row[0])
|
||||
#
|
||||
# logger.debug(str(features))
|
||||
#
|
||||
# row = results.fetch_row(how=1)
|
||||
|
||||
|
||||
# TODO depreated
|
||||
def prepare_features_mysql(entity):
|
||||
|
||||
checkpoint = time.time()
|
||||
@@ -306,29 +353,40 @@ def test():
|
||||
start = time.time()
|
||||
logger.info('starting training ' + str(start))
|
||||
|
||||
# generate_features_and_classify()
|
||||
train()
|
||||
|
||||
logger.info('total duration: ' + str(time.time() - start) + 's')
|
||||
cleanup()
|
||||
# cleanup()
|
||||
|
||||
# db.mariadb_get_distinct_ttl('d2s45lswxaswrw.cloudfront.net', train_start.strftime(db_format_time), train_end.strftime(db_format_time))
|
||||
|
||||
|
||||
def flow():
|
||||
iris = load_iris()
|
||||
clf = tree.DecisionTreeClassifier()
|
||||
clf = clf.fit(iris.data, iris.target) # training set, manual classification
|
||||
decision_tree_model = tree.DecisionTreeClassifier()
|
||||
decision_tree_model = decision_tree_model.fit(iris.data, iris.target) # training set, manual classification
|
||||
|
||||
# predict single or multiple sets with clf.predict([[]])
|
||||
|
||||
# visualize decision tree classifier
|
||||
dot_data = tree.export_graphviz(clf, out_file=None)
|
||||
dot_data = tree.export_graphviz(decision_tree_model, out_file=None)
|
||||
graph = graphviz.Source(dot_data)
|
||||
graph.render('test', view=True)
|
||||
graph.render('plot', view=True)
|
||||
|
||||
# dump trained decision tree classifier to file
|
||||
decision_tree_pkl_filename = 'dtc_' + datetime.datetime.now().strftime(config.format_date) + '.pkl'
|
||||
decision_tree_model_pkl = open(decision_tree_pkl_filename, 'wb')
|
||||
pickle.dump(decision_tree_model, decision_tree_model_pkl)
|
||||
decision_tree_model_pkl.close()
|
||||
|
||||
# load serialized model
|
||||
decision_tree_model_pkl = open(decision_tree_pkl_filename, 'rb')
|
||||
decision_tree_model = pickle.load(decision_tree_model_pkl)
|
||||
|
||||
|
||||
def cleanup():
|
||||
db.close()
|
||||
db_sql.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -21,24 +21,25 @@ def changes(array):
|
||||
|
||||
|
||||
# specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf]
|
||||
def specific_range(ttl):
|
||||
specific_ttl_ranges = 4 # default is [900, inf]
|
||||
def specific_ranges(ttls):
|
||||
range_0 = False
|
||||
range_1 = False
|
||||
range_2 = False
|
||||
range_3 = False
|
||||
range_4 = False
|
||||
|
||||
try:
|
||||
ttl = int(ttl)
|
||||
except ValueError:
|
||||
logger.error('ttl not a number')
|
||||
return specific_ttl_ranges
|
||||
|
||||
if 0 < ttl <= 1:
|
||||
specific_ttl_ranges = 0
|
||||
elif 1 < ttl <= 100:
|
||||
specific_ttl_ranges = 1
|
||||
elif 100 < ttl <= 300:
|
||||
specific_ttl_ranges = 2
|
||||
elif 300 < ttl <= 900:
|
||||
specific_ttl_ranges = 3
|
||||
return specific_ttl_ranges
|
||||
for ttl in ttls:
|
||||
if 0 < ttl <= 1:
|
||||
range_0 = True
|
||||
elif 1 < ttl <= 100:
|
||||
range_1 = True
|
||||
elif 100 < ttl <= 300:
|
||||
range_2 = True
|
||||
elif 300 < ttl <= 900:
|
||||
range_3 = True
|
||||
elif ttl > 900:
|
||||
range_4 = True
|
||||
return range_0, range_1, range_2, range_3, range_4
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,14 +1,16 @@
|
||||
#!/bin/bash
|
||||
|
||||
#cd /run/media/felix/AE7E01B77E01797B/pDNS;
|
||||
cd /home/felix/sources/MastersThesis/src/DoresA/data;
|
||||
month="04"
|
||||
cd /run/media/felix/AE7E01B77E01797B/pDNS;
|
||||
#cd /home/felix/sources/MastersThesis/src/DoresA/data;
|
||||
month="10"
|
||||
machine="sgsgpdc0n9x" # set empty for all (sgsgpdc0n9x|usmlvdc010x|demchdc902n)
|
||||
|
||||
for i in {01..31}; do
|
||||
if compgen -G *"2017-$month-$i"* > /dev/null; then
|
||||
echo -n -e "day $i \t size: ";
|
||||
echo -n -e $(du -ch *"2017-$month-$i"* | tail -1) " \t #files: ";
|
||||
ls *"2017-$month-$i"* | wc -l;
|
||||
echo "$machine"
|
||||
for i in {01..31}; do
|
||||
if compgen -G *"$machine-2017-$month-$i"* > /dev/null; then
|
||||
echo -n -e "day $i \t size: ";
|
||||
echo -n -e $(du -ch *"$machine-2017-$month-$i"* | tail -1) " \t #files: ";
|
||||
ls *"$machine-2017-$month-$i"* | wc -l;
|
||||
fi
|
||||
done
|
||||
|
||||
|
||||
24
todo.txt
Normal file
24
todo.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
übersicht bild architektur
|
||||
filter lists vergrößern?
|
||||
|
||||
pub key schicken rechner im grünen netz lauffähig machen
|
||||
fst@janus.cert.siemens.com
|
||||
|
||||
größe der files ungewöhnlich
|
||||
verlängerung gerlinde
|
||||
|
||||
dependencies:
|
||||
** make sure the development packages of libxml2 and libxslt are installed **
|
||||
SystemError: Cannot compile 'Python.h'. Perhaps you need to install python-dev|python-devel
|
||||
(opt): ImportError: The 'enchant' C library was not found. Please install it via your OS package manager, or use a pre-built binary wheel from PyPI
|
||||
|
||||
==> apt install libxml2-dev libxslt1-dev python-dev enchant
|
||||
#scipy, enchant
|
||||
==> libatlas-base-dev gfortran enchant
|
||||
|
||||
pfad zu logs: /mnt/old/2017
|
||||
|
||||
nx_domains: ips nicht in redis? -> wahrsch (aufbau nxdomain liste [timestamp, record type, domain/ip?])
|
||||
|
||||
dsl: https://www.maxmind.com/de/geoip2-connection-type-database
|
||||
|
||||
Reference in New Issue
Block a user