complete flow possible?!
This commit is contained in:
@@ -2,22 +2,43 @@
|
|||||||
|
|
||||||
def load_whitelist():
|
def load_whitelist():
|
||||||
filename = 'res/benign_domains.txt'
|
filename = 'res/benign_domains.txt'
|
||||||
return open(filename).read().splitlines()
|
whitelist = []
|
||||||
|
for item in open(filename).read().splitlines():
|
||||||
|
if item not in whitelist:
|
||||||
|
whitelist.append(item)
|
||||||
|
return whitelist
|
||||||
|
|
||||||
|
|
||||||
def load_blacklist():
|
def load_blacklist():
|
||||||
filename = 'res/malicious_domains.txt'
|
filename = 'res/malicious_domains.txt'
|
||||||
return open(filename).read().splitlines()
|
blacklist = []
|
||||||
|
for item in open(filename).read().splitlines():
|
||||||
|
# eliminate duplicates. If domain both in white and black,
|
||||||
|
# do not add to black (as EXPOSURE is handling)
|
||||||
|
if item not in blacklist and item not in whitelist:
|
||||||
|
blacklist.append(item)
|
||||||
|
return blacklist
|
||||||
|
|
||||||
|
|
||||||
|
def is_malicious(domain):
|
||||||
|
return 1.0 if domain in blacklist else .0
|
||||||
|
|
||||||
|
|
||||||
# TODO check if white and blacklists have to be stripped to tld (subdomains)
|
|
||||||
# TODO also check if subdomains are located in db
|
|
||||||
def test():
|
def test():
|
||||||
filter_list = load_whitelist()
|
print('blacklist length: ' + str(len(blacklist)))
|
||||||
|
|
||||||
for item in filter_list:
|
# dupes = [x for n, x in enumerate(whitelist) if x in whitelist[:n]]
|
||||||
print(item.count('.'))
|
# print(dupes)
|
||||||
|
|
||||||
|
# domain contained in both benign and malicious
|
||||||
|
# for domain in blacklist:
|
||||||
|
# if domain in whitelist:
|
||||||
|
# print(domain + ' found both in black and white')
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
whitelist = load_whitelist()
|
||||||
|
blacklist = load_blacklist()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
test()
|
test()
|
||||||
|
|||||||
15
src/DoresA/config.py
Normal file
15
src/DoresA/config.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
import datetime
|
||||||
|
|
||||||
|
db_format_time = '%Y-%m-%d %H:%M:%S'
|
||||||
|
format_date = '%Y-%m-%d'
|
||||||
|
train_start = datetime.date(2017, 9, 1)
|
||||||
|
train_end = datetime.date(2017, 9, 7)
|
||||||
|
|
||||||
|
analysis_start_date = datetime.date(2017, 9, 1)
|
||||||
|
analysis_days_amount = 7
|
||||||
|
#pdns_logs_path = '/home/felix/pdns/'
|
||||||
|
pdns_logs_path = '/mnt/old/2017'
|
||||||
|
|
||||||
|
# e.g. analysis_days = ['2017-04-07', '2017-04-08', '2017-04-09']
|
||||||
|
analysis_days = [(analysis_start_date + datetime.timedelta(days=x)).strftime(format_date) for x in
|
||||||
|
range(analysis_days_amount)]
|
||||||
@@ -2,26 +2,25 @@ import csv
|
|||||||
import gzip
|
import gzip
|
||||||
import glob
|
import glob
|
||||||
import time
|
import time
|
||||||
import datetime
|
import config
|
||||||
import os
|
|
||||||
import logging
|
import logging
|
||||||
import progressbar
|
import progressbar
|
||||||
# import db_sql
|
# import db_sql
|
||||||
|
|
||||||
logger = logging.getLogger('csv')
|
logger = logging.getLogger('csv')
|
||||||
logger.setLevel(logging.INFO)
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
analysis_start_date = datetime.date(2017, 9, 1)
|
analysis_start_date = config.analysis_start_date
|
||||||
analysis_days_amount = 7
|
analysis_days_amount = config.analysis_days_amount
|
||||||
pdns_logs_path = '/home/felix/pdns/'
|
pdns_logs_path = config.pdns_logs_path
|
||||||
|
|
||||||
# e.g. analysis_days = ['2017-04-07', '2017-04-08', '2017-04-09']
|
# e.g. analysis_days = ['2017-04-07', '2017-04-08', '2017-04-09']
|
||||||
analysis_days = [(analysis_start_date + datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in
|
analysis_days = config.analysis_days
|
||||||
range(analysis_days_amount)]
|
|
||||||
|
|
||||||
|
|
||||||
def iterate_logs():
|
def iterate_logs():
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
logger.info('iterate logs: ' + str(start))
|
||||||
|
|
||||||
for day in range(analysis_days_amount):
|
for day in range(analysis_days_amount):
|
||||||
log_files_hour = get_log_files_for_hours_of_day(analysis_days[day])
|
log_files_hour = get_log_files_for_hours_of_day(analysis_days[day])
|
||||||
@@ -34,33 +33,28 @@ def iterate_logs():
|
|||||||
reader = csv.reader(file)
|
reader = csv.reader(file)
|
||||||
|
|
||||||
for row in reader:
|
for row in reader:
|
||||||
logger.info('loaded row: ' + str(row))
|
logger.debug('loaded row: ' + str(row))
|
||||||
|
logger.info('iterate logs duration: ' + str(time.time() - start) + 's')
|
||||||
|
|
||||||
|
|
||||||
def serialize_logs_to_db():
|
def serialize_logs_to_db():
|
||||||
# check_duplicates() TODO readd
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
|
||||||
print('starting analysis ' + str(start))
|
print('starting analysis ' + str(start))
|
||||||
|
|
||||||
distinct_ttl_count = {}
|
# for log_file in ['data/pdns_capture.pc
|
||||||
# everything = {}
|
|
||||||
|
|
||||||
# for log_file in ['data/pdns_capture.pcap-sgsgpdc0n9x-2017-04-07_00-00-02.csv.gz']:
|
# TODOap-sgsgpdc0n9x-2017-04-07_00-00-02.csv.gz']:
|
||||||
|
|
||||||
for day in range(analysis_days_amount):
|
for day in range(analysis_days_amount):
|
||||||
log_files_hour = get_log_files_for_hours_of_day(analysis_days[day])
|
log_files_hour = get_log_files_for_hours_of_day(analysis_days[day])
|
||||||
# everything[day] = {}
|
# everything[day] = {}
|
||||||
|
|
||||||
progress_bar = Bar(analysis_days[day], max=24)
|
progress_bar = progressbar.ProgressBar()
|
||||||
|
|
||||||
for hour in range(24):
|
for hour in progress_bar(range(24)):
|
||||||
progress_bar.next()
|
progress_bar.next()
|
||||||
# everything[day][hour] = {}
|
|
||||||
for hour_files in log_files_hour[hour]:
|
for hour_files in log_files_hour[hour]:
|
||||||
# a bit faster, 10-15% (but pandas overhead)
|
|
||||||
# df = pandas.read_csv(log_file, compression='gzip', header=None)
|
|
||||||
# print(df.iloc[0])
|
|
||||||
with gzip.open(hour_files, 'rt', newline='') as file:
|
with gzip.open(hour_files, 'rt', newline='') as file:
|
||||||
reader = csv.reader(file)
|
reader = csv.reader(file)
|
||||||
all_rows = list(reader)
|
all_rows = list(reader)
|
||||||
@@ -75,8 +69,6 @@ def serialize_logs_to_db():
|
|||||||
# db_sql.mariadb_insert_log(log_entry)
|
# db_sql.mariadb_insert_log(log_entry)
|
||||||
# # db_mongo.mongodb_insert_log(log_entry)
|
# # db_mongo.mongodb_insert_log(log_entry)
|
||||||
|
|
||||||
progress_bar.finish()
|
|
||||||
|
|
||||||
print('total duration: ' + str(time.time() - start) + 's')
|
print('total duration: ' + str(time.time() - start) + 's')
|
||||||
db_sql.close()
|
db_sql.close()
|
||||||
|
|
||||||
@@ -87,20 +79,19 @@ def batch(iterable, n=1):
|
|||||||
yield iterable[ndx:min(ndx + n, length)]
|
yield iterable[ndx:min(ndx + n, length)]
|
||||||
|
|
||||||
|
|
||||||
def check_duplicates():
|
# def check_duplicates():
|
||||||
days_cumulated = 0
|
# days_cumulated = 0
|
||||||
|
#
|
||||||
for day in analysis_days:
|
# for day in analysis_days:
|
||||||
days_cumulated += len(get_log_files_for_day(day))
|
# days_cumulated += len(get_log_files_for_day(day))
|
||||||
|
#
|
||||||
all_logs = len(get_log_files_for_day(''))
|
# all_logs = len(get_log_files_for_day(''))
|
||||||
|
#
|
||||||
if days_cumulated != all_logs:
|
# if days_cumulated != all_logs:
|
||||||
raise Exception('Log files inconsistency')
|
# raise Exception('Log files inconsistency')
|
||||||
|
|
||||||
|
|
||||||
# TODO
|
def get_log_files_for_range_of_day(date, minutes_range, gz=True):
|
||||||
def get_log_files_for_range_of_day(date, minutes_range):
|
|
||||||
slot_files = {}
|
slot_files = {}
|
||||||
slots_amount = int(1440 / minutes_range)
|
slots_amount = int(1440 / minutes_range)
|
||||||
|
|
||||||
@@ -109,21 +100,21 @@ def get_log_files_for_range_of_day(date, minutes_range):
|
|||||||
hours, minutes = divmod(total_mins, 60)
|
hours, minutes = divmod(total_mins, 60)
|
||||||
|
|
||||||
time_range = '%02d-%02d' % (hours, minutes)
|
time_range = '%02d-%02d' % (hours, minutes)
|
||||||
slot_files[slot] = 'data/*' + date + '_' + time_range + '*.csv.gz'
|
slot_files[slot] = 'data/*' + date + '_' + time_range + '*.csv' + ('.gz' if gz else '')
|
||||||
|
|
||||||
|
|
||||||
def get_log_files_for_hours_of_day(date):
|
def get_log_files_for_hours_of_day(date, gz=True):
|
||||||
slot_files = {}
|
slot_files = {}
|
||||||
slots_amount = 24
|
slots_amount = 24
|
||||||
|
|
||||||
for slot in range(slots_amount):
|
for slot in range(slots_amount):
|
||||||
slot_files[slot] = glob.glob(pdns_logs_path + '*' + date + '_' + ('%02d' % slot) + '*.csv.gz')
|
slot_files[slot] = glob.glob(pdns_logs_path + '*' + date + '_' + ('%02d' % slot) + '*.csv' + ('.gz' if gz else ''))
|
||||||
|
|
||||||
return slot_files
|
return slot_files
|
||||||
|
|
||||||
|
|
||||||
def get_log_files_for_day(date):
|
def get_log_files_for_day(date, gz=True):
|
||||||
log_files = 'data/*' + date + '*.csv.gz'
|
log_files = 'data/*' + date + '*.csv.gz' + ('.gz' if gz else '')
|
||||||
|
|
||||||
return glob.glob(log_files)
|
return glob.glob(log_files)
|
||||||
|
|
||||||
|
|||||||
@@ -74,6 +74,25 @@ def get_stats_for_domain(rrname, rrtype='A'):
|
|||||||
logger.error(e)
|
logger.error(e)
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_ips_for_domain(rrname):
|
||||||
|
redis_r = Redis(redis_host, port=redis_port_reverse)
|
||||||
|
|
||||||
|
# remove trailing slash
|
||||||
|
rrname = rrname.rstrip('/')
|
||||||
|
rrtype = 'A'
|
||||||
|
|
||||||
|
try:
|
||||||
|
all_ips = []
|
||||||
|
for res in redis_r.smembers('r:{}:{}'.format(rrname, rrtype)):
|
||||||
|
ip = unpack('>L', res)[0]
|
||||||
|
ip = '.'.join([str(tuple) for tuple in ((ip & (0xff << 8 * i)) >> 8 * i for i in range(4))])
|
||||||
|
all_ips.append(ip)
|
||||||
|
|
||||||
|
return all_ips
|
||||||
|
except RedisError as e:
|
||||||
|
logger.error(e)
|
||||||
|
|
||||||
|
|
||||||
def get_stats_for_ip(rdata):
|
def get_stats_for_ip(rdata):
|
||||||
redis_v = Redis(redis_host, port=redis_port_4)
|
redis_v = Redis(redis_host, port=redis_port_4)
|
||||||
|
|
||||||
@@ -104,8 +123,10 @@ def get_stats_for_ip(rdata):
|
|||||||
|
|
||||||
|
|
||||||
def test():
|
def test():
|
||||||
asd = get_stats_for_domain('ZN015105.ppp.dion.ne.jp')
|
# asd = get_stats_for_domain('ZN015105.ppp.dion.ne.jp')
|
||||||
# asd = get_stats_for_ip('172.217.27.14')
|
# asd = get_all_ips_for_domain('google.de')
|
||||||
|
asd = get_stats_for_ip('172.217.27.14')
|
||||||
|
print(len(asd))
|
||||||
logger.info(asd)
|
logger.info(asd)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,13 +1,17 @@
|
|||||||
import enchant
|
# import enchant
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import spell
|
||||||
|
|
||||||
# check if dictionary is installed: $aspell dicts (or enchant.list_languages() in python)
|
# check if dictionary is installed: $ aspell dicts (or enchant.list_languages() in python)
|
||||||
# if not, check http://pythonhosted.org/pyenchant/tutorial.html
|
# if not, check http://pythonhosted.org/pyenchant/tutorial.html
|
||||||
dictionary = enchant.Dict('en_US')
|
# dictionary = enchant.Dict('en_US')
|
||||||
|
|
||||||
|
# TODO readd enchant?!
|
||||||
|
|
||||||
|
|
||||||
def check_if_english_word(string):
|
def check_if_english_word(string):
|
||||||
return dictionary.check(string)
|
# return dictionary.check(string)
|
||||||
|
return spell.check(string)
|
||||||
|
|
||||||
|
|
||||||
# TODO strip of protocol and TLD (if needed)
|
# TODO strip of protocol and TLD (if needed)
|
||||||
@@ -19,7 +23,7 @@ def find_longest_meaningful_substring(string):
|
|||||||
for j in range(i+1, len(string)):
|
for j in range(i+1, len(string)):
|
||||||
if min_length <= (j + 1 - i) <= max_length:
|
if min_length <= (j + 1 - i) <= max_length:
|
||||||
substring = string[i:j+1]
|
substring = string[i:j+1]
|
||||||
if dictionary.check(substring):
|
if check_if_english_word(substring):
|
||||||
if len(match) < len(substring):
|
if len(match) < len(substring):
|
||||||
match = substring
|
match = substring
|
||||||
return match
|
return match
|
||||||
@@ -32,7 +36,7 @@ def ratio_lms_to_fqdn(string):
|
|||||||
|
|
||||||
|
|
||||||
def test():
|
def test():
|
||||||
print(ratio_lms_to_fqdn('www.google.de'))
|
print(ratio_lms_to_fqdn('www.hallo.de'))
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,15 +1,29 @@
|
|||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
|
import json
|
||||||
from geoip2 import database, errors
|
from geoip2 import database, errors
|
||||||
|
|
||||||
logger = logging.getLogger('ip')
|
logger = logging.getLogger('ip')
|
||||||
logger.setLevel(logging.DEBUG)
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
top_100_hosters = json.load(open('res/asns.json'))
|
||||||
|
top_100_hosters_asns = []
|
||||||
|
for hoster in top_100_hosters.values():
|
||||||
|
top_100_hosters_asns.extend(hoster['asns'])
|
||||||
|
|
||||||
|
|
||||||
|
def is_hoster_ip(ip):
|
||||||
|
return str(get_isp_by_ip(ip)) in top_100_hosters_asns
|
||||||
|
|
||||||
|
|
||||||
|
# if specific country not available in database take continent instead
|
||||||
def get_country_by_ip(ip):
|
def get_country_by_ip(ip):
|
||||||
with database.Reader('res/GeoLite2-Country_20170905/GeoLite2-Country.mmdb') as reader:
|
with database.Reader('res/GeoLite2-Country_20170905/GeoLite2-Country.mmdb') as reader:
|
||||||
result = reader.country(ip)
|
result = reader.country(ip)
|
||||||
return result.country.names['en']
|
if not result.country:
|
||||||
|
return result.continent.geoname_id
|
||||||
|
else:
|
||||||
|
return result.country.geoname_id
|
||||||
|
|
||||||
|
|
||||||
def get_isp_by_ip(ip):
|
def get_isp_by_ip(ip):
|
||||||
@@ -21,8 +35,35 @@ def get_isp_by_ip(ip):
|
|||||||
logger.debug('address not in isp database')
|
logger.debug('address not in isp database')
|
||||||
|
|
||||||
|
|
||||||
|
def ratio_ips_hoster(ips):
|
||||||
|
hosted = []
|
||||||
|
not_hosted = []
|
||||||
|
for ip in ips:
|
||||||
|
if is_hoster_ip(ip):
|
||||||
|
hosted.append(ip)
|
||||||
|
else:
|
||||||
|
not_hosted.append(ip)
|
||||||
|
return float(len(hosted)) / (len(hosted) + len(not_hosted))
|
||||||
|
|
||||||
|
|
||||||
|
def ratio_ips_isp(ips):
|
||||||
|
known = []
|
||||||
|
unknown = []
|
||||||
|
for ip in ips:
|
||||||
|
if get_isp_by_ip(ip):
|
||||||
|
known.append(ip)
|
||||||
|
else:
|
||||||
|
unknown.append(ip)
|
||||||
|
return float(len(known)) / (len(known) + len(unknown))
|
||||||
|
|
||||||
|
|
||||||
def test():
|
def test():
|
||||||
print(get_isp_by_ip('178.27.82.37'))
|
#print(is_hoster_ip('208.97.151.195'))
|
||||||
|
print(get_country_by_ip('80.146.228.87'))
|
||||||
|
# print(is_hoster_ip('8.8.8.8'))
|
||||||
|
# print(get_isp_by_ip('65.254.244.120'))
|
||||||
|
# print(ratio_ips_isp(['205.204.101.47', '198.11.132.53', '1.1.1.1']))
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
# proudly taken from https://stackoverflow.com/questions/319279/how-to-validate-ip-address-in-python
|
# proudly taken from https://stackoverflow.com/questions/319279/how-to-validate-ip-address-in-python
|
||||||
@@ -86,7 +127,7 @@ def is_valid_ipv6(ip):
|
|||||||
| (?<!:) #
|
| (?<!:) #
|
||||||
| (?<=:) (?<!::) : #
|
| (?<=:) (?<!::) : #
|
||||||
) # OR
|
) # OR
|
||||||
| # A v4 address with NO leading zeros
|
| # A v4 address with NO leading zeros
|
||||||
(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)
|
(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)
|
||||||
(?: \.
|
(?: \.
|
||||||
(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)
|
(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)
|
||||||
|
|||||||
@@ -5,22 +5,21 @@ cycler==0.10.0
|
|||||||
geoip2==2.6.0
|
geoip2==2.6.0
|
||||||
graphviz==0.8
|
graphviz==0.8
|
||||||
idna==2.6
|
idna==2.6
|
||||||
javabridge==1.0.15
|
#javabridge==1.0.15
|
||||||
|
lxml==4.1.1
|
||||||
matplotlib==2.0.2
|
matplotlib==2.0.2
|
||||||
maxminddb==1.3.0
|
|
||||||
maxminddb-geolite2==2017.803
|
maxminddb-geolite2==2017.803
|
||||||
mysqlclient==1.3.12
|
#mysqlclient==1.3.12
|
||||||
nltk==3.2.5
|
nltk==3.2.5
|
||||||
numpy==1.13.1
|
numpy==1.13.1
|
||||||
pandas==0.20.3
|
|
||||||
progressbar2==3.34.3
|
progressbar2==3.34.3
|
||||||
pyenchant==1.6.11
|
pyenchant==1.6.11
|
||||||
pymongo==3.5.1
|
#pymongo==3.5.1
|
||||||
pyparsing==2.2.0
|
pyparsing==2.2.0
|
||||||
python-dateutil==2.6.1
|
python-dateutil==2.6.1
|
||||||
python-geoip==1.2
|
python-geoip==1.2
|
||||||
python-utils==2.2.0
|
python-utils==2.2.0
|
||||||
python-weka-wrapper3==0.1.3
|
#python-weka-wrapper3==0.1.3
|
||||||
pytz==2017.2
|
pytz==2017.2
|
||||||
redis==2.10.6
|
redis==2.10.6
|
||||||
requests==2.18.4
|
requests==2.18.4
|
||||||
|
|||||||
100
src/DoresA/res/20170222_report.csv
Normal file
100
src/DoresA/res/20170222_report.csv
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
1,Godaddy.com LLC,USA,www.godaddy.com,390.672,14455 N Hayden Road Suite 226 Scottsdale AZ 85260 US, +1-480-505-8809,https://myip.ms/view/web_hosting/2433,
|
||||||
|
2,Cloudflare Inc,USA,www.cloudflare.com,305.278,665 Third Street #207 San Francisco CA 94107 US, +1-650-319-8930,https://myip.ms/view/web_hosting/4638,
|
||||||
|
3,Amazon.com Inc,USA,www.amazonaws.com,285.105,1200 12Th Avenue South Seattle WA 98144 US, +1-206-266-4064,https://myip.ms/view/web_hosting/615,
|
||||||
|
4,Hostgator.com Llc,USA,www.hostgator.com,204.779,11251 Northwest Freeway Suite 400 Houston TX 77092 US, +1-866-964-2867 +1-832-443-1710,https://myip.ms/view/web_hosting/26757,
|
||||||
|
5,Hetzner Online Ag,Germany,www.hetzner.de,174.990,Hetzner Online Ag Stuttgarter Str 1 D-91710 Gunzenhausen Germany, +49 9831 61 00 61,https://myip.ms/view/web_hosting/45081,
|
||||||
|
6,Ovh Sas,France,www.ovh.com,168.907,140 Quai Du Sartel 59100 Roubaix France, +33 9 7453 1323 +33 3 2020 0957,https://myip.ms/view/web_hosting/7593,
|
||||||
|
7,Bluehost Inc,USA,https://www.bluehost.com,130.372,1958 South 950 East Provo UT 84606 US, +1-801-765-9400,https://myip.ms/view/web_hosting/3886,
|
||||||
|
8,Rackspace Hosting,USA,www.rackspace.com,102.826,5000 Walzem Road San Antonio TX 78218 US, +1-210-892-4000,https://myip.ms/view/web_hosting/486,
|
||||||
|
9,Google Inc,USA,sites.google.com,96.064,1600 Amphitheatre Parkway Mountain View CA 94043 US, +1-650-253-0000,https://myip.ms/view/web_hosting/617,
|
||||||
|
10,Aliyun Computing Co. Ltd,China,www.aliyun.com,82.460,No.391 Wen'er Road Hangzhou Zhejiang 310099 China, +86-0571-85022600 +86-0571-85022088,https://myip.ms/view/web_hosting/179248,
|
||||||
|
11,Trellian Pty. Limited,Australia,www.trellian.com,70.588,8 East Concourse Beaumaris Victoria 3193 Australia, +61 395897946,https://myip.ms/view/web_hosting/419061,
|
||||||
|
12,Confluence Networks Inc,Virgin Islands British,www.confluence-networks.com,68.897,3Rd Floor Omar Hodge Building Wickhams Cay I P.o. Box 362 Road Town Tortola Vg1110 VG, +1-415-462-7734,https://myip.ms/view/web_hosting/14729,
|
||||||
|
13,Digital Ocean Inc,USA,www.digitalocean.com,64.479,270 Lafayette St Suite 1206 New York NY 10012 US, +1-347-985-0306,https://myip.ms/view/web_hosting/38903,
|
||||||
|
14,Sakura Internet Inc,Japan,www.sakura.ne.jp,60.301,1-8-14 Minami Honmachi Chuo-ku Osaka 541-0054 Japan, +81-3-5297-2311,https://myip.ms/view/web_hosting/5654,
|
||||||
|
15,Team Internet Ag,Germany,www.teaminternet.de,54.803,Team Internet Ag Lindwurmstr. 25 80337 / Muenchen Germany, +49-89-416146-010,https://myip.ms/view/web_hosting/91136,
|
||||||
|
16,Liquid Web Inc,USA,www.liquidweb.com,54.523,4210 Creyts Rd. Lansing MI 48917 US, +1-800-580-4985,https://myip.ms/view/web_hosting/6574,
|
||||||
|
17,1&1 Internet Ag,Germany,www.1und1.de,54.439,1&1 Internet Ag Axel Fischer Brauerstr. 48 76135 Karlsruhe Germany, +49 721 91374 0,https://myip.ms/view/web_hosting/18944,
|
||||||
|
18,Rook Media GmbH,Switzerland,www.rookmedia.net,51.888,Alte Landstrasse 131 8800 Thalwil Zurich Switzerland, +41 788905326,https://myip.ms/view/web_hosting/21083,
|
||||||
|
19,Linode,USA,www.linode.com,48.337,329 E. Jimmie Leeds Road Suite A Galloway NJ 08205 US, +1-609-593-7103,https://myip.ms/view/web_hosting/3220,
|
||||||
|
20,Ovh Hosting Inc,Canada,www.ovh.ca,48.138,625 Avenue Du President Kennedy Bureau 310 Montreal QC H3A 1K2 Canada, +1-855-684-5463,https://myip.ms/view/web_hosting/1232,
|
||||||
|
21,New Dream Network LLC,USA,www.dreamhost.com,47.665,417 Associated Rd. Pmb #257 Brea CA 92821 US, +1-424-646-4949,https://myip.ms/view/web_hosting/13529,
|
||||||
|
22,Akamai Technologies Inc,USA,www.akamai.com,40.582,8 Cambridge Center 02142 Cambridge MA United States, +1 617 4444768,https://myip.ms/view/web_hosting/115822,
|
||||||
|
23,The Endurance International Group Inc,USA,www.maileig.com,40.255,70 Blanchard Road Burlington MA 01803 US, +1-781-852-3254,https://myip.ms/view/web_hosting/6777,
|
||||||
|
24,Xserver Inc,Japan,https://www.xserver.co.jp,36.678,Karasuma-Building6F Mikura-Cho 85-1 Karasumanishiiru Sanjodori Nakagyo-Ku Kyoto-City JP, +81-3-5297-2311,https://myip.ms/view/web_hosting/20749,
|
||||||
|
25,Aruba S.p.a,Italy,www.aruba.it,36.661,Aruba S.p.A Loc. Palazzetto 4 52011 Bibbiena Stazione - Arezzo Italy, +39 0575 0505,https://myip.ms/view/web_hosting/58626,
|
||||||
|
26,Korea Telecom,South Korea,www.kt.com/eng/main.jsp,35.569,206 Jungja-Dong Bundang-Gu Sungnam-Ci 463-711, +82 2-500-6630,https://myip.ms/view/web_hosting/3284,
|
||||||
|
27,Peer 1 Network (USA) Inc,USA,www.peer1.net,34.401,75 Broad Street 2Nd Floor New York NY 10004 US, +1-604-683-7747 +1-604-484-2588,https://myip.ms/view/web_hosting/198987,
|
||||||
|
28,Softlayer Technologies Inc,USA,www.softlayer.com,32.270,4849 Alpha Rd. Dallas TX 75244 US, +1-214-442-0601,https://myip.ms/view/web_hosting/2165,
|
||||||
|
29,Web - Hosting.com,USA,www.web-hosting.com,32.108,11400 W. Olympic Blvd Suite 200 Los Angeles CA 90064 US, ,https://myip.ms/view/web_hosting/86913,
|
||||||
|
30,Media Temple Inc,USA,www.mediatemple.net,31.889,8520 National Blvd. Building B Culver City CA 90232 US, +1-877-578-4000,https://myip.ms/view/web_hosting/6922,
|
||||||
|
31,Reg.ru Ltd,Russia,www.reg.ru,31.503,Vassily Petushkova St. House 3 Office 326 Moscow 125476 Russia, +7 495 580-11-11,https://myip.ms/view/web_hosting/90869,
|
||||||
|
32,Strato Ag,Germany,www.strato.de,29.782,Strato Ag Pascalstr. 10 D-10587 Berlin Germany, +49 30 39802-0,https://myip.ms/view/web_hosting/82506,
|
||||||
|
33,Shopify Inc,Canada,www.shopify.com,28.480,111 W. Jackson Blvd. Suite 1600 150 Elgin Street 8Th Floor Ottawa ON K2P 1L4 CA, +1-888-746-7439,https://myip.ms/view/web_hosting/376714,
|
||||||
|
34,Online Sas,France,www.online.net,27.612,18 Rue De Londres 75441 Paris France, +33 1 78 56 90 00,https://myip.ms/view/web_hosting/19052,
|
||||||
|
35,1&1 Internet Inc,USA,www.1and1.com,27.313,701 Lee Rd Suite 300 Chesterbrook PA 19087 US, +1-610-560-1617,https://myip.ms/view/web_hosting/4927,
|
||||||
|
36,Gmo Pepabo Inc,Japan,https://pepabo.com,27.207,asd, +81 3 5456 2622,https://myip.ms/view/web_hosting/639855,
|
||||||
|
37,Host Europe Gmbh,Germany,www.hosteurope.de,27.117,Host Europe Gmbh Welserstrasse 14 51149 Cologne Germany, +49 2203 1045 0,https://myip.ms/view/web_hosting/2727,
|
||||||
|
38,Neue Medien Muennich Gmbh,Germany,www.all-inkl.com,26.640,Neue Medien Muennich Gmbh Hauptstrasse 68 D-02742 Friedersdorf Germany, +49 35872 353 10,https://myip.ms/view/web_hosting/15651,
|
||||||
|
39,Enom Inc,USA,www.enom.com,26.223,5808 Lake Washington Blvd. Suite 300 Kirkland WA 98033 US, +1-310-917-6444,https://myip.ms/view/web_hosting/1186,
|
||||||
|
40,Level 3 Communications Inc,USA,www.level3.com,24.294,1025 Eldorado Blvd. Broomfield CO 80021 US, +1-877-453-8353 +1-303-414-5000,https://myip.ms/view/web_hosting/99,
|
||||||
|
41,Locaweb Serviços De Internet S/a,Brazil,www.locaweb.com.br,23.486,Rua Fradique Coutinho 50 – 13 anda, +55 11 3544-0555,https://myip.ms/view/web_hosting/6859,
|
||||||
|
42,Leaseweb B.V,Netherlands,www.leaseweb.com,22.993,Ocom B.V. P.o. Box 93054 1090 Bb Amsterdam Netherlands, +31203162880 +31 20 3162880,https://myip.ms/view/web_hosting/47654,
|
||||||
|
43,Domainfactory Gmbh,Germany,www.df.eu,22.365,Domainfactory Gmbh Oskar-Messter-Str. 33 85737 Ismaning DE, +49 89 55266 112,https://myip.ms/view/web_hosting/47315,
|
||||||
|
44,Beget Ltd,Russia,www.beget.ru,22.002,2-Line Vasilevsky Island Building 5 Letter A Of. 11N St. Petersburg 199034 Russia, +7 812 6408088,https://myip.ms/view/web_hosting/92924,
|
||||||
|
45,Microsoft Corporation,USA,www.microsoft.com,21.804,One Microsoft Way Redmond WA 98052 US, +1-425-882-8080 +1-425-936-7382 +1-425-706-2751 +1-352-278-8979,https://myip.ms/view/web_hosting/198615,
|
||||||
|
46,Rook Media Usa Inc,USA,,21.375,1 Meadow Road Suite 210 Florida NY 10921 US, +1-469-269-2764,https://myip.ms/view/web_hosting/364805,
|
||||||
|
47,TimeWeb Co. Ltd,Russia,www.timeweb.ru,20.822,22A Zastavskaya Str Saint-Petersburg 196084 Russia, +7 812 2441081,https://myip.ms/view/web_hosting/27148,
|
||||||
|
48,PDR,USA,bh-44.webhostbox.net 1,19.631,P.d.r Solutions Llc 10 Corporate Drive Suite 300 Burlington MA 01803 US, +1-201-377-5952,https://myip.ms/view/web_hosting/523753,
|
||||||
|
49,Squarespace Inc,USA,,19.028,225 Varick St New York NY 10014 US, +1-347-758-4644,https://myip.ms/view/web_hosting/641261,
|
||||||
|
50,Sk Broadband Co Ltd,South Korea,www.skbroadband.com,18.676,267 Seoul Namdaemunno 5(O)-Ga Jung-Gu Sk Namsangreen Bldg 100-711, +82-2-3473-0094,https://myip.ms/view/web_hosting/1788,
|
||||||
|
51,Gmo Internet Inc,Japan,www.gmo.jp,18.602,Cerulean Tower Sakuragicho 26-1 Shibyua-Ku 150-8512 Tokyo Japan JP, +81-3-5297-2311,https://myip.ms/view/web_hosting/11234,
|
||||||
|
52,Athenix Inc,USA,www.athenixinc.com,18.122,523 W 6Th St. Los Angeles CA 90014 US, +1-213-536-4767,https://myip.ms/view/web_hosting/118741,
|
||||||
|
53,Lg Dacom Corporation,South Korea,www.lguplus.co.kr,17.318,Dacom Bldg. 706-1 Yoeksam-Dong Kangnam-Ku Seoul South Korea, +82-2-6440-2925 +82-2-2089-7755,https://myip.ms/view/web_hosting/95702,
|
||||||
|
54,Automattic Inc,USA,www.automattic.com,16.952,60 29Th Street #343 San Francisco CA 94110 US, +1 877-273-8550,https://myip.ms/view/web_hosting/3435,
|
||||||
|
55,1&1 Internet Ag,Poland,,16.876,1&1 Internet Se, ,https://myip.ms/view/web_hosting/557035,
|
||||||
|
56,Singlehop Inc,USA,www.singlehop.com,16.851,215 W. Ohio St. 5Th Floor Chicago IL 60654 US, +1-866-817-2811,https://myip.ms/view/web_hosting/728,
|
||||||
|
57,Incapsula Inc,USA,www.incapsula.com,16.186,3400 Bridge Parkway Suite 200 Redwood Shores CA 94065, +1 (866) 250 7659,https://myip.ms/view/web_hosting/15021,
|
||||||
|
58,Home.pl S.a,Poland,www.home.pl,16.067,Home.pl S.a Plac Rodla 9 70-419 Szczecin Poland, +48 801 44 55 55,https://myip.ms/view/web_hosting/93632,
|
||||||
|
59,Hostdime.com Inc,USA,www.hostdime.com,16.048,189 South Orange Avenue Suite 1500S Orlando FL 32801 US, +1 407-756-1126,https://myip.ms/view/web_hosting/1401,
|
||||||
|
60,Alibaba.com Llc,USA,www.alibaba-inc.com,16.045,3945 Freedom Circle Suite 600 Santa Clara CA 95054 US, +1-408-748-1200,https://myip.ms/view/web_hosting/14483,
|
||||||
|
61,Namecheap Inc,USA,www.namecheap.com,14.526,11400 W. Olympic Blvd. Suite 200 Los Angeles CA 90064 US, +1-323-375-2822 +1-661-310-2107,https://myip.ms/view/web_hosting/17640,
|
||||||
|
62,Hetzner (Pty) Ltd,South Africa,www.hetzner.co.za,14.295,Frazzitta Business Park Cnr Battis And Langeberd Road Durbanville Cape Town 7550 South Africa, +27 21 970 2000,https://myip.ms/view/web_hosting/21661,
|
||||||
|
63,Chinanet Guangdong Province Network,China,www.189.cn,14.045,No.31 Jingrong Street Beijing 100032, +86-20-83877223 +86-10-58501724,https://myip.ms/view/web_hosting/883,
|
||||||
|
64,Heart Internet Ltd,United Kingdom,www.heartinternet.co.uk,13.899,2 Castle Quay Castle Boulevard Nottingham. Ng7 1Fw, +44 845 644 7750,https://myip.ms/view/web_hosting/16055,
|
||||||
|
65,Masterhost Cjsc,Russia,www.masterhost.ru,13.590,Masterhost Ostapovskiy Proezd 3 27/29 109316 Moscow Russian Federation, +7 495 7729720,https://myip.ms/view/web_hosting/14883,
|
||||||
|
66,KnownHost Llc,USA,www.knownhost.com,13.348,1379 Dilworthtown Crossing Suite 214 West Chester PA 19382 USA, +1-866-332-9894,https://myip.ms/view/web_hosting/16393,
|
||||||
|
67,Inmotion Hosting Inc,USA,advanced1969.inmotionhosting.com www.inmotionhosting.com,13.002,6100 Center Drive Suite 1190 Los Angeles CA 90045 US, +1-888-321-4678,https://myip.ms/view/web_hosting/517387,
|
||||||
|
68,Iweb Technologies Inc,Canada,www.iweb.com,12.735,20 Place Du Commerce Montreal QC H3E-1Z6 Canada, +1-514-286-4242,https://myip.ms/view/web_hosting/6607,
|
||||||
|
69,Chunghwa Telecom Co. Ltd,Taiwan,www.cht.com.tw,12.657,No.21-3 Sec. 1 Xinyi Rd. Zhongzheng Dist. Taipei City 100 Taiwan, +886 2 2322 3495 +886 2 2344 3007,https://myip.ms/view/web_hosting/600,
|
||||||
|
70,Cogini Hong Kong Limited,Hong Kong,1,12.452,800-1801 Mcgill College Room 1005 Allied Kajima Bldg138 Gloucester Road Wanchai QC 00000 HK, +1-855-684-5463,https://myip.ms/view/web_hosting/570618,
|
||||||
|
71,Chinanet Jiangsu Province Network,China,www.jsinfo.net,12.382,260 Zhongyang Road Nanjing 210037, +86-25-86588231,https://myip.ms/view/web_hosting/51427,
|
||||||
|
72,Cdmon,Spain,vxadj-20.srv.cat,11.801,C/ Girona 81 - 83 Local 6 08380 Malgrat De Mar Spain, +34 937653268,https://myip.ms/view/web_hosting/18003,
|
||||||
|
73,Ru - Center Jsc,Russia,www.nic.ru,11.624,"Jsc ""Ru-Center"" 123308 Moscow Russian Federation 3 Khoroshevskaya 2-1", +7 495 737 0601,https://myip.ms/view/web_hosting/92451,
|
||||||
|
74,Verotel International,Netherlands,,11.360,Verotel International Keizersgracht 213 Nl-1016 Dt Amsterdam Netherlands, +31 20 5315726 +31 20 5315757,https://myip.ms/view/web_hosting/345493,
|
||||||
|
75,Corespace Inc,USA,www.fetisssh.com www.corespace.com,11.319,7505 John W. Carpenter Freeway Dallas TX 75247 US, +1-800-976-2673,https://myip.ms/view/web_hosting/10357,
|
||||||
|
76,Hostinger International Ltd,Cyprus,www.main-hosting.com,11.096,Hostinger International Ltd 61 Lordou Vyronos Lumiel Building 4Th Floor 6023 Larnaca Cyprus, +37 064503378,https://myip.ms/view/web_hosting/388419,
|
||||||
|
77,Fasthosts Internet Ltd,United Kingdom,www.fasthosts.co.uk,10.902,Discovery House 154 Southgate Street Gloucester GL1 2EX UK, +44 1452 561874,https://myip.ms/view/web_hosting/18956,
|
||||||
|
78,Parklogic,USA,www.gigenet.com,10.834,asd, asd,https://myip.ms/view/web_hosting/580441,
|
||||||
|
79,Linode LLC,USA,li349-246.members.linode.com,10.697,329 E. Jimmie Leeds Road Suite A Galloway Nj 08205 USA, +16093807100 +16093807504 +1-609-380-7100,https://myip.ms/view/web_hosting/517366,
|
||||||
|
80,Spaceweb Cjsc,Russia,www.sweb.ru,10.644,Tsvetochnaya Ul. 18 Of.137 Saint-Petersburg 196084 Russia, +7 812 3341222,https://myip.ms/view/web_hosting/149021,
|
||||||
|
81,Gandi SAS,France,www.gandi.net,10.626,63-65 Boulevard Massena 75013 Paris France, +33 1 70 39 37 55,https://myip.ms/view/web_hosting/11091,
|
||||||
|
82,Hawk Host Inc,Canada,www.arandomserver.com www.hawkhost.com,10.595,710 Tower St South Po Box 50081 Fergus ON N1M 2R0 CA, +1-800-859-8803 +1-408-761-1354 +1.866.398.7638,https://myip.ms/view/web_hosting/118178,
|
||||||
|
83,Digirock Inc,Japan,www.digi-rock.com,10.570,Bakuro-Machi 4-7-5 Honmachi Ts Building 6F Chuo-Ku Osaka-Shi Osaka Japan JP, +81-3-5297-2311,https://myip.ms/view/web_hosting/17471,
|
||||||
|
84,Ovh Hispano,Spain,www.ovh.es,10.569,Calle Princesa 22 2 Dcha Madrid 28008 Spain, +34 902 106 113,https://myip.ms/view/web_hosting/46221,
|
||||||
|
85,Enzu Inc,USA,150.135-80-192.rdns.scalabledns.com www.enzu.com,10.549,10120 S Eastern Ave Suite #248 Henderson NV 89052 US, +1-702-965-1615,https://myip.ms/view/web_hosting/572612,
|
||||||
|
86,123 Server Inc,Japan,www.123server.jp,10.272,700-0024 Okayama Prefecture Okayama Kita-ku Motomachi Station No. 1 No. 4 Terminal Square 10F, +81-3-5297-2311,https://myip.ms/view/web_hosting/39737,
|
||||||
|
87,Bodis LLC,USA,www.bodis.com,10.115,1133 Broadway Suite 706 New York NY 10010 US, +1 877-263-4744,https://myip.ms/view/web_hosting/90378,
|
||||||
|
88,Chinanet Shanghai Province Network,China,www.online.sh.cn,9.985,No.31 Jingrong Street Beijing 100032, +86-21-63630562 +86-10-58501724,https://myip.ms/view/web_hosting/10332,
|
||||||
|
89,Weebly Inc,USA,portal.editmysite.com www.weebly.com,9.972,460 Bryant St San Francisco CA 94107 US, +1-415-375-3266 +1-510-771-7036,https://myip.ms/view/web_hosting/371569,
|
||||||
|
90,Kddi Web Communications Inc,Japan,www.kddi-webcommunications.co.jp,9.781,Sumitomo Fudousan Kojimachi Building No. 3 3-6 Kojimachi Chiyoda-ku Tokyo 102-0083 JP, +81-3-3238-5780,https://myip.ms/view/web_hosting/19217,
|
||||||
|
91,Contabo Gmbh,Germany,www.contabo.de,9.401,Aschauer Str. 32A 81549 Muenchen Germany, +49 89 21268372,https://myip.ms/view/web_hosting/89812,
|
||||||
|
92,Siteground Chicago,USA,www.siteground.com,9.394,Racho Petkov Kazandjiata 8 Floor 3 Siteground, +359886660270 +442071839093,https://myip.ms/view/web_hosting/384220,
|
||||||
|
93,P.d.r Solutions Fzc,United Arab Emirates,md-in-6.webhostbox.net,9.292,P.d.r Solutions Fzc F-20 Business Center 1 Business Park Rak Free Trade Zone Ras Al Khaimah, +1 .2013775952,https://myip.ms/view/web_hosting/604868,
|
||||||
|
94,Nazwa.pl Sp.z.o.o,Poland,ajp6.rev.netart.pl,9.191,Ul. Cystersów 20A 31-553 Krakow Poland, +48 122 978 810,https://myip.ms/view/web_hosting/545716,
|
||||||
|
95,Webclusters For Customers,Denmark,,9.176,One.com Kalvebod Brygge 24 7 Dk-1560 Copenhagen V Denmark, ,https://myip.ms/view/web_hosting/567444,
|
||||||
|
96,Transip B.V,Netherlands,www.transip.nl,9.130,Schipholweg 9B 2316 Xb Leiden Netherlands, +31 71 5241919,https://myip.ms/view/web_hosting/108620,
|
||||||
|
97,Selectel Ltd,Russia,mail.eaxyz.com,9.015,Russia Saint-Petersburg Cvetochnaya St. 21, +78127188036 +78126778036,https://myip.ms/view/web_hosting/408471,
|
||||||
|
98,Avguro Technologies Ltd,Russia,www.avguro.ru,9.005,Avguro Technologies Ltd 18 912 Yunnatov Str 127083 Moscow Russia, +74 952293031,https://myip.ms/view/web_hosting/93301,
|
||||||
|
99,Trabia Network,Moldova,www.streamboard.tv,8.773,I.c.s. Trabia-Network S.r.l Moldova, +373 22 994-994,https://myip.ms/view/web_hosting/604588,
|
||||||
|
100,Paperboy&co. Inc,Japan,www.paperboy.co.jp,8.669,, ,https://myip.ms/view/web_hosting/96112,
|
||||||
|
Binary file not shown.
1
src/DoresA/res/asns.json
Normal file
1
src/DoresA/res/asns.json
Normal file
File diff suppressed because one or more lines are too long
128457
src/DoresA/res/big.txt
Normal file
128457
src/DoresA/res/big.txt
Normal file
File diff suppressed because it is too large
Load Diff
6
src/DoresA/res/howto_asns.txt
Normal file
6
src/DoresA/res/howto_asns.txt
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
get first 100 hosting providers from here https://myip.ms/info/top_hosting/TOP_Best_World_Web_Hosting_Companies_Real_Time_Statistics.html
|
||||||
|
|
||||||
|
- parse excel file
|
||||||
|
- open detail page (column 7 or so)
|
||||||
|
- extract asns from html
|
||||||
|
- where needed (null value) open source code of detail page and manually extract asns
|
||||||
41
src/DoresA/scripts/get_hosting_asn.py
Normal file
41
src/DoresA/scripts/get_hosting_asn.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
import requests
|
||||||
|
import json
|
||||||
|
from lxml import html
|
||||||
|
|
||||||
|
# https://myip.ms/browse/web_hosting/World_Web_Hosting_Global_Statistics.html
|
||||||
|
# download excel file and export first sheet (report for 2017) as a proper csv file (need to remove some line breaks in cells)
|
||||||
|
# beware of api limits ;)
|
||||||
|
|
||||||
|
asns = {}
|
||||||
|
with open('../res/20170222_report.csv', 'rt') as file:
|
||||||
|
for line in file:
|
||||||
|
row = line.split(',')
|
||||||
|
asns[row[0]] = {
|
||||||
|
'url': row[7],
|
||||||
|
'hoster': row[1],
|
||||||
|
'asns': []
|
||||||
|
}
|
||||||
|
url = row[7]
|
||||||
|
response = requests.get(url)
|
||||||
|
root = html.fromstring(response.content)
|
||||||
|
|
||||||
|
# xpath of asn span(s) (if more then some 10 asns, not all asns are displayed here)
|
||||||
|
# //*[@id="web_hosting857"]/table/tbody/tr/td[2]/div/span
|
||||||
|
|
||||||
|
# better xpath (all asns in url)
|
||||||
|
# //*[@id="web_hosting857"]
|
||||||
|
asd = root.xpath('//*[@id="web_hosting857"]')
|
||||||
|
|
||||||
|
if len(asd) == 0:
|
||||||
|
print('no asn for hoster: ' + row[1])
|
||||||
|
continue
|
||||||
|
all_asns_link = asd[0].attrib['similar']
|
||||||
|
|
||||||
|
all_asns = all_asns_link.replace('/browse/ip_owners/1/asn/', '').replace('/asn_A/1', '').split('%5E')
|
||||||
|
asns[row[0]]['asns'].extend(all_asns)
|
||||||
|
|
||||||
|
# debug
|
||||||
|
print(row[0])
|
||||||
|
|
||||||
|
with open('../res/asns_new.json', 'w') as outfile:
|
||||||
|
json.dump(asns, outfile)
|
||||||
51
src/DoresA/spell.py
Normal file
51
src/DoresA/spell.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
import re
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
|
||||||
|
# http://norvig.com/spell-correct.html
|
||||||
|
|
||||||
|
def words(text): return re.findall(r'\w+', text.lower())
|
||||||
|
|
||||||
|
|
||||||
|
WORDS = Counter(words(open('res/big.txt').read()))
|
||||||
|
|
||||||
|
|
||||||
|
def P(word, N=sum(WORDS.values())):
|
||||||
|
"""Probability of `word`."""
|
||||||
|
return WORDS[word] / N
|
||||||
|
|
||||||
|
|
||||||
|
def correction(word):
|
||||||
|
"""Most probable spelling correction for word."""
|
||||||
|
return max(candidates(word), key=P)
|
||||||
|
|
||||||
|
|
||||||
|
def candidates(word):
|
||||||
|
"""Generate possible spelling corrections for word."""
|
||||||
|
return known([word]) or known(edits1(word)) or known(edits2(word)) or [word]
|
||||||
|
|
||||||
|
|
||||||
|
def known(words):
|
||||||
|
"""The subset of `words` that appear in the dictionary of WORDS."""
|
||||||
|
return set(w for w in words if w in WORDS)
|
||||||
|
|
||||||
|
|
||||||
|
def edits1(word):
|
||||||
|
"""All edits that are one edit away from `word`."""
|
||||||
|
letters = 'abcdefghijklmnopqrstuvwxyz'
|
||||||
|
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
|
||||||
|
deletes = [L + R[1:] for L, R in splits if R]
|
||||||
|
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
|
||||||
|
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
|
||||||
|
inserts = [L + c + R for L, R in splits for c in letters]
|
||||||
|
return set(deletes + transposes + replaces + inserts)
|
||||||
|
|
||||||
|
|
||||||
|
def edits2(word):
|
||||||
|
"""All edits that are two edits away from `word`."""
|
||||||
|
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
|
||||||
|
|
||||||
|
|
||||||
|
def check(word):
|
||||||
|
"""Check if 'word' is an english word"""
|
||||||
|
return word in WORDS.keys()
|
||||||
@@ -17,18 +17,21 @@ import ip
|
|||||||
import ttl
|
import ttl
|
||||||
import csv_tools
|
import csv_tools
|
||||||
import progressbar
|
import progressbar
|
||||||
|
import pickle
|
||||||
|
import classify
|
||||||
|
import config
|
||||||
|
import traceback
|
||||||
# import db_sql
|
# import db_sql
|
||||||
|
|
||||||
from sklearn.datasets import load_iris
|
from sklearn.datasets import load_iris
|
||||||
from sklearn import tree
|
from sklearn import tree
|
||||||
|
|
||||||
logger = logging.getLogger('train')
|
logger = logging.getLogger('train')
|
||||||
logger.setLevel(logging.DEBUG)
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
db_format_time = '%Y-%m-%d %H:%M:%S'
|
db_format_time = config.db_format_time
|
||||||
|
train_start = config.train_start
|
||||||
train_start = datetime.date(2017, 5, 1)
|
train_end = config.train_end
|
||||||
train_end = datetime.date(2017, 5, 4)
|
|
||||||
|
|
||||||
id_upto = 379283817
|
id_upto = 379283817
|
||||||
|
|
||||||
@@ -38,17 +41,23 @@ record_types = ['A']
|
|||||||
|
|
||||||
# id_upto = db.mariadb_get_nearest_id(train_end.strftime(db_format_time))
|
# id_upto = db.mariadb_get_nearest_id(train_end.strftime(db_format_time))
|
||||||
|
|
||||||
|
def generate_features_and_classify():
|
||||||
def train():
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
logger.info('feature generation start: ' + str(start))
|
||||||
|
|
||||||
for day in range(csv_tools.analysis_days_amount):
|
all_features = []
|
||||||
log_files_hour = csv_tools.get_log_files_for_hours_of_day(csv_tools.analysis_days[day])
|
all_classifications = []
|
||||||
|
for day in range(config.analysis_days_amount):
|
||||||
|
# TODO dev
|
||||||
|
# log_files_hour = csv_tools.get_log_files_for_hours_of_day(config.analysis_days[day], gz=False)
|
||||||
|
log_files_hour = csv_tools.get_log_files_for_hours_of_day(config.analysis_days[day])
|
||||||
|
|
||||||
progress_bar = progressbar.ProgressBar()
|
progress_bar = progressbar.ProgressBar()
|
||||||
|
|
||||||
for hour in progress_bar(range(24)):
|
for hour in progress_bar(range(24)):
|
||||||
for hour_files in log_files_hour[hour]:
|
for hour_files in log_files_hour[hour]:
|
||||||
|
# TODO dev
|
||||||
|
# with open(hour_files, 'rt') as file:
|
||||||
with gzip.open(hour_files, 'rt', newline='') as file:
|
with gzip.open(hour_files, 'rt', newline='') as file:
|
||||||
reader = csv.reader(file)
|
reader = csv.reader(file)
|
||||||
|
|
||||||
@@ -56,59 +65,75 @@ def train():
|
|||||||
if row[2] in record_types:
|
if row[2] in record_types:
|
||||||
entity = {'timestamp': row[0], 'domain': row[1], 'type': row[2],
|
entity = {'timestamp': row[0], 'domain': row[1], 'type': row[2],
|
||||||
'record': row[3], 'ttl': row[4]}
|
'record': row[3], 'ttl': row[4]}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
prepare_features_redis(entity)
|
all_features.append(prepare_features_redis(entity))
|
||||||
# pass
|
all_classifications.append(classify.is_malicious(entity['domain']))
|
||||||
except Exception as e:
|
except Exception:
|
||||||
logger.error(e)
|
logger.error(traceback.format_exc())
|
||||||
logger.error('Exception occured processing entity: ' + str(entity))
|
logger.error('Exception occured processing entity: ' + str(entity))
|
||||||
|
# break
|
||||||
|
# break
|
||||||
|
# break
|
||||||
|
# break
|
||||||
|
# iris = load_iris()
|
||||||
|
# return iris.data, iris.target
|
||||||
|
|
||||||
|
logger.info('feature generation duration: ' + str(time.time() - start) + 's')
|
||||||
|
return np.array(all_features), np.array(all_classifications)
|
||||||
|
|
||||||
|
|
||||||
def get_logs_from_db():
|
def train():
|
||||||
results = db.mariadb_get_logs(id_upto)
|
start = time.time()
|
||||||
|
logger.info('training start: ' + str(start))
|
||||||
|
|
||||||
row = results.fetch_row(how=1)
|
features, classification = generate_features_and_classify()
|
||||||
|
|
||||||
logger.debug("# entity: " + row[0]['domain'])
|
# TODO save serialized features and classification
|
||||||
|
|
||||||
features = prepare_features_redis(row[0])
|
decision_tree_model = tree.DecisionTreeClassifier()
|
||||||
|
decision_tree_model = decision_tree_model.fit(features, classification) # training set, manual classification
|
||||||
|
|
||||||
logger.debug(str(features))
|
# predict single or multiple sets with clf.predict([[]])
|
||||||
# while row:
|
|
||||||
# logger.debug("# entity: " + row[0]['domain'])
|
# visualize decision tree classifier
|
||||||
#
|
dot_data = tree.export_graphviz(decision_tree_model, out_file=None)
|
||||||
# features = prepare_features(row[0])
|
graph = graphviz.Source(dot_data)
|
||||||
#
|
graph.render('plot' + datetime.datetime.now().strftime(config.format_date))
|
||||||
# logger.debug(str(features))
|
|
||||||
#
|
# dump trained decision tree classifier to file
|
||||||
# row = results.fetch_row(how=1)
|
decision_tree_pkl_filename = 'dtc_' + datetime.datetime.now().strftime(config.format_date) + '.pkl'
|
||||||
|
decision_tree_model_pkl = open(decision_tree_pkl_filename, 'wb')
|
||||||
|
pickle.dump(decision_tree_model, decision_tree_model_pkl)
|
||||||
|
decision_tree_model_pkl.close()
|
||||||
|
|
||||||
|
|
||||||
def prepare_features_redis(entity):
|
def prepare_features_redis(entity):
|
||||||
checkpoint = time.time()
|
checkpoint = time.time()
|
||||||
domain_stats = db_redis.get_stats_for_domain(entity['domain'])
|
domain_stats = db_redis.get_stats_for_domain(entity['domain'])
|
||||||
ip_stats = db_redis.get_stats_for_ip(entity['record'])
|
ip_stats = db_redis.get_stats_for_ip(entity['record'])
|
||||||
logger.debug('redis took' + str(time.time() - checkpoint))
|
logger.debug('redis took ' + str(time.time() - checkpoint) + ' s')
|
||||||
|
|
||||||
logger.debug(domain_stats)
|
logger.debug(domain_stats)
|
||||||
|
|
||||||
if len(domain_stats) != 1:
|
if not domain_stats:
|
||||||
logger.debug('no stats in redis for entity: ' + entity)
|
logger.debug('no stats in redis for entity: ' + entity)
|
||||||
|
|
||||||
domain_stats = domain_stats[0]
|
domain_stats = domain_stats[0]
|
||||||
|
|
||||||
# TODO
|
ips = db_redis.get_all_ips_for_domain(entity['domain'])
|
||||||
ips = []
|
|
||||||
|
|
||||||
# feature 5: Number of distinct IP addresses
|
logger.debug('all ips seen for domain ' + str(ips))
|
||||||
|
|
||||||
|
# feature 5: Number of distinct IP addresses (0)
|
||||||
|
|
||||||
distinct_ips = len(ips)
|
distinct_ips = len(ips)
|
||||||
|
|
||||||
# feature 6: Number of distinct countries
|
# feature 6: Number of distinct countries (1)
|
||||||
|
|
||||||
distinct_countries = len([ip.get_country_by_ip(ip_str) for ip_str in ips])
|
distinct_countries = len(set([ip.get_country_by_ip(ip_str) for ip_str in ips]))
|
||||||
|
|
||||||
# feature 7: Number of (distinct) domains share the IP with
|
# feature 7: Number of (distinct) domains share the IP with (2)
|
||||||
|
|
||||||
distinct_domains_with_same_ip = len(ip_stats)
|
distinct_domains_with_same_ip = len(ip_stats)
|
||||||
|
|
||||||
@@ -117,24 +142,23 @@ def prepare_features_redis(entity):
|
|||||||
# 5 atomic feature
|
# 5 atomic feature
|
||||||
|
|
||||||
# atomic 1: ratio of IP addresses that cannot be matched with a domain name (NX domains)
|
# atomic 1: ratio of IP addresses that cannot be matched with a domain name (NX domains)
|
||||||
|
# TODO not possible?
|
||||||
ratio_ips_nx = 0
|
ratio_ips_nx = 0
|
||||||
|
|
||||||
# atomic 2: ratio of ips that are used for DSL lines
|
# atomic 2: ratio of ips that are used for DSL lines
|
||||||
|
# TODO maxmind?
|
||||||
ratio_ips_dsl = 0
|
ratio_ips_dsl = 0
|
||||||
|
|
||||||
# atomic 3: ratio of ips that belong to hosting services
|
# atomic 3: ratio of ips that belong to hosting services
|
||||||
ratio_ips_hoster = 0
|
ratio_ips_hoster = ip.ratio_ips_hoster(ips)
|
||||||
|
|
||||||
# atomic 4: ratio of ips that belong to known ISPs
|
# atomic 4: ratio of ips that belong to known ISPs
|
||||||
ratio_ips_isp = 0
|
ratio_ips_isp = ip.ratio_ips_isp(ips)
|
||||||
|
|
||||||
# atomic 5: ips that can be matched with a valid domain name
|
# atomic 5: ips that can be matched with a valid domain name
|
||||||
|
# TODO not possible?
|
||||||
ratio_ips_valid = 0
|
ratio_ips_valid = 0
|
||||||
|
|
||||||
# TODO add atomics to 'all_features'
|
|
||||||
|
|
||||||
reverse_dns_result = 0
|
|
||||||
|
|
||||||
# feature 9: Average TTL
|
# feature 9: Average TTL
|
||||||
|
|
||||||
average_ttl = sum(domain_stats['ttls']) / len(domain_stats['ttls'])
|
average_ttl = sum(domain_stats['ttls']) / len(domain_stats['ttls'])
|
||||||
@@ -153,8 +177,7 @@ def prepare_features_redis(entity):
|
|||||||
|
|
||||||
# feature 13: Percentage usage of specific TTL ranges
|
# feature 13: Percentage usage of specific TTL ranges
|
||||||
# specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf]
|
# specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf]
|
||||||
# TODO check if 5 individual features make a difference
|
ttl_range_0, ttl_range_1, ttl_range_2, ttl_range_3, ttl_range_4 = ttl.specific_ranges(domain_stats['ttls'])
|
||||||
specific_ttl_ranges = ttl.specific_range(entity['ttl'])
|
|
||||||
|
|
||||||
# feature 14: % of numerical characters
|
# feature 14: % of numerical characters
|
||||||
|
|
||||||
@@ -166,14 +189,38 @@ def prepare_features_redis(entity):
|
|||||||
|
|
||||||
all_features = np.array([
|
all_features = np.array([
|
||||||
distinct_ips, distinct_countries,
|
distinct_ips, distinct_countries,
|
||||||
distinct_domains_with_same_ip, reverse_dns_result, average_ttl, standard_deviation, distinct_ttl, ttl_changes,
|
distinct_domains_with_same_ip, ratio_ips_nx, ratio_ips_dsl, ratio_ips_hoster,
|
||||||
specific_ttl_ranges, numerical_characters_percent, lms_percent
|
ratio_ips_isp, ratio_ips_valid,
|
||||||
|
average_ttl, standard_deviation, distinct_ttl, ttl_changes,
|
||||||
|
ttl_range_0, ttl_range_1, ttl_range_2, ttl_range_3, ttl_range_4,
|
||||||
|
numerical_characters_percent, lms_percent
|
||||||
])
|
])
|
||||||
logger.debug(all_features)
|
# logger.debug(all_features)
|
||||||
exit()
|
|
||||||
return all_features
|
return all_features
|
||||||
|
|
||||||
|
|
||||||
|
# TODO depreated
|
||||||
|
def get_logs_from_db():
|
||||||
|
results = db_sql.mariadb_get_logs(id_upto)
|
||||||
|
|
||||||
|
row = results.fetch_row(how=1)
|
||||||
|
|
||||||
|
logger.debug("# entity: " + row[0]['domain'])
|
||||||
|
|
||||||
|
features = prepare_features_redis(row[0])
|
||||||
|
|
||||||
|
logger.debug(str(features))
|
||||||
|
# while row:
|
||||||
|
# logger.debug("# entity: " + row[0]['domain'])
|
||||||
|
#
|
||||||
|
# features = prepare_features(row[0])
|
||||||
|
#
|
||||||
|
# logger.debug(str(features))
|
||||||
|
#
|
||||||
|
# row = results.fetch_row(how=1)
|
||||||
|
|
||||||
|
|
||||||
|
# TODO depreated
|
||||||
def prepare_features_mysql(entity):
|
def prepare_features_mysql(entity):
|
||||||
|
|
||||||
checkpoint = time.time()
|
checkpoint = time.time()
|
||||||
@@ -306,29 +353,40 @@ def test():
|
|||||||
start = time.time()
|
start = time.time()
|
||||||
logger.info('starting training ' + str(start))
|
logger.info('starting training ' + str(start))
|
||||||
|
|
||||||
|
# generate_features_and_classify()
|
||||||
train()
|
train()
|
||||||
|
|
||||||
logger.info('total duration: ' + str(time.time() - start) + 's')
|
logger.info('total duration: ' + str(time.time() - start) + 's')
|
||||||
cleanup()
|
# cleanup()
|
||||||
|
|
||||||
# db.mariadb_get_distinct_ttl('d2s45lswxaswrw.cloudfront.net', train_start.strftime(db_format_time), train_end.strftime(db_format_time))
|
# db.mariadb_get_distinct_ttl('d2s45lswxaswrw.cloudfront.net', train_start.strftime(db_format_time), train_end.strftime(db_format_time))
|
||||||
|
|
||||||
|
|
||||||
def flow():
|
def flow():
|
||||||
iris = load_iris()
|
iris = load_iris()
|
||||||
clf = tree.DecisionTreeClassifier()
|
decision_tree_model = tree.DecisionTreeClassifier()
|
||||||
clf = clf.fit(iris.data, iris.target) # training set, manual classification
|
decision_tree_model = decision_tree_model.fit(iris.data, iris.target) # training set, manual classification
|
||||||
|
|
||||||
# predict single or multiple sets with clf.predict([[]])
|
# predict single or multiple sets with clf.predict([[]])
|
||||||
|
|
||||||
# visualize decision tree classifier
|
# visualize decision tree classifier
|
||||||
dot_data = tree.export_graphviz(clf, out_file=None)
|
dot_data = tree.export_graphviz(decision_tree_model, out_file=None)
|
||||||
graph = graphviz.Source(dot_data)
|
graph = graphviz.Source(dot_data)
|
||||||
graph.render('test', view=True)
|
graph.render('plot', view=True)
|
||||||
|
|
||||||
|
# dump trained decision tree classifier to file
|
||||||
|
decision_tree_pkl_filename = 'dtc_' + datetime.datetime.now().strftime(config.format_date) + '.pkl'
|
||||||
|
decision_tree_model_pkl = open(decision_tree_pkl_filename, 'wb')
|
||||||
|
pickle.dump(decision_tree_model, decision_tree_model_pkl)
|
||||||
|
decision_tree_model_pkl.close()
|
||||||
|
|
||||||
|
# load serialized model
|
||||||
|
decision_tree_model_pkl = open(decision_tree_pkl_filename, 'rb')
|
||||||
|
decision_tree_model = pickle.load(decision_tree_model_pkl)
|
||||||
|
|
||||||
|
|
||||||
def cleanup():
|
def cleanup():
|
||||||
db.close()
|
db_sql.close()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -21,24 +21,25 @@ def changes(array):
|
|||||||
|
|
||||||
|
|
||||||
# specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf]
|
# specific ranges: [0, 1], [1, 100], [100, 300], [300, 900], [900, inf]
|
||||||
def specific_range(ttl):
|
def specific_ranges(ttls):
|
||||||
specific_ttl_ranges = 4 # default is [900, inf]
|
range_0 = False
|
||||||
|
range_1 = False
|
||||||
|
range_2 = False
|
||||||
|
range_3 = False
|
||||||
|
range_4 = False
|
||||||
|
|
||||||
try:
|
for ttl in ttls:
|
||||||
ttl = int(ttl)
|
if 0 < ttl <= 1:
|
||||||
except ValueError:
|
range_0 = True
|
||||||
logger.error('ttl not a number')
|
elif 1 < ttl <= 100:
|
||||||
return specific_ttl_ranges
|
range_1 = True
|
||||||
|
elif 100 < ttl <= 300:
|
||||||
if 0 < ttl <= 1:
|
range_2 = True
|
||||||
specific_ttl_ranges = 0
|
elif 300 < ttl <= 900:
|
||||||
elif 1 < ttl <= 100:
|
range_3 = True
|
||||||
specific_ttl_ranges = 1
|
elif ttl > 900:
|
||||||
elif 100 < ttl <= 300:
|
range_4 = True
|
||||||
specific_ttl_ranges = 2
|
return range_0, range_1, range_2, range_3, range_4
|
||||||
elif 300 < ttl <= 900:
|
|
||||||
specific_ttl_ranges = 3
|
|
||||||
return specific_ttl_ranges
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -1,14 +1,16 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
#cd /run/media/felix/AE7E01B77E01797B/pDNS;
|
cd /run/media/felix/AE7E01B77E01797B/pDNS;
|
||||||
cd /home/felix/sources/MastersThesis/src/DoresA/data;
|
#cd /home/felix/sources/MastersThesis/src/DoresA/data;
|
||||||
month="04"
|
month="10"
|
||||||
|
machine="sgsgpdc0n9x" # set empty for all (sgsgpdc0n9x|usmlvdc010x|demchdc902n)
|
||||||
|
|
||||||
for i in {01..31}; do
|
echo "$machine"
|
||||||
if compgen -G *"2017-$month-$i"* > /dev/null; then
|
for i in {01..31}; do
|
||||||
echo -n -e "day $i \t size: ";
|
if compgen -G *"$machine-2017-$month-$i"* > /dev/null; then
|
||||||
echo -n -e $(du -ch *"2017-$month-$i"* | tail -1) " \t #files: ";
|
echo -n -e "day $i \t size: ";
|
||||||
ls *"2017-$month-$i"* | wc -l;
|
echo -n -e $(du -ch *"$machine-2017-$month-$i"* | tail -1) " \t #files: ";
|
||||||
|
ls *"$machine-2017-$month-$i"* | wc -l;
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|||||||
24
todo.txt
Normal file
24
todo.txt
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
übersicht bild architektur
|
||||||
|
filter lists vergrößern?
|
||||||
|
|
||||||
|
pub key schicken rechner im grünen netz lauffähig machen
|
||||||
|
fst@janus.cert.siemens.com
|
||||||
|
|
||||||
|
größe der files ungewöhnlich
|
||||||
|
verlängerung gerlinde
|
||||||
|
|
||||||
|
dependencies:
|
||||||
|
** make sure the development packages of libxml2 and libxslt are installed **
|
||||||
|
SystemError: Cannot compile 'Python.h'. Perhaps you need to install python-dev|python-devel
|
||||||
|
(opt): ImportError: The 'enchant' C library was not found. Please install it via your OS package manager, or use a pre-built binary wheel from PyPI
|
||||||
|
|
||||||
|
==> apt install libxml2-dev libxslt1-dev python-dev enchant
|
||||||
|
#scipy, enchant
|
||||||
|
==> libatlas-base-dev gfortran enchant
|
||||||
|
|
||||||
|
pfad zu logs: /mnt/old/2017
|
||||||
|
|
||||||
|
nx_domains: ips nicht in redis? -> wahrsch (aufbau nxdomain liste [timestamp, record type, domain/ip?])
|
||||||
|
|
||||||
|
dsl: https://www.maxmind.com/de/geoip2-connection-type-database
|
||||||
|
|
||||||
Reference in New Issue
Block a user