Files
master_thesis/src/DoresA/serialize_logs_to_db.py
2017-10-27 14:22:09 +02:00

113 lines
3.1 KiB
Python

import csv
import gzip
import glob
import time
import datetime
import os
from progress.bar import Bar
import db
# TODO environment this
analysis_start_date = datetime.date(2017, 5, 1)
analysis_days_amount = 31
# pdns_logs_path = 'data/'
pdns_logs_path = '/data/'
# e.g. analysis_days = ['2017-04-07', '2017-04-08', '2017-04-09']
analysis_days = [(analysis_start_date + datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in
range(analysis_days_amount)]
def main():
# check_duplicates() TODO readd
start = time.time()
distinct_ttl_count = {}
# everything = {}
# for log_file in ['data/pdns_capture.pcap-sgsgpdc0n9x-2017-04-07_00-00-02.csv.gz']:
for day in range(analysis_days_amount):
log_files_hour = get_log_files_for_hours_of_day(analysis_days[day])
# everything[day] = {}
progress_bar = Bar(analysis_days[day], max=24)
for hour in range(24):
progress_bar.next()
# everything[day][hour] = {}
for hour_files in log_files_hour[hour]:
# a bit faster, 10-15% (but pandas overhead)
# df = pandas.read_csv(log_file, compression='gzip', header=None)
# print(df.iloc[0])
with gzip.open(hour_files, 'rt', newline='') as file:
reader = csv.reader(file)
all_rows = list(reader)
# batch mode (batches of 1000 entries)
for log_entries in batch(all_rows, 1000):
db.mariadb_insert_logs(log_entries)
#db.mongodb_insert_logs(log_entries)
# single mode
# for log_entry in reader:
# db.mariadb_insert_log(log_entry)
# # db.mongodb_insert_log(log_entry)
progress_bar.finish()
print('total duration: ' + str(time.time() - start) + 's')
db.close()
def batch(iterable, n=1):
length = len(iterable)
for ndx in range(0, length, n):
yield iterable[ndx:min(ndx + n, length)]
def check_duplicates():
days_cumulated = 0
for day in analysis_days:
days_cumulated += len(get_log_files_for_day(day))
all_logs = len(get_log_files_for_day(''))
if days_cumulated != all_logs:
raise Exception('Log files inconsistency')
# TODO
def get_log_files_for_range_of_day(date, minutes_range):
slot_files = {}
slots_amount = int(1440 / minutes_range)
for slot in range(slots_amount):
total_mins = slot * minutes_range
hours, minutes = divmod(total_mins, 60)
time_range = '%02d-%02d' % (hours, minutes)
slot_files[slot] = 'data/*' + date + '_' + time_range + '*.csv.gz'
def get_log_files_for_hours_of_day(date):
slot_files = {}
slots_amount = 24
for slot in range(slots_amount):
slot_files[slot] = glob.glob(pdns_logs_path + '*' + date + '_' + ('%02d' % slot) + '*.csv.gz')
return slot_files
def get_log_files_for_day(date):
log_files = 'data/*' + date + '*.csv.gz'
return glob.glob(log_files)
if __name__ == "__main__":
main()