import csv import gzip import glob import time import datetime import os from progress.bar import Bar import db # TODO environment this analysis_start_date = datetime.date(2017, 5, 1) analysis_days_amount = 31 # pdns_logs_path = 'data/' pdns_logs_path = '/data/' # e.g. analysis_days = ['2017-04-07', '2017-04-08', '2017-04-09'] analysis_days = [(analysis_start_date + datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in range(analysis_days_amount)] def main(): # check_duplicates() TODO readd start = time.time() distinct_ttl_count = {} # everything = {} # for log_file in ['data/pdns_capture.pcap-sgsgpdc0n9x-2017-04-07_00-00-02.csv.gz']: for day in range(analysis_days_amount): log_files_hour = get_log_files_for_hours_of_day(analysis_days[day]) # everything[day] = {} progress_bar = Bar(analysis_days[day], max=24) for hour in range(24): progress_bar.next() # everything[day][hour] = {} for hour_files in log_files_hour[hour]: # a bit faster, 10-15% (but pandas overhead) # df = pandas.read_csv(log_file, compression='gzip', header=None) # print(df.iloc[0]) with gzip.open(hour_files, 'rt', newline='') as file: reader = csv.reader(file) all_rows = list(reader) # batch mode (batches of 1000 entries) for log_entries in batch(all_rows, 1000): db.mariadb_insert_logs(log_entries) #db.mongodb_insert_logs(log_entries) # single mode # for log_entry in reader: # db.mariadb_insert_log(log_entry) # # db.mongodb_insert_log(log_entry) progress_bar.finish() print('total duration: ' + str(time.time() - start) + 's') db.close() def batch(iterable, n=1): length = len(iterable) for ndx in range(0, length, n): yield iterable[ndx:min(ndx + n, length)] def check_duplicates(): days_cumulated = 0 for day in analysis_days: days_cumulated += len(get_log_files_for_day(day)) all_logs = len(get_log_files_for_day('')) if days_cumulated != all_logs: raise Exception('Log files inconsistency') # TODO def get_log_files_for_range_of_day(date, minutes_range): slot_files = {} slots_amount = int(1440 / minutes_range) for slot in range(slots_amount): total_mins = slot * minutes_range hours, minutes = divmod(total_mins, 60) time_range = '%02d-%02d' % (hours, minutes) slot_files[slot] = 'data/*' + date + '_' + time_range + '*.csv.gz' def get_log_files_for_hours_of_day(date): slot_files = {} slots_amount = 24 for slot in range(slots_amount): slot_files[slot] = glob.glob(pdns_logs_path + '*' + date + '_' + ('%02d' % slot) + '*.csv.gz') return slot_files def get_log_files_for_day(date): log_files = 'data/*' + date + '*.csv.gz' return glob.glob(log_files) if __name__ == "__main__": main()