iterate logs from db
This commit is contained in:
@@ -12,7 +12,9 @@ analysis_start_date = datetime.date(2017, 4, 7)
|
||||
analysis_days_amount = 3
|
||||
|
||||
# e.g. analysis_days = ['2017-04-07', '2017-04-08', '2017-04-09']
|
||||
analysis_days = [(analysis_start_date + datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in range(analysis_days_amount)]
|
||||
analysis_days = [(analysis_start_date + datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in
|
||||
range(analysis_days_amount)]
|
||||
|
||||
|
||||
# mongodb
|
||||
|
||||
@@ -38,6 +40,10 @@ def main():
|
||||
progress_bar.next()
|
||||
# everything[day][hour] = {}
|
||||
for hour_files in log_files_hour[hour]:
|
||||
|
||||
# a bit faster
|
||||
# df = pandas.read_csv(log_file, compression='gzip', header=None)
|
||||
# print(df.iloc[0])
|
||||
with gzip.open(hour_files, 'rt', newline='') as file:
|
||||
reader = csv.reader(file)
|
||||
all_rows = list(reader)
|
||||
@@ -45,37 +51,23 @@ def main():
|
||||
# batch mode (batches of 1000 entries)
|
||||
for log_entries in batch(all_rows, 1000):
|
||||
db.mariadb_insert_logs(log_entries)
|
||||
db.mongodb_insert_logs(log_entries)
|
||||
# db.mongodb_insert_logs(log_entries)
|
||||
|
||||
# single mode
|
||||
# for log_entry in reader:
|
||||
# db.mariadb_insert_log(log_entry)
|
||||
# # db.mongodb_insert_log(log_entry)
|
||||
# single mode
|
||||
# for log_entry in reader:
|
||||
# db.mariadb_insert_log(log_entry)
|
||||
# # db.mongodb_insert_log(log_entry)
|
||||
|
||||
progress_bar.finish()
|
||||
|
||||
# log_entry[4] == TTL
|
||||
# if log_entry[4] in distinct_ttl_count:
|
||||
# distinct_ttl_count[log_entry[4]] += 1
|
||||
# else:
|
||||
# distinct_ttl_count[log_entry[4]] = 1
|
||||
#
|
||||
# everything[day][hour]['ttl'] = distinct_ttl_count
|
||||
|
||||
# a bit faster
|
||||
# df = pandas.read_csv(log_file, compression='gzip', header=None)
|
||||
# print(df.iloc[0])
|
||||
|
||||
# print('distinct TTLs: ' + str(len(everything[0][0]['ttl'].keys())))
|
||||
|
||||
print('total duration: ' + str(time.time() - start) + 's')
|
||||
db.close()
|
||||
|
||||
|
||||
def batch(iterable, n=1):
|
||||
l = len(iterable)
|
||||
for ndx in range(0, l, n):
|
||||
yield iterable[ndx:min(ndx + n, l)]
|
||||
length = len(iterable)
|
||||
for ndx in range(0, length, n):
|
||||
yield iterable[ndx:min(ndx + n, length)]
|
||||
|
||||
|
||||
def check_duplicates():
|
||||
|
||||
Reference in New Issue
Block a user