From 8d10f9bf9345b4a5c6daf482dcef17dc94aff15d Mon Sep 17 00:00:00 2001 From: Felix Steghofer Date: Fri, 1 Dec 2017 12:31:42 +0100 Subject: [PATCH] updated benchmarks --- src/benchmarks/benchmark.py | 84 ------------------ src/benchmarks/compare_days.sh | 13 ++- src/benchmarks/compare_plain-zipped.py | 40 +++++++++ src/benchmarks/load_day.py | 56 ++++++++++++ .../{benchmark.sh => plain-vs-zipped.sh} | 0 ...e.pcap-demchdc902n-2017-09-01_00-20-02.csv | 0 ...cap-demchdc902n-2017-09-01_00-20-02.csv.gz | Bin 7 files changed, 107 insertions(+), 86 deletions(-) delete mode 100755 src/benchmarks/benchmark.py create mode 100755 src/benchmarks/compare_plain-zipped.py create mode 100755 src/benchmarks/load_day.py rename src/benchmarks/{benchmark.sh => plain-vs-zipped.sh} (100%) rename src/benchmarks/{ => test-data}/pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv (100%) rename src/benchmarks/{ => test-data}/pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv.gz (100%) diff --git a/src/benchmarks/benchmark.py b/src/benchmarks/benchmark.py deleted file mode 100755 index 6fcdde3..0000000 --- a/src/benchmarks/benchmark.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env python3 -import csv -import gzip -import time -import glob - - -def compare_load_file(): - dur_p = load_file_plain() - dur_z = load_file_zipped() - - print('plain took: ' + str(dur_p) + ' s') - print('zipped took: ' + str(dur_z) + ' s') - print('(plain - zipped): ' + str(dur_p - dur_z) + ' s') - - -def load_file_plain(): - start_p = time.time() - with open('pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv', 'rt') as file_p: - for line in file_p: - row = line.split() - # print(row) - pass - return time.time() - start_p - - -def load_file_zipped(): - start_z = time.time() - with gzip.open('pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv.gz', 'rt', newline='') as file_z: - reader = csv.reader(file_z) - for row in reader: - # print(row) - pass - return time.time() - start_z - - -def load_day_zipped(): - start_z = time.time() - - globbed = glob.glob('/home/felix/pdns/' + '*-2017-09-01*.csv.gz') - - for f in globbed: - with gzip.open(f, 'rt', newline='') as file: - reader = csv.reader(file) - - for row in reader: - pass - dur_z = time.time() - start_z - print('iterating day took: ' + str(dur_z) + ' s') - return dur_z - - -def benchmark_load_day(): - durs = [] - for i in range(10): - durs.append(load_day_zipped()) - print('all results: ' + str(durs)) - cleaned = ignore_outliers(durs) - print('cleaned results: ' + str(cleaned) - print('average: ' + str(mean(cleaned))) - - -def ignore_outliers(lst): - med = median(lst) - lst = [e for e in lst if e < med * 1.1] - return lst - -def median(lst): - sortedLst = sorted(lst) - lstLen = len(lst) - index = (lstLen - 1) // 2 - - if (lstLen % 2): - return sortedLst[index] - else: - return (sortedLst[index] + sortedLst[index + 1])/2.0 - - -def mean(lst): - return float(sum(lst)) / max(len(lst), 1) - - -if __name__ == '__main__': - benchmark_load_day() diff --git a/src/benchmarks/compare_days.sh b/src/benchmarks/compare_days.sh index 784dd78..71c05f3 100755 --- a/src/benchmarks/compare_days.sh +++ b/src/benchmarks/compare_days.sh @@ -1,5 +1,14 @@ #!/bin/bash -cd /run/media/felix/AE7E01B77E01797B/pDNS; -for i in {01..31}; do echo -n -e "day $i \t size: "; echo -n -e $(du -ch *"2017-10-$i"* | tail -1) " \t #files: "; ls *"2017-10-$i"* | wc -l; done +#cd /run/media/felix/AE7E01B77E01797B/pDNS; +cd /home/felix/sources/MastersThesis/src/DoresA/data; +month="04" + +for i in {01..31}; do + if compgen -G *"2017-$month-$i"* > /dev/null; then + echo -n -e "day $i \t size: "; + echo -n -e $(du -ch *"2017-$month-$i"* | tail -1) " \t #files: "; + ls *"2017-$month-$i"* | wc -l; + fi +done diff --git a/src/benchmarks/compare_plain-zipped.py b/src/benchmarks/compare_plain-zipped.py new file mode 100755 index 0000000..00af8f9 --- /dev/null +++ b/src/benchmarks/compare_plain-zipped.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +import gzip +import csv +import time + + +logs_dir = 'test-data/' + + +def compare_load_file(): + dur_p = load_file_plain() + dur_z = load_file_zipped() + + print('plain took: ' + str(dur_p) + ' s') + print('zipped took: ' + str(dur_z) + ' s') + print('(plain - zipped): ' + str(dur_p - dur_z) + ' s') + + +def load_file_plain(): + start_p = time.time() + with open(logs_dir + 'pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv', 'rt') as file_p: + for line in file_p: + row = line.split() + # print(row) + pass + return time.time() - start_p + + +def load_file_zipped(): + start_z = time.time() + with gzip.open(logs_dir + 'pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv.gz', 'rt', newline='') as file_z: + reader = csv.reader(file_z) + for row in reader: + # print(row) + pass + return time.time() - start_z + + +if __name__ == '__main__': + compare_load_file() diff --git a/src/benchmarks/load_day.py b/src/benchmarks/load_day.py new file mode 100755 index 0000000..483ce86 --- /dev/null +++ b/src/benchmarks/load_day.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +import csv +import gzip +import time +import glob + + +def benchmark_load_day(): + durs = [] + for i in range(10): + durs.append(load_day_zipped()) + print('all results: ' + str(durs)) + cleaned = ignore_outliers(durs) + print('cleaned results: ' + str(cleaned)) + print('average: ' + str(mean(cleaned))) + + +def load_day_zipped(): + start_z = time.time() + + globbed = glob.glob('/home/felix/pdns/' + '*-2017-09-01*.csv.gz') + + for f in globbed: + with gzip.open(f, 'rt', newline='') as file: + reader = csv.reader(file) + + for row in reader: + pass + dur_z = time.time() - start_z + print('iterating day took: ' + str(dur_z) + ' s') + return dur_z + + +def ignore_outliers(lst): + med = median(lst) + lst = [e for e in lst if e < med * 1.1] + return lst + + +def median(lst): + sorted_lst = sorted(lst) + lst_len = len(lst) + index = (lst_len - 1) // 2 + + if lst_len % 2: + return sorted_lst[index] + else: + return (sorted_lst[index] + sorted_lst[index + 1])/2.0 + + +def mean(lst): + return float(sum(lst)) / max(len(lst), 1) + + +if __name__ == '__main__': + benchmark_load_day() diff --git a/src/benchmarks/benchmark.sh b/src/benchmarks/plain-vs-zipped.sh similarity index 100% rename from src/benchmarks/benchmark.sh rename to src/benchmarks/plain-vs-zipped.sh diff --git a/src/benchmarks/pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv b/src/benchmarks/test-data/pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv similarity index 100% rename from src/benchmarks/pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv rename to src/benchmarks/test-data/pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv diff --git a/src/benchmarks/pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv.gz b/src/benchmarks/test-data/pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv.gz similarity index 100% rename from src/benchmarks/pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv.gz rename to src/benchmarks/test-data/pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv.gz