From 8d10f9bf9345b4a5c6daf482dcef17dc94aff15d Mon Sep 17 00:00:00 2001
From: Felix Steghofer <felix.steghofer@gmail.com>
Date: Fri, 1 Dec 2017 12:31:42 +0100
Subject: [PATCH] updated benchmarks

---
 src/benchmarks/benchmark.py                   |  84 ------------------
 src/benchmarks/compare_days.sh                |  13 ++-
 src/benchmarks/compare_plain-zipped.py        |  40 +++++++++
 src/benchmarks/load_day.py                    |  56 ++++++++++++
 .../{benchmark.sh => plain-vs-zipped.sh}      |   0
 ...e.pcap-demchdc902n-2017-09-01_00-20-02.csv |   0
 ...cap-demchdc902n-2017-09-01_00-20-02.csv.gz | Bin
 7 files changed, 107 insertions(+), 86 deletions(-)
 delete mode 100755 src/benchmarks/benchmark.py
 create mode 100755 src/benchmarks/compare_plain-zipped.py
 create mode 100755 src/benchmarks/load_day.py
 rename src/benchmarks/{benchmark.sh => plain-vs-zipped.sh} (100%)
 rename src/benchmarks/{ => test-data}/pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv (100%)
 rename src/benchmarks/{ => test-data}/pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv.gz (100%)

diff --git a/src/benchmarks/benchmark.py b/src/benchmarks/benchmark.py
deleted file mode 100755
index 6fcdde3..0000000
--- a/src/benchmarks/benchmark.py
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/usr/bin/env python3
-import csv
-import gzip
-import time
-import glob
-
-
-def compare_load_file():
-    dur_p = load_file_plain()
-    dur_z = load_file_zipped()
-    
-    print('plain took: ' + str(dur_p) + ' s')
-    print('zipped took: ' + str(dur_z) + ' s')
-    print('(plain - zipped): ' + str(dur_p - dur_z) + ' s')
-
-
-def load_file_plain():
-    start_p = time.time()
-    with open('pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv', 'rt') as file_p:
-        for line in file_p:
-            row = line.split()
-            # print(row)
-            pass
-    return time.time() - start_p
-
-
-def load_file_zipped():
-    start_z = time.time()
-    with gzip.open('pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv.gz', 'rt', newline='') as file_z:
-        reader = csv.reader(file_z)
-        for row in reader:
-            # print(row)
-            pass
-    return time.time() - start_z
-
-
-def load_day_zipped():
-    start_z = time.time()
-
-    globbed = glob.glob('/home/felix/pdns/' + '*-2017-09-01*.csv.gz')
-
-    for f in globbed:
-        with gzip.open(f, 'rt', newline='') as file:
-            reader = csv.reader(file)
-
-            for row in reader:
-                pass
-    dur_z = time.time() - start_z
-    print('iterating day took: ' + str(dur_z) + ' s')
-    return dur_z
-
-
-def benchmark_load_day():
-    durs = []
-    for i in range(10):
-        durs.append(load_day_zipped())
-    print('all results: ' + str(durs))
-    cleaned = ignore_outliers(durs)
-    print('cleaned results: ' + str(cleaned)
-    print('average: ' + str(mean(cleaned)))
-
-
-def ignore_outliers(lst):
-    med = median(lst)
-    lst = [e for e in lst if e < med * 1.1]
-    return lst
-
-def median(lst):
-    sortedLst = sorted(lst)
-    lstLen = len(lst)
-    index = (lstLen - 1) // 2
-
-    if (lstLen % 2):
-        return sortedLst[index]
-    else:
-        return (sortedLst[index] + sortedLst[index + 1])/2.0
-
-
-def mean(lst):
-    return float(sum(lst)) / max(len(lst), 1)
-
-
-if __name__ == '__main__':
-    benchmark_load_day()
diff --git a/src/benchmarks/compare_days.sh b/src/benchmarks/compare_days.sh
index 784dd78..71c05f3 100755
--- a/src/benchmarks/compare_days.sh
+++ b/src/benchmarks/compare_days.sh
@@ -1,5 +1,14 @@
 #!/bin/bash
 
-cd /run/media/felix/AE7E01B77E01797B/pDNS;
-for i in {01..31}; do echo -n -e "day $i \t size: "; echo -n -e $(du -ch *"2017-10-$i"* | tail -1) " \t #files: "; ls *"2017-10-$i"* | wc -l; done
+#cd /run/media/felix/AE7E01B77E01797B/pDNS;
+cd /home/felix/sources/MastersThesis/src/DoresA/data;
+month="04"
+
+for i in {01..31}; do 
+    if compgen -G *"2017-$month-$i"* > /dev/null; then
+        echo -n -e "day $i \t size: "; 
+        echo -n -e $(du -ch *"2017-$month-$i"* | tail -1) " \t #files: "; 
+        ls *"2017-$month-$i"* | wc -l; 
+    fi
+done
 
diff --git a/src/benchmarks/compare_plain-zipped.py b/src/benchmarks/compare_plain-zipped.py
new file mode 100755
index 0000000..00af8f9
--- /dev/null
+++ b/src/benchmarks/compare_plain-zipped.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+import gzip
+import csv
+import time
+
+
+logs_dir = 'test-data/'
+
+
+def compare_load_file():
+    dur_p = load_file_plain()
+    dur_z = load_file_zipped()
+    
+    print('plain took: ' + str(dur_p) + ' s')
+    print('zipped took: ' + str(dur_z) + ' s')
+    print('(plain - zipped): ' + str(dur_p - dur_z) + ' s')
+
+
+def load_file_plain():
+    start_p = time.time()
+    with open(logs_dir + 'pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv', 'rt') as file_p:
+        for line in file_p:
+            row = line.split()
+            # print(row)
+            pass
+    return time.time() - start_p
+
+
+def load_file_zipped():
+    start_z = time.time()
+    with gzip.open(logs_dir + 'pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv.gz', 'rt', newline='') as file_z:
+        reader = csv.reader(file_z)
+        for row in reader:
+            # print(row)
+            pass
+    return time.time() - start_z
+
+
+if __name__ == '__main__':
+    compare_load_file()
diff --git a/src/benchmarks/load_day.py b/src/benchmarks/load_day.py
new file mode 100755
index 0000000..483ce86
--- /dev/null
+++ b/src/benchmarks/load_day.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+import csv
+import gzip
+import time
+import glob
+
+
+def benchmark_load_day():
+    durs = []
+    for i in range(10):
+        durs.append(load_day_zipped())
+    print('all results: ' + str(durs))
+    cleaned = ignore_outliers(durs)
+    print('cleaned results: ' + str(cleaned))
+    print('average: ' + str(mean(cleaned)))
+
+
+def load_day_zipped():
+    start_z = time.time()
+
+    globbed = glob.glob('/home/felix/pdns/' + '*-2017-09-01*.csv.gz')
+
+    for f in globbed:
+        with gzip.open(f, 'rt', newline='') as file:
+            reader = csv.reader(file)
+
+            for row in reader:
+                pass
+    dur_z = time.time() - start_z
+    print('iterating day took: ' + str(dur_z) + ' s')
+    return dur_z
+
+
+def ignore_outliers(lst):
+    med = median(lst)
+    lst = [e for e in lst if e < med * 1.1]
+    return lst
+
+
+def median(lst):
+    sorted_lst = sorted(lst)
+    lst_len = len(lst)
+    index = (lst_len - 1) // 2
+
+    if lst_len % 2:
+        return sorted_lst[index]
+    else:
+        return (sorted_lst[index] + sorted_lst[index + 1])/2.0
+
+
+def mean(lst):
+    return float(sum(lst)) / max(len(lst), 1)
+
+
+if __name__ == '__main__':
+    benchmark_load_day()
diff --git a/src/benchmarks/benchmark.sh b/src/benchmarks/plain-vs-zipped.sh
similarity index 100%
rename from src/benchmarks/benchmark.sh
rename to src/benchmarks/plain-vs-zipped.sh
diff --git a/src/benchmarks/pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv b/src/benchmarks/test-data/pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv
similarity index 100%
rename from src/benchmarks/pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv
rename to src/benchmarks/test-data/pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv
diff --git a/src/benchmarks/pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv.gz b/src/benchmarks/test-data/pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv.gz
similarity index 100%
rename from src/benchmarks/pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv.gz
rename to src/benchmarks/test-data/pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv.gz