Merge branch 'master' of github.com:felixsteghofer/masters_thesis

This commit is contained in:
2017-10-27 16:21:41 +02:00
5 changed files with 32 additions and 23 deletions

View File

@@ -1,7 +1,11 @@
FROM python:3 FROM python:3.6-alpine
ENV PYTHONUNBUFFERED 1
# numpy scipy spellchecking etc
RUN apk add --no-cache mariadb-dev gcc gfortran musl-dev freetype-dev enchant
#ENV PYTHONUNBUFFERED 1
RUN mkdir /app RUN mkdir /app
WORKDIR /app WORKDIR /app
ADD requirements.txt /app/ ADD requirements.txt /app/
RUN pip install -r requirements.txt RUN pip install -r requirements.txt
ADD . /app/ VOLUME /app

View File

@@ -10,6 +10,7 @@ mongodb_db_name = 'doresa'
mongodb_collection_name = 'may' # tmp TODO remove mongodb_collection_name = 'may' # tmp TODO remove
sql_host = 'localhost' sql_host = 'localhost'
sql_port = 3306
sql_db_name = 'doresa' sql_db_name = 'doresa'
sql_user_name = 'doresa' sql_user_name = 'doresa'
sql_pw = '3qfACEZzbXY4b' sql_pw = '3qfACEZzbXY4b'
@@ -20,6 +21,9 @@ sql_table_name = 'pdns_logs_test'
if 'MYSQL_HOST' in os.environ: if 'MYSQL_HOST' in os.environ:
sql_host = os.environ['MYSQL_HOST'] sql_host = os.environ['MYSQL_HOST']
if 'MYSQL_PORT' in os.environ:
sql_port = int(os.environ['MYSQL_PORT'])
if 'MYSQL_DATABASE' in os.environ: if 'MYSQL_DATABASE' in os.environ:
sql_db_name = os.environ['MYSQL_DATABASE'] sql_db_name = os.environ['MYSQL_DATABASE']
@@ -35,7 +39,7 @@ mongo_db = mongo_client[mongodb_db_name]
pdns_logs_mongo = mongo_db[mongodb_collection_name] pdns_logs_mongo = mongo_db[mongodb_collection_name]
sql_connection = mariadb.connect(host=sql_host, user=sql_user_name, passwd=sql_pw, db=sql_db_name) sql_connection = mariadb.connect(host=sql_host, user=sql_user_name, passwd=sql_pw, db=sql_db_name, port=sql_port)
sql_cursor = sql_connection.cursor() sql_cursor = sql_connection.cursor()
@@ -61,7 +65,11 @@ def mariadb_insert_logs(csv_entries):
values += [convert_timestamp_to_sql_datetime(float(csv_entry[0])), csv_entry[1], values += [convert_timestamp_to_sql_datetime(float(csv_entry[0])), csv_entry[1],
csv_entry[2], csv_entry[3], csv_entry[4]] csv_entry[2], csv_entry[3], csv_entry[4]]
sql_cursor.execute(inserts_sql, values) try:
sql_cursor.execute(inserts_sql, values)
except mariadb.OperationalError:
print('something happened') # TODO proper error handling
sql_connection.commit() sql_connection.commit()
@@ -110,7 +118,7 @@ def close():
mongo_client.close() mongo_client.close()
mariadb_create_table() #mariadb_create_table()
if __name__ == "__main__": if __name__ == "__main__":
exit() exit()

View File

@@ -1,10 +1,9 @@
version: '3' version: '3'
services: services:
mariadb: doresa_mariadb:
container_name: mariadb container_name: doresa_mariadb
image: mariadb image: mariadb
restart: unless-stopped
ports: ports:
- "3306:3306" - "3306:3306"
environment: environment:
@@ -13,19 +12,19 @@ services:
MYSQL_USER: doresa MYSQL_USER: doresa
MYSQL_PASSWORD: 3qfACEZzbXY4b MYSQL_PASSWORD: 3qfACEZzbXY4b
app: doresa_app:
build: . build: .
container_name: app container_name: doresa_app
restart: unless-stopped
command: python3 serialize_logs_to_db.py command: python3 serialize_logs_to_db.py
volumes: volumes:
- .:/app - .:/app
- ./data:/data - /media/backup/felix/uni/Mastersthesis/pDNS/2017.05:/data # path to zipped pDNS logs
environment: environment:
MYSQL_HOST: mariadb MYSQL_HOST: doresa_mariadb
MYSQL_PORT: 3306
MYSQL_DATABASE: doresa MYSQL_DATABASE: doresa
MYSQL_USER: doresa MYSQL_USER: doresa
MYSQL_PASSWORD: 3qfACEZzbXY4b MYSQL_PASSWORD: 3qfACEZzbXY4b
DATA_PATH:
depends_on: depends_on:
- mariadb - doresa_mariadb

View File

@@ -2,7 +2,6 @@ maxminddb==1.3.0
maxminddb-geolite2==2017.803 maxminddb-geolite2==2017.803
mysqlclient==1.3.12 mysqlclient==1.3.12
numpy==1.13.1 numpy==1.13.1
pandas==0.20.3
progress==1.3 progress==1.3
pyenchant==1.6.11 pyenchant==1.6.11
pymongo==3.5.1 pymongo==3.5.1

View File

@@ -3,16 +3,16 @@ import gzip
import glob import glob
import time import time
import datetime import datetime
import pandas
import os import os
from progress.bar import Bar from progress.bar import Bar
import db import db
# TODO environment this
analysis_start_date = datetime.date(2017, 5, 1) analysis_start_date = datetime.date(2017, 5, 1)
analysis_days_amount = 31 analysis_days_amount = 31
# pdns_logs_path = 'data/' # pdns_logs_path = 'data/'
pdns_logs_path = '/run/media/felix/ext/2017.05/' # tmp TODO remove pdns_logs_path = '/data/'
# e.g. analysis_days = ['2017-04-07', '2017-04-08', '2017-04-09'] # e.g. analysis_days = ['2017-04-07', '2017-04-08', '2017-04-09']
analysis_days = [(analysis_start_date + datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in analysis_days = [(analysis_start_date + datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in
@@ -27,7 +27,7 @@ def main():
# everything = {} # everything = {}
# for log_file in ['data/pdns_capture.pcap-sgsgpdc0n9x-2017-04-07_00-00-02.csv.gz']: # for log_file in ['data/pdns_capture.pcap-sgsgpdc0n9x-2017-04-07_00-00-02.csv.gz']:
for day in range(analysis_days_amount): for day in range(analysis_days_amount):
log_files_hour = get_log_files_for_hours_of_day(analysis_days[day]) log_files_hour = get_log_files_for_hours_of_day(analysis_days[day])
# everything[day] = {} # everything[day] = {}
@@ -38,8 +38,7 @@ def main():
progress_bar.next() progress_bar.next()
# everything[day][hour] = {} # everything[day][hour] = {}
for hour_files in log_files_hour[hour]: for hour_files in log_files_hour[hour]:
# a bit faster, 10-15% (but pandas overhead)
# a bit faster
# df = pandas.read_csv(log_file, compression='gzip', header=None) # df = pandas.read_csv(log_file, compression='gzip', header=None)
# print(df.iloc[0]) # print(df.iloc[0])
with gzip.open(hour_files, 'rt', newline='') as file: with gzip.open(hour_files, 'rt', newline='') as file:
@@ -49,7 +48,7 @@ def main():
# batch mode (batches of 1000 entries) # batch mode (batches of 1000 entries)
for log_entries in batch(all_rows, 1000): for log_entries in batch(all_rows, 1000):
db.mariadb_insert_logs(log_entries) db.mariadb_insert_logs(log_entries)
db.mongodb_insert_logs(log_entries) #db.mongodb_insert_logs(log_entries)
# single mode # single mode
# for log_entry in reader: # for log_entry in reader: