diff --git a/src/DoresA/Dockerfile b/src/DoresA/Dockerfile index 8adcf82..b5e3b3e 100644 --- a/src/DoresA/Dockerfile +++ b/src/DoresA/Dockerfile @@ -1,7 +1,11 @@ -FROM python:3 -ENV PYTHONUNBUFFERED 1 +FROM python:3.6-alpine + +# numpy scipy spellchecking etc +RUN apk add --no-cache mariadb-dev gcc gfortran musl-dev freetype-dev enchant + +#ENV PYTHONUNBUFFERED 1 RUN mkdir /app WORKDIR /app ADD requirements.txt /app/ RUN pip install -r requirements.txt -ADD . /app/ \ No newline at end of file +VOLUME /app diff --git a/src/DoresA/db.py b/src/DoresA/db.py index bb8349b..c5af6ac 100644 --- a/src/DoresA/db.py +++ b/src/DoresA/db.py @@ -10,6 +10,7 @@ mongodb_db_name = 'doresa' mongodb_collection_name = 'may' # tmp TODO remove sql_host = 'localhost' +sql_port = 3306 sql_db_name = 'doresa' sql_user_name = 'doresa' sql_pw = '3qfACEZzbXY4b' @@ -20,6 +21,9 @@ sql_table_name = 'pdns_logs_test' if 'MYSQL_HOST' in os.environ: sql_host = os.environ['MYSQL_HOST'] +if 'MYSQL_PORT' in os.environ: + sql_port = int(os.environ['MYSQL_PORT']) + if 'MYSQL_DATABASE' in os.environ: sql_db_name = os.environ['MYSQL_DATABASE'] @@ -35,7 +39,7 @@ mongo_db = mongo_client[mongodb_db_name] pdns_logs_mongo = mongo_db[mongodb_collection_name] -sql_connection = mariadb.connect(host=sql_host, user=sql_user_name, passwd=sql_pw, db=sql_db_name) +sql_connection = mariadb.connect(host=sql_host, user=sql_user_name, passwd=sql_pw, db=sql_db_name, port=sql_port) sql_cursor = sql_connection.cursor() @@ -61,7 +65,11 @@ def mariadb_insert_logs(csv_entries): values += [convert_timestamp_to_sql_datetime(float(csv_entry[0])), csv_entry[1], csv_entry[2], csv_entry[3], csv_entry[4]] - sql_cursor.execute(inserts_sql, values) + try: + sql_cursor.execute(inserts_sql, values) + except mariadb.OperationalError: + print('something happened') # TODO proper error handling + sql_connection.commit() @@ -110,7 +118,7 @@ def close(): mongo_client.close() -mariadb_create_table() +#mariadb_create_table() if __name__ == "__main__": exit() diff --git a/src/DoresA/docker-compose.yml b/src/DoresA/docker-compose.yml index db6f28c..c20943d 100644 --- a/src/DoresA/docker-compose.yml +++ b/src/DoresA/docker-compose.yml @@ -1,10 +1,9 @@ version: '3' services: - mariadb: - container_name: mariadb + doresa_mariadb: + container_name: doresa_mariadb image: mariadb - restart: unless-stopped ports: - "3306:3306" environment: @@ -13,19 +12,19 @@ services: MYSQL_USER: doresa MYSQL_PASSWORD: 3qfACEZzbXY4b - app: + doresa_app: build: . - container_name: app - restart: unless-stopped + container_name: doresa_app command: python3 serialize_logs_to_db.py volumes: - .:/app - - ./data:/data + - /media/backup/felix/uni/Mastersthesis/pDNS/2017.05:/data # path to zipped pDNS logs environment: - MYSQL_HOST: mariadb + MYSQL_HOST: doresa_mariadb + MYSQL_PORT: 3306 MYSQL_DATABASE: doresa MYSQL_USER: doresa MYSQL_PASSWORD: 3qfACEZzbXY4b - DATA_PATH: depends_on: - - mariadb + - doresa_mariadb + diff --git a/src/DoresA/requirements.txt b/src/DoresA/requirements.txt index 57cba1a..45138c6 100644 --- a/src/DoresA/requirements.txt +++ b/src/DoresA/requirements.txt @@ -2,7 +2,6 @@ maxminddb==1.3.0 maxminddb-geolite2==2017.803 mysqlclient==1.3.12 numpy==1.13.1 -pandas==0.20.3 progress==1.3 pyenchant==1.6.11 pymongo==3.5.1 diff --git a/src/DoresA/serialize_logs_to_db.py b/src/DoresA/serialize_logs_to_db.py index d23229a..cf18b94 100644 --- a/src/DoresA/serialize_logs_to_db.py +++ b/src/DoresA/serialize_logs_to_db.py @@ -3,16 +3,16 @@ import gzip import glob import time import datetime -import pandas import os from progress.bar import Bar import db +# TODO environment this analysis_start_date = datetime.date(2017, 5, 1) analysis_days_amount = 31 # pdns_logs_path = 'data/' -pdns_logs_path = '/run/media/felix/ext/2017.05/' # tmp TODO remove +pdns_logs_path = '/data/' # e.g. analysis_days = ['2017-04-07', '2017-04-08', '2017-04-09'] analysis_days = [(analysis_start_date + datetime.timedelta(days=x)).strftime('%Y-%m-%d') for x in @@ -27,7 +27,7 @@ def main(): # everything = {} # for log_file in ['data/pdns_capture.pcap-sgsgpdc0n9x-2017-04-07_00-00-02.csv.gz']: - + for day in range(analysis_days_amount): log_files_hour = get_log_files_for_hours_of_day(analysis_days[day]) # everything[day] = {} @@ -38,8 +38,7 @@ def main(): progress_bar.next() # everything[day][hour] = {} for hour_files in log_files_hour[hour]: - - # a bit faster + # a bit faster, 10-15% (but pandas overhead) # df = pandas.read_csv(log_file, compression='gzip', header=None) # print(df.iloc[0]) with gzip.open(hour_files, 'rt', newline='') as file: @@ -49,7 +48,7 @@ def main(): # batch mode (batches of 1000 entries) for log_entries in batch(all_rows, 1000): db.mariadb_insert_logs(log_entries) - db.mongodb_insert_logs(log_entries) + #db.mongodb_insert_logs(log_entries) # single mode # for log_entry in reader: