rush hour

2018-01-29 22:52:15 +01:00
parent 817b68b025
commit ece9b4afcf
14 changed files with 284 additions and 110 deletions
--- a/Thesis/bibliography.bib
+++ b/Thesis/bibliography.bib
@@ -277,7 +277,6 @@
    howpublished={\url{http://www.team-cymru.org/IP-ASN-mapping.html}}
 }

-
@inproceedings{porras2009foray,
  title={A Foray into Conficker's Logic and Rendezvous Points.},
  author={Porras, Phillip A and Sa{\"\i}di, Hassen and Yegneswaran, Vinod},
@@ -285,6 +284,71 @@
  year={2009}
 }

+@misc{TrendMicroOnline,
+    author={TREND MICRO},
+    title={{A Look at Locky Ransomware’s Recent Spam Activities}},
+    month=jan,
+    year={2018},
+    howpublished={\url{https://blog.trendmicro.com/trendlabs-security-intelligence/look-locky-ransomwares-recent-spam-activities/}}
+}
+
+@misc{MariaDBOnline,
+    author={The MariaDB Foundation},
+    title={{MariaDB}},
+    month=jan,
+    year={2018},
+    howpublished={\url{https://mariadb.org/}}
+}
+
+@misc{MongoDBOnline,
+    author={MongoDB, Inc},
+    title={{MongoDB}},
+    month=jan,
+    year={2018},
+    howpublished={\url{https://www.mongodb.com}}
+}
+
+@misc{RedisOnline,
+    author={redislabs},
+    title={{Redis}},
+    month=jan,
+    year={2018},
+    howpublished={\url{https://redis.io/}}
+}
+
+@misc{SciKitOnline,
+    author={scikit-learn},
+    title={{scikit-learn - Decision Trees}},
+    month=jan,
+    year={2018},
+    howpublished={\url{http://scikit-learn.org/stable/modules/tree.html#tree-algorithms-id3-c4-5-c5-0-and-cart}}
+}
+
+@misc{SciKitProbOnline,
+    author={scikit-learn},
+    title={{scikit-learn - Classification}},
+    month=jan,
+    year={2018},
+    howpublished={\url{http://scikit-learn.org/stable/modules/tree.html#classification}}
+}
+
+@misc{DENICOnline,
+    author={DENIC e.G.},
+    title={{DENIC}},
+    month=jan,
+    year={2018},
+    howpublished={\url{https://www.denic.de/}}
+}
+
+@misc{IANADNSClassesOnline,
+    author={IANA},
+    title={{Domain Name System (DNS) Parameters}},
+    month=jan,
+    year={2018},
+    howpublished={\url{https://www.iana.org/assignments/dns-parameters/dns-parameters.xhtml}}
+}
+
+
@inproceedings{Stone-Gross:2009:YBM:1653662.1653738,
 author = {Stone-Gross, Brett and Cova, Marco and Cavallaro, Lorenzo and Gilbert, Bob and Szydlowski, Martin and Kemmerer, Richard and Kruegel, Christopher and Vigna, Giovanni},
 title = {Your Botnet is My Botnet: Analysis of a Botnet Takeover},
@@ -349,4 +413,81 @@ isbn="978-3-540-24668-8"
 acmid = {1643047},
 publisher = {Morgan Kaufmann Publishers Inc.},
 address = {San Francisco, CA, USA},
+}
+
+@Article{Lim2000,
+author="Lim, Tjen-Sien
+and Loh, Wei-Yin
+and Shih, Yu-Shan",
+title="A Comparison of Prediction Accuracy, Complexity, and Training Time of Thirty-Three Old and New Classification Algorithms",
+journal="Machine Learning",
+year="2000",
+month="Sep",
+day="01",
+volume="40",
+number="3",
+pages="203--228",
+abstract="Twenty-two decision tree, nine statistical, and two neural network algorithms are compared on thirty-two datasets in terms of classification accuracy, training time, and (in the case of trees) number of leaves. Classification accuracy is measured by mean error rate and mean rank of error rate. Both criteria place a statistical, spline-based, algorithm called POLYCLSSS at the top, although it is not statistically significantly different from twenty other algorithms. Another statistical algorithm, logistic regression, is second with respect to the two accuracy criteria. The most accurate decision tree algorithm is QUEST with linear splits, which ranks fourth and fifth, respectively. Although spline-based statistical algorithms tend to have good accuracy, they also require relatively long training times. POLYCLASS, for example, is third last in terms of median training time. It often requires hours of training compared to seconds for other algorithms. The QUEST and logistic regression algorithms are substantially faster. Among decision tree algorithms with univariate splits, C4.5, IND-CART, and QUEST have the best combinations of error rate and speed. But C4.5 tends to produce trees with twice as many leaves as those from IND-CART and QUEST.",
+issn="1573-0565",
+doi="10.1023/A:1007608224229",
+url="https://doi.org/10.1023/A:1007608224229"
+}
+
+@book{kernighan2006c,
+  title={The C programming language},
+  author={Kernighan, Brian W and Ritchie, Dennis M},
+  year={2006}
+}
+
+@inproceedings{Jung:2004:ESS:1028788.1028838,
+ author = {Jung, Jaeyeon and Sit, Emil},
+ title = {An Empirical Study of Spam Traffic and the Use of DNS Black Lists},
+ booktitle = {Proceedings of the 4th ACM SIGCOMM Conference on Internet Measurement},
+ series = {IMC '04},
+ year = {2004},
+ isbn = {1-58113-821-0},
+ location = {Taormina, Sicily, Italy},
+ pages = {370--375},
+ numpages = {6},
+ url = {http://doi.acm.org/10.1145/1028788.1028838},
+ doi = {10.1145/1028788.1028838},
+ acmid = {1028838},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ keywords = {DNS black lists, Zipf-like distribution, spam traffic},
+}
+
+@inproceedings{ramachandran2006can,
+  title={Can DNS-based blacklists keep up with bots?},
+  author={Ramachandran, Anirudh and Dagon, David and Feamster, Nick},
+  booktitle={CEAS},
+  year={2006},
+  organization={Citeseer}
+}
+
+@inproceedings{nazario2008net,
+  title={As the net churns: Fast-flux botnet observations},
+  author={Nazario, Jose and Holz, Thorsten},
+  booktitle={Malicious and Unwanted Software, 2008. MALWARE 2008. 3rd International Conference on},
+  pages={24--31},
+  year={2008},
+  organization={IEEE}
+}
+
+@inproceedings{Deri:2012:TPD:2245276.2245396,
+ author = {Deri, Luca and Trombacchi, Lorenzo Luconi and Martinelli, Maurizio and Vannozzi, Daniele},
+ title = {Towards a Passive DNS Monitoring System},
+ booktitle = {Proceedings of the 27th Annual ACM Symposium on Applied Computing},
+ series = {SAC '12},
+ year = {2012},
+ isbn = {978-1-4503-0857-1},
+ location = {Trento, Italy},
+ pages = {629--630},
+ numpages = {2},
+ url = {http://doi.acm.org/10.1145/2245276.2245396},
+ doi = {10.1145/2245276.2245396},
+ acmid = {2245396},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ keywords = {domain name system, traffic measurement},
 } 
--- a/Thesis/content/Abuse_of_Domain_Names/Abuse_of_Domain_Names.tex
+++ b/Thesis/content/Abuse_of_Domain_Names/Abuse_of_Domain_Names.tex
@@ -1,7 +1,7 @@
 \chapter{Abuse of Domain Names}
 \label{cha:abuse_of_domain_names}

-The \gls{dns} makes it easy to browse the internet with human readable domain names. It adds an extra layer to the TCP/IP model that allows administrators to reliably maintain services, especially for large applications which are served by many servers in different locations. Using techniques like round robin \gls{dns} enables efficient use of multiple machines, decreases access time for different users and enhances availability if single nodes in the machine cluster fail. Although this led to the described advantages it can also be used by malicious applications. In this work three major types of misuse of domain names are taken into account.
+The \gls{dns} makes it easy to browse the internet with human readable domain names. It adds an extra layer to the TCP/IP model that allows administrators to reliably maintain services, especially for large applications which are served by many servers in different locations. Using techniques like round robin \gls{dns} enables efficient use of multiple machines, decreases access time for different users and enhances availability if single nodes in the machine cluster fail. Although this led to the described advantages it can also be used by malicious applications. In this work three major types of domain name misuses are taken into account.


 \section{Malware}
@@ -11,22 +11,6 @@ On May 12th 2017, British security researchers discovered a malware which was sp

 This case shows an example of how domains can be used by attackers to control their software. Usually domains are more often used to connect to command and control servers or to communicate with other infected machines (see Section~\ref{sec:botnets}). To infect a machine, attackers often use so called \textit{droppers} or \textit{injectors} that do not ship the malicious code in the first hand but that are little programs to download further source code or binaries that contain the harming functionality. It is much easier for malware authors to use domains for this purpose instead of hard coding the IP addresses for many reasons: If machines that serve the down-loadable content are e.g. confiscated by the police or taken down for other reasons, domains can simply be pointed to a redundant server and such minimizing slow downs in the distribution of the malware. Reliable endpoints are also used to maintain the malicious software and load additional code. As domains are comparably cheap (starting at a few cents per year compared to at least \$ 10 for a dedicated IPv4 address a year), attackers can build a pool of many domains and such compensate take downs of some domain names. This could possibly change when IPv6 is widely adopted (with IPv6 addresses being much cheaper) but according to statistics of Google, only about 20\% of worldwide users accessing google where IPv6 enabled (natively or using IPv6 to IPv4 bridges) \fsCite{googlecom_ipv6adoption}. This imposes the usage of IPv6 as the primary protocol in malware for obvious reasons.

-\todo{add somewhere here}
-techniques like:
-fast flux networks, domain flux networks, domain generation algorithm
-
-((
-    Examples of malware that make use of such DGAs are Kraken/Bobax, the Srizbi bots and the Conficker worm
-
-
-))
-
-\subsection{Countermeasures}
-\label{subsec:countermeasures}
-
-\todo{see kopis section 2 end, DNS blacklisting etc}
-
-

 \section{Phishing}
 \label{sec:phishing}
@@ -40,16 +24,9 @@ Phishing describes malicious activities where attackers try to steal private inf

 A Botnet is a network of mostly computers infected with malicious software and controlled as a group without the owners' knowledge under the remote control of a human operator called bot master or bot herder. Each infected machine is called a Bot; and similar to how robots are acting independently commanded by human operators, every node in the Botnet is performing actions as instructed by the Botmaster. Botnets are mostly used for sending spam emails and running \gls{ddos} attacks.

+To understand how botnets can be detected, mainly considering how botnets make use of domain names, some basic concepts have to be introduced:

-\subsection{Distribution}
-\label{subsec:distribution}
+\subsection{Fast-Flux service networks}
+\label{subsec:fast-flux_service_networks}

-
-
-
-\subsection{Architecture}
-\label{subsec:architecture}
-
-
-\subsection{Discovery}
-\label{subsec:botnets_discovery}
+Fast-Flux service network is a technique, for example often used to serve illegal web pages or in botnets (\fsCite{nazario2008net}), to hide the actual location of core components like command and control servers (C\&C servers is used as an example here). Using DNS round robin which helps e.g. legitimate services to reduce downtimes when a single node fails, command and control servers are hidden behind groups of bots which are acting as proxies and are accessible by a domain name. As botnets usually contain a large number of bots, these proxies can quickly be changed and leave no trace back to the actual C\&C server. To be able to quickly change the hiding proxy, the time to live of the domain names to those proxies has to be set to a low value. This is one characteristic that can be used to distinguish legitimate from malicious services. Domain-Flux networks are the successor of Fast-Flux service networks and in addition do use dynamic domains for the proxies. Domain-Flux networks use changing domain names for the proxies that hide the location of core components in the botnet. For this method to work properly, all bots in the botnet do have to know under which domain the C\&C server is reachable. To be able to communicate with the C\&C server, the bot first generates the currently valid domain name (e.g. based on time) and afterwards is able to send data through the proxy to the command and control server. Some examples for malware that uses DGAs are the Srizbi botnet, the Conficker worm and the GameOver Zeus botnet. One major difference of algorithmically generated domains in contrast to legitimate domains is that they usually contain more numbers and fewer/no human readable words.
--- a/Thesis/content/Conclusion/Conclusion.tex
+++ b/Thesis/content/Conclusion/Conclusion.tex
@@ -1,5 +1,6 @@
 \chapter{Conclusion}
 \label{cha:conclusion}
+    

 \section{Limitations}
 \label{sec:limitations}
--- a/Thesis/content/Development_of_DoresA/Development_of_DoresA.tex
+++ b/Thesis/content/Development_of_DoresA/Development_of_DoresA.tex
@@ -1,17 +1,83 @@
-\chapter{Development of $DoresA$}
+\chapter{Development of DoresA}
 \label{cha:development_of_doresa}

-==> remember, operated in a mostly safe environment (few malware should be in the field)
+The last part of this work the development of a dynamic domain reputation system. A lot of concepts for this system will be adopted from the previously evaluated systems, most concepts will be taken from \textit{Exposure} with some general ideas of \textit{Notos} and \textit{Kopis}. There will also be some additional concepts be investigated that are not yet proposed by the those systems. In general, there are some limitations to be taken into account which arise mostly by the specific type of data that is available for this work and where it has been monitored. The passive DNS logs that have been provided for this work have been collected on three recursive DNS servers in a large company in locations in Europe, Asia and the United States. As those logs do contain sensitive data, raw logs used in this work can not be published mostly due to privacy reasons. It also has to be noted, that the DNS requests are not available for this work for the same reason. The DNS responses should however be sufficient for the target of this work.
+
 ==> not like exposure: do not initially filter out domains? (alexa top 1000 and older than one year)

 \section{Initial Situation and Goals}
 \label{sec:initial_situation_and_goals}

-\section{Dataset preprocessing}
-\label{sec:dataset_preprocessing}
+Ultimately, this work should come up with an algorithm to find domains that are involved in malicious activities. Most of the latest approached work has been working with machine learning techniques to build domain reputation scoring algorithms. As those publications have generally shown promising results (see Section~\ref{cha:evaluation_of_existing_systems}), this work is also focusing on a dynamic approach with machine learning algorithms involved. The network, in which the logs for this work have been collected is different from most ISP or other public networks. There is a lot of effort made to keep the network malware-free. This includes both software solutions (like anti-virus software and firewalls) as well as a team that proactively and reactively monitors and removes malware. Another defensive task is to train the employees to be aware of current and upcoming threats (e.g., to pay attention on hyperlinks in emails, distrust public usb sticks and physical access guidelines). Although this should lead to a mostly malware free network with few requests to malicious domains, 2017 has shown to be the year of ransomware (see Section~\ref{sec:malware}). Private internet users and companies have been infected with malware that was encrypting their data and requiring the target to pay an amount of money to decrypt it. There are of course other ongoing threats that have existed for many years, like spam campaigns (\fsCite{TrendMicroOnline}). The particular task in this work is to discover whether a dynamic reputation system for domains is useful and is applicable under this circumstances. 
+
+
+\section{System Architecture}
+\label{sec:system_architecture}
+
+The overall system will take an similar approach which was first introduced by \textit{Exposure} (see \ref{sec:exposure}). In general, this involves an architecture with four different modules. The \textit{Malicious/Benign Domains Collector} is working at the beginning of the analysis and is fetching malicious domains as well as known benign domains from several external services:
+\begin{itemize}
+    \item \textit{Malware Prevention through Domain Blocking} list from malwaredomains.com which is an professionally maintained list with domains involved in malicious activities like the distribution of malware and spyware (\fsCite{malwaredomainsInformationOnline}).
+    \item \textit{Phishtank}: A list that targets domains that are engaged in spam activities (\fsCite{PhishtankInformationOnline}).
+    \item \textit{ZeuS Tracker}: Blocking list for domains and IP addresses involved in the ZeuS botnet as command and control (C\&C)servers.
+    \item \textit{Alexa} with a list of the most popular domains in various countries as well as a global overview (total of 2000 domains).
+\end{itemize}
+
+The malicious domains list from those three services consisted of 28367 individual entries when first collected. This information is later used to label benign and malicious domains in the training process. The \textit{Malicious/Benign Domains Collector} can be rerun at any time to keep up with known malicious and benign domains at a later stage and increase the accuracy of \textit{DoresA}. The second module \todo{ref system architecture image}, \textit{Data Aggregation Module} is collecting all passive DNS logs and persisting those. The \textit{Data Aggregation Module} is also responsible for persisting information that is explicitly needed in the training step and such consumed by the \textit{Training Module}. This \textit{Training Module}'s primary concern is to learn a model that holds information about resource usage of certain DNS responses as well as labeling those data samples. Due to the limitation of available time, the training period has been reduced to three days (starting from the first of september in 2017) of training time with a window of \todo{how many minutes roughly?}. The training model thus consisted of a total of \todo{how many in total} DNS responses and included resolutions for \textit{how many individual domains} individual domains. The accuracy of this model can be also be increased by retraining the model e.v. once a day or week to keep up with new characteristics of malicious usage. This training model can then be used in the last module, the \textit{Classification Module}, to classify resolutions of unlabeled domains. The \textit{Classification Module} could e.g. be used to act as a real-time warning system when deployed in a network.
+
+The logs that are provided have been collected in different locations all over the world and are aggregated on a single machine as csv files. As operating on the raw csv logs in the training step has shown to be very inefficient \todo{benchmark here, roughly one week per day}, especially when performing multiple analysis cycles, a different solution for accessing the logs had to be found. Experimenting with putting the raw passive DNS logs into a NoSQL database (MongoDB \fsCite{MongoDBOnline}) as well as a relational database (MariaDB \fsCite{MariaDBOnline}) did not show a significant decrease in accessing the data so a slightly different approach has been used. By using an in-memory database (redis \fsCite{RedisOnline}) and only keeping those information, that are needed for the analysis has shown to give much better results \todo{benchmark here}. It has to be stated though that while retaining most of the needed information, information like the timestamp of individual requests could not be kept. See Table \todo{redis table} for which data is stored inside the redis instance. Using an in-memory database for this application led to a different challenge. Even though trimmed down to the minimum set of information, the data has an average size of \todo{todo numbers here} per day. For this reason, a machine with an appropriate amount of internal RAM had to be used. In this case, a total of 512 Gigabyte \todo{verify} of RAM with an Intel Xeon with 32 cores was available. 
+
+
+\todo{system architecture image}
+
+
+\subsection{Decision Tree Classifier}
+\label{subsec:decision_tree_classifier}
+
+While evaluating previous work, mainly two classification algorithms have shown to provide good results in this area. A decision tree classifier has some advantages over different other classification systems: the training time is comparably low, especially in contrast to neural networks. It delivers quite easily interpretable results when plotting the resulting decision tree, it requires little data preparation (e.g. no normalization of the input is needed like in many other algorithms and can handle both numerical and categorical inputs) and it is possible to validate the results of the training using techniques like cross-validation. In this work, the implementation of the python library scikit-learn is used. The current implementation of the scikit-learn algorithm is called \textit{CART} (Classification and Regression Trees) and is based on the C4.5 decision tree implementation that is also used in \textit{Exposure}. For a detailed comparison of classification algorithms see \fsCite{Lim2000}.
+

 \section{Feature Selection}
 \label{sec:feature_selection}

-\section{Evaluation}
-\label{sec:evaluation}
+The feature selection is primarily motivated by the results of the evaluation of previously proposed systems. As \textit{Exposure} has shown to be the system that shares most similarities compared to the network and traffic that is available, also most features are adopted from \textit{Exposure} in the first place. Due to the restricted analysis time, the \textit{Time-Based Features} can unfortunately not be used in this work. To recapture, at least one week of traffic has to be trained to benefit from those features. Besides from that, nearly all features of \textit{Exposure} could be used for the training. See Table~\ref{tab:doresa_features} for all features that are used to model the resource usage characteristics of domains, used in legitimate and malicious activities. For a detailed explanation of why these features have been included, see Section~\ref{subsec:exposure_features}.
+
+
+\begin{table}[!htbp]
+    \centering
+    \caption{Doresa: Features}
+    \label{tab:doresa_features}
+    \begin{tabularx}{\textwidth}{|l|X|}
+    \hline
+    \textbf{Feature Set}                                & \textbf{Feature Name}                   \\ \hline
+    \multirow{4}{*}{\textit{DNS Answer-Based Features}} & Number of distinct IP addresses         \\ \cline{2-2} 
+                                                        & Number of distinct countries            \\ \cline{2-2} 
+                                                        & Number of domains share the IP with     \\ \cline{2-2} 
+                                                        & Reverse DNS query results               \\ \hline
+    \multirow{5}{*}{\textit{TTL Value-Based Features}}  & Average TTL                             \\ \cline{2-2} 
+                                                        & Standard Deviation of TTL               \\ \cline{2-2} 
+                                                        & Number of distinct TTL values           \\ \cline{2-2} 
+                                                        & Number of TTL change                    \\ \cline{2-2} 
+                                                        & Percentage usage of specific TTL ranges \\ \hline
+    \multirow{2}{*}{\textit{Domain Name-Based Features}}         & \% of numerical characters              \\ \cline{2-2} 
+                                                        & \% of the length of the LMS             \\ \hline
+\end{tabularx}
+\end{table}
+
+\todo{additional features?}
+
+
+\section{Implementation}
+\label{sec:implementation}
+
+The implementation of \textit{DoresA} does include several different pieces of software. The main part is implemented in python and consists of the \textit{Training Module} and the \textit{Classification Module}. Apart from the main application, the \textit{Malicious/Benign Domains Collector} is a collection of bash scripts to fetch the filter lists and combine them into lists that can easily be consumed by the main application. The \textit{Data Aggregation Module} is written in C (\fsCite{kernighan2006c}), mostly for performance reasons as these logs are aggregated in real time and fed into the redis database. Most of the \textit{Data Aggregation Module} implementation has been available for this work but had to be extended to also persist all TTL changes for a domain. 
+
+The main application is mainly working in two modes. In the training mode, all entries are first loaded from the raw csv logs for the given period. The next step extracts and calculates the values that are needed for each feature and uses the filter lists, gathered by the \textit{Malicious/Benign Domains Collector} to label the dataset. After this, the feature values along with the label is persisted as serialized python objects. This persistence step is on the one side needed to do the final step of training but can also be useful if for some reason, the training is crashing or stopped it can be continued and picked up where the previous training left off. The last step is using the preprocessed features and the corresponding labels to build the decision model, i.e. generate the decision tree. The training can mostly (apart from the last step) be done in parallel to get a reasonable training time – the implementation in this work has efficiently been executed on 32 cores and took roughly two days for partially (see \ref{sec:system_architecture}) training three days of input data \todo{figures}. In the second mode, the \textit{Classification Module} classifies a dataset as either being benign or malicious. While the evaluated systems do have a variable reputation score from zero to one, this system does a binary classification for the dataset in the first place. This could be changed to a variable reputation score, e.g. using the probability for each class that can also be retrieved by the scikit-learn decision tree implementation \todo{(see fscite SciKitProbOnline)}.
+
+\todo{include picture of decision tree here}
+
+
+%\section{Evaluation}
+%\label{sec:evaluation}
+
+\todo{include more graphs/pictures in general}
+
--- a/Thesis/content/Evaluation_of_existing_Systems/Evaluation_of_existing_Systems.tex
+++ b/Thesis/content/Evaluation_of_existing_Systems/Evaluation_of_existing_Systems.tex
@@ -1,4 +1,4 @@
-\chapter{Evaluation of existing Systems} \todo{rename to survey?}
+\chapter{Evaluation of existing Systems}
 \label{cha:evaluation_of_existing_systems}

 This chapter deals with work around domain reputation scoring systems that has been released. While there exist different types of algorithms, only those that follow a similar approach are taken into account here: namely those that use passive DNS logs and machine learning to calculate the reputation score. \todo{why this two or three?}
@@ -19,15 +19,10 @@ For a comprehensive evaluation, all input and output as well as the exact implem
 \section{Results and Comparison}
 \label{sec:results_and_comparison}

-\todo{here}
-==> Exposure much simpler, much less data available (ips of malicious servers, honeypots, dyndns,...) 
+After investigating those three systems, we want to demonstrate the major differences and similarities. The results discussed here are the base for the implementation of the own algorithm. All three systems are based on machine learning techniques. Two of the systems use a decision tree classifier and \textit{Kopis} uses a random forest classifier which is not significantly different from a decision tree but has some advantages in some areas (see a detailed comparision on Section~\ref{sec:model_selection}) One major difference of these systems is the data they are working with. While \textit{Notos} and \textit{Exposure} are operated with data collected at recursive DNS servers in lower DNS layers, \textit{Exposure} is gathering traffic from a top level domain name server and two AuthNS from major domain name registrars. As the data available for this work has also been gathered at RDNS servers in a lower DNS hierarchy and no data from higher DNS layers is available, most concepts of \textit{Kopis} can not be used for the system that is proposed in this work. Nevertheless there are general aspects of \textit{Kopis} that can be useful, e.g. which sources have been used to build the knowledge base for the classification of test samples in the training or how the overall architecture has been designed. It also has to be noted though, that \textit{Kopis} is the only system that is able to operate without having reputation information for domains and IPs available. Having data available that is collected similarly to \textit{Notos} and \textit{Exposure} does not mean that all concepts and features can be applied in the new system. A company network has much different characteristics then a network operated by e.g. an ISP. The network, in which the logs for this work has been collected in, is hardened with much more effort so that malware should generally be rarely found. Especially \textit{Notos} uses public traffic from an ISP RDNS server that is handling clients of this ISP network which, by design, can not be taken care of like in a closed company network and is much more likely to contain a lot of different malware. One major difference between \textit{Notos} and \textit{Exposure} is the complexity of the overall system. \textit{Notos}, being the first dynamic domain reputation system, has a much higher amount of features that are used. Some of these features, like the Network-based features (see Table~\ref{tab:notos_network-based_features}) are much more fine grain (e.g. independently operating on the top level, second level and third level domains) compared to the similar group of features in \textit{Exposure} (see Table~\ref{tab:exposure_features}, \textit{DNS Answer-Based Features}). For this reason, \textit{Notos} does also need much more detailed reputation information, e.g. for the IP spaces. Although not having such fine grain features, \textit{Exposure} shows similar detection rates like \textit{Notos}. Another general advantages of \textit{Exposure} over \textit{Notos} is the reduced training time (again for example due to fewer features) and that it does not need information about malware that has been gathered in self hosted honeypots (which in fact, done right is a completely different topic on its own and therefore not part of this work). 

-==> difference notos exposure, see literature exposure section 5.5.2
+It also has to be noted, that while all three systems show a high detection rate in general with a high true positive and low false positive rate, they can not be operated with a 100\% success rate and should always be deployed along with other detection systems like firewalls, malware detection software and/or traditional filter systems like DNS black- and whitelists. Dynamic reputation system can however be used to find domains used in malicious activities before other systems are aware of the threat.

-==> not possible to simply block everything, always false positives
-
-==> kopis is able to operate without ip reputation information

+\todo{if time}
 ==> read all 'limitations' sections again and compare here
-
-==> no concepts of kopis because of caching, no data available
--- a/Thesis/content/Evaluation_of_existing_Systems/Exposure/Exposure.tex
+++ b/Thesis/content/Evaluation_of_existing_Systems/Exposure/Exposure.tex
@@ -35,7 +35,7 @@ For the distinction of benign and malicious domains to perform well, a large set
 \textit{Exposure} uses a total of fifteen features that have been chosen after several month of study with thousands of well-known benign and malicious domains. These features are grouped into four different categories which can be seen in Table~\ref{tab:exposure_features}. 

 The first group, \textit{Time-Based Features} has not been approached in publications before. These features investigate the time, at which the request with domain \textit{d} has been issued. The main idea behind this group of features is to find malicious services that use techniques like \textit{domain flux} 
-\todo{explain domain flux} to circumvent take downs and make their infrastructure more agile. ``[\textit{Domain flux}] often show a sudden increase followed by a sudden decrease in the number of requests'' \fsCite[Section 3.1]{Bilge:2014:EPD:2617317.2584679}. Domains of malicious services using a DGA do only exist for a short period of time by design. \fsAuthor{Bilge:2014:EPD:2617317.2584679} defines the first feature as follows: ``A domain is defined to be a short-lived domain [...] if it is queried only between time \(t_0\) and \(t_1\), and if this duration is comparably short (e.g., less than several days).'' The next three features are subject to the change point detection (CPD) problem: Change point detection is about the identification of (abrupt) changes in the distribution of values, for example in time series. \textit{Exposure} implemented a CPD algorithm based on the popular CUSUM (cumulative sum) algorithm. At first, the time series of request timestamps is split into periods of 3600 seconds (one hour was tested to work well). After that, all time intervals are iterated and for each interval, the average request count of the previous eight hours \(P_t^-\) and following eight intervals \(P_t^+\) is calculated. In the next step, the distance of these two values is calculated \(d(t)=|P_t^--P_t^+|\) for each interval and the resulting ordered sequence \(d(t)\) of distances is fed to the CUSUM algorithm to finally get retrieve all change points (For more information on the implemented CPD algorithm, see \fsCite[Section 3.1]{Bilge:2014:EPD:2617317.2584679}). To calculate feature two (\textit{Daily similarity}), the Euclidean Distance of the time series of each day for \textit{d} is calculated. Intuitively, a low distance means similar time series and such high daily similarity whereas two days with higher distance do show a less similar request volume. All the features of this group do naturally only perform well when having a larger number of requests to \textit{d} over a significant period of time.
+(see Section~\ref{subsec:fast-flux_service_networks}) to circumvent take downs and make their infrastructure more agile. ``[\textit{Domain flux}] often show a sudden increase followed by a sudden decrease in the number of requests'' \fsCite[Section 3.1]{Bilge:2014:EPD:2617317.2584679}. Domains of malicious services using a DGA do only exist for a short period of time by design. \fsAuthor{Bilge:2014:EPD:2617317.2584679} defines the first feature as follows: ``A domain is defined to be a short-lived domain [...] if it is queried only between time \(t_0\) and \(t_1\), and if this duration is comparably short (e.g., less than several days).'' The next three features are subject to the change point detection (CPD) problem: Change point detection is about the identification of (abrupt) changes in the distribution of values, for example in time series. \textit{Exposure} implemented a CPD algorithm based on the popular CUSUM (cumulative sum) algorithm. At first, the time series of request timestamps is split into periods of 3600 seconds (one hour was tested to work well). After that, all time intervals are iterated and for each interval, the average request count of the previous eight hours \(P_t^-\) and following eight intervals \(P_t^+\) is calculated. In the next step, the distance of these two values is calculated \(d(t)=|P_t^--P_t^+|\) for each interval and the resulting ordered sequence \(d(t)\) of distances is fed to the CUSUM algorithm to finally get retrieve all change points (For more information on the implemented CPD algorithm, see \fsCite[Section 3.1]{Bilge:2014:EPD:2617317.2584679}). To calculate feature two (\textit{Daily similarity}), the Euclidean Distance of the time series of each day for \textit{d} is calculated. Intuitively, a low distance means similar time series and such high daily similarity whereas two days with higher distance do show a less similar request volume. All the features of this group do naturally only perform well when having a larger number of requests to \textit{d} over a significant period of time.

 The next group of Features (\textit{DNS Answer-Based Features}) investigates resolutions of the requested domain \textit{d}. While one domain can map to multiple IP addresses for benign services, most harmless services do show a much smaller network profile in terms of e.g. location and \glspl{as}. To satisfy those findings, four features have been extracted: The number of distinct IP addresses, the amount of different countries these IP addresses are assigned to, the number of other domains that share an IP address \textit{d} resolves to and the fourth feature is the amount of results of the reverse dns query for all IPs of \textit{d}. It is worth noting, that some hosting providers also use one IP address for many domains so an extra layer to prevent such false positives make sense.

@@ -73,14 +73,14 @@ The last group of features are the \textit{Domain Name-Based Features}. Domain n
 \subsection{Reputation Engine}
 \label{subsec:exposure_reputation_engine}

-The reputation classifier of \textit{Exposure} is implemented as a \textit{J48} decision tree algorithm. The performance of decision trees mainly depend on the quality of the training set. For this reason a representative set of training data, with malicious domains from various threat classes, has to be chosen. Sources that have been used to identify malicious and benign domains can be found in Section~\ref{subsec:exposure_architecture}. In total, a list of 3500 known bad as well as 3000 known good domains have been used for the initial training. In order to take advantage of the \textit{Time-Based Features}, the optimal training period has been observed to be seven days. The tree is then constructed using the feature attribute values and its corresponding labels. More specifically, the whole training set is iterated and each time, a set of samples can be separated using one single attribute (in perspective to the assigned label) it is branched out and a new leaf is created. Each branch is then split into more fine grained subtrees as long as there is an \textit{information gain}, which means that all samples of the subset belong to the same class, i.e. are assigned the same label. \todo{link comprehensive decision tree explanation} 
+The reputation classifier of \textit{Exposure} is implemented as a \textit{J48} decision tree algorithm. The performance of decision trees mainly depend on the quality of the training set. For this reason a representative set of training data, with malicious domains from various threat classes, has to be chosen. Sources that have been used to identify malicious and benign domains can be found in Section~\ref{subsec:exposure_architecture}. In total, a list of 3500 known bad as well as 3000 known good domains have been used for the initial training. In order to take advantage of the \textit{Time-Based Features}, the optimal training period has been observed to be seven days. The tree is then constructed using the feature attribute values and its corresponding labels. More specifically, the whole training set is iterated and each time, a set of samples can be separated using one single attribute (in perspective to the assigned label) it is branched out and a new leaf is created. Each branch is then split into more fine grained subtrees as long as there is an \textit{information gain}, which means that all samples of the subset belong to the same class, i.e. are assigned the same label (see more on decision trees in Section~\ref{subsec:decision_tree_classifier}).

 \subsection{Results}
 \label{subsec:exposure_results}

 The performance of classifiers with different feature sets has been tested using e.g. 10-fold cross validation. To find the model with the minimum error rate, all combinations of feature sets ({\textit{Time-Based Features} as F1, \textit{DNS Answer-Based Features} as F2, \textit{TTL Value-Based Features} F3 and \textit{Domain Name-Based Features} as F4) have been trained using the same decision tree algorithm. Figure~\ref{fig:exposure_miss-classifier_instances} shows the error rate of those different classification models. The \textit{Time-Based Features} are showing the smallest error when inspecting single feature sets only. Looking at models with multiple feature sets, the overall minimum error rate is produced when using all four feature groups. The total amount of requests in the dataset that was collected for the initial analysis counted roughly 100 billion DNS queries. As processing all of these requests is not feasible in practice, two filtering steps have been introduced. The first one filters out all requests to a domain in the top 1000 Alexa list. The assumption for this filter is that no malicious domain will get this popular without being detected in some form. This action reduced about 20\% of the initial requests. The second step filters out all requests to domains that have been registered at least one year before the analysis. This filter applied to 45.000 domains (or 40 billion corresponding queries) and reduced the remaining traffic by another 50\%. The filtering process has been cross tested against the Alexa top list, McAfee WebAdvisor (formerly McAfee SiteAdvisor) \fsCite{MCAfeeWebAdvisorOnline}, Google Safe Browsing \fsCite{GoogleSafeBrowsingOnline} and Norton Safe Web \fsCite{NortonSafeWebOnline} and only 0.09\% have been reported to be risky. \fsAuthor{Bilge11exposure:finding} for this reason states that: ``We therefore believe that our filtering policy did not miss a significant number of malicious domains because of the pre-filtering we performed during the offline experiments.''

-The accuracy of the classifier has been validated using two different methods. The first method was to classify the training set with 10-fold cross validation. This validation method splits the dataset in ten partitions/folds (each partition optimally containing roughly the same class label distribution). One fold is then used as the validation sample (testing set) and the remaining nine partitions are used as the training set. The training set is used to train the model which is then cross validated with the testing set. This step is repeated for ten times using the same partitions, each partition being the testing set once. The second method is to simply use 66\% of the dataset for the training and the remaining 33\% as the testing set. \todo{detailed explanation of 10 fold cross validation}
+The accuracy of the classifier has been validated using two different methods. The first method was to classify the training set with 10-fold cross validation. This validation method splits the dataset in ten partitions/folds (each partition optimally containing roughly the same class label distribution). One fold is then used as the validation sample (testing set) and the remaining nine partitions are used as the training set. The training set is used to train the model which is then cross validated with the testing set. This step is repeated for ten times using the same partitions, each partition being the testing set once. The second method is to simply use 66\% of the dataset for the training and the remaining 33\% as the testing set.


 \begin{figure}[!htbp]
--- a/Thesis/content/Evaluation_of_existing_Systems/Kopis/Kopis.tex
+++ b/Thesis/content/Evaluation_of_existing_Systems/Kopis/Kopis.tex
@@ -115,6 +115,3 @@ Again using those 150 datasets, the performance of each individual feature set a
 \end{figure}

 To test a more real-world scenario, another approach has been validated. This test case includes one month of data and 20\% of the known benign and known malicious domains have been extracted and not used for training the model (simulating zero knowledge about these domains during training). This model has been tested using the consecutive three weeks, including the 20\% benign and malicious samples as well as all other new, previously unseen (not known to the trained model), domains. This case has been repeated four different month. Summarizing the results, \textit{Kopis} was able to classify new domains with an average \(TP_{rate}\) of 73.62\% and a \(FP_{rate}\) of 0.53\%. In contrast to the first results shown in this chapter (which showed a much higher \(TP_{rate}\) and a lower \(FP_{rate}\)), this results are achieved using zero knowledge of the tested domains and are such still considered a good detection rate. This real-world value could be confirmed by detecting a previously unknown commercial botnet in china. This botnet has been identified within the first weeks of its appearance and could get removed from the internet in September 2010, before it could spread outside of china. The DDos botnet was controlled with eighteen domain names which resolved to five IP addresses in China and one in the the United States.
-
-
-\todo{see section one for contributions}
--- a/Thesis/content/Evaluation_of_existing_Systems/Notos/Notos.tex
+++ b/Thesis/content/Evaluation_of_existing_Systems/Notos/Notos.tex
@@ -146,6 +146,7 @@ For the evidence-based features, public information and exclusively collected da
 \end{tabularx}
 \end{table}

+Figure~\ref{fig:notos_features} shows how the three different feature groups are computed individually and which input the corresponding feature set is using. All those features are then used as a single vector as input for the \textit{Reputation Engine}.

 \begin{figure}[!htbp]
    \centering
@@ -153,7 +154,6 @@ For the evidence-based features, public information and exclusively collected da
    \caption{Notos: Computing network-based, zone-based, evidence-based features \fsCite[Figure 2]{Antonakakis:2010:BDR:1929820.1929844}}
    \label{fig:notos_features}
 \end{figure}
-\todo{not referenced atm}


 \subsection{Reputation Engine}
--- a/Thesis/content/Introduction/Introduction.tex
+++ b/Thesis/content/Introduction/Introduction.tex
@@ -7,27 +7,24 @@ The domain name system (\gls{dns}) has been one of the corner stones of the inte
 \section{Motivation}
 \label{sec:motivation}

+
 \todo{also check papers for motivations}


 \section{Challenges}
 \label{sec:challenges}

-All of the investigated approaches are using \gls{pdns} logs to generate a reputation score for a specific domain. These logs are generated on central \gls{dns} resolvers and capture outgoing traffic of multiple users (see section~\ref{subsec:passive_dns}), one challenge of this work is handling huge volumes of data. With about seven Gigabytes \todo{verify} of uncompressed \gls{pdns} logs for a single day, various general issues might occur: General purpose computers nowadays usually have up to 16 Gigabytes of RAM (rarely 32 GB) which concludes that multiple tasks (i.e. building a training set) may not be performed purely in-memory. The time of analysis might also become a bottleneck. Simply loading one single day (see benchmark example~\ref{lst:load_and_iterate_one_day_of_compressed_pdns_logs}) of (compressed) logs from disk and iterating it without actual calculations takes roughly 148 seconds. To evaluate existing algorithms certain requirements have to be met. Passive DNS logs usually contain sensitive data which is one reason why most papers do not publish test data. For a precise evaluation the raw input data is needed. Some previously developed classifications have not completely disclosed the involved algorithms so these have to be reconstructed as close as possible taking all available information into account. 
+All of the investigated approaches are using \gls{pdns} logs to generate a reputation score for a specific domain. These logs are generated on central \gls{dns} resolvers and capture outgoing traffic of multiple users (see Section~\ref{subsec:passive_dns}), one challenge of this work is handling huge volumes of data. With about seven Gigabytes \todo{verify} of uncompressed \gls{pdns} logs for a single day, various general issues might occur: General purpose computers nowadays usually have up to 16 Gigabytes of RAM (rarely 32 GB) which concludes that multiple tasks (i.e. building a training set) may not be performed purely in-memory. The time of analysis might also become a bottleneck. Simply loading one single day (see benchmark example~\ref{lst:load_and_iterate_one_day_of_compressed_pdns_logs}) of (compressed) logs from disk and iterating it without actual calculations takes roughly 148 seconds. To evaluate existing algorithms certain requirements have to be met. Passive DNS logs usually contain sensitive data which is one reason why most papers do not publish test data. For a precise evaluation the raw input data is needed. Some previously developed classifications have not completely disclosed the involved algorithms so these have to be reconstructed as close as possible taking all available information into account. 


 \section{Goals}
 \label{sec:goals}

-The task of this work is to evaluate existing scoring mechanisms of domains in the special context of IT security, and also research the potential for combining different measurement approaches. It ultimately shall come up with an improved and evaluated algorithm for determining the probability of a domain being related to hostile activities.
+The task of this work is to evaluate existing scoring mechanisms of domains in the special context of IT security, and also research the potential for combining different measurement approaches. It ultimately shall come up with an improved and evaluated algorithm for determining the probability of a domain being related to hostile activities. 


 \section{Related Work}
 \label{sec:related_work}

+In the context of IT-Security, there do exists several approaches for assigning a reputation score to a domain. Before 2010 the general idea of protecting a network against malicious requests targeting other networks was to establish static filter lists. This included both explicitly allowing requests as well as explicitly blocking request to certain IP addresses or domain names. For example \fsAuthor{Jung:2004:ESS:1028788.1028838} introduced an approach to block request to certain domains using a DNS black list. As shown by \fsCite{ramachandran2006can} in 2006, this approach is not always suitable to keep up with the speed of malware authors. A different type of system has been established in 2010 when two algorithms have been introduced, \textit{Notos} followed by \textit{Exposure}, that used machine learning to dynamically assign a reputation score to a domain by using the characteristics of how benign and malicious domains are usually configured and used in terms of e.g. DNS resource usage or the global distribution of the machines that are used for malicious purposes.

-\todo{machine learning vs others}
-
- 
-
-%\lstinputlisting[language={java}, label=lst:sendImpliciteIntent,caption=Intent - Bild anzeigen]{res/src/sendImpliciteIntent.java}
--- a/Thesis/content/Technical_Background/Benchmarks/Benchmarks.tex
+++ b/Thesis/content/Technical_Background/Benchmarks/Benchmarks.tex
@@ -1,7 +1,7 @@
 \section{Benchmarks}
 \label{sec:benchmarks}

-To get a better understanding of performance related challenges, some benchmarks are performed and described in this section. All benchmarks are performed on the same machine with 16 GB of DD3 RAM with a clock speed of 1600 MT/s in dual channel, an Intel i7-3520M CPU @ 2900 MHz and a Samsung SSD 850 EVO with 250 GB (where not otherwise specified). Linux 4.13.12-1 has been used and Python scripts are executed with Python interpreter in version 3.6.3. For consistency, no other software is running at the time of the benchmark execution (e.g. a desktop environment or heavy background processes) \todo{list of what is running}. All benchmark are run ten times and outliers that show a run time of 10\% above the statistical median are ignored. Although considering the mentioned actions, it is not safe to assume completely equal initial situations at the time of execution on non real-time operating systems (like the one used). So these figures have to be treated with care and should only give a fundamental understanding of how long tasks are about to run.
+To get a better understanding of performance related challenges, some benchmarks are performed and described in this section. All benchmarks are performed on the same machine with 16 GB of DD3 RAM with a clock speed of 1600 MT/s in dual channel, an Intel i7-3520M CPU @ 2900 MHz and a Samsung SSD 850 EVO with 250 GB (where not otherwise specified). Linux 4.13.12-1 has been used and Python scripts are executed with Python interpreter in version 3.6.3. For consistency, no other software is running at the time of the benchmark execution (e.g. a desktop environment or heavy background processes). All benchmark are run ten times and outliers that show a run time of 10\% above the statistical median are ignored. Although considering the mentioned actions, it is not safe to assume completely equal initial situations at the time of execution on non real-time operating systems (like the one used). So these figures have to be treated with care and should only give a fundamental understanding of how long tasks are about to run.

 \begin{lstlisting}[language={bash}, caption={Benchmark: Load and iterate one day of compressed pdns logs}, label={lst:load_and_iterate_one_day_of_compressed_pdns_logs}]
 start_z = time.time()
@@ -19,4 +19,4 @@ print('iterating day took: ' + str(time.time() - start_z) + ' s')

 cleaned results: [155.0667760372162, 148.00951623916626, 147.8429672718048, 147.2554485797882, 147.1039183139801, 147.26967453956604, 147.13052105903625, 147.33162689208984, 147.20316672325134, 147.29751586914062]
 average: 148.15111315250397 seconds
-\end{lstlisting}
+\end{lstlisting}
--- a/Thesis/content/Technical_Background/DNS/DNS.tex
+++ b/Thesis/content/Technical_Background/DNS/DNS.tex
@@ -1,9 +1,9 @@
 \section{Domain Name System}
 \label{sec:DNS}

-The \gls{dns} is one of the cornerstone of the internet as it is known today. \todo{statistic about usage}. Initial designs have been proposed in 1983 and evolved over the following four years into the first globally adapted standard RFC 1034 \fsCite{rfc1034} (RFC 1035 for implementation and specification details \fsCite{rfc1035}). The main idea of the \gls{dns} is translating human readable domain names to network addresses. There are many extensions to the initial design including many security related features and enhancements or the support for \gls{ipv6} in 1995. 
+The \gls{dns} is one of the cornerstone of the internet as it is known today. Nearly every device, connected to the internet is using DNS. Initial designs have been proposed in 1983 and evolved over the following four years into the first globally adapted standard RFC 1034 \fsCite{rfc1034} (see also RFC 1035 for implementation and specification details \fsCite{rfc1035}). The main idea of the \gls{dns} is translating human readable domain names to network addresses. There are many extensions to the initial design including many security related features and enhancements or the support for \gls{ipv6} in 1995. 

-In order to understand how the \gls{dns} is misused for hostile activities and how to prevent these attacks, it is necessary to explain some basic mechanisms.
+In order to understand how the \gls{dns} is misused for malicious activities and how to prevent these attacks, it is necessary to explain some basic mechanisms.


 \subsection{Basics}
@@ -35,15 +35,16 @@ The \gls{dns} primarily builds on two types of components: name servers and reso
 \subsubsection{Name space}
 \label{subsubsec:name_space}

-The \gls{dns} is based on a naming system that consists of a hierarchical and logical tree structure and is called the domain namespace. It contains a single root node and an arbitrary amount of nodes in subordinate levels in variable depths. Each node is uniquely identifiable through a \gls{fqdn} and usually represents a domain, machine or service in the network. Furthermore, every domain can be subdivided into more fine-grained domains. These can again be specific machines or domains, called subdomains. This subdividing is an important concept for the internet to continue to grow and each responsible instance of a domain (e.g. a company or cooperative) is responsible for the maintenance and subdivision of the domain. 
+The \gls{dns} is based on a naming system that consists of a hierarchical and logical tree structure and is called the domain namespace. It contains a single root node (\textit{top level domain} or \textit{TLD})and an arbitrary amount of nodes in subordinate levels in variable depths (descending called second level, third level domain, and so forth). Each node is uniquely identifiable through a \gls{fqdn} and usually represents a domain, machine or service in the network. Furthermore, every domain can be subdivided into more fine-grained domains. These can again be specific machines or domains, called subdomains. This subdividing is an important concept for the internet to continue to grow and each responsible instance of a domain (e.g. a company or cooperative) is responsible for the maintenance and subdivision of the domain. 


 \subsubsection{\gls{dns} Resource Records}
 \label{subsubsec:dns_resource_records}

-\todo{TODO}
+See Table~\ref{tab:resource_record_types} for an list of built-in resource types in the DNS. Those built-in resource records do serve different purposes and are more or less frequently used.

-\begin{table}[]
+
+\begin{table}[!htbp]
 \centering
 \caption{Resource Record Types}
 \label{tab:resource_record_types}
@@ -79,7 +80,7 @@ In this section we will introduce the actual payload a \gls{dns} request as well
 \label{par:message_header}with
 The Message Header is obligatory for all types of communication and may not be empty. It contains different types of flags that are used to control the transaction. The header specifies e.g. which further sections are present, whether the message is a query or a response and more specific opcodes.

-\begin{table}[h!]
+\begin{table}[!htbp]
 \centering
 \caption{Message Header}
 \label{tab:message_header}
@@ -103,7 +104,7 @@ Table~\ref{tab:message_header} shows the template of a \gls{dns} message header.
    \item \textbf{QR:} Query/Response Flag – one bit field whether this message is a query(0) or a response(1)
    
    \item \textbf{OPCODE:} Four bit field that specifies the kind of query for this message. This is set by the requester and copied into the response. Possible values for the opcode field can be found in Table~\ref{tab:message_header_opcodes}
-    \begin{table}[h!]
+    \begin{table}[!htbp]
    \centering
    \caption{Message Header Opcodes}
    \label{tab:message_header_opcodes}
@@ -136,7 +137,7 @@ Table~\ref{tab:message_header} shows the template of a \gls{dns} message header.
    
    \item \textbf{RCODE:} Response Code – only available in response messages, these four bits are used to reveal errors while processing the query. Available error codes are listed in Table~\ref{tab:message_header_response_codes}. Error codes 0 to 5 have been initially available whereas error codes 6 to 10 are used for dynamic \gls{dns} defined in RFC 2136 \fsCite{rfc2136}.
    
-    \begin{table}[h!]
+    \begin{table}[!htbp]
    \centering
    \caption{Message Header Response Codes}
    \label{tab:message_header_response_codes}
@@ -156,24 +157,13 @@ Table~\ref{tab:message_header} shows the template of a \gls{dns} message header.
    10    & Not Zone        & \begin{tabular}[c]{@{}l@{}}A name specified in the request is not contained \\ within the zone declared in the message.\end{tabular}              \\ \bottomrule
    \end{tabular}
    \end{table}
-    
-    \todo{do something with this}
-    There are more response codes available that could be added (due to size restrictions) after \gls{edns} has been introduced.
-    
-    \item \textbf{QDCOUNT:} Unsigned 16 bit integer specifying the number of entries in the Question Section.
-    
-    \item \textbf{ANCOUNT:} Unsigned 16 bit integer specifying the number of resource records in the answer section.
-    
-    \item \textbf{NSCOUNT:} Unsigned 16 bit integer specifying the number of name server resource records in the authority records section.
-    
-    \item \textbf{ARCOUNT:} Unsigned 16 bit integer specifying the number of resource records in the additional records section.
 \end{itemize}


 \paragraph{Question Section:}
 \label{par:question_section}

-\begin{table}[]
+\begin{table}[!htbp]
 \centering
 \caption{Question Section}
 \label{tab_question_section}
@@ -187,43 +177,28 @@ Table~\ref{tab:message_header} shows the template of a \gls{dns} message header.


 \begin{itemize}
-    \item \textbf{Question Name:} Contains a variably sized payload payload including the domain, zone name or general object that is subject of the query. Encoded using standard \gls{dns} name notation. Depending on the Question Type, for example requesting an A Record will typically require an host part, such as www.domain.tld. A MX query will usually only contain a base domain name (domain.tld).
-    \todo{\url{http://www.tcpipguide.com/free/t_DNSNameNotationandMessageCompressionTechnique.htm}}
+    \item \textbf{Question Name:} Contains a variably sized payload including the domain, zone name or general object that is subject of the query. Encoded using standard \gls{dns} name notation. Depending on the Question Type, for example requesting an A Record will typically require an host part, such as www.domain.tld. A MX query will usually only contain a base domain name (domain.tld).
    
-    \item \textbf{Question Type:} Specifies the type of question being asked. This field may contain a code number corresponding to a particular type of resource being requested, see Table~\ref{tab:resource_record_types} for common resource types. TODO continue here (special values)
+    \item \textbf{Question Type:} Specifies the type of question being asked. This field may contain a code number corresponding to a particular type of resource being requested, see Table~\ref{tab:resource_record_types} for common resource types.
    
-    \item \textbf{Question Class} \todo{TODO}
+    \item \textbf{Question Class:} The class of the resource records that are being requested (unsigned 16 bit value). Usually Internet, question classes are assigned by the IANA where all can be found (\fsCite{IANADNSClassesOnline})
 \end{itemize}

-\todo{all tables h!}
+There are more parameters available that can be specified when requesting a resource but do not have a higher relevance here.

-\begin{table}[h!]
-\centering
-\caption{Question Section Format}
-\label{tab:question_section_format}
-\begin{tabular}{@{}lll@{}}
-\toprule
-QType & Type  & Description                                                  \\ \midrule
-251   & IXFR  & Request for a incremental Zone transfer (RFC 1995 \fsCite{rfc1995}) \\
-252   & AXFR  & Request for a Zone Transfer                                  \\
-253   & MAILB & Request for mailbox like resources (obsolete now)            \\
-254   & MAILA & Request for mail agent (obsolete, MX records used instead)   \\
-255   & *     & Request for all records                                      \\ \bottomrule
-\end{tabular}
-\end{table}

 \subsection{Domain Names}
 \label{subsec:domain_names}
-\todo{TODO structure of a domain, etc. top-level, second-level, third-level}
+
+The structure of domain names is generally managed by the corresponding registrar, e.g. the DENIC e.G. (\fsCite{DENICOnline}) for .de domains. This includes for example which characters are allowed in second-level domains and the overall registration process. In the .de space, the second-level domain must contain between one and 63 characters, all characters of the latin alphabet can be used in addition to numbers, hyphen and all 93 characters of the internationalized domain name. The first, third, fourth and last characters is additionally not allowed to be a hyphen. Many different registrars use similar rules like this example which makes it hard to easily distinguish valid from non-valid domain names. 


 \subsection{Resolution}
 \label{subsec:resolution}

-\subsubsection{Recursive}
-\label{TODO subsubsec:recursive}
+Figure~\ref{fig:address_resolution} quickly describes the process of how domain names are resolved from the perspective of a requesting machine. Each step here assumes that the request has not been performed before and such is not available in any cache. In the first step, the \textit{Operating System} is contacting the local resolver, e.g. a router in a private network or a dedicated resolve server in a larger company. As the \textit{DNS Resolver} does know nothing about the domain, it contacts the \textit{Root NS} to return the address of the responsible top-level domain server (\textit{TLD NS} for .com in this example). The resolver then asks the \textit{TLD NS} server to return back the address of the second-level domain server that is in charge of the requested zone (e.g. google.com). Finally the resolver queries the \textit{Google NS} server for the IP address of the \textit{Google Webserver} and sends it back to the \textit{Operating System} which can then establish a connection to the \textit{Google Webserver}.

-\todo{explain delegation (e.g. of TLDs) somewhere here}
+There are mainly two different types of DNS requests that are performed here. The \textit{Operating System} is sending a recursive request to the \textit{DNS Resolver} which itself is successively sending iterative requests to the higher level DNS servers. Usually most public servers do not allow recursive queries due to security risks (denial of service attacks).


 \begin{figure}[!htbp]
@@ -232,10 +207,8 @@ QType & Type  & Description                                                  \\
 \caption{Address Resolution}
 \label{fig:address_resolution}
 \end{figure}
-\todo{not referenced atm}
-

 \subsection{Passive DNS}
 \label{subsec:passive_dns}

-
+A Passive DNS database is a database that contains a history of all resolved DNS queries in a network. The traffic can be observed at any appropriate location in a network, e.g. on a resolver. A Passive DNS database can be used in a variety of actions to harden a network from different threats. Projects like the Security Information Exchange (SIE) collect passive DNS data from multiple sources and analyse the databases to find e.g. inconsistencies in the resolutions (\fsCite{SIEOnline}). Passive DNS databases can also be used by researchers or service providers to find performance issues, identify anomalies or generate usage statistics \fsCite{Deri:2012:TPD:2245276.2245396}.
--- a/Thesis/content/abstract.tex
+++ b/Thesis/content/abstract.tex
@@ -1,3 +1,4 @@
 \section*{Abstract}
 \label{sec:Abstract}
-Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
+Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
+
--- a/Thesis/glossar.tex
+++ b/Thesis/glossar.tex
@@ -8,19 +8,19 @@
 \newglossaryentry{ransomware}
 {
    name={Ransomware},
-    description={Ransomware is a type of malicious software from cryptovirology that threatens to publish the victim's data or perpetually block access to it unless a ransom is paid TODO cite}
+    description={Ransomware is a type of malicious software from cryptovirology that threatens to publish the victim's data or perpetually block access to it unless a ransom is paid.}
 }

 \newglossaryentry{rir}
 {
    name={Regional Internet Registry},
-    description={TODO}
+    description={A Regional Internet Registry is an organization responsible for assigning regional internet resources, e.g. IP address blocks and ASNs}
 }

 \newglossaryentry{lir}
 {
    name={Local Internet Registry},
-    description={TODO}
+    description={A Local Internet Registry is an organization responsible for one or more blocks of IP addresses and ASNs for mostly consumers and small companies, e.g. an Internet Service Provider}
 }

 \newglossaryentry{ddos}
--- a/Thesis/main.tdo
+++ b/Thesis/main.tdo
@@ -0,0 +1,26 @@
+\contentsline {todo}{also check papers for motivations}{1}{section*.11}
+\contentsline {todo}{verify}{1}{section*.13}
+\contentsline {todo}{machine learning vs others}{2}{section*.16}
+\contentsline {todo}{add somewhere here}{4}{section*.19}
+\contentsline {todo}{see kopis section 2 end, DNS blacklisting etc}{4}{section*.21}
+\contentsline {todo}{statistic about usage}{6}{section*.29}
+\contentsline {todo}{TODO}{8}{section*.34}
+\contentsline {todo}{do something with this}{11}{section*.46}
+\contentsline {todo}{\url {http://www.tcpipguide.com/free/t_DNSNameNotationandMessageCompressionTechnique.htm}}{12}{section*.49}
+\contentsline {todo}{TODO}{13}{section*.50}
+\contentsline {todo}{all tables h!}{13}{section*.51}
+\contentsline {todo}{TODO structure of a domain, etc. top-level, second-level, third-level}{13}{section*.54}
+\contentsline {todo}{explain delegation (e.g. of TLDs) somewhere here}{13}{section*.57}
+\contentsline {todo}{not referenced atm}{13}{section*.59}
+\contentsline {todo}{literature exposure section 6.1}{15}{section*.62}
+\contentsline {todo}{list of what is running}{15}{section*.66}
+\contentsline {todo}{rename to survey?}{17}{section*.84}
+\contentsline {todo}{why this two or three?}{17}{section*.85}
+\contentsline {todo}{figure not referenced atm}{25}{section*.103}
+\contentsline {todo}{explain domain flux}{30}{section*.117}
+\contentsline {todo}{link comprehensive decision tree explanation}{33}{section*.120}
+\contentsline {todo}{detailed explanation of 10 fold cross validation}{33}{section*.122}
+\contentsline {todo}{make reduction more clear}{40}{section*.135}
+\contentsline {todo}{see section one for contributions}{43}{section*.139}
+\contentsline {todo}{==> Exposure much simpler, much less data available (ips of malicious servers, honeypots, dyndns,...) \par ==> difference notos exposure, see literature exposure section 5.5.2 \par ==> not possible to simply block everything, always false positives \par ==> read all 'limitations' sections again and compare here \par ==> no concepts of kopis because of caching, no data available}{44}{section*.141}
+\contentsline {todo}{include more graphs/pictures in general}{46}{section*.151}