rush hour 2

2018-01-30 20:54:29 +01:00
parent ece9b4afcf
commit 95d35f2470
20 changed files with 171 additions and 146 deletions
--- a/Thesis/agreement.tex
+++ b/Thesis/agreement.tex
@@ -1,9 +1,10 @@
 \addchap{Agreement}
-Me \thesisauthor\ \matriculationid\ \art\ blabla TODO at \institute
+
+I, \thesisauthor\ matriculation number \matriculationid\ hereby declare that the \art\ submitted to the \institute\ is my own unaided work. All direct or indirect sources used are acknowledged as references. I am aware that the thesis in digital form can be examined for the use of unauthorized aid and in order to determine whether the thesis as a whole or parts incorporated in it may be deemed as plagiarism. Further rights of reproduction and usage, however, are not granted here. This paper was not previously presented to another examination board and has not been published.

 \location, \today


 \rule[-0.2cm]{5cm}{0.5pt}

-\textsc{\thesisauthor} 
+\textsc{\thesisauthor}
--- a/Thesis/appendix.tex
+++ b/Thesis/appendix.tex
@@ -1,15 +1,3 @@
 \chapter{Appendix}
 \label{sec:Appendix}
 Intro
-
-\section{FooBar}
-\label{sec:FooBar}
-
-
-\section{CD}
-\label{sec:cd}
-
-\dirtree{%
-.1 / \DTcomment{root dir}.
-.2 subdir \DTcomment{sub dir}.
-}
--- a/Thesis/bibliography.bib
+++ b/Thesis/bibliography.bib
@@ -324,14 +324,6 @@
    howpublished={\url{http://scikit-learn.org/stable/modules/tree.html#tree-algorithms-id3-c4-5-c5-0-and-cart}}
 }

-@misc{SciKitProbOnline,
-    author={scikit-learn},
-    title={{scikit-learn - Classification}},
-    month=jan,
-    year={2018},
-    howpublished={\url{http://scikit-learn.org/stable/modules/tree.html#classification}}
-}
-
@misc{DENICOnline,
    author={DENIC e.G.},
    title={{DENIC}},
@@ -348,6 +340,22 @@
    howpublished={\url{https://www.iana.org/assignments/dns-parameters/dns-parameters.xhtml}}
 }

+@misc{WannaCryTwitterOnline,
+    author={Darien Huss},
+    title={{WannaCry propagation payload contains previously unregistered domain}},
+    month=jan,
+    year={2018},
+    howpublished={\url{https://twitter.com/darienhuss/status/863083680528576512}}
+}
+
+@misc{WhyDGOWinsOnline,
+    author={LITAL ASHER-DOTAN},
+    title={{THE FBI VS. GAMEOVER ZEUS: WHY THE DGA-BASED BOTNET WINS}},
+    month=jan,
+    year={2018},
+    howpublished={\url{https://www.cybereason.com/blog/the-fbi-vs-gameover-zeus-why-the-dga-based-botnet-wins}}
+}
+

@inproceedings{Stone-Gross:2009:YBM:1653662.1653738,
 author = {Stone-Gross, Brett and Cova, Marco and Cavallaro, Lorenzo and Gilbert, Bob and Szydlowski, Martin and Kemmerer, Richard and Kruegel, Christopher and Vigna, Giovanni},
@@ -415,6 +423,15 @@ isbn="978-3-540-24668-8"
 address = {San Francisco, CA, USA},
 }

+@ARTICLE{Friedman98additivelogistic,
+    author = {Jerome Friedman and Trevor Hastie and Robert Tibshirani},
+    title = {Additive Logistic Regression: a Statistical View of Boosting},
+    journal = {Annals of Statistics},
+    year = {1998},
+    volume = {28},
+    pages = {2000}
+}
+
@Article{Lim2000,
 author="Lim, Tjen-Sien
 and Loh, Wei-Yin
@@ -457,6 +474,19 @@ url="https://doi.org/10.1023/A:1007608224229"
 keywords = {DNS black lists, Zipf-like distribution, spam traffic},
 }

+@INPROCEEDINGS{6151979, 
+author={P. J. Nero and B. Wardman and H. Copes and G. Warner}, 
+booktitle={2011 eCrime Researchers Summit}, 
+title={Phishing: Crime that pays}, 
+year={2011}, 
+volume={}, 
+number={}, 
+pages={1-10}, 
+keywords={computer crime;computer forensics;financial data processing;unsolicited e-mail;cybercriminal;e-mail phishing;financial institution;forensic evidence;malicious Web site;phishing Web sites;Companies;Data mining;Education;Electronic mail;Interviews;Law enforcement;Security;Investigations;Justice Science;Phishing}, 
+doi={10.1109/eCrime.2011.6151979}, 
+ISSN={2159-1237}, 
+month={Nov},}
+
@inproceedings{ramachandran2006can,
  title={Can DNS-based blacklists keep up with bots?},
  author={Ramachandran, Anirudh and Dagon, David and Feamster, Nick},
--- a/Thesis/content/Abuse_of_Domain_Names/Abuse_of_Domain_Names.tex
+++ b/Thesis/content/Abuse_of_Domain_Names/Abuse_of_Domain_Names.tex
@@ -1,28 +1,28 @@
-\chapter{Abuse of Domain Names}
+\chapter{Abusive use of Domain Names}
 \label{cha:abuse_of_domain_names}

-The \gls{dns} makes it easy to browse the internet with human readable domain names. It adds an extra layer to the TCP/IP model that allows administrators to reliably maintain services, especially for large applications which are served by many servers in different locations. Using techniques like round robin \gls{dns} enables efficient use of multiple machines, decreases access time for different users and enhances availability if single nodes in the machine cluster fail. Although this led to the described advantages it can also be used by malicious applications. In this work three major types of domain name misuses are taken into account.
+The \gls{dns} makes it easy to browse the internet with human readable domain names. It adds an extra layer on top of TCP/IP that allows administrators to reliably maintain services, especially for large applications which are served by many servers in different locations. Using techniques like round robin, where the DNS server has a list of available servers and rotationally returns clients one of those servers, \gls{dns} enables efficient use of multiple machines, decreases access time for different users and enhances availability if single nodes in the machine cluster fail (by removing failing servers from the round robin rotation). Although this leads to the described advantages it can also be used by malicious applications. In this work three major types of domain name misuses are taken into account, malware, Phishing and Botnets.


 \section{Malware}
 \label{sec:malware}

-On May 12th 2017, British security researchers discovered a malware which was spreading massively at the time, especially in central Europe. After successful attack the WannaCry called malware encrypted users and companies files and pretended that the only solution to get back the decrypted files was to pay an amount of about \$ 300 in bitcoins. Researchers quickly discovered a request that was made by the malware to an unregistered domain. The purpose of the very long nonsensical domain name (\texttt{iuqerfsodp9ifjaposdfjhgosurijfaewrwergwea.com}) was not known at the time one of the researchers (Darien Huss) registered it. Afterwards Huss registered many thousands of requests every second to this domain. After more investigations it was clear that the domain was acting as a kill switch for the \gls{ransomware} and by registering the domain, further spreading could be slowed down \fsCite{theguardiancom_wannacry}.
+On May 12th 2017, British security researchers discovered malware which was spreading massively at the time, especially in central Europe. After successful attack the ``WannaCry'' called malware encrypted files and pretended that the only solution to get back the decrypted files was to pay an amount of about \$ 300 in a cryptocurrency. This a popular case of a so called ransomware. Ransomware in general is a type of malicious software that threatens to publish the victim's data or blocks access to it unless a ransom is paid. Researchers quickly discovered a request that was made by the malware to an unregistered domain. The purpose of the very long nonsensical domain name (\texttt{iuqerfsodp9ifjaposdfjhgosurijfaewrwergwea.com}) was not known at the time one of the researchers (\fsAuthor{WannaCryTwitterOnline}) registered it. Afterwards Huss registered many thousands of requests every second to this domain. After more investigations it was clear that the domain was acting as a kill switch for the ransomware and by registering the domain, further spreading could be slowed down \fsCite{theguardiancom_wannacry}.

-This case shows an example of how domains can be used by attackers to control their software. Usually domains are more often used to connect to command and control servers or to communicate with other infected machines (see Section~\ref{sec:botnets}). To infect a machine, attackers often use so called \textit{droppers} or \textit{injectors} that do not ship the malicious code in the first hand but that are little programs to download further source code or binaries that contain the harming functionality. It is much easier for malware authors to use domains for this purpose instead of hard coding the IP addresses for many reasons: If machines that serve the down-loadable content are e.g. confiscated by the police or taken down for other reasons, domains can simply be pointed to a redundant server and such minimizing slow downs in the distribution of the malware. Reliable endpoints are also used to maintain the malicious software and load additional code. As domains are comparably cheap (starting at a few cents per year compared to at least \$ 10 for a dedicated IPv4 address a year), attackers can build a pool of many domains and such compensate take downs of some domain names. This could possibly change when IPv6 is widely adopted (with IPv6 addresses being much cheaper) but according to statistics of Google, only about 20\% of worldwide users accessing google where IPv6 enabled (natively or using IPv6 to IPv4 bridges) \fsCite{googlecom_ipv6adoption}. This imposes the usage of IPv6 as the primary protocol in malware for obvious reasons.
+This case shows an example of how domains can be used by attackers to control their malware. Usually domains are more often used to connect to command and control servers or to communicate with other infected machines (see Section~\ref{sec:botnets} for an). To infect a machine, attackers often use so called \textit{droppers} or \textit{injectors} that do not ship the malicious code at first but are little programs to download further source code or binaries containing the harmful functionality. It is much easier for malware authors to use domains for this purpose instead of hard coding the IP addresses for many reasons: If machines that serve the down-loadable content are e.g. confiscated by the police or taken down for other reasons, domains can simply be pointed to a different servers' IP address to maintain the malicious service. Reliable endpoints are also used to maintain the malicious software and load additional code. Domains do generally have three advantages for malware authors over IP addresses, they are much more cheaper (few cents a year compared to at least \$ 10), the efforts to point a domain to a new IP address are much lower than assigning a new IP to a machine and finally it is much faster. This follows that attackers can build a pool of many domains and to compensate for take downs of some domain names. This could change when IPv6 is widely adopted (with IPv6 addresses being much cheaper) but according to statistics of Google, only about 20\% of worldwide users accessing google where IPv6 enabled (natively or using IPv6 to IPv4 bridges) \fsCite{googlecom_ipv6adoption}. This prevents the usage of IPv6 as the primary protocol in malware for obvious reasons.


 \section{Phishing}
 \label{sec:phishing}

-Phishing describes malicious activities where attackers try to steal private information from internet users which are mostly used to gain financial benefit from. There are various different types of phishing attacks that have been identified. Starting long before emails and the world wide web had significant popularity, criminals used social engineering on phones to trick users into handing over private personal and financial information. This method is known as vishing (Voice phishing). In the mid 90s AOL was the number one provider of Internet access and the first big target of phishing activities like it is known today. At the time, people from the warez community used phishing to get passwords for AOL accounts. By impersonating AOL employees in instant messengers as well as email conversations they could obtain free internet access or financially harm people using their credit card information. With the success of the world wide web including the movement of more financial services to the internet criminals used another approach to trick users. By registering domains that look very much like a benign service and imitating the appearance of the corresponding benign website many internet users unknowingly put their banking credentials into fake sites and suffer financial harm. Those credentials may be sold on black markets e.g. in the dark web and can worth up to 5\% of the balance for online banking credentials according to the SecureWorks Counter Threat Unit \fsCite{rp-2016-underground-hacker-marketplace-report}.
+Phishing describes malicious activities where attackers try to steal private information from internet users which are mostly used to gain financial benefit from (\fsCite{6151979}). There are various different types of phishing attacks that have been identified. Starting long before emails and the world wide web had significant popularity, criminals used social engineering on phones to trick users into handing over private personal and financial information. This method is known as vishing (Voice phishing). In the mid 90s, AOL was the number one provider of Internet access and the first big target of phishing activities like it is known today. At the time, people from the warez community used phishing to get passwords for AOL accounts. By impersonating AOL employees in instant messengers as well as email conversations they could obtain free internet access or financially harm people using their credit card information. With the success of the world wide web including the movement of more financial services to the internet, criminals used another approach to trick users. By registering domains that look very much like a benign service and imitating the appearance of the corresponding benign website many internet users unknowingly put their banking credentials into fake sites and suffer financial harm (also known as cybersquatting or domaine squatting). Those credentials may be sold on black markets e.g. in the dark web and can worth up to 5\% of the balance for online banking credentials according to the SecureWorks Counter Threat Unit \fsCite{rp-2016-underground-hacker-marketplace-report}.



 \section{Botnets}
 \label{sec:botnets}

-A Botnet is a network of mostly computers infected with malicious software and controlled as a group without the owners' knowledge under the remote control of a human operator called bot master or bot herder. Each infected machine is called a Bot; and similar to how robots are acting independently commanded by human operators, every node in the Botnet is performing actions as instructed by the Botmaster. Botnets are mostly used for sending spam emails and running \gls{ddos} attacks.
+A Botnet is a network of computers infected with malicious software and controlled as a group without the owners' knowledge under the remote control of a human operator called bot master or bot herder. Each infected machine is called a Bot; and similar to how robots are acting independently commanded by human operators, every node in the Botnet is performing actions as instructed by the Botmaster. The Botmaster is using central servers which are called command and control servers (C\&C) to communicate with the bots. Botnets are mostly used for sending spam emails and running \gls{ddos} attacks.

 To understand how botnets can be detected, mainly considering how botnets make use of domain names, some basic concepts have to be introduced:

--- a/Thesis/content/Conclusion/Conclusion.tex
+++ b/Thesis/content/Conclusion/Conclusion.tex
@@ -1,9 +1,6 @@
 \chapter{Conclusion}
 \label{cha:conclusion}
-    

-\section{Limitations}
-\label{sec:limitations}
+All existing machine learning systems show a promising accuracy in detecting malicious domains with different feature sets. This shows that such system can effectively detect domains that are involved in a variety of malicious activities like, botnets, phishing and spam-campaigns. The three most popular systems that have been published, \textit{Notos}, \textit{Exposure} and \textit{Kopis} are however either hard to deploy and/or require a lot of manual work to get started and can generally be seen more like academic prototypes than mature products.

-\section{Future Work}
-\label{sec:future_work}
+In the time of writing this thesis, no evaluation of the implemented algorithm could be finished. Future work can use this implementation and investigate the accuracy of this approach. Furthermore, built on top of this work, a monitoring system can be realized to proactively warn of requests to domains, involved in malicious activities. To the best of my knowledge, no system that can easily be deployed to networks exists, neither commercial or non-commercial.
--- a/Thesis/content/Development_of_DoresA/Development_of_DoresA.tex
+++ b/Thesis/content/Development_of_DoresA/Development_of_DoresA.tex
@@ -1,20 +1,18 @@
 \chapter{Development of DoresA}
 \label{cha:development_of_doresa}

-The last part of this work the development of a dynamic domain reputation system. A lot of concepts for this system will be adopted from the previously evaluated systems, most concepts will be taken from \textit{Exposure} with some general ideas of \textit{Notos} and \textit{Kopis}. There will also be some additional concepts be investigated that are not yet proposed by the those systems. In general, there are some limitations to be taken into account which arise mostly by the specific type of data that is available for this work and where it has been monitored. The passive DNS logs that have been provided for this work have been collected on three recursive DNS servers in a large company in locations in Europe, Asia and the United States. As those logs do contain sensitive data, raw logs used in this work can not be published mostly due to privacy reasons. It also has to be noted, that the DNS requests are not available for this work for the same reason. The DNS responses should however be sufficient for the target of this work.
-
-==> not like exposure: do not initially filter out domains? (alexa top 1000 and older than one year)
+The last part of this work the development of a dynamic domain reputation system, \textit{DoresA} (or Domain reputation scoring Algoriothm). A lot of concepts for this system will be adopted from the previously evaluated systems, most concepts will be taken from \textit{Exposure} with some general ideas of \textit{Notos} and \textit{Kopis}. In general, there are some limitations to be taken into account which arise mostly by the specific type of data that is available for this work and where it has been monitored. The passive DNS logs that have been provided for this work have been collected on three recursive DNS servers in a large company in locations in Europe, Asia and the United States. As those logs do contain sensitive data, raw logs used in this work can not be published mostly due to privacy reasons. It also has to be noted, that the DNS requests are not available for this work for the same reason.

 \section{Initial Situation and Goals}
 \label{sec:initial_situation_and_goals}

-Ultimately, this work should come up with an algorithm to find domains that are involved in malicious activities. Most of the latest approached work has been working with machine learning techniques to build domain reputation scoring algorithms. As those publications have generally shown promising results (see Section~\ref{cha:evaluation_of_existing_systems}), this work is also focusing on a dynamic approach with machine learning algorithms involved. The network, in which the logs for this work have been collected is different from most ISP or other public networks. There is a lot of effort made to keep the network malware-free. This includes both software solutions (like anti-virus software and firewalls) as well as a team that proactively and reactively monitors and removes malware. Another defensive task is to train the employees to be aware of current and upcoming threats (e.g., to pay attention on hyperlinks in emails, distrust public usb sticks and physical access guidelines). Although this should lead to a mostly malware free network with few requests to malicious domains, 2017 has shown to be the year of ransomware (see Section~\ref{sec:malware}). Private internet users and companies have been infected with malware that was encrypting their data and requiring the target to pay an amount of money to decrypt it. There are of course other ongoing threats that have existed for many years, like spam campaigns (\fsCite{TrendMicroOnline}). The particular task in this work is to discover whether a dynamic reputation system for domains is useful and is applicable under this circumstances. 
+Ultimately, this work should come up with an algorithm to find domains that are involved in malicious activities. Most of the latest approached work has been working with machine learning techniques to build domain reputation scoring algorithms. As those publications have generally shown promising results (see Section~\ref{cha:evaluation_of_existing_systems}), this work is also focusing on a dynamic approach with machine learning algorithms involved. The network, in which the logs for this work have been collected is different from most ISP or other public networks. There is a lot of effort made to keep the network malware-free. This includes both software solutions (like anti-virus software and firewalls) as well as a team that proactively and reactively monitors and removes malware. Another defensive task is to train the employees to be aware of current and upcoming threats (e.g., to pay attention on hyperlinks in emails, distrust public usb sticks and physical access guidelines). Although this should lead to a mostly malware free network with few requests to malicious domains, 2017 has shown to be the year of ransomware (see Section~\ref{sec:malware}). Private internet users and companies have been infected with malware that was encrypting their data and requiring the target to pay an amount of money to decrypt it. There are of course other ongoing threats that have existed for many years, like spam campaigns (\fsCite{TrendMicroOnline}). The particular task in this work is to discover whether a dynamic reputation system for domains is useful and applicable under this circumstances. The ultimate goal (not part of this work) is an automated warning system that triggers when a malicious domain is requested.


 \section{System Architecture}
 \label{sec:system_architecture}

-The overall system will take an similar approach which was first introduced by \textit{Exposure} (see \ref{sec:exposure}). In general, this involves an architecture with four different modules. The \textit{Malicious/Benign Domains Collector} is working at the beginning of the analysis and is fetching malicious domains as well as known benign domains from several external services:
+The overall system will take an similar approach which was first introduced by \textit{Exposure} (see \ref{sec:exposure}). In general, this involves an architecture with four different modules that can be seen in Figure~\ref{fig:doresa_system_architecture}. The \textit{Malicious/Benign Domains Collector} is working at the beginning of the analysis and is fetching malicious domains as well as known benign domains from several external services:
 \begin{itemize}
    \item \textit{Malware Prevention through Domain Blocking} list from malwaredomains.com which is an professionally maintained list with domains involved in malicious activities like the distribution of malware and spyware (\fsCite{malwaredomainsInformationOnline}).
    \item \textit{Phishtank}: A list that targets domains that are engaged in spam activities (\fsCite{PhishtankInformationOnline}).
@@ -22,18 +20,33 @@ The overall system will take an similar approach which was first introduced by \
    \item \textit{Alexa} with a list of the most popular domains in various countries as well as a global overview (total of 2000 domains).
 \end{itemize}

-The malicious domains list from those three services consisted of 28367 individual entries when first collected. This information is later used to label benign and malicious domains in the training process. The \textit{Malicious/Benign Domains Collector} can be rerun at any time to keep up with known malicious and benign domains at a later stage and increase the accuracy of \textit{DoresA}. The second module \todo{ref system architecture image}, \textit{Data Aggregation Module} is collecting all passive DNS logs and persisting those. The \textit{Data Aggregation Module} is also responsible for persisting information that is explicitly needed in the training step and such consumed by the \textit{Training Module}. This \textit{Training Module}'s primary concern is to learn a model that holds information about resource usage of certain DNS responses as well as labeling those data samples. Due to the limitation of available time, the training period has been reduced to three days (starting from the first of september in 2017) of training time with a window of \todo{how many minutes roughly?}. The training model thus consisted of a total of \todo{how many in total} DNS responses and included resolutions for \textit{how many individual domains} individual domains. The accuracy of this model can be also be increased by retraining the model e.v. once a day or week to keep up with new characteristics of malicious usage. This training model can then be used in the last module, the \textit{Classification Module}, to classify resolutions of unlabeled domains. The \textit{Classification Module} could e.g. be used to act as a real-time warning system when deployed in a network.
-
-The logs that are provided have been collected in different locations all over the world and are aggregated on a single machine as csv files. As operating on the raw csv logs in the training step has shown to be very inefficient \todo{benchmark here, roughly one week per day}, especially when performing multiple analysis cycles, a different solution for accessing the logs had to be found. Experimenting with putting the raw passive DNS logs into a NoSQL database (MongoDB \fsCite{MongoDBOnline}) as well as a relational database (MariaDB \fsCite{MariaDBOnline}) did not show a significant decrease in accessing the data so a slightly different approach has been used. By using an in-memory database (redis \fsCite{RedisOnline}) and only keeping those information, that are needed for the analysis has shown to give much better results \todo{benchmark here}. It has to be stated though that while retaining most of the needed information, information like the timestamp of individual requests could not be kept. See Table \todo{redis table} for which data is stored inside the redis instance. Using an in-memory database for this application led to a different challenge. Even though trimmed down to the minimum set of information, the data has an average size of \todo{todo numbers here} per day. For this reason, a machine with an appropriate amount of internal RAM had to be used. In this case, a total of 512 Gigabyte \todo{verify} of RAM with an Intel Xeon with 32 cores was available. 
+\begin{figure}[!htbp]
+    \centering
+    \includegraphics[width=.4\textwidth, clip=true]{content/Development_of_DoresA/doresa_architecture.pdf}
+    \caption{DoresA: System Architecture}
+    \label{fig:doresa_system_architecture}
+\end{figure}


-\todo{system architecture image}
+The malicious domains list from those three services consisted of 28367 individual entries when first collected. This information is later used to label benign and malicious domains in the training process. The \textit{Malicious/Benign Domains Collector} can be rerun at any time to keep up with known malicious and benign domains at a later stage and increase the accuracy of \textit{DoresA}. The second module, \textit{Data Aggregation Module} is collecting all passive DNS logs and persisting those. The \textit{Data Aggregation Module} is also responsible for extracting and persisting all feature values that are needed in the training step and such consumed by the \textit{Training Module}. This \textit{Training Module}'s primary concern is to learn a model that holds information about resource usage of certain DNS responses as well as labeling those data samples. Due to the limitation of available time, the training period has been reduced to three days (starting from the first of september in 2017) and for simplicity has been reduced to 1 million samples (which have been chosen randomly over the three days). The training model thus consisted of a total of 1 million DNS responses and included resolutions for \textit{how many individual domains} individual domains. The accuracy of this model can be also be increased by retraining the model e.v. once a day or week to keep up with new characteristics of malicious usage. This training model can then be used in the last module, the \textit{Classification Module}, to classify resolutions (feature vector) of unlabeled domains. The \textit{Classification Module} could e.g. be used to act as a real-time warning system when deployed in a network.
+
+The logs that are provided have been collected in different locations all over the world and are aggregated on a single machine as csv files. As operating on the raw csv logs in the training step has shown to be very inefficient (with roughly one week of training time for one day), especially when performing multiple analysis cycles, a different solution for accessing the logs had to be found. Experimenting with putting the raw passive DNS logs into a NoSQL database (MongoDB \fsCite{MongoDBOnline}) as well as a relational database (MariaDB \fsCite{MariaDBOnline}) did not show a significant decrease in accessing the data so a slightly different approach has been used. By using an in-memory database (redis \fsCite{RedisOnline}) and only keeping those information, that are needed for the analysis has shown to give much better results, e.g. one day for the training of 1 million samples. It has to be stated though that while retaining most of the needed information, information like the timestamp of individual requests could not be kept. The following attributes are stored inside the redis instance. 
+\begin{itemize}
+    \item \textbf{Resource record}, i.e. the domain name in this scope
+    \item The \textbf{type of the resource record}, DoresA does only take A records into account as most features can not be extracted from other types. See Section~\ref{subsubsec:dns_resource_records} for an explanation of the DNS resource record types..
+    \item All \textbf{TTL} values that this domain has had in the analysis period.
+    \item \textbf{Resolution}: The IP addresses, that the record type resolved to.
+    \item \textbf{First/last-seen}: Timestamps of when the domain has been seen for the first and last time.
+    \item Additionally, all \textbf{reverse DNS} results are persisted, e.g. to find all historic domains that resolved to a known IP address.
+\end{itemize}
+
+Using an in-memory database for this application led to a different challenge. Even though trimmed down to the minimum set of information, the data. For this reason, a machine with an appropriate amount of internal RAM had to be used. In this case, a total of 512 Gigabyte of RAM was available with an Intel Xeon CPU with 32 cores.


 \subsection{Decision Tree Classifier}
 \label{subsec:decision_tree_classifier}

-While evaluating previous work, mainly two classification algorithms have shown to provide good results in this area. A decision tree classifier has some advantages over different other classification systems: the training time is comparably low, especially in contrast to neural networks. It delivers quite easily interpretable results when plotting the resulting decision tree, it requires little data preparation (e.g. no normalization of the input is needed like in many other algorithms and can handle both numerical and categorical inputs) and it is possible to validate the results of the training using techniques like cross-validation. In this work, the implementation of the python library scikit-learn is used. The current implementation of the scikit-learn algorithm is called \textit{CART} (Classification and Regression Trees) and is based on the C4.5 decision tree implementation that is also used in \textit{Exposure}. For a detailed comparison of classification algorithms see \fsCite{Lim2000}.
+While evaluating previous work, mainly two classification algorithms have shown to provide good results in this area. A decision tree classifier has some advantages over different other classification systems: the training time is comparably low, especially in contrast to neural networks. It delivers quite easily interpretable results when plotting the resulting decision tree, it requires little data preparation (e.g. no normalization of the input is needed like in many other algorithms and can handle both numerical and categorical inputs) and it is possible to validate the results of the training using techniques like cross validation. In this work, the implementation of the python library scikit-learn is used. The current implementation of the scikit-learn algorithm is called \textit{CART} (Classification and Regression Trees) and is based on the C4.5 decision tree implementation that is also used in \textit{Exposure}. For a detailed comparison of classification algorithms see \fsCite{Lim2000}.


 \section{Feature Selection}
@@ -63,21 +76,23 @@ The feature selection is primarily motivated by the results of the evaluation of
 \end{tabularx}
 \end{table}

-\todo{additional features?}
-

 \section{Implementation}
 \label{sec:implementation}

 The implementation of \textit{DoresA} does include several different pieces of software. The main part is implemented in python and consists of the \textit{Training Module} and the \textit{Classification Module}. Apart from the main application, the \textit{Malicious/Benign Domains Collector} is a collection of bash scripts to fetch the filter lists and combine them into lists that can easily be consumed by the main application. The \textit{Data Aggregation Module} is written in C (\fsCite{kernighan2006c}), mostly for performance reasons as these logs are aggregated in real time and fed into the redis database. Most of the \textit{Data Aggregation Module} implementation has been available for this work but had to be extended to also persist all TTL changes for a domain. 

-The main application is mainly working in two modes. In the training mode, all entries are first loaded from the raw csv logs for the given period. The next step extracts and calculates the values that are needed for each feature and uses the filter lists, gathered by the \textit{Malicious/Benign Domains Collector} to label the dataset. After this, the feature values along with the label is persisted as serialized python objects. This persistence step is on the one side needed to do the final step of training but can also be useful if for some reason, the training is crashing or stopped it can be continued and picked up where the previous training left off. The last step is using the preprocessed features and the corresponding labels to build the decision model, i.e. generate the decision tree. The training can mostly (apart from the last step) be done in parallel to get a reasonable training time – the implementation in this work has efficiently been executed on 32 cores and took roughly two days for partially (see \ref{sec:system_architecture}) training three days of input data \todo{figures}. In the second mode, the \textit{Classification Module} classifies a dataset as either being benign or malicious. While the evaluated systems do have a variable reputation score from zero to one, this system does a binary classification for the dataset in the first place. This could be changed to a variable reputation score, e.g. using the probability for each class that can also be retrieved by the scikit-learn decision tree implementation \todo{(see fscite SciKitProbOnline)}.
+The main application is mainly working in two modes. In the training mode, all entries are first loaded from the raw csv logs for the given period. The next step extracts and calculates the values that are needed for each feature and uses the filter lists, gathered by the \textit{Malicious/Benign Domains Collector} to label the dataset. After this, the feature values along with the label is persisted as serialized python objects. This persistence step is on the one side needed to do the final step of training but can also be useful if for some reason, the training is crashing or stopped, it can be continued and picked up where the previous training left off. The last step is using the preprocessed features and the corresponding labels to build the decision model, i.e. generate the decision tree. The training can mostly (apart from the last step) be done in parallel to get a reasonable training time – the implementation in this work has efficiently been executed on 32 cores and took roughly two days for training a dataset with 1 million samples. In the second mode, the \textit{Classification Module} classifies a dataset as either being benign or malicious. While the evaluated systems do have a variable reputation score from zero to one, this system does a binary classification for the dataset in the first place. This could be changed to a variable reputation score, e.g. using the probability for each class that can also be retrieved by the scikit-learn decision tree implementation. 

-\todo{include picture of decision tree here}
+Figure~\ref{fig:doresa_selection_decision_tree} shows an excerpt of the resulting decision tree from the test training with 1 million data samples. \todo{describe what is seen on the decision tree excerpt}
+
+\begin{figure}[!htbp]
+    \centering
+    \includegraphics[angle=90, width=.8\textwidth, clip=true]{content/Development_of_DoresA/doresa_example_tree.png}
+    \caption{DoresA: Excerpt of resulting decision tree}
+    \label{fig:doresa_selection_decision_tree}
+\end{figure}


 %\section{Evaluation}
 %\label{sec:evaluation}
-
-\todo{include more graphs/pictures in general}
-
--- a/Thesis/content/Development_of_DoresA/doresa_example_tree.png
+++ b/Thesis/content/Development_of_DoresA/doresa_example_tree.png
--- a/Thesis/content/Evaluation_of_existing_Systems/Evaluation_of_existing_Systems.tex
+++ b/Thesis/content/Evaluation_of_existing_Systems/Evaluation_of_existing_Systems.tex
@@ -1,12 +1,12 @@
-\chapter{Evaluation of existing Systems}
+\chapter{Evaluation of Existing Systems}
 \label{cha:evaluation_of_existing_systems}

-This chapter deals with work around domain reputation scoring systems that has been released. While there exist different types of algorithms, only those that follow a similar approach are taken into account here: namely those that use passive DNS logs and machine learning to calculate the reputation score. \todo{why this two or three?}
+This chapter deals with work around domain reputation scoring systems that has been released. While there exist different types of algorithms, only those that follow a similar approach are taken into account here: namely those that use passive DNS logs and machine learning to calculate the reputation score. For an overview of general existing approach do refer Section~\ref{sec:related_work}.

-\section{Evaluation Scheme}
+\section{Evaluation Schema}
 \label{sec:evaluation_scheme}

-For a comprehensive evaluation, all input and output as well as the exact implementations (and/or the corresponding parameters that have been used for the analysis) of the algorithm was needed. Unfortunately, none of the publications we are dealing with here have released any (raw) input data, specifically the passive DNS logs and the filter lists for the training set. Neither has any of the algorithm's actual implementation been published. For this reason the evaluation of the existing systems is focusing on the results that have individually been published. Most importantly the detection rate as well as the false positive rate. Another important fact for this overview is what data has actually been used for the training and classification and where the data has been obtained. Passive DNS logs may be collected in different stages of the DNS resolution and might, due to e.g. caching, lead to the extraction of different information. A resolver running on the users machine might obtain much more traffic and such benefit from e.g. time based patterns which are not possible at higher level DNS servers that are not able to collect that traffic because the response has been cached on resolvers in a lower (DNS-) hierarchy.
+For a comprehensive evaluation, all input and output as well as the exact implementations (and/or the corresponding parameters that have been used for the analysis) of the algorithm was needed. Unfortunately, none of the publications we are dealing with here have released any (raw) input data, specifically the passive DNS logs and the filter lists for the training set. Neither has any of the algorithm's actual implementation been published. For this reason the evaluation of the existing systems is focusing on the results that have individually been published. Most importantly the detection rate as well as the false positive rate. Another important fact for this overview is what data has actually been used for the training and classification and where the data has been obtained. Passive DNS logs may be collected in different stages of the DNS resolution and might, e.g. due to caching, lead to the extraction of different information. A resolver running on the users machine might obtain much more traffic and thus benefit from e.g. time based patterns which are not possible at higher level DNS servers that are not able to collect that traffic because the response has been cached on resolvers in a lower (DNS-) hierarchy.


 \input{content/Evaluation_of_existing_Systems/Notos/Notos.tex}
@@ -22,7 +22,3 @@ For a comprehensive evaluation, all input and output as well as the exact implem
 After investigating those three systems, we want to demonstrate the major differences and similarities. The results discussed here are the base for the implementation of the own algorithm. All three systems are based on machine learning techniques. Two of the systems use a decision tree classifier and \textit{Kopis} uses a random forest classifier which is not significantly different from a decision tree but has some advantages in some areas (see a detailed comparision on Section~\ref{sec:model_selection}) One major difference of these systems is the data they are working with. While \textit{Notos} and \textit{Exposure} are operated with data collected at recursive DNS servers in lower DNS layers, \textit{Exposure} is gathering traffic from a top level domain name server and two AuthNS from major domain name registrars. As the data available for this work has also been gathered at RDNS servers in a lower DNS hierarchy and no data from higher DNS layers is available, most concepts of \textit{Kopis} can not be used for the system that is proposed in this work. Nevertheless there are general aspects of \textit{Kopis} that can be useful, e.g. which sources have been used to build the knowledge base for the classification of test samples in the training or how the overall architecture has been designed. It also has to be noted though, that \textit{Kopis} is the only system that is able to operate without having reputation information for domains and IPs available. Having data available that is collected similarly to \textit{Notos} and \textit{Exposure} does not mean that all concepts and features can be applied in the new system. A company network has much different characteristics then a network operated by e.g. an ISP. The network, in which the logs for this work has been collected in, is hardened with much more effort so that malware should generally be rarely found. Especially \textit{Notos} uses public traffic from an ISP RDNS server that is handling clients of this ISP network which, by design, can not be taken care of like in a closed company network and is much more likely to contain a lot of different malware. One major difference between \textit{Notos} and \textit{Exposure} is the complexity of the overall system. \textit{Notos}, being the first dynamic domain reputation system, has a much higher amount of features that are used. Some of these features, like the Network-based features (see Table~\ref{tab:notos_network-based_features}) are much more fine grain (e.g. independently operating on the top level, second level and third level domains) compared to the similar group of features in \textit{Exposure} (see Table~\ref{tab:exposure_features}, \textit{DNS Answer-Based Features}). For this reason, \textit{Notos} does also need much more detailed reputation information, e.g. for the IP spaces. Although not having such fine grain features, \textit{Exposure} shows similar detection rates like \textit{Notos}. Another general advantages of \textit{Exposure} over \textit{Notos} is the reduced training time (again for example due to fewer features) and that it does not need information about malware that has been gathered in self hosted honeypots (which in fact, done right is a completely different topic on its own and therefore not part of this work). 

 It also has to be noted, that while all three systems show a high detection rate in general with a high true positive and low false positive rate, they can not be operated with a 100\% success rate and should always be deployed along with other detection systems like firewalls, malware detection software and/or traditional filter systems like DNS black- and whitelists. Dynamic reputation system can however be used to find domains used in malicious activities before other systems are aware of the threat.
-
-
-\todo{if time}
-==> read all 'limitations' sections again and compare here
--- a/Thesis/content/Evaluation_of_existing_Systems/Exposure/Exposure.tex
+++ b/Thesis/content/Evaluation_of_existing_Systems/Exposure/Exposure.tex
@@ -4,17 +4,17 @@
 \subsection{General}
 \label{subsec:exposure_general}

-\textit{Exposure} is ``a system that employs large-scale, passive DNS analysis techniques to detect domains that are involved in malicious activity'', which was first introduced in 2011 by the \textit{Institute Eurecom} in Sophia Antipolis, the \textit{Northeastern University} from Boston and the \textit{University of California} in Santa Barbara \fsCite{Bilge11exposure:finding}. \textit{Exposure} is the second published system to detect malicious domains using passive DNS data and is built on the key premise, that most malicious services are dependent on the domain name system and compared to benign services should expose enough differences in behavior for an automated discovery. The main analysis for \textit{Exposure} has been run on data of a period of 2.5 month with more than 100 billion DNS queries. \textit{Exposure} is not targeted at a specific threat but rather covers a wide variety of malicious activities like phishing, Fast-Flux services, spamming, botnets (using domain generation algorithms), etc. It uses fifteen features with nine features, that have not been proposed in previous research. Ultimately, \textit{Exposure} offers a real-time detection system which has been made available for the public in 2014 \fsCite{Bilge:2014:EPD:2617317.2584679}. Unfortunately, the service was not accessible at the time of this work. 
+\textit{Exposure} is ``a system that employs large-scale, passive DNS analysis techniques to detect domains that are involved in malicious activity''\fsCite{Bilge11exposure:finding}, which was first introduced in 2011 by the \textit{Institute Eurecom} in Sophia Antipolis, the \textit{Northeastern University} from Boston and the \textit{University of California} in Santa Barbara. \textit{Exposure} is the second published system to detect malicious domains using passive DNS data and is built on the key premise, that most malicious services are dependent on the domain name system and compared to benign services should expose enough differences in behaviour for an automated discovery, see Section~\ref{subsec:exposure_features} for what differences the features are targeted at. The main analysis for \textit{Exposure} has been run on data of a period of 2.5 month with more than 100 billion DNS queries. \textit{Exposure} is not targeted at a specific threat but rather covers a wide variety of malicious activities like phishing, Fast-Flux services, spamming, botnets (using domain generation algorithms), and similar others. It uses fifteen features, with nine features, that have not been proposed in previous research. Ultimately, \textit{Exposure} offers a real-time detection system which has been made available to the public in 2014 \fsCite{Bilge:2014:EPD:2617317.2584679}. Unfortunately, the service was not accessible at the time of this writing. 


 \subsection{Architecture}
 \label{subsec:exposure_architecture}

-For the distinction of benign and malicious domains to perform well, a large set of training data is used in \textit{Exposure} (seven days). The offline training has been powered by recursive DNS traffic (RDNS), gathered from the Security Information Exchange (SIE). Specifically, only the answer of the RDNS traffic has been used, that comprises of: the queried domain name, timestamp of the request, caching time TTL and the list of resolved IP addresses. The overall systems consists of five main components. The interaction of those models can be seen in Figure~\ref{fig:exposure_system_overview}.
+For the distinction of benign and malicious domains to perform well, a large set of training data is used in \textit{Exposure} (seven days). The offline training has been powered by recursive DNS traffic (RDNS), gathered from the Security Information Exchange (SIE). Specifically, only the answer of the RDNS traffic has been used, that comprises of: the queried domain name, timestamp of the request, caching time TTL and the list of resolved IP addresses. The overall system consists of five main components. How those modules are interacting with each other and which input data is required for each module can be seen in Figure~\ref{fig:exposure_system_overview}.

 \begin{itemize}
    \item The \textit{Data Collector} module passively captures the DNS traffic in the monitored network.
-    \item The \textit{Feature Attribution} component is attributing the captured domains with the desired features.
+    \item The \textit{Feature Attribution} component is attributing the captured domains with a vector containing the associated features.
    \item The third component \textit{Malicious and Benign Domains Collector} is running in parallel to the first two modules and constantly gathers information about known good and known bad domains. These lists are used to label the output of the \textit{Feature Attribution} module afterwards, as it can be seen in picture~\ref{fig:exposure_system_overview}. The list of benign domains is extracted from the Alexa top list \fsCite{AlexaWebInformationOnline} and externally confirmed \gls{whois} data. The list of known malicious domains is collected from several external, both professionally provisioned and user maintained, sources and includes domains in different threat classes, e.g., malwaredomains.com \fsCite{malwaredomainsInformationOnline}, Phishtank \fsCite{PhishtankInformationOnline}, Anubis (no longer available), the Zeus Block List \fsCite{zeusblocklistInformationOnline} and domains from DGAs for Conficker \fsCite{porras2009foray} and Mebroot \fsCite{Stone-Gross:2009:YBM:1653662.1653738}.
    \item The labeled dataset is then fed into the \textit{Learning Module} and trains the domain detection model that is used in the final step. This classifier may also be retrained on a regular basis to keep up with malicious behavior (daily in \textit{Exposure}). 
    \item The \textit{Classifier} uses the decision model to classify unlabeled (new) domains into benign and malicious groups. For this, the same feature vector that is produced by the \textit{Feature Attribution} module is used.
@@ -35,13 +35,13 @@ For the distinction of benign and malicious domains to perform well, a large set
 \textit{Exposure} uses a total of fifteen features that have been chosen after several month of study with thousands of well-known benign and malicious domains. These features are grouped into four different categories which can be seen in Table~\ref{tab:exposure_features}. 

 The first group, \textit{Time-Based Features} has not been approached in publications before. These features investigate the time, at which the request with domain \textit{d} has been issued. The main idea behind this group of features is to find malicious services that use techniques like \textit{domain flux} 
-(see Section~\ref{subsec:fast-flux_service_networks}) to circumvent take downs and make their infrastructure more agile. ``[\textit{Domain flux}] often show a sudden increase followed by a sudden decrease in the number of requests'' \fsCite[Section 3.1]{Bilge:2014:EPD:2617317.2584679}. Domains of malicious services using a DGA do only exist for a short period of time by design. \fsAuthor{Bilge:2014:EPD:2617317.2584679} defines the first feature as follows: ``A domain is defined to be a short-lived domain [...] if it is queried only between time \(t_0\) and \(t_1\), and if this duration is comparably short (e.g., less than several days).'' The next three features are subject to the change point detection (CPD) problem: Change point detection is about the identification of (abrupt) changes in the distribution of values, for example in time series. \textit{Exposure} implemented a CPD algorithm based on the popular CUSUM (cumulative sum) algorithm. At first, the time series of request timestamps is split into periods of 3600 seconds (one hour was tested to work well). After that, all time intervals are iterated and for each interval, the average request count of the previous eight hours \(P_t^-\) and following eight intervals \(P_t^+\) is calculated. In the next step, the distance of these two values is calculated \(d(t)=|P_t^--P_t^+|\) for each interval and the resulting ordered sequence \(d(t)\) of distances is fed to the CUSUM algorithm to finally get retrieve all change points (For more information on the implemented CPD algorithm, see \fsCite[Section 3.1]{Bilge:2014:EPD:2617317.2584679}). To calculate feature two (\textit{Daily similarity}), the Euclidean Distance of the time series of each day for \textit{d} is calculated. Intuitively, a low distance means similar time series and such high daily similarity whereas two days with higher distance do show a less similar request volume. All the features of this group do naturally only perform well when having a larger number of requests to \textit{d} over a significant period of time.
+(see Section~\ref{subsec:fast-flux_service_networks}) to circumvent take downs and make their infrastructure more agile. ``[\textit{Domain flux}] often show a sudden increase followed by a sudden decrease in the number of requests'' \fsCite[Section 3.1]{Bilge:2014:EPD:2617317.2584679}. Domains of malicious services using a DGA do only exist for a short period of time by design. \fsAuthor{Bilge:2014:EPD:2617317.2584679} defines the first feature as follows: ``A domain is defined to be a short-lived domain [...] if it is queried only between time \(t_0\) and \(t_1\), and if this duration is comparably short (e.g., less than several days).'' The next three features are subject to the change point detection (CPD) problem: Change point detection is about the identification of (abrupt) changes in the distribution of values, for example in time series. \textit{Exposure} implemented a CPD algorithm based on the popular CUSUM (cumulative sum) algorithm. At first, the time series of request timestamps is split into periods of 3600 seconds (one hour was tested to work well). After that, all time intervals are iterated and for each interval, the average request count of the previous eight hours \(P_t^-\) and following eight intervals \(P_t^+\) is calculated. In the next step, the distance of these two values is calculated \(d(t)=|P_t^--P_t^+|\) for each interval and the resulting ordered sequence \(d(t)\) of distances is fed to the CUSUM algorithm to finally retrieve all change points (For more information on the implemented CPD algorithm, see \fsCite[Section 3.1]{Bilge:2014:EPD:2617317.2584679}). To calculate the \textit{Daily similarity} features, the Euclidean Distance of the time series of each day for \textit{d} is calculated. Intuitively, a low distance denotes similar time series and thus high daily similarity whereas two days with higher distance do show a less similar request volume. All the features of this group naturally only perform well when having a larger number of requests to \textit{d} over a significant period of time.

-The next group of Features (\textit{DNS Answer-Based Features}) investigates resolutions of the requested domain \textit{d}. While one domain can map to multiple IP addresses for benign services, most harmless services do show a much smaller network profile in terms of e.g. location and \glspl{as}. To satisfy those findings, four features have been extracted: The number of distinct IP addresses, the amount of different countries these IP addresses are assigned to, the number of other domains that share an IP address \textit{d} resolves to and the fourth feature is the amount of results of the reverse dns query for all IPs of \textit{d}. It is worth noting, that some hosting providers also use one IP address for many domains so an extra layer to prevent such false positives make sense.
+The next group of Features (\textit{DNS Answer-Based Features}) investigates resolutions of the requested domain \textit{d}. While one domain can map to multiple IP addresses for benign services, most harmless services show a much smaller network profile in terms of e.g. location and \glspl{as}. To benefit from those findings, four features have been extracted: The number of distinct IP addresses, the amount of different countries these IP addresses are assigned to, the number of other domains that share an IP address \textit{d} resolves to and the fourth feature is the amount of results of the reverse dns query for all IPs of \textit{d}. It is worth noting, that some hosting providers also use one IP address for many domains and an extra layer helps preventing those false positives.

-The \textit{TTL Value-Based Features} covers five individual features. Each answer for a DNS request contains the TTL attribute, which is the recommendation, configured by the operator of \textit{d}, of how long the resolution will be valid and should be cached for this reason. Whereas RFC 1033 recommends a TTL of one day (86400 seconds) \fsCite{RFC1033} it is getting more common, especially for content delivery networks to use much lower values (e.g. Cloudflare, one of the biggest managed DNS providers is using a default of 5 minutes). Botnets are also usually applying low TTL values to avoid long outages of C\&C servers and bots. As \fsAuthor{Bilge:2014:EPD:2617317.2584679} states, botnets do also change their TTL values more frequently and use values in different ranges depending on their availability. While applying a higher value to high bandwidth servers with low downtimes, home computers behind a digital subscriber line are much likely to fail and get lower TTL values. For this reason, all TTL values for a domain are checked against the following ranges: [0, 1], [1, 10], [10, 100], [100, 300], [300, 900], [900, inf].
+The \textit{TTL Value-Based Features} cover five individual features. Each answer for a DNS request contains the TTL attribute, which is the recommendation, configured by the operator of \textit{d}, of how long the resolution will be valid and should be cached for this reason. Whereas RFC 1033 recommends a TTL of one day (86400 seconds) \fsCite{RFC1033}, it is getting more common, especially for content delivery networks to use much lower values (e.g. Cloudflare, one of the biggest managed DNS providers is using a default of 5 minutes). Botnets are also usually applying low TTL values to avoid long outages of C\&C servers and bots. As \fsAuthor{Bilge:2014:EPD:2617317.2584679} states, botnets also change their TTL values more frequently and use values in different ranges depending on their availability. While applying a higher value to high bandwidth servers with low downtimes, home computers behind a digital subscriber line are much likely to fail and get lower TTL values. For this reason, all TTL values for a domain are checked against the following range (in seconds): [0, 1], [1, 10], [10, 100], [100, 300], [300, 900], [900, inf].

-The last group of features are the \textit{Domain Name-Based Features}. Domain names of benign services mostly use easy to remember names which consist of valid words. Attackers often are not interested in human readable domain names. This is especially right for domains generated by a DGA. \textit{Exposure} extracts two statistical features out of the domain name, the first being the percentage of numerical characters and secondly the length of the longest (english) meaningful string (LMS).
+The last group of features are the \textit{Domain Name-Based Features}. Domain names of benign services mostly use easy to remember names which consist of valid words. Attackers often are not interested in human readable domain names. This is especially true for domains generated by a DGA. \textit{Exposure} extracts two statistical features out of the domain name, the first being the percentage of numerical characters and secondly the length of the longest (english) meaningful string (LMS).


 \begin{table}[!htbp]
@@ -73,14 +73,14 @@ The last group of features are the \textit{Domain Name-Based Features}. Domain n
 \subsection{Reputation Engine}
 \label{subsec:exposure_reputation_engine}

-The reputation classifier of \textit{Exposure} is implemented as a \textit{J48} decision tree algorithm. The performance of decision trees mainly depend on the quality of the training set. For this reason a representative set of training data, with malicious domains from various threat classes, has to be chosen. Sources that have been used to identify malicious and benign domains can be found in Section~\ref{subsec:exposure_architecture}. In total, a list of 3500 known bad as well as 3000 known good domains have been used for the initial training. In order to take advantage of the \textit{Time-Based Features}, the optimal training period has been observed to be seven days. The tree is then constructed using the feature attribute values and its corresponding labels. More specifically, the whole training set is iterated and each time, a set of samples can be separated using one single attribute (in perspective to the assigned label) it is branched out and a new leaf is created. Each branch is then split into more fine grained subtrees as long as there is an \textit{information gain}, which means that all samples of the subset belong to the same class, i.e. are assigned the same label (see more on decision trees in Section~\ref{subsec:decision_tree_classifier}).
+The reputation classifier of \textit{Exposure} is implemented as a \textit{J48} decision tree algorithm (see Section~\ref{sec:machine_learning} for details of decision trees, \textit{j48} is an implementation of the \textit{C4.5} decision tree algorithm). The performance of decision trees mainly depend on the quality of the training set. For this reason a representative set of training data, with malicious domains from various threat classes, has to be chosen. Sources that have been used to identify malicious and benign domains can be found in Section~\ref{subsec:exposure_architecture}. In total, a list of 3500 known bad as well as 3000 known good domains have been used for the initial training. In order to take advantage of the \textit{Time-Based Features}, the optimal training period has been observed to be seven days. The tree is then constructed using the feature attribute values and its corresponding labels. More specifically, the whole training set is iterated and each time, a set of samples can be separated using one single attribute (in perspective to the assigned label) it is branched out and a new leaf is created. Each branch is then split into more fine grained subtrees as long as there is an \textit{information gain}, which means that all samples of the subset belong to the same class, i.e. are assigned the same label (see more on decision trees in Section~\ref{subsec:decision_tree_classifier}).

 \subsection{Results}
 \label{subsec:exposure_results}

-The performance of classifiers with different feature sets has been tested using e.g. 10-fold cross validation. To find the model with the minimum error rate, all combinations of feature sets ({\textit{Time-Based Features} as F1, \textit{DNS Answer-Based Features} as F2, \textit{TTL Value-Based Features} F3 and \textit{Domain Name-Based Features} as F4) have been trained using the same decision tree algorithm. Figure~\ref{fig:exposure_miss-classifier_instances} shows the error rate of those different classification models. The \textit{Time-Based Features} are showing the smallest error when inspecting single feature sets only. Looking at models with multiple feature sets, the overall minimum error rate is produced when using all four feature groups. The total amount of requests in the dataset that was collected for the initial analysis counted roughly 100 billion DNS queries. As processing all of these requests is not feasible in practice, two filtering steps have been introduced. The first one filters out all requests to a domain in the top 1000 Alexa list. The assumption for this filter is that no malicious domain will get this popular without being detected in some form. This action reduced about 20\% of the initial requests. The second step filters out all requests to domains that have been registered at least one year before the analysis. This filter applied to 45.000 domains (or 40 billion corresponding queries) and reduced the remaining traffic by another 50\%. The filtering process has been cross tested against the Alexa top list, McAfee WebAdvisor (formerly McAfee SiteAdvisor) \fsCite{MCAfeeWebAdvisorOnline}, Google Safe Browsing \fsCite{GoogleSafeBrowsingOnline} and Norton Safe Web \fsCite{NortonSafeWebOnline} and only 0.09\% have been reported to be risky. \fsAuthor{Bilge11exposure:finding} for this reason states that: ``We therefore believe that our filtering policy did not miss a significant number of malicious domains because of the pre-filtering we performed during the offline experiments.''
+The performance of classifiers with different feature sets has been tested using e.g. 10-fold cross validation. See Section~\ref{subsec:notos_results} for more details on 10-fold cross validation. To find the model with the minimum error rate, all combinations of feature sets ({\textit{Time-Based Features} as F1, \textit{DNS Answer-Based Features} as F2, \textit{TTL Value-Based Features} F3 and \textit{Domain Name-Based Features} as F4) have been trained using the same decision tree algorithm. Figure~\ref{fig:exposure_miss-classifier_instances} shows the error rate of those different classification models. The \textit{Time-Based Features} are showing the smallest error when inspecting single feature sets only. Looking at models with multiple feature sets, the overall minimum error rate is produced when using all four feature groups. The total amount of requests in the dataset that was collected for the initial analysis counted roughly 100 billion DNS queries. As processing all of these requests is not feasible in practice, two filtering steps have been introduced. The first one filters out all requests to a domain in the top 1000 Alexa list. The assumption for this filter is that no malicious domain will get this popular without being detected in some form. This action reduced about 20\% of the initial requests. The second step filters out all requests to domains that have been registered at least one year before the analysis. This filter applied to 45.000 domains (or 40 billion corresponding queries) and reduced the remaining traffic by another 50\%. The filtering process has been cross tested against the Alexa top list, McAfee WebAdvisor (formerly McAfee SiteAdvisor) \fsCite{MCAfeeWebAdvisorOnline}, Google Safe Browsing \fsCite{GoogleSafeBrowsingOnline} and Norton Safe Web \fsCite{NortonSafeWebOnline} and only 0.09\% have been reported to be risky. \fsAuthor{Bilge11exposure:finding} for this reason states that: ``We therefore believe that our filtering policy did not miss a significant number of malicious domains because of the pre-filtering we performed during the offline experiments.''

-The accuracy of the classifier has been validated using two different methods. The first method was to classify the training set with 10-fold cross validation. This validation method splits the dataset in ten partitions/folds (each partition optimally containing roughly the same class label distribution). One fold is then used as the validation sample (testing set) and the remaining nine partitions are used as the training set. The training set is used to train the model which is then cross validated with the testing set. This step is repeated for ten times using the same partitions, each partition being the testing set once. The second method is to simply use 66\% of the dataset for the training and the remaining 33\% as the testing set.
+The accuracy of the classifier has been validated using two different methods. The first method was to classify the training set with 10-fold cross validation. The second method is to simply use 66\% of the dataset for the training and the remaining 33\% as the testing set.


 \begin{figure}[!htbp]
--- a/Thesis/content/Evaluation_of_existing_Systems/Kopis/Kopis.tex
+++ b/Thesis/content/Evaluation_of_existing_Systems/Kopis/Kopis.tex
@@ -36,7 +36,7 @@ The overall system architecture can be seen in Figure~\ref{fig:kopis_system_over

 \textbf{Benign domain sources: }
 \begin{itemize}
-    \item Domain and ip whitelists from DNSWL \fsCite{DNSWLOnline}
+    \item Domain and IP whitelists from DNSWL \fsCite{DNSWLOnline}
    \item Address space of the top 30 Alexa domains \fsCite{AlexaWebInformationOnline}
    \item Dihe's IP-Index Browser \fsCite{DIHEOnline}
 \end{itemize}
@@ -67,7 +67,7 @@ This group of features tries to map the requester diversity, i.e. where the requ
 \subsubsection{Requester Profile (RP)}
 \label{subsubsec:kopis_requester_profile}

-The \textit{Requester Profile} features are aiming to separate request that are coming from hardened networks (like enterprise networks) from less secure networks, e.g. ISP networks. Most smaller networks like enterprise or university networks are much better protected against malware in general and such should show less requests to malicious domains. On the other hand, ISPs do usually not invest much effort into cleaning their network from malware and do not offer a high level of protection against malware propagation inside the network. As \textit{Kopis} is operating in the upper DNS layers, it is often not possible to simply measure the population behind the requesting RDNS server (due to e.g. caching \fsCite{10.1007/978-3-540-24668-8_15}) and a different metric has to be found to measure the size of the network, a request has been submitted from. Assuming traffic, monitored from a large AuthNS in epoch \(E_t\) that has authority for a set of Domains \(D\) and all unique requesting IP addresses \(R\). For each requester IP \(R_k \in R\) the amount of different domains, queried by \(R_k\) in \(E_t\), is counted \(c_{t,k}\). A weight can then be applied to each requester \(R_k\) as \(w_{t,k} = \frac{c_{t,k}}{max_{l=1}^{|R|}c_{t,l}}\). Subsequent, the more domains in \(D\) a requester \(R_k\) is querying, the higher the weight will be. This way, high weights are corresponding to larger networks and following the explanation above, the more likely it is that this requester is infected with malicious software. Given a domain \textit{d} and let \(R_d\) being the set of all requester IP addresses, the count \(c_{t,k}\) is computed for each epoch \(E_t\) like previously described. In the following, the count for each epoch is multiplied with each weight \(w_{t-n,k}\), where \(w_{t-n,k}\) are the weights of \textit{n} days before epoch \(E_t\), top get the set of weighted counts of \textit{d} during \(E_t\): \(WC_t(d) = \{c_{t,k} * w_{t-n,k}\}_k\). Finally, five different feature values are calculated with the values of \(WC_t(d)\): the average, the \gls{biased_estimator} and \gls{unbiased_estimator} standard deviation and the biased and unbiased variance (see Glossary for an explanation of \gls{biased_estimator} and \gls{unbiased_estimator} standard deviation).
+The \textit{Requester Profile} features are aiming to separate request that are coming from hardened networks (like enterprise networks) from less secure networks, e.g. ISP networks. Most smaller networks like enterprise or university networks are much better protected against malware in general and such should show less requests to malicious domains. On the other hand, ISPs do usually not invest much effort into cleaning their network from malware and do not offer a high level of protection against malware propagation inside the network. As \textit{Kopis} is operating in the upper DNS layers, it is often not possible to simply measure the population behind the requesting RDNS server (due to e.g. caching \fsCite{10.1007/978-3-540-24668-8_15}) and a different metric has to be found to measure the size of the network, a request has been submitted from. Assuming traffic, monitored from a large AuthNS in epoch \(E_t\) that has authority for a set of Domains \(D\) and all unique requesting IP addresses \(R\). For each requester IP \(R_k \in R\) the amount of different domains, queried by \(R_k\) in \(E_t\), is counted \(c_{t,k}\). A weight can then be applied to each requester \(R_k\) as \(w_{t,k} = \frac{c_{t,k}}{max_{l=1}^{|R|}c_{t,l}}\). Subsequent, the more domains in \(D\) a requester \(R_k\) is querying, the higher the weight will be. This way, high weights are corresponding to larger networks and following the explanation above, the more likely it is that this requester is infected with malicious software. Given a domain \textit{d} and let \(R_d\) being the set of all requester IP addresses, the count \(c_{t,k}\) is computed for each epoch \(E_t\) like previously described. In the following, the count for each epoch is multiplied with each weight \(w_{t-n,k}\), where \(w_{t-n,k}\) are the weights of \textit{n} days before epoch \(E_t\), get the set of weighted counts of \textit{d} during \(E_t\): \(WC_t(d) = \{c_{t,k} * w_{t-n,k}\}_k\). Finally, five different feature values are calculated with the values of \(WC_t(d)\): the average, the biased and unbiased standard deviation and the biased and unbiased variance. The biased and unbiased estimator for the standard deviation of a random variable \textit{X} are defined as \(\sqrt{\sum_{i=1}^N \frac{1}{N}(\bar{X}_i - \mu)^2}\) and respectively \(\sqrt{\sum_{i=1}^N \frac{1}{N-1}(\bar{X}_i - \mu)^2}\) (with \(N\) being the amount of samples and \(\mu\) the empirical mean).


 \subsubsection{Resolved-IPs Reputation (IPR)}
@@ -86,7 +86,7 @@ The set of \textit{Resolved-IPs Reputation} features consists of nine individual
 \subsection{Results}
 \label{subsec:kopis_results}

-\textit{Kopis} used DNS traffic captured at two major domain name registrars (AuthNS servers) between 01.01.2010 and 31.08.2010 as well as a country code top level domain server (.ca) from 26.08.2010 up to 18.10.2010. As the TLD server was operated in delegate-only mode, passive DNS traffic had to be additionally collected to get the resolutions for these queries. In total, this led to 321 million lookups a day in average. This amount of data showed to be a significant problem and the overall traffic size to be analysed had to be reduced. The most significant reduction was to remove all duplicate queries and only take unique requests into account. Finally, about 12.5 million daily unique requests remained in average \todo{make reduction more clear}. Using the \textit{KB}, that consists of various sources (see \fsCite{subsec:kopis_architecture}), a sample with 225,429 unique RRs (corresponding to 28,915 unique domain names) could be split into groups with 27,317 malicious and 1,598 benign domains. All raw data was indexed in a relational database and was enriched with information like first and last seen timestamps. Like any system that uses a machine learning approach, it was important for \textit{Kopis} to select significant features and a period that was sufficient for the training to deliver good results. Figure~\ref{fig:kopis_train_period_selection} shows the \glspl{roc} (ROC) of different models, generated with data from periods of one up to five days and validated using 10-fold cross validation. According to \fsAuthor[Section 5.3]{Antonakakis:2011:DMD:2028067.2028094}: ``When we increased the observation window beyond the mark of five days we did not see a significant improvement in the detection results.'' Using these models, the best classification algorithm had to be found. This has been accomplished using a technique called model selection (see e.g. \fsCite{Kohavi:1995:SCB:1643031.1643047}). The most accurate classifier for these models has shown to be the \textit{random forest} implementation with a true positive rate of 98.4\% and a false positive rate of 0.3\% (with training data from a period of five days). \textit{Random forest} is the combination of different decision trees, either trained on different training sets or using different sets of features. Unfortunately, the exact random forest classification implementation of \textit{Kopis} has not been published. Other classifiers that have been experimented with are: Naive Bayes, k-nearest neighbors, Support Vector Machines, MLP Neural Network and random committee.
+\textit{Kopis} used DNS traffic captured at two major domain name registrars (AuthNS servers) between 01.01.2010 and 31.08.2010 as well as a country code top level domain server (.ca) from 26.08.2010 up to 18.10.2010. As the TLD server was operated in delegate-only mode, passive DNS traffic had to be additionally collected to get the resolutions for these queries. In total, this led to 321 million lookups a day in average. This amount of data showed to be a significant problem and the overall traffic size to be analysed had to be reduced. The most significant reduction was to remove all duplicate queries and only take unique requests into account. Finally, about 12.5 million daily unique requests remained in average. Using the \textit{KB}, that consists of various sources (see \fsCite{subsec:kopis_architecture}), a sample with 225,429 unique RRs (corresponding to 28,915 unique domain names) could be split into groups with 27,317 malicious and 1,598 benign domains. All raw data was indexed in a relational database and was enriched with information like first and last seen timestamps. Like any system that uses a machine learning approach, it was important for \textit{Kopis} to select significant features and a period that was sufficient for the training to deliver good results. Figure~\ref{fig:kopis_train_period_selection} shows the \glspl{roc} (ROC) of different models, generated with data from periods of one up to five days and validated using 10-fold cross validation. According to \fsAuthor[Section 5.3]{Antonakakis:2011:DMD:2028067.2028094}: ``When we increased the observation window beyond the mark of five days we did not see a significant improvement in the detection results.'' Using these models, the best classification algorithm had to be found. This has been accomplished using a technique called model selection (see e.g. \fsCite{Kohavi:1995:SCB:1643031.1643047}). The most accurate classifier for these models has shown to be the \textit{random forest} implementation with a true positive rate of 98.4\% and a false positive rate of 0.3\% (with training data from a period of five days). \textit{Random forest} is the combination of different decision trees, either trained on different training sets or using different sets of features. Unfortunately, the exact random forest classification implementation of \textit{Kopis} has not been published. Other classifiers that have been experimented with are: Naive Bayes, k-nearest neighbors, Support Vector Machines, MLP Neural Network and random committee.

 \begin{figure}[!htbp]
    \centering
--- a/Thesis/content/Evaluation_of_existing_Systems/Notos/Notos.tex
+++ b/Thesis/content/Evaluation_of_existing_Systems/Notos/Notos.tex
@@ -4,18 +4,18 @@
 \subsection{General}
 \label{subsec:notos_general}

-\textit{Notos} has been published in 2010 by \fsAuthor{Antonakakis:2010:BDR:1929820.1929844} at the Georgia Institute of Technology. It has been introduced as ``being the first [system] to create a comprehensive dynamic reputation system around domain names'' \fsCite[Section 1]{Antonakakis:2010:BDR:1929820.1929844}. \textit{Notos} is based on observations that malicious use of DNS usually can be distinguished from legitimate, professionally provisioned DNS services by unique characteristics. Fraudulent activities therefore usually utilize techniques to evade security countermeasures \fsCite{Antonakakis:2010:BDR:1929820.1929844}. This approach is mainly using passive historical DNS information that was obtained on multiple recursive resolvers distributed across the Internet. For building a model of how resources are typically used in legitimate and malicious applications, information about vicious ip addresses and domain names is collected from different sources like honeypots, malware analysis services and spam-traps. Using this model, new domains that have never been seen before can be dynamically assigned with a reputation score of how likely this new domain is involved in malicious activities. Malicious activities in the context of \textit{Notos} are roughly described as: ``if it [a domain] has been involved with botnet C\&C servers, spam campaigns, malware propagation, etc.'' \fsCite[Section 3]{Antonakakis:2010:BDR:1929820.1929844}
+\textit{Notos} has been published in 2010 by \fsAuthor{Antonakakis:2010:BDR:1929820.1929844} at the Georgia Institute of Technology. It has been introduced as ``being the first [system] to create a comprehensive dynamic reputation system around domain names'' \fsCite[Section 1]{Antonakakis:2010:BDR:1929820.1929844}. \textit{Notos} is based on the observation that malicious use of DNS usually can be distinguished from legitimate, professionally provisioned DNS services by unique characteristics. Fraudulent activities therefore usually utilize techniques to evade security countermeasures, like fast changing records \fsCite{Antonakakis:2010:BDR:1929820.1929844}. This approach is mainly using passive historical DNS information that was obtained on multiple recursive resolvers distributed across the Internet. For building a model of how resources are typically used in legitimate and malicious applications, information about vicious IP addresses and domain names is collected from different sources like honeypots, malware analysis services and spam-traps. Using this model, new domains that have never been seen before can be dynamically assigned with a reputation score of how likely this new domain is involved in malicious activities. Malicious activities in the context of \textit{Notos} are roughly described as: ``if it [a domain] has been involved with botnet C\&C servers, spam campaigns, malware propagation, etc.'' \fsCite[Section 3]{Antonakakis:2010:BDR:1929820.1929844}

-\textit{Notos} uses some basic terminology which is shortly introduced here:
+\textit{Notos} uses some basic terminology which is briefly introduced here:
 \begin{itemize}
    \item A domain \textit{d} consists of several substrings which are described in \nameref{subsec:domain_names}. Abbreviations used in the following Sections are: \\
-        \textbf{Top-level domain:} TLD, where \(TLD(d)\) is the top-level domain of \textit{d} \\
-        \textbf{Second-level domain:} \(2LD(d)\) being the second-level domain of domain \textit{d} \\
-        \textbf{Third-level domain: } \(3LD(d)\) containing the three rightmost substrings separated by period for \textit{d}
-    \item Given domain \(d\) \(Zone(d)\) describes the set of domains that include \textit{d} and all subdomains of \textit{d}
-    \item \(D = \{d_1, d_2, ..., d_m\}\) representing a set of domains and \(A(D)\) all IP addresses that, at any time, any domain \(d \in D\) resolved to
-    \item \(BGP(a)\) consists of all ip addresses that are residing in the same \gls{bgp} prefix than \textit{a}
-    \item Analogously, \(AS(a)\) as the set of IP addresses located in the same \gls{as} than \textit{a}
+        \textbf{Top-level domain:} TLD, where \(TLD(d)\) is the top-level domain of \textit{d}. \\
+        \textbf{Second-level domain:} \(2LD(d)\) being the second-level domain of domain \textit{d}. \\
+        \textbf{Third-level domain: } \(3LD(d)\) containing the three rightmost substrings separated by period for \textit{d}.
+    \item Given domain \(d\) \(Zone(d)\) describes the set of domains that include \textit{d} and all subdomains of \textit{d}.
+    \item \(D = \{d_1, d_2, ..., d_m\}\) representing a set of domains and \(A(D)\) all IP addresses that, at any time, any domain \(d \in D\) resolved to.
+    \item \(BGP(a)\) consists of all IP addresses that are residing in the same \gls{bgp} prefix as \textit{a}.
+    \item Analogously, \(AS(a)\) as the set of IP addresses located in the same \gls{as} as \textit{a}.
 \end{itemize}


@@ -23,7 +23,7 @@
 \label{subsec:notos_architecture}

 The main goal of \textit{Notos} is to assign a dynamic reputation score to domain names. Domains that are likely to be involved in malicious activities are tagged with a low reputation score, whereas legitimate Internet services are assigned with a high reputation score. 
-\textit{Notos'} primary source of information is a database that contains historical data about domains and resolved ip addresses. This database is built using DNS traffic from two recursive ISP DNS servers (RDNS) and pDNS logs collected by the Security Information Exchange (SIE) which covers authoritative name servers in North America and Europe. For building a list of known malicious domain names, several honeypots and spam-traps have been deployed. A large list of known benign domains has been gathered from the top sites list on \textit{alexa.com} which ranks the most popular websites in several regions \fsCite{AlexaWebInformationOnline}. These two lists are referred to as the \textit{knowledge base} and are used to train the reputation training model. 
+\textit{Notos'} primary source of information for the training and classification is a database that contains historical data about domains and resolved IP addresses. This database is built using DNS traffic from two recursive ISP DNS servers (RDNS) and pDNS logs collected by the Security Information Exchange (SIE) which covers authoritative name servers in North America and Europe. For building a list of known malicious domain names, several honeypots and spam-traps have been deployed. A large list of known benign domains has been gathered from the top sites list on \textit{alexa.com} which ranks the most popular websites in several regions \fsCite{AlexaWebInformationOnline}. These two lists are referred to as the \textit{knowledge base} and are used to train the reputation training model. 


 To assign a reputation score to a domain \textit{d}, the most current set of IP addresses \(A_{c}(d) = \left\{a_{i}\right\}_{i=1..m}\) to which \textit{d} points is first fetched. Afterwards the pDNS database is queried for several information for this domain \textit{d}. The \textit{Related Historic IPs (RHIPs)} is the set of all IP addresses that ever pointed to this domain. In case domain \textit{d} is a third-level domain, all IP addresses that pointed to the corresponding second-level domain are also included. See Chapter~\ref{subsec:domain_names} for more information on the structure of domain names. If \textit{d} is a second-level domain, then all IPs that are pointed to from any of the third-level subdomains are also added to the RHIPs. In the next step, the set of \textit{Related Historic Domains (RHDNs)} is queried and covers all domains that are related to the currently processed domain \textit{d}. Specifically, all domains which ever resolved to an IP address that is residing in any of the ASNs of those IPs that \textit{d} currently resolves to.
@@ -38,7 +38,7 @@ There are three types of features extracted from the database for \textit{Notos}
 \end{enumerate}
 \end{quote}

-Figure~\ref{fig:notos_system_overview} shows the overall system architecture of \textit{Notos}. After all the features are extracted from the passive DNS database and prepared for further steps, the reputation engine is initialized. \textit{Notos'} reputation engine is operating in two modes. In offline mode, the reputation model is constructed for a set of domains using the feature set of each domain and the classification which can be calculated using the \textit{knowledge base} with black- and whitelist (also referred as training). This model can later be used in the online mode to dynamically assign a reputation score. In online mode, the same features that are used for the initial training are extracted for a new domain (resource record or RR, see Section~\nameref{subsubsec:dns_resource_records}) and \textit{Notos} queries the trained reputation engine for the dynamic reputation rating (see Figure~\ref{fig:notos_online_offline_mode}). The data for labeling domains and IPs  originates from various sources: the blacklist primarily consists of filter lists from malware services like malwaredomainlist.com and malwaredomains.com. Additional IP and domain labeling blacklists are the Sender Policy Block from Spamhaus and the ZeuS blocklist from ZeuS Tracker. The base has been downloaded before the main analyzation period (fifteen days from the first of August 2009)and as filter lists usually lag behind state-of-the art malware, the blacklists have continuously been updated. The whitelist was built using the top 500 popular Alexa websites. The 18 most common second level domains from various content delivery networks for classifying the CDN clusters and a list of 464 dynamic DNS 2LD for identifying domains and IPs in dynamic DNS zones.
+Figure~\ref{fig:notos_system_overview} shows the overall system architecture of \textit{Notos}. After all the features are extracted from the passive DNS database and prepared for further processing, the reputation engine is initialized. \textit{Notos'} reputation engine is operating in two modes. In offline mode, the reputation model is constructed for a set of domains using the feature set of each domain and the classification which can be calculated using the \textit{knowledge base} with black- and whitelist (also referred as training). This model can later be used in the online mode to dynamically assign a reputation score. In online mode, the same features that are used for the initial training are extracted for a new domain (resource record or RR, see Section~\nameref{subsubsec:dns_resource_records}) and \textit{Notos} uses the trained reputation engine to calculate a dynamic reputation rating (see Figure~\ref{fig:notos_online_offline_mode}). The data for labeling domains and IPs  originates from various sources: the blacklist primarily consists of filter lists from malware services like malwaredomainlist.com and malwaredomains.com. Additional IP and domain labeling blacklists are the Sender Policy Block from Spamhaus (\fsCite{SBLOnline}) and the ZeuS blocklist from ZeuS Tracker (\fsCite{zeusblocklistInformationOnline}). The base has been downloaded before the main analyzation period (fifteen days from the first of August 2009) and as filter lists usually lag behind state-of-the art malware, the blacklists have continuously been updated. The whitelist was built using the top 500 popular Alexa websites. Additionally, the 18 most common second level domains from various content delivery networks for classifying the CDN clusters and a list of 464 dynamic DNS 2LD for identifying domains and IPs in dynamic DNS zones have been gathered.

 \begin{figure}[!htbp]
    \centering
@@ -58,12 +58,12 @@ Figure~\ref{fig:notos_system_overview} shows the overall system architecture of
 \subsection{Features}
 \label{subsec:notos_features}

-In this Section, all statistical features are listed and a short explanation, for what reason those have been chosen, is introduced.
+In this Section, all statistical features are listed and a short explanation, for what reason those have been chosen to distinguish malicious from legitimate domains, is introduced.

 \subsubsection{Network-based features}
 \label{subsubsec:notos_network-based_features}

-The first group of features handles network-related keys. This group mostly describe how the owning operators of \textit{d} allocate network resources to achieve different goals. While most legitimate and professionally operated internet services feature have a rather stable network profile, malicious usage usually involves short living domain names and ip addresses with high agility to circumvent blacklisting and other simple types of resource blocking. Botnets usually contain machines in many different networks (\glspl{as} and \glspl{bgp}) operated by different organizations in different countries. Appropriate companies mostly acquire bigger ip blocks and such use consecutive IPs for their services in the same address space. This homogeneity also applies to other registration related information like registrars and registration dates. To measure this level of agility and homogeneity, eighteen statistical network-based features are extracted from the RHIPs (see Table~\ref{tab:notos_network-based_features}).
+The first group of features handles network-related keys. This group mostly describe how the owning operators of \textit{d} allocate network resources to achieve different goals. While most legitimate and professionally operated internet services feature have a rather stable network profile, malicious usage usually involves short living domain names and IP addresses with high agility to circumvent blacklisting and other simple types of resource blocking. Botnets usually contain machines in many different networks (\glspl{as} and \glspl{bgp}) operated by different organizations in different countries. Appropriate companies mostly acquire bigger IP blocks and such use consecutive IPs for their services in the same address space. This homogeneity also applies to other registration related information like registrars and registration dates. To measure this level of agility and homogeneity, eighteen statistical network-based features are extracted from the RHIPs (see Table~\ref{tab:notos_network-based_features}).

 \begin{table}[!htbp]
    \centering
@@ -96,7 +96,7 @@ The first group of features handles network-related keys. This group mostly desc
 \subsubsection{Zone-based features}
 \label{subsubsec:notos_zone-based_features}

-The second group is about zone-based features and is extracted from the RHDNs. In contrast to the network-based features which compares characteristics of the historic IPs, the zone-based features handles characteristics of all historically involved domains. While legitimate services often involve many domains, they usually share similarities. ``For example, google.com, googlesyndication.com, googlewave.com, etc., are all related to Internet services provided by Google, and contain the string 'google' in their name.''. In contrast, randomly generated domains used in spam campaigns are rarely sharing similarities. By calculating the mean, median and standard deviation for some key, the ``summarize [of] the shape of its distribution'' is investigated \fsCite[Section 3.2.2]{Antonakakis:2010:BDR:1929820.1929844}. To calculate this level of diversity, seventeen features are extracted which can be found in Table~\ref{tab:notos_zone-based_features}:
+The second group is about zone-based features and is extracted from the RHDNs. In contrast to the network-based features which compares characteristics of the historic IPs, the zone-based features handles characteristics of all historically involved domains. While legitimate services often involve many domains, they usually share similarities. For example, google.com, googlemail.com, googleplus.com, etc., are all services provided by Google and contain the string 'google' in their domains. In contrast, randomly generated domains used in spam campaigns are rarely sharing similarities. By calculating the mean, median and standard deviation for some key, the summarize of the shape of its distribution is investigated \fsCite{Antonakakis:2010:BDR:1929820.1929844}. To calculate this level of diversity, seventeen features are extracted which can be found in Table~\ref{tab:notos_zone-based_features}

 \begin{table}[!htbp]
    \centering
@@ -128,7 +128,7 @@ The second group is about zone-based features and is extracted from the RHDNs. I
 \subsubsection{Evidence-based features}
 \label{subsubsec:notos_evidence-based_features}

-For the evidence-based features, public information and exclusively collected data from honeypots and spam-traps is collected. This \textit{knowledge base} primarily helps to discover if a domain \textit{d} is in some way interacting with known malicious IPs and domains. As domain names are much cheaper than ip addresses, malware authors tend to reuse IPs with updated domain names. The blacklist features detect the reuse of known malicious resources like IP addresses, \gls{bgp} prefixes and \glspl{as}. 
+For the evidence-based features, public information and data from honeypots and spam-traps was collected. This \textit{knowledge base} primarily helps to discover if a domain \textit{d} is in some way interacting with known malicious IPs and domains. As domain names are much cheaper to obtain than IP addresses, malware authors tend to reuse IPs with updated domain names. The blacklist features indicate the reuse of known malicious resources like IP addresses, \gls{bgp} prefixes and \glspl{as}. 

 \begin{table}[!htbp]
    \centering
@@ -159,7 +159,7 @@ Figure~\ref{fig:notos_features} shows how the three different feature groups are
 \subsection{Reputation Engine}
 \label{subsec:notos_reputation_engine}

-The reputation engine is used to dynamically assign a reputation score to a domain \textit{d}. In the first step, the engine has to be trained with the available training set (temporal defined as the \textit{training period}). The training is performed in an offline fashion which means all data is statically available at the beginning of this step. The training mode consists of three modules: The \textit{Network Profile Model} is a model of how known good domains are using resources. This model uses popular content delivery networks (e.g., Akamai, Amazon CloudFront) and large sites (e.g., google.com, yahoo.com) as a base. In total the \textit{Network Profile Model} consists of five classes of domains: \textit{Popular Domains}, \textit{Common Domains}, \textit{Akamai Domains}, \textit{CDN Domains} and \textit{Dynamic DNS Domains}. The second module \textit{Domain Name Clusters} performs a general clustering of all domains (respectively their statistical feature vectors) of the training set. There are two consecutive clustering processes: The \textit{network-based} clustering aims to group domains with similar agility characteristics. To refine those clusters, a \textit{zone-based} clustering is performed which groups domains that are similar in terms of its RHDNs (see explanation for the \textit{zone-based features}). Those clusters of domains with similar characteristics can then be used to identify mostly benign and malicious sets of domains. In the last step of the offline mode, the \textit{Reputation Function} is build. As seen in Figure~\ref{fig:notos_online_offline_mode} this module takes the results of the \textit{Network Profile Model} (\(NM(d_i)\)) and the \textit{Domain Name Clusters} (\(DC(d_i)\)) for each domain \textit{d} in \(d_i, i = 1..n\) as inputs, calculates an \textit{Evidence Features Vector} \(EV(d_i)\), which basically checks if \(d_i\) or any of its resolved IPs is known to be benign or malicious, and builds a model that can assign a reputation score between zero and one to \textit{d}. This \textit{Reputation Function} is implemented as a statistical classifier. These three modules form the reputation model that can be used in the last step to compute the reputation score. A rebuild of the training model can be done at any time, for example given an updated training set.
+The reputation engine is used to dynamically assign a reputation score to a domain \textit{d}. In the first step, the engine has to be trained with the available training set (temporal defined as the \textit{training period}). The training is performed in an offline fashion which means all data is statically available at the beginning of this step. The training mode consists of three modules: The \textit{Network Profile Model} is a model of how known good domains are using resources. This model uses popular content delivery networks (e.g., Akamai, Amazon CloudFront) and large sites (e.g., google.com, yahoo.com) as a base. In total the \textit{Network Profile Model} consists of five classes of domains: \textit{Popular Domains}, \textit{Common Domains}, \textit{Akamai Domains}, \textit{CDN Domains} and \textit{Dynamic DNS Domains}. The second module \textit{Domain Name Clusters} performs a general clustering of all domains (respectively their statistical feature vectors) of the training set. There are two consecutive clustering processes: The \textit{network-based} clustering aims to group domains with similar characteristics in terms of the agility, e.g. how often DNS resources are changed. To refine those clusters, a \textit{zone-based} clustering is performed which groups domains that are similar in terms of its RHDNs (see explanation for the \textit{zone-based features}). Those clusters of domains with similar characteristics can then be used to identify mostly benign and malicious sets of domains. In the last step of the offline mode, the \textit{Reputation Function} is built. As seen in Figure~\ref{fig:notos_online_offline_mode} this module takes the results of the \textit{Network Profile Model} (\(NM(d_i)\)) and the \textit{Domain Name Clusters} (\(DC(d_i)\)) for each domain \textit{d} in \(d_i, i = 1..n\) as inputs, calculates an \textit{Evidence Features Vector} \(EV(d_i)\), which basically checks if \(d_i\) or any of its resolved IPs is known to be benign or malicious, and builds a model that can assign a reputation score between zero and one to \textit{d}. This \textit{Reputation Function} is implemented as a statistical classifier. These three modules form the reputation model that can be used in the last step to compute the reputation score. A rebuild of the training model can be done at any time, for example given an updated training set.

 The final stage of the reputation engine is the online (streaming like) mode. Any considered domain \textit{d} is first supplied to the \textit{network profiles} module which returns a probability vector \(NM(d) = \{c_1, c_2, ..., c_5\}\) of how likely \textit{d} belongs to one of the five classes (e.g. probability \(c_1\) that \textit{d} belongs to \textit{Popular Domains}). \(DC(d)\) is the resulting vector of the \textit{domain clusters} module and can be broken down into the following parts: For the domain \textit{d} of interest, the network-based features are extracted and the closest network-based cluster \(C_d\), generated in the training mode by the \textit{Domain Name Clusters} module, is calculated. The following step takes all zone-based feature vectors \(v_j \in C_d\) and eliminates those vectors that do not fulfill \(dist(z_d , v_j ) < R\), where \(z_d\) is the zone-based feature vector for \textit{d} and \textit{R} being a predefined radius; or \(v_j \in KNN(z_d)\), with \(KNN(z_d)\)) being the k nearest-neighbors of \(z_d\). Each vector \(v_i\) of the resulting subset \(V_d \subseteq C_d\) is then assigned one of this eight labels: \textit{Popular Domains}, \textit{Common Domains}, \textit{Akamai}, \textit{CDN}, \textit{Dynamic DNS}, \textit{Spam Domains}, \textit{Flux Domains}, and \textit{Malware Domains}. The next step is to calculate the five statistical features that form the resulting vector \(DC(d) = \{l_1, l_2, ..., l_5\}\).

@@ -177,7 +177,7 @@ Having the \textit{Network Profile Model} \(NM(d)\), the \textit{Domain Name Clu

 In the last Section of the evaluation of \textit{Notos}, experimental results that have been published are listed. This covers metrics about the usage of raw data, lessons learned in the analyzation process (i.e. examined algorithms) and final acquisitions like precision and accuracy of the classification.

-\textit{Notos} being the first dynamic reputation system in the context of domain names, it is able to identify malicious domain names before they appear in public filter lists. To be able to assign reputation scores to new domains, \fsAuthor{Antonakakis:2010:BDR:1929820.1929844} used historic passive dns logs of a time span of 68 days with a total volume of 27,377,461 unique, successful A-type resolutions mainly from two recursive ISP DNS servers in North America (plus pDNS logs from various networks, aggregated by the SIE \ref{subsec:notos_architecture}). Figure~\ref{fig:notos_volume_new_rr} shows that after a few days, the number of new domains (RR) stabilizes at about 100,000 to 150,000 new domains a day compared to a much higher total load of unique resource records (about 94.7\% duplicates) (see Figure~\ref{fig:notos_total_volume_unique_rr}). The amount of new IPs is analogously nearly constant. After few weeks, even big content delivery networks with a large (but nearly constant) number of IP addresses will get scanned, in contrast to botnets where continuously new machines are infected. The authors follow that a relatively small pDNS database is therefor sufficient for \textit{Notos} to produce good results.
+\textit{Notos} being the first dynamic reputation system in the context of domain names, it is able to identify malicious domain names before they appear in public filter lists. To be able to assign reputation scores to new domains, \fsAuthor{Antonakakis:2010:BDR:1929820.1929844} used historic passive dns logs of a time span of 68 days with a total volume of 27,377,461 unique, successful A-type resolutions mainly from two recursive ISP DNS servers in North America (plus pDNS logs from various networks, aggregated by the SIE \ref{subsec:notos_architecture}). Figure~\ref{fig:notos_volume_new_rr} shows that after a few days, the number of new domains (RR) stabilizes at about 100,000 to 150,000 new domains a day compared to a much higher total load of unique resource records (about 94.7\% duplicates) (see Figure~\ref{fig:notos_total_volume_unique_rr}). The amount of new IPs is analogously nearly constant. After few weeks, even big content delivery networks with a large (but nearly constant) number of IP addresses will get scanned, in contrast to botnets where continuously new machines are infected. The authors infer that a relatively small pDNS database is therefor sufficient for \textit{Notos} to produce good results.

 \begin{figure}[!htbp]
    \centering
@@ -193,10 +193,10 @@ In the last Section of the evaluation of \textit{Notos}, experimental results th
    \label{fig:notos_total_volume_unique_rr}
 \end{figure}

-To get optimal results with the \textit{Reputation Function}, several classifiers have been tested and selected for the given circumstances (time complexity, detection results and precision [true positives over all positives]). A decision tree with Logit-Boost strategy has shown to provide the best results with a low false positive rate (FP) of 0.38\% and a high true positive rate (TP) of 96.8\%. These results have been verified using a 10-fold cross-validation with a reputation score threshold of 0.5. For this validation, a dataset of 20,249 domains with 9,530 known bad RR has been used. As the list of known good domains, the Alexa top 500 websites have been used. Taking a bigger amount of Alexa popular sites has shown to decrease accuracy of the overall system, i.e. 100,000 entries showed a TP of 80.6\% and a FP of 0.6\%. To compare \textit{Notos}' performance with static filter lists, a pre-trained instance has been fed with 250,000 unique domains collected on 1. August 2009. 10,294 distinct entries have been reported with a reputation score below 0.5. 7,984 of this 10,294 or 77.6\% could be found in at least one blacklist (see Section~\nameref{subsec:notos_architecture} for a list of included blacklists). The remaining 22.4\% could not be precisely revealed. It is worth stating that 7,980 of the 7,984 confirmed bad domain names were assigned a reputation score of less than or equal to 0.15.
+To get optimal results with the \textit{Reputation Function}, several classifiers have been tested and selected for the given circumstances (time complexity, detection results and precision [true positives over all positives]). A decision tree with Logit-Boost strategy (see \fsCite{Friedman98additivelogistic} for implementation details) has shown to provide the best results with a low false positive rate (FP) of 0.38\% and a high true positive rate (TP) of 96.8\%. These results have been verified using a 10-fold cross validation with a reputation score threshold of 0.5. This 10-fold cross validation method splits the dataset in ten partitions/folds (each partition optimally containing roughly the same class label distribution). One fold is then used as the validation sample (testing set) and the remaining nine partitions are used as the training set. The training set is used to train the model which is then cross validated with the testing set. This step is repeated for ten times using the same partitions, each partition being the testing set once. For the validation in \textit{Notos}, a dataset of 20,249 domains with 9,530 known bad RR has been used. As the list of known good domains, the Alexa top 500 websites have been used. Taking a bigger amount of Alexa popular sites has shown to decrease accuracy of the overall system, which could be interpreted as smaller/less popular sites are more likely to get compromised. To compare \textit{Notos}' performance with static filter lists, a pre-trained instance has been fed with 250,000 unique domains collected on 1. August 2009. 10,294 distinct entries have been reported with a reputation score below 0.5. 7,984 of this 10,294 or 77.6\% could be found in at least one blacklist (see Section~\nameref{subsec:notos_architecture} for a list of included blacklists). The remaining 22.4\% could not be precisely revealed. It is worth stating that 7,980 of the 7,984 confirmed bad domain names were assigned a reputation score of less than or equal to 0.15.


 \subsection{Limitations}
 \label{subsec:notos_limitations}

-As \textit{Notos} is mainly using historic DNS information, new domain names that resolve to IP addresses with previously unseen prefixes, can not be reliably classified. Once IPv6, with its huge address space, will get the overall standard, a precise reputation score based on the fine grain IP address features will get harder.
+As \textit{Notos} is mainly using historic DNS information, new domain names that resolve to IP addresses with previously unseen prefixes, can not be reliably classified. Once IPv6, with its huge address space, will get the overall standard, a precise reputation score based on the fine grained IP address features will get harder.
--- a/Thesis/content/Introduction/Introduction.tex
+++ b/Thesis/content/Introduction/Introduction.tex
@@ -1,26 +1,25 @@
 \chapter{Introduction}
 \label{cha:Introduction}

-The domain name system (\gls{dns}) has been one of the corner stones of the internet for a long time. It acts as a hierarchical, bidirectional translation device between mnemonic domain names and network addresses. It also provides service lookup or enrichment capabilities for a range of application protocols like HTTP, SMTP, and SSH. In the context of defensive IT security, investigating aspects of the \gls{dns} can facilitate protection efforts tremendously. Estimating the reputation of domains can help in identifying hostile activities. Such a score can, for example, consider features like quickly changing network blocks for a given domain or clustering of already known malicious domains and newly observed ones. 
+The domain name system (\gls{dns}) has been one of the corner stones of the internet for a long time. It acts as a hierarchical, bidirectional translation device between mnemonic domain names and network addresses. It also provides service lookup or enrichment capabilities for a range of application protocols like HTTP, SMTP, and SSH (e.g. verifying SSH host keys using DNS). In the context of defensive IT security, investigating aspects of the \gls{dns} can facilitate protection efforts tremendously. Estimating the reputation of domains can help in identifying hostile activities. Such a score can, for example, consider features like quickly changing network blocks for a given domain or clustering of already known malicious domains and newly observed ones. 


 \section{Motivation}
 \label{sec:motivation}

-
-\todo{also check papers for motivations}
+Malware like botnets, phishing sites and spam heavily rely on the domain name system to either hide behind proxies or communicate with command and control servers. Malware authors are getting more and more creative in bypassing traditional countermeasures. Using techniques like domain generation algorithms and fast-flux service networks make it hard to eliminate the roots of, for example botnets. The ZeuS botnet family exists since 2007 and further propagation could not be stopped until today (\fsCite{WhyDGOWinsOnline}). This leads to a situation where static filter list can not keep pace with evolving malware authors. To eliminate malware in the long run, malware has to be stopped before it can be widely spread across the internet. There are three major systems that have been proposed as dynamic domain reputation systems using passive DNS data in the past. With passive DNS databases getting more common, setting up a domain reputation system using pDNS data promises a lightweight monitoring system.


 \section{Challenges}
 \label{sec:challenges}

-All of the investigated approaches are using \gls{pdns} logs to generate a reputation score for a specific domain. These logs are generated on central \gls{dns} resolvers and capture outgoing traffic of multiple users (see Section~\ref{subsec:passive_dns}), one challenge of this work is handling huge volumes of data. With about seven Gigabytes \todo{verify} of uncompressed \gls{pdns} logs for a single day, various general issues might occur: General purpose computers nowadays usually have up to 16 Gigabytes of RAM (rarely 32 GB) which concludes that multiple tasks (i.e. building a training set) may not be performed purely in-memory. The time of analysis might also become a bottleneck. Simply loading one single day (see benchmark example~\ref{lst:load_and_iterate_one_day_of_compressed_pdns_logs}) of (compressed) logs from disk and iterating it without actual calculations takes roughly 148 seconds. To evaluate existing algorithms certain requirements have to be met. Passive DNS logs usually contain sensitive data which is one reason why most papers do not publish test data. For a precise evaluation the raw input data is needed. Some previously developed classifications have not completely disclosed the involved algorithms so these have to be reconstructed as close as possible taking all available information into account. 
+All of the investigated approaches are using \gls{pdns} logs to generate a reputation score for a specific domain. These logs are monitored on central \gls{dns} resolvers and capture lookup results of arbitrary scale users (see Section~\ref{subsec:passive_dns}), so one challenge of this work is handling huge volumes of data. With about seven Gigabytes of uncompressed \gls{pdns} logs for a single day, various general issues might occur: General purpose computers nowadays usually have up to 16 Gigabytes of RAM (rarely 32 GB) which concludes that multiple tasks (i.e. building a training set) may not be performed purely in-memory. The time of analysis might also become a bottleneck. Simply loading one single day (see benchmark example~\ref{lst:load_and_iterate_one_day_of_compressed_pdns_logs}) of (compressed) logs from disk and iterating it without actual calculations takes roughly 148 seconds. To evaluate existing algorithms certain requirements have to be met. Passive DNS logs usually contain sensitive data which is one reason why most papers do not publish test data. For a precise evaluation the raw input data is needed. Some previously developed classifications have not completely disclosed the involved algorithms so these have to be reconstructed as closely as possible taking all available information into account. 


 \section{Goals}
 \label{sec:goals}

-The task of this work is to evaluate existing scoring mechanisms of domains in the special context of IT security, and also research the potential for combining different measurement approaches. It ultimately shall come up with an improved and evaluated algorithm for determining the probability of a domain being related to hostile activities. 
+The task of this work is to evaluate for existing scoring mechanisms of domains in the special context of IT security, and also research the potential for combining different measurement approaches. It ultimately shall come up with an improved algorithm by combining existing algorithms for determining the probability of a domain being related to hostile activities. 


 \section{Related Work}
--- a/Thesis/content/Technical_Background/DNS/DNS.tex
+++ b/Thesis/content/Technical_Background/DNS/DNS.tex
@@ -1,7 +1,7 @@
 \section{Domain Name System}
 \label{sec:DNS}

-The \gls{dns} is one of the cornerstone of the internet as it is known today. Nearly every device, connected to the internet is using DNS. Initial designs have been proposed in 1983 and evolved over the following four years into the first globally adapted standard RFC 1034 \fsCite{rfc1034} (see also RFC 1035 for implementation and specification details \fsCite{rfc1035}). The main idea of the \gls{dns} is translating human readable domain names to network addresses. There are many extensions to the initial design including many security related features and enhancements or the support for \gls{ipv6} in 1995. 
+The \gls{dns} is one of the cornerstones of the internet as it is known today. Nearly every device, connected to the internet is using DNS. Initial designs have been proposed in 1983 and evolved over the following four years into the first globally adapted standard RFC 1034 \fsCite{rfc1034} (see also RFC 1035 for implementation and specification details \fsCite{rfc1035}). The main idea of the \gls{dns} is translating human readable domain names to network addresses. There are many extensions to the initial design including many security related features and enhancements or the support for \gls{ipv6} in 1995. 

 In order to understand how the \gls{dns} is misused for malicious activities and how to prevent these attacks, it is necessary to explain some basic mechanisms.

@@ -9,15 +9,15 @@ In order to understand how the \gls{dns} is misused for malicious activities and
 \subsection{Basics}
 \label{subsec:basics}

-In the early days of the internet the mapping between host names and ip addresses has been accomplished using a single file, \texttt{HOSTS.TXT}. This file was maintained on a central instance, the \gls{sri-nic}, and distributed to all hosts in the internet via \gls{ftp}. As this file grew and more machines got connected to the internet, the costs for distributing the mappings were increasing up to an unacceptable effort. Additionally, the initial trend of the internet, the \gls{arpanet} connecting multiple hosts together into one network, got outdated. The new challenge of the internet was to connect multiple local networks (which itself contain many machines) into a global, interactive and \gls{tcp/ip} based grid. With the amount of machines quickly increasing and the costs for distributing the \texttt{HOSTS.TXT} file exponentially rising, a new system for a reliable and fast resolution of addresses to host names had to be developed.
+In the early days of the internet the mapping between host names and IP addresses has been accomplished using a single file, \texttt{HOSTS.TXT}. This file was maintained on a central instance, the \gls{sri-nic}, and distributed to all hosts in the internet via \gls{ftp}. As this file grew and more machines got connected to the internet, the costs for distributing the mappings were increasing up to an unacceptable effort. Additionally, the initial trend of the internet, the \gls{arpanet} connecting multiple hosts together into one network, got outdated. The new challenge of the internet was to connect multiple local networks (which itself contain many machines) into a global, interactive and \gls{tcp/ip} based grid. With the amount of machines quickly increasing and the costs for distributing the \texttt{HOSTS.TXT} file rising, a new system for a reliable and fast resolution of addresses to host names had to be developed.

 \citeauthor{mockapetris1988development} proposed five conditions that had to be met by the base design of \gls{dns} \fsCite[p. 124]{mockapetris1988development}:

 \begin{itemize}
-\item Provide at least all of the same information as HOSTS.TXT.
+\item Provide at least the same information as HOSTS.TXT.
 \item Allow the database to be maintained in a distributed manner.
 \item Have no obvious size limits for names, name components, data associated with a name, etc.
-\item Interoperate across the DARPA Internet as many other environments as possible.
+\item Interoperate across the DARPA Internet and as many other environments as possible.
 \item Provide tolerable performance.
 \end{itemize}

@@ -28,14 +28,21 @@ In general, avoid as many constraints and support as many implementation structu
 \subsubsection{Architecture}
 \label{subsubsec:architecture}

-The \gls{dns} primarily builds on two types of components: name servers and resolvers. A name server holds information that can be used to handle incoming requests e.g. to resolve a domain name into an ip address. Although resolving domain names into ip addresses might be the primary use case, name servers can possess arbitrary information and provide service to retrieve this information. A resolver interacts with client software and implements algorithms to find a name server that holds the information requested by the client. Depending on the functionality needed, these two components may be split to different machines and locations or running on one machine. Where in former days the power of a workstation may not has been sufficient to run a resolver on, today it is more interesting to benefit from cached information for performance reasons. In a company network it is common to have multiple resolvers e.g. one per organizational unit.
+The \gls{dns} primarily builds on two types of components: name servers and resolvers. A name server holds information that can be used to handle incoming requests e.g. to resolve a domain name into an IP address. Although resolving domain names into IP addresses might be the primary use case, name servers can possess arbitrary (within the limits of DNS records see \ref{tab:resource_record_types}) information and provide service to retrieve this information. A resolver interacts with client software and implements algorithms to find a name server that holds the information requested by the client (see also Section~\ref{subsec:resolution} for how the resolution is working). Depending on the functionality needed, these two components may be split to different machines and locations or running on one machine. Whereas in former days the bandwidth of a workstation may not have been sufficient to run a resolver on, today it is more interesting to benefit from cached information for performance reasons. In a company network it is common to have multiple resolvers e.g. one per organizational unit.



 \subsubsection{Name space}
 \label{subsubsec:name_space}

-The \gls{dns} is based on a naming system that consists of a hierarchical and logical tree structure and is called the domain namespace. It contains a single root node (\textit{top level domain} or \textit{TLD})and an arbitrary amount of nodes in subordinate levels in variable depths (descending called second level, third level domain, and so forth). Each node is uniquely identifiable through a \gls{fqdn} and usually represents a domain, machine or service in the network. Furthermore, every domain can be subdivided into more fine-grained domains. These can again be specific machines or domains, called subdomains. This subdividing is an important concept for the internet to continue to grow and each responsible instance of a domain (e.g. a company or cooperative) is responsible for the maintenance and subdivision of the domain. 
+The \gls{dns} is based on a naming system that consists of a hierarchical and logical tree structure and is called the domain namespace. It contains a single root node (\textit{top level domain} or \textit{TLD})and an arbitrary amount of nodes in subordinate levels in variable depths (descending called second level, third level domain, and so forth). Each node is uniquely identifiable through a \gls{fqdn} and usually represents a domain, machine or service in the network. The FQDN can be constructed by fully iterating the DNS tree, see Figure~\ref{fig:dns_tree_web_de} for an example of how the DNS tree for www.web.de is looking like (note that the Root node is often abbreviated with a simple dot). Furthermore, every domain can be subdivided into more fine-grained domains. These can again be specific machines or domains, called subdomains. This subdividing is an important concept for the internet to continue to grow and each responsible instance of a domain (e.g. a company or cooperative) is responsible for the maintenance and subdivision of the domain. 
+
+\begin{figure}[!htbp]
+    \centering
+    \includegraphics[width=.4\textwidth, clip=true]{content/Technical_Background/DNS/dns_tree_web_de.png}
+    \caption{DNS: Tree structure}
+    \label{fig:dns_tree_web_de}
+\end{figure}


 \subsubsection{\gls{dns} Resource Records}
@@ -46,7 +53,7 @@ See Table~\ref{tab:resource_record_types} for an list of built-in resource types

 \begin{table}[!htbp]
 \centering
-\caption{Resource Record Types}
+\caption{DNS: Resource Record Types}
 \label{tab:resource_record_types}
 \begin{tabular}{@{}llll@{}}
 \toprule
@@ -66,7 +73,7 @@ Value & Text Code & Type
 \subsubsection{Payload}
 \label{subsubsec:payload}

-In this section we will introduce the actual payload a \gls{dns} request as well as the response is built on. The format of each message that is shared between a resolver and \gls{dns} server has been initially defined in RFC 1035 \fsCite{rfc1035} and consecutively extended with new opcodes, response codes etc. This general format applies to both requests as well as responses and consists of five sections:
+In this section we will introduce the actual payload a \gls{dns} request as well as the response are built on. The format of each message that is shared between a resolver and \gls{dns} server has been initially defined in RFC 1035 \fsCite{rfc1035} and consecutively extended with new opcodes, response codes etc. This general format applies to both requests as well as responses and consists of five sections:

 \begin{enumerate}
    \item Message Header
@@ -77,12 +84,12 @@ In this section we will introduce the actual payload a \gls{dns} request as well
 \end{enumerate}

 \paragraph{Message Header:}
-\label{par:message_header}with
+\label{par:message_header}
 The Message Header is obligatory for all types of communication and may not be empty. It contains different types of flags that are used to control the transaction. The header specifies e.g. which further sections are present, whether the message is a query or a response and more specific opcodes.

 \begin{table}[!htbp]
 \centering
-\caption{Message Header}
+\caption{DNS: Message Header}
 \label{tab:message_header}
 \begin{tabular}{@{}cccccccccccccccc@{}}
 \toprule
@@ -99,14 +106,14 @@ QR & \multicolumn{4}{c}{OPCODE} & AA & TC & RD & RA & Z & AD & CD & \multicolumn
 Table~\ref{tab:message_header} shows the template of a \gls{dns} message header. In the following listing, an explanation for the respective variables and flags is given:

 \begin{itemize}
-    \item \textbf{Message ID:} 16 bit identifier supplied by the requester (any kind of software that generates a request) and resend back unchanged by the responder to identify the transaction and enables the requester to match up replies to outstanding request.
+    \item \textbf{Message ID:} 16 bit identifier supplied by the requester (any kind of software that generates a request) and sent back unchanged by the responder to identify the transaction and enables the requester to match up replies to outstanding request.
    
    \item \textbf{QR:} Query/Response Flag – one bit field whether this message is a query(0) or a response(1)
    
    \item \textbf{OPCODE:} Four bit field that specifies the kind of query for this message. This is set by the requester and copied into the response. Possible values for the opcode field can be found in Table~\ref{tab:message_header_opcodes}
    \begin{table}[!htbp]
    \centering
-    \caption{Message Header Opcodes}
+    \caption{DNS: Message Header Opcodes}
    \label{tab:message_header_opcodes}
    \begin{tabular}{@{}lll@{}}
    \toprule
@@ -139,7 +146,7 @@ Table~\ref{tab:message_header} shows the template of a \gls{dns} message header.
    
    \begin{table}[!htbp]
    \centering
-    \caption{Message Header Response Codes}
+    \caption{DNS: Message Header Response Codes}
    \label{tab:message_header_response_codes}
    \begin{tabular}{@{}lll@{}}
    \toprule
@@ -165,7 +172,7 @@ Table~\ref{tab:message_header} shows the template of a \gls{dns} message header.

 \begin{table}[!htbp]
 \centering
-\caption{Question Section}
+\caption{DNS: Question Section}
 \label{tab_question_section}
 \begin{tabular}{@{}ccccccccc@{}}
 \toprule
@@ -177,7 +184,7 @@ Table~\ref{tab:message_header} shows the template of a \gls{dns} message header.


 \begin{itemize}
-    \item \textbf{Question Name:} Contains a variably sized payload including the domain, zone name or general object that is subject of the query. Encoded using standard \gls{dns} name notation. Depending on the Question Type, for example requesting an A Record will typically require an host part, such as www.domain.tld. A MX query will usually only contain a base domain name (domain.tld).
+    \item \textbf{Question Name:} Contains a variably sized payload including the domain, zone name or general object that is subject of the query. Encoded using standard \gls{dns} name notation. Depending on the Question Type, for example requesting an A Record will require an host part, such as www.domain.tld. A MX query will usually only contain a base domain name (domain.tld).
    
    \item \textbf{Question Type:} Specifies the type of question being asked. This field may contain a code number corresponding to a particular type of resource being requested, see Table~\ref{tab:resource_record_types} for common resource types.
    
@@ -204,11 +211,11 @@ There are mainly two different types of DNS requests that are performed here. Th
 \begin{figure}[!htbp]
 \centering
 \includegraphics[scale=.5, clip=true]{content/Technical_Background/DNS/DNS_address-resolution.pdf}
-\caption{Address Resolution}
+\caption{DNS: Address Resolution}
 \label{fig:address_resolution}
 \end{figure}

 \subsection{Passive DNS}
 \label{subsec:passive_dns}

-A Passive DNS database is a database that contains a history of all resolved DNS queries in a network. The traffic can be observed at any appropriate location in a network, e.g. on a resolver. A Passive DNS database can be used in a variety of actions to harden a network from different threats. Projects like the Security Information Exchange (SIE) collect passive DNS data from multiple sources and analyse the databases to find e.g. inconsistencies in the resolutions (\fsCite{SIEOnline}). Passive DNS databases can also be used by researchers or service providers to find performance issues, identify anomalies or generate usage statistics \fsCite{Deri:2012:TPD:2245276.2245396}.
+A Passive DNS database is a database that contains a history of all resolved DNS queries in a network. The traffic can be observed at any appropriate location in a network, e.g. on a resolver. The main advantage of passively collecting DNS traffic is that there are no operational changes needed to collect logs of resolutions, one simple way is to mirror the DNS port on the resolver and persist the traffic into files). A Passive DNS database can be used in a variety of actions to harden a network from different threats. Projects like the Security Information Exchange (SIE) collect passive DNS data from multiple sources and analyse the databases to find e.g. inconsistencies in the resolutions (\fsCite{SIEOnline}). Passive DNS databases can also be used by researchers or service providers to find performance issues, identify anomalies or generate usage statistics \fsCite{Deri:2012:TPD:2245276.2245396}.
--- a/Thesis/content/Technical_Background/DNS/dns_tree_web_de.png
+++ b/Thesis/content/Technical_Background/DNS/dns_tree_web_de.png
--- a/Thesis/content/Technical_Background/Technical_Background.tex
+++ b/Thesis/content/Technical_Background/Technical_Background.tex
@@ -2,5 +2,14 @@
 \label{cha:technical_background}

 \input{content/Technical_Background/DNS/DNS}
-\input{content/Technical_Background/Detecting_Malicious_Domain_Names/Detecting_Malicious_Domain_Names}
-\input{content/Technical_Background/Benchmarks/Benchmarks}
+
+\section{Machine Learning}
+\label{sec:machine_learning}
+
+Machine learning is broad field in computer science that aims to give computers the ability to learn without being explicitly programmed for a special purpose. There are many different approaches available that have advantages and disadvantages in different areas. Machine learning in this work is mostly limited to decision tree learning. Decision tree learning is an approach that is generally adopted from how humans are making decisions. Given a set of attributes, humans are able to decide, e.g. whether to buy one or another product. Machine learning algorithms use a technique called training to build a model which can later be used to make decisions. A decision tree consists of three components: a node represents the test of a certain attribute to split up the tree, leafs are terminal nodes and represent the prediction (the class or label) of the path from the root node to the leaf, and edges correspond to the results of a test and establish a connection to the next node or leaf. This training is performed in multiple steps: Given an arbitrarily large dataset (training set) with an fixed size of features (attributes) and each sample in the training set is assigned a label. The amount of labels is arbitrary (but limited), in a binary classification there are two different labels (e.g. malicious or benign in cases for domains). In the first step of the training, the whole training set is iterated and each time, a set of samples can be separated using one single attribute (in perspective to the assigned label) it is branched out and a new leaf is created. Each branch is then split into more fine grained subtrees as long as there is an \textit{information gain}, which means that all samples of the subset belong to the same class, i.e. are assigned the same label. The model can later be queried with an unlabeled data sample and the model returns the probability with which the data sample can be assigned to a class/label. 
+
+This way, having a labeled training set with limited size and by learning the characteristics of the labeled test sample, unlabeled data can be classified.
+
+
+%\input{content/Technical_Background/Detecting_Malicious_Domain_Names/Detecting_Malicious_Domain_Names}
+\input{content/Technical_Background/Benchmarks/Benchmarks}
--- a/Thesis/content/abstract.tex
+++ b/Thesis/content/abstract.tex
@@ -1,4 +1,5 @@
 \section*{Abstract}
 \label{sec:Abstract}
-Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
+
+\todo{write abstract}

--- a/Thesis/cover.tex
+++ b/Thesis/cover.tex
@@ -16,8 +16,8 @@ Submitted by:  & \quad \thesisauthor\\[1.2ex]
 Study field: & \quad \course\\[1.2ex]
 Matriculation number: & \quad \matriculationid\\[1.2ex]
 Primary Reviewer:  & \quad \firstreview\\[1.2ex]
-%Secondary Reviewer: & \quad \secondreview\\[1.2ex]
-Mentor:       & \quad \mentor\\[1.2ex]
+Secondary Reviewer: & \quad \secondreview\\[1.2ex]
+Supervisor:       & \quad \mentor\\[1.2ex]
 \end{tabular}

 \thesisyear\\[9ex]
--- a/Thesis/glossar.tex
+++ b/Thesis/glossar.tex
@@ -5,12 +5,6 @@
            of rules and specifications that a software program can follow to access and make use of the services and resources provided by another particular software program that implements that API.}
 }

-\newglossaryentry{ransomware}
-{
-    name={Ransomware},
-    description={Ransomware is a type of malicious software from cryptovirology that threatens to publish the victim's data or perpetually block access to it unless a ransom is paid.}
-}
-
 \newglossaryentry{rir}
 {
    name={Regional Internet Registry},
@@ -32,36 +26,24 @@
 \newglossaryentry{as}
 {
    name={AS},
-    description={An Autonomous System is a set of different networks in the Internet that allows to consistently route between those networks (i.e. an Internet Service Provider) and that exports a single interface for other AS. Each Autonomous System is assigned a officially registered unique Autonomous System Number (ASN).}
+    description={An Autonomous System references a set of one or more networks in the Internet that allows to consistently route between those networks (i.e. an Internet Service Provider) and that exports a single interface for other AS. Each Autonomous System is assigned a officially registered unique Autonomous System Number (ASN).}
 }

 \newglossaryentry{bgp}
 {
    name={BGP},
-    description={The Border Gateway Protocol, also known as the Exterior-Gateway-Protocol (EGP), is the protocol to connect different Autonomous Systems in the Internet. Is is used to share several information for IP blocks to allow routing between different Autonomous Systems.}
+    description={The Border Gateway Protocol, also known as the Exterior-Gateway-Protocol (EGP), is a protocol to route traffic between different Autonomous Systems in the Internet. It is used to share several information for IP blocks to allow routing between different Autonomous Systems.}
 }

 \newglossaryentry{whois}
 {
    name={Whois},
-    description={Whois is a protocol, used to gather information about owners of domains in the domain name system and IP addresses, specified in RFC 1834.}
-}
-
-\newglossaryentry{biased_estimator}
-{
-    name={biased},
-    description={The biased estimator for the standard deviation of a random variable \textit{X} is defined as \(\sqrt{\sum_{i=1}^N \frac{1}{N}(\bar{X}_i - \mu)^2}\)}
-}
-
-\newglossaryentry{unbiased_estimator}
-{
-    name={unbiased},
-    description={The unbiased estimator for the standard deviation of a random variable \textit{X} is defined as \(\sqrt{\sum_{i=1}^N \frac{1}{N-1}(\bar{X}_i - \mu)^2}\)}
+    description={Whois is a protocol used to gather information about owners of domains in the domain name system and IP addresses specified in RFC 1834.}
 }

 \newglossaryentry{roc}
 {
-    name={receiver operating characteristic curve},
+    name={Receiver Operating Characteristic Curve},
    description={The ROC curve is a graphical plot of the true positive rate as well as the false positive rate and highlights the performance of a binary classifier.}
 }

--- a/Thesis/main.tex
+++ b/Thesis/main.tex
@@ -62,8 +62,8 @@
 % ------------------------------------------------------------------------------
 \pagenumbering{Roman}

-% Table of content depth
-\setcounter{secnumdepth}{1}
+% Table of content depth and numbering of sections to a specific depth
+\setcounter{secnumdepth}{2}
 \setcounter{tocdepth}{1}

 \tableofcontents
--- a/Thesis/meta.tex
+++ b/Thesis/meta.tex
@@ -4,16 +4,16 @@
 \usepackage[utf8]{inputenc}
 \newcommand{\fsTitle}{}
 \newcommand{\fsSubTitle}{Evaluation of domain reputation scoring algorithms in the context of IT-Security and development of a domain reputation scoring algorithm}
-\newcommand{\art}{Master's thesis}
+\newcommand{\art}{Master thesis}
 \newcommand{\field}{IT-Security}
 \newcommand{\thesisauthor}{Felix Steghofer}
 \newcommand{\course}{Informatik}
 \newcommand{\matriculationid}{61443}
-\newcommand{\firstreview}{Prof. Dr. rer. nat. Joachim Posegga}
-\newcommand{\secondreview}{Prof. Dr. }
-\newcommand{\mentor}{Thomas Penteker}
+\newcommand{\firstreview}{Professor Dr. Joachim Posegga}
+\newcommand{\secondreview}{Professor Dr. Hans Reiser}
+\newcommand{\mentor}{M. Sc. Thomas Penteker}
 \newcommand{\location}{Passau}
-\newcommand{\thesisyear}{2017}
+\newcommand{\thesisyear}{2018}
 %change to res/img/Logo_UniPassau_small_bw.png for a black and white version
 \newcommand{\logo}{res/img/Logo_UniPassau_small.png}
 \newcommand{\institute}{Universität Passau}