Notos evaluation first part

2018-01-02 19:12:28 +01:00
parent 45a9e74c14
commit 48f8343a79
8 changed files with 247 additions and 3 deletions
--- a/Thesis/.gitignore
+++ b/Thesis/.gitignore
@@ -14,6 +14,8 @@
 *.ind
 *.lof
 *.lot
+*.glg
+*.gls


 # vscode
--- a/Thesis/.latexmkrc
+++ b/Thesis/.latexmkrc
@@ -0,0 +1,17 @@
+$pdf_mode = 1;
+
+$pdflatex = "pdflatex -synctex=1 -halt-on-error %O %S";
+
+add_cus_dep('glo', 'gls', 0, 'run_makeglossaries');
+add_cus_dep('acn', 'acr', 0, 'run_makeglossaries');
+
+sub run_makeglossaries {
+  if ( $silent ) {
+    system "makeglossaries -q '$_[0]'";
+  }
+  else {
+    system "makeglossaries '$_[0]'";
+  };
+}
+
+@default_files = ('main.tex');
--- a/Thesis/bibliography.bib
+++ b/Thesis/bibliography.bib
@@ -83,6 +83,22 @@
  year = 1997
 }

+@inproceedings{Antonakakis:2010:BDR:1929820.1929844,
+ author = {Antonakakis, Manos and Perdisci, Roberto and Dagon, David and Lee, Wenke and Feamster, Nick},
+ title = {Building a Dynamic Reputation System for DNS},
+ booktitle = {Proceedings of the 19th USENIX Conference on Security},
+ series = {USENIX Security'10},
+ year = {2010},
+ isbn = {888-7-6666-5555-4},
+ location = {Washington, DC},
+ pages = {18--18},
+ numpages = {1},
+ url = {http://dl.acm.org/citation.cfm?id=1929820.1929844},
+ acmid = {1929844},
+ publisher = {USENIX Association},
+ address = {Berkeley, CA, USA},
+} 
+
@misc{theguardiancom_wannacry,
  author = {Nadia Khomami and Olivia Solon},
  month = {May},
--- a/Thesis/content/Evaluation_of_existing_Systems/Evaluation_of_existing_Systems.tex
+++ b/Thesis/content/Evaluation_of_existing_Systems/Evaluation_of_existing_Systems.tex
@@ -1,10 +1,174 @@
-\chapter{Evaluation of existing Systems}
+\chapter{Evaluation of existing Systems} \todo{rename to survey?}
 \label{cha:evaluation_of_existing_systems}

+This chapter deals with work around domain reputation scoring systems that has been released. While there exist different types of algorithms, only those that follow a similar approach are taken into account here: namely those that use passive DNS logs and machine learning to calculate the reputation score. \todo{why this two or three?}

 \section{Evaluation Scheme}
 \label{sec:evaluation_scheme}

+For a comprehensive evaluation, all input and output as well as the exact implementations (and/or the corresponding parameters that have been used for the analysis) of the algorithm was needed. Unfortunately, none of the publications we are dealing with here have released any (raw) input data, specifically the passive DNS logs and the filter lists for the training set. Neither has any of the algorithm's actual implementation been published. For this reason the evaluation of the existing systems is focusing on the results that have individually been published. Most importantly the detection rate as well as the false positive rate. Another important fact for this overview is what data has actually been used for the training and classification and where the data has been obtained. Passive DNS logs may be collected in different stages of the DNS resolution and might, due to e.g. caching, lead to the extraction of different information. A resolver running on the users machine might obtain much more traffic and such benefit from e.g. time based patterns which are not possible at higher level DNS servers that are not able to collect that traffic because the response has been cached on resolvers in a lower (DNS-) hierarchy.
+
+
+\section{Notos}
+\label{sec:notos}
+
+\subsection{General}
+\label{subsec:notos_general}
+
+\textit{Notos} has been published in 2010 by \fsAuthor{Antonakakis:2010:BDR:1929820.1929844} at the Georgia Institute of Technology. It has been introduced as ``being the first [system] to create a comprehensive dynamic reputation system around domain names'' \fsCite[Section 1]{Antonakakis:2010:BDR:1929820.1929844}. \textit{Notos} is based on observations that malicious use of DNS usually can be distinguished from legitimate, professionally provisioned DNS services by unique characteristics. Fraudulent activities therefore usually utilize techniques to evade security countermeasures \fsCite{Antonakakis:2010:BDR:1929820.1929844}. This approach is mainly using passive historical DNS information that was obtained on multiple recursive resolvers distributed accross the Internet. For building a model of how resources are typically used in legitimate and malcious applications, information about vicious ip addresses and domain names is collected from different sources like honeypots, malware analysis services and spam-traps. Using this model, new domains that have never been seen before can be dynamically assigned with a reputation score of how likely this new domain is involved in malicious activities. Malcious activities in the context of \textit{Notos} are roughly described as: ``if it [a domain] has been involved with botnet C\&C servers, spam campaigns, malware propagation, etc.'' \fsCite[Section 3]{Antonakakis:2010:BDR:1929820.1929844}
+
+\textit{Notos} uses some basic terminology which is shortly introduced here:
+\begin{itemize}
+    \item A domain \textit{d} consists of several substrings which are described in \nameref{subsec:domain_names}. Abbreviations used in the following Sections are: \\
+        \textbf{Top-level domain:} TLD, where \(TLD(d)\) is the top-level domain of \textit{d} \\
+        \textbf{Second-level domain:} \(2LD(d)\) being the second-level domain of domain \textit{d} \\
+        \textbf{Third-level domain: } \(3LD(d)\) containing the three rightmost substrings separated by period for \textit{d}
+    \item Given domain \(d\) \(Zone(d)\) describes the set of domains that include \textit{d} and all subdomains of \textit{d}
+    \item \(D = \{d_1, d_2, ..., d_m\}\) representing a set of domains and \(A(D)\) all IP addresses that, at any time, any domain \(d \in D\) resolved to
+    \item \(BGP(a)\) consists of all ip addresses that are residing in the same \gls{bgp} prefix than \textit{a}
+    \item Analogously, \(AS(a)\) as the set of IP addresses located in the same \gls{as} than \textit{a}
+\end{itemize}
+
+\subsection{Architecture}
+\label{subsec:notos_architecture}
+
+The main goal of \textit{Notos} is to assign a dynamic reputation score to domain names. Domains that are likely to be involved in malicious activities are tagged with a low reputation score, whereas legitimate Internet services are assigned with a high reputation score. 
+\textit{Notos'} primary source of information is a database that contains historical data about domains and resolved ip addresses. This database is built using DNS traffic from two recursive ISP DNS servers (RDNS) and pDNS logs collected by the Security Information Exchange (SIE) which covers authoritive name servers in North America and Europe. For building a list of known malicious domain names, several honeypots and spam-traps have been deployed. A large list of known good domains has been gathered from the top sites list on \textit{alexa.com} which ranks the most popular websites in several regions. These two lists are referred to as the \textit{knowledge base} and are used to train the reputation training model. 
+
+
+To assign a reputation score to a domain \textit{d}, the most current set of IP addresses \(A_{c}(d) = \left\{a_{i}\right\}_{i=1..m}\) to which \textit{d} points is first fetched. Afterwards the pDNS database is queried for several information for this domain \textit{d}. The \textit{Related Historic IPs (RHIPs)} is the set of all IP addresses that ever pointed to this domain. In case domain \textit{d} is a third-level domain, all IP addresses that pointed to the corresponding second-level domain are also included. See Chapter~\ref{subsec:domain_names} for more information on the structure of domain names. If \textit{d} is a second-level domain, then all IPs that are pointed to from any of the third-level subdomains are also added to the RHIPs. In the next step, the set of \textit{Related Historic Domains (RHDNs)} is queried and covers all domains that are related to the currently processed domain \textit{d}. Specifically, all domains which ever resolved to an IP address that is residing in any of the ASNs of those IPs that \textit{d} currently resolves to. \todo{understandable?}
+
+There are three types of features extracted from the database for \textit{Notos} that are used for training the reputation model (quotation from \fsCite[Section 3.1]{Antonakakis:2010:BDR:1929820.1929844}):
+
+\begin{quote}
+\begin{enumerate}
+    \item \textbf{Network-based features:} The first group of statistical features is extracted from the set of RHIPs. We measure quantities such as the total number of IPs historically associated with \textit{d}, the diversity of their geographical location, the number of distinct autonomous systems (ASs) in which they reside, etc.
+    \item \textbf{Zone-based features:} The second group of features we extract are those from the RHDNs set. We measure the average length of domain names in RHDNs, the number of distinct TLDs, the occurrence frequency of different characters, etc.
+    \item \textbf{Evidence-based features:} The last set of features includes the measurement of quantities such as the number of distinct malware samples that contacted the domain \textit{d},     the number of malware samples that connected to any of the IPs pointed by \textit{d}, etc.
+\end{enumerate}
+\end{quote}
+
+Figure~\ref{fig:notos_system_overview} shows the overall system architecture of \textit{Notos}. After all the features are extracted from the passive DNS database and prepared for further steps, the reputation engine is initialized. \textit{Notos'} reputation engine is operating in two modes. In offline mode, the reputation model is constructed for a set of domains using the feature set of each domain and the classification which can be calculated using the \textit{knowledge base} with black- and whitelist (also referred as training). This model can later be used in the online mode to dynamically assign a reputation score. In online mode, the same features that are used for the initial training are extracted for a new domain (resource record or RR, see Section~\nameref{subsubsec:dns_resource_records}) and \textit{Notos} queries the trained reputation engine for the dynamic reputation rating (see Figure~\ref{fig:notos_online_offline_mode}).
+
+\todo{better explain EV, NM, DC}
+\begin{figure}[!htbp]
+    \centering
+    \includegraphics[scale=.3, clip=true]{content/Evaluation_of_existing_Systems/Notos_System_overview.png}
+    \caption{Notos: System overview \fsCite[Figure 1]{Antonakakis:2010:BDR:1929820.1929844}}
+    \label{fig:notos_system_overview}
+\end{figure}
+
+\begin{figure}[!htbp]
+    \centering
+    \includegraphics[scale=.3, clip=true]{content/Evaluation_of_existing_Systems/Notos_offline-online_mode.png}
+    \caption{Notos: online and offline mode \fsCite[Figure 3]{Antonakakis:2010:BDR:1929820.1929844}}
+    \label{fig:notos_online_offline_mode}
+\end{figure}
+
+
+\subsection{Features}
+\label{subsec:notos_features}
+
+In this Section, all statistical features are listed and a short explanation, for what reason those have been chosen, is introduced.
+
+The first group of features handles network-related keys. This group mostly describe how the owning operators of \textit{d} allocate network resources to achieve different goals. While most legitimate and professionally operated internet services feature have a rather stable network profile, malicious usage usually involves short living domain names and ip addresses with high agility to circumvent blacklisting and other simple types of resource blocking. Botnets usually contain machines in many different networks (\glspl{as} and \glspl{bgp}) operated by different organizations in different countries. Appropriate companies mostly acquire bigger ip blocks and such use consecutive IPs for their services in the same address space. This homogeneity also applies to other registration related information like registrars and registration dates. To measure this level of agility and homogeneity, eighteen statistical network-based features are extracted from the RHIPs (see Table~\ref{tab:notos_network-based_features}).
+
+\begin{table}[!htbp]
+    \centering
+    \caption{Notos: Network-based features}
+    \label{tab:notos_network-based_features}
+    \begin{tabularx}{\textwidth}{|l|X|}
+    \hline
+    \textbf{Feature Source}                         & \textbf{Feature}                                                            \\ \hline
+    \multirow{9}{*}{\textit{BGP}}          & \# of distinct BGP prefixes related to \(BGP(A(d))\)                  \\ \cline{2-2} 
+                                           & \# of countries in which these BGP prefixes reside                 \\ \cline{2-2} 
+                                           & \# of organizations that own these BGP prefixes                    \\ \cline{2-2} 
+                                           & \# of distinct IP addresses in the sets \(A_{3LD}(d)\)                  \\ \cline{2-2} 
+                                           & \# of distinct IP addresses in the sets \(A_{2LD}(d)\)                  \\ \cline{2-2} 
+                                           & \# of distinct BGP prefixes related to \(BGP(A_{3LD}(d)\)            \\ \cline{2-2} 
+                                           & \# of distinct BGP prefixes related to \(BGP(A_{2LD}(d)\)             \\ \cline{2-2} 
+                                           & \# of countries in which \(BGP(A_{3LD}(d)\)                           \\ \cline{2-2} 
+                                           & \# of countries in which \(BGP(A_{2LD}(d)\)                           \\ \hline
+    \multirow{3}{*}{\textit{ASN}}          & \# of distinct autonomous systems related to \(AS(A(d))\)              \\ \cline{2-2} 
+                                           & \# of distinct autonomous systems related to \(AS(A_{3LD}(d)\)         \\ \cline{2-2} 
+                                           & \# of distinct autonomous systems related to \(AS(A_{2LD}(d)\)         \\ \hline
+    \multirow{6}{*}{\textit{Registration}} & \# of distinct registrars associated with the IPs in the \(A(d)\) set  \\ \cline{2-2} 
+                                           & diversity in the registration dates related to the IPs in \(A(d)\)     \\ \cline{2-2} 
+                                           & \# of distinct registrars associated with the IPs in the \(A_{3LD}(d)\) \\ \cline{2-2} 
+                                           & \# of distinct registrars associated with the IPs in the \(A_{2LD}(d)\) \\ \cline{2-2} 
+                                           & diversity in the registration dates for the IPs in \(A_{3LD}(d)\)       \\ \cline{2-2} 
+                                           & diversity in the registration dates for the IPs in \(A_{2LD}(d)\)       \\ \hline
+\end{tabularx}
+\end{table}
+
+The second group is about zone-based features and is extracted from the RHDNs. In contrast to the network-based features which compares characteristics of the historic IPs, the zone-based features handles characteristics of all historically involved domains. While legitimate services often involve many domains, they usually share similarities. ``For example, google.com, googlesyndication.com, googlewave.com, etc., are all related to Internet services provided by Google, and contain the string 'google' in their name.'' \fsCite[Section 3.2.2]{Antonakakis:2010:BDR:1929820.1929844}. In contrast, randomly generated domains used in spam campaigns are rarely sharing similarities. To calculate this level of diversity, seventeen features are extracted which can be found in Table~\ref{tab:notos_zone-based_features}:
+
+\begin{table}[]
+    \centering
+    \caption{Notos: Zone-based features}
+    \label{tab:notos_zone-based_features}
+    \begin{tabularx}{\textwidth}{|l|X|}
+    \hline
+    \textbf{Feature Source}           & \textbf{Feature}                                                                                            \\ \hline
+    \multirow{12}{*}{\textit{String}} & \# of distinct domain names in RHDNs                                                                        \\ \cline{2-2} 
+                                      & average \# of distinct domain names in RHDNs                                                                \\ \cline{2-2} 
+                                      & standard deviation of \# of distinct domain names in RHDNs                                                  \\ \cline{2-2} 
+                                      & mean of the occurrence frequency of each single character in the domain name strings in RHDNs               \\ \cline{2-2} 
+                                      & median of the occurrence frequency of each single character in the domain name strings in RHDNs             \\ \cline{2-2} 
+                                      & standard deviation of the occurrence frequency of each single character in the domain name strings in RHDNs \\ \cline{2-2} 
+                                      & mean distribution of 2-grams (i.e. pairs of characters)                                                     \\ \cline{2-2} 
+                                      & median distribution of 2-grams                                                                              \\ \cline{2-2} 
+                                      & standard deviation of 2-grams                                                                               \\ \cline{2-2} 
+                                      & mean distribution of 3-grams (i.e. triples of characters)                                                   \\ \cline{2-2} 
+                                      & median distribution of 3-grams                                                                              \\ \cline{2-2} 
+                                      & standard deviation of the distribution of 3-grams                                                           \\ \hline
+    \multirow{5}{*}{\textit{TLD}}     & \# of distinct TLD strings of each domain \(d_i\) in the RHDNs set     \\ \cline{2-2} 
+                                      & ratio between \# of domains \(d_i\) whose \(TLD(d_i)=".com"\) and the total \# of TLD different from ".com"      \\ \cline{2-2} 
+                                      & mean of the occurrence frequency of the TLD strings                                                         \\ \cline{2-2} 
+                                      & median of the occurrence frequency of the TLD strings                                                       \\ \cline{2-2} 
+                                      & standard deviation of the occurrence frequency of the TLD strings                                           \\ \hline
+\end{tabularx}
+\end{table}
+
+For the evidence-based features, public information and exclusively collected data from honeypots and spam-traps is collected. This \textit{knowledge base} primarily helps to discover if a domain \textit{d} is in some way interacting with known malicious IPs and domains. As domain names are much cheaper than ip addresses, malware authors tend to reuse IPs with updated domain names. The blacklist features detect the reuse of known malicious resources like IP addresses, \gls{bgp} prefixes and \glspl{as}. 
+
+\begin{table}[]
+    \centering
+    \caption{Notos: Evidence-based features}
+    \label{tab:notos_evidence-based_features}
+    \begin{tabularx}{\textwidth}{|l|X|}
+    \hline
+    \textbf{Feature Source}             & \textbf{Feature}                                                                               \\ \hline
+    \multirow{3}{*}{\textit{Honeypot}}  & \# of distinct malware samples that, when executed, try to contact \(d\) or any IP address in \(A(d)\) \\ \cline{2-2} 
+                                        & \# of malware samples that contact any IP address in \(BGP(A(d)\)                                \\ \cline{2-2} 
+                                        & \# of samples that contact any IP address in \(AS(A(d))\)                                          \\ \hline
+    \multirow{3}{*}{\textit{Blacklist}} & \# of IP addresses in \(A(d)\) that are listed in public IP blacklists                             \\ \cline{2-2} 
+                                        & \# of IPs in \(BGP(A(d)\) that are listed in public IP blacklists                                \\ \cline{2-2} 
+                                        & \# of IPs in \(AS(A(d))\) that are listed in public IP blacklists                                  \\ \hline
+    \end{tabularx}
+    \end{table}
+
+\todo{all formulas explained?}
+
+
+
+\begin{figure}[!htbp]
+    \centering
+    \includegraphics[scale=.3, clip=true]{content/Evaluation_of_existing_Systems/Notos_features.png}
+    \caption{Notos: Computing network-based, zone-based, evidence-based features \fsCite[Figure 2]{Antonakakis:2010:BDR:1929820.1929844}}
+    \label{fig:notos_features}
+\end{figure}
+\todo{not referenced atm}
+
+
+\subsection{Reputation Engine}
+\label{subsec:notos_reputation_engine}
+
+The reputation engine is used to dynamically assign a reputation score to a domain \textit{d}. In order to be able to achieve this, the engine has to be trained first. The training clusters 
+
+
+==> a low false positive rate (0.38\%) and high true positive rate (96.8\%).
+
 \section{Exposure}
 \label{sec:exposure}

--- a/Thesis/content/Technical_Background/DNS/DNS.tex
+++ b/Thesis/content/Technical_Background/DNS/DNS.tex
@@ -3,7 +3,7 @@

 The \gls{dns} is one of the cornerstone of the internet as it is known today. \todo{statistic about usage}. Initial designs have been proposed in 1983 and evolved over the following four years into the first globally adapted standard RFC 1034 \fsCite{rfc1034} (RFC 1035 for implementation and specification details \fsCite{rfc1035}). The main idea of the \gls{dns} is translating human readable domain names to network addresses. There are many extensions to the initial design including many security related features and enhancements or the support for \gls{ipv6} in 1995. 

-In order to understand how the \gls{dns} is misused for hostile activities and how to prevent these attacks it is necessary to explain some basic mechanisms.
+In order to understand how the \gls{dns} is misused for hostile activities and how to prevent these attacks, it is necessary to explain some basic mechanisms.


 \subsection{Basics}
@@ -223,7 +223,7 @@ QType & Type  & Description                                                  \\

 \subsection{Domain Names}
 \label{subsec:domain_names}
-\todo{TODO structure of a domain, etc.}
+\todo{TODO structure of a domain, etc. top-level, second-level, third-level}


 \subsection{Resolution}
--- a/Thesis/glossar.tex
+++ b/Thesis/glossar.tex
@@ -29,6 +29,18 @@
    description={Distributed Denial-of-Service is an attack where multiple machines are used to generate as much workload as needed to cause downtimes of a service or machine and make benign usage impossible.}
 }

+\newglossaryentry{as}
+{
+    name={AS},
+    description={An Autonomous System is a set of different networks in the Internet that allows to consistently route between those networks (i.e. an Internet Service Provider) and that exports a single interface for other AS. Each Autonomous System is assigned a officially registered unique Autonomous System Number (ASN).}
+}
+
+\newglossaryentry{bgp}
+{
+    name={BGP},
+    description={The Border Gateway Protocol, also known as the Exterior-Gateway-Protocol (EGP), is the protocol to connect different Autonomous Systems in the Internet. Is is used to share several information for IP blocks to allow routing between different Autonomous Systems.}
+}
+
 \newacronym{sri-nic}{SRI-NIC}{Stanford Research Institute - Network Information Center}

 \newacronym{dns}{DNS}{Domain Name System}
--- a/Thesis/packages.tex
+++ b/Thesis/packages.tex
@@ -120,10 +120,12 @@

 % Long tables ------------------------------------------------------------------
 \usepackage{booktabs}
+\usepackage{multirow}
 \usepackage{longtable}
 \usepackage{array}
 \usepackage{ragged2e}
 \usepackage{lscape}
+\usepackage{tabularx}

 % Columndefinitions with defined width and aligned right -----------------------
 \newcolumntype{w}[1]{>{\raggedleft\hspace{0pt}}p{#1}}
--- a/src/benchmarks/compare_load-files.py
+++ b/src/benchmarks/compare_load-files.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+import gzip
+import csv
+
+def compare_load_file():
+    dur_p = load_file_plain()
+    dur_z = load_file_zipped()
+    
+    print('plain took: ' + str(dur_p) + ' s')
+    print('zipped took: ' + str(dur_z) + ' s')
+    print('(plain - zipped): ' + str(dur_p - dur_z) + ' s')
+
+
+def load_file_plain():
+    start_p = time.time()
+    with open('pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv', 'rt') as file_p:
+        for line in file_p:
+            row = line.split()
+            # print(row)
+            pass
+    return time.time() - start_p
+
+
+def load_file_zipped():
+    start_z = time.time()
+    with gzip.open('pdns_capture.pcap-demchdc902n-2017-09-01_00-20-02.csv.gz', 'rt', newline='') as file_z:
+        reader = csv.reader(file_z)
+        for row in reader:
+            # print(row)
+            pass
+    return time.time() - start_z