mostly finished Kopis

2018-01-23 16:59:08 +01:00
parent 3586762494
commit 817b68b025
7 changed files with 118 additions and 4 deletions
--- a/Thesis/bibliography.bib
+++ b/Thesis/bibliography.bib
@@ -261,6 +261,22 @@
    howpublished={\url{https://www.farsightsecurity.com/solutions/security-information-exchange/}}
 }

+@misc{SBLOnline,
+    author={The Spamhaus Project, Ltd},
+    title={{The Spamhaus Block List}},
+    month=dec,
+    year={2017},
+    howpublished={\url{https://www.spamhaus.org/sbl/}}
+}
+
+@misc{CymruOnline,
+    author={Team Cymru},
+    title={{Team Cymru - IP TO ASN MAPPING}},
+    month=dec,
+    year={2017},
+    howpublished={\url{http://www.team-cymru.org/IP-ASN-mapping.html}}
+}
+

@inproceedings{porras2009foray,
  title={A Foray into Conficker's Logic and Rendezvous Points.},
@@ -301,3 +317,36 @@
  url = {http://www.rfc-editor.org/rfc/rfc1033.txt},
  note = {\url{http://www.rfc-editor.org/rfc/rfc1033.txt}},
 }
+
+@InProceedings{10.1007/978-3-540-24668-8_15,
+author="Wessels, Duane
+and Fomenkov, Marina
+and Brownlee, Nevil
+and claffy, kc",
+editor="Barakat, Chadi
+and Pratt, Ian",
+title="Measurements and Laboratory Simulations of the Upper DNS Hierarchy",
+booktitle="Passive and Active Network Measurement",
+year="2004",
+publisher="Springer Berlin Heidelberg",
+address="Berlin, Heidelberg",
+pages="147--157",
+abstract="Given that the global DNS system, especially at the higher root and top-levels, experiences significant query loads, we seek to answer the following questions: (1) How does the choice of DNS caching software for local resolvers affect query load at the higher levels? (2) How do DNS caching implementations spread the query load among a set of higher level DNS servers? To answer these questions we did case studies of workday DNS traffic at the University of California San Diego (USA), the University of Auckland (New Zealand), and the University of Colorado at Boulder (USA). We also tested various DNS caching implementations in fully controlled laboratory experiments. This paper presents the results of our analysis of real and simulated DNS traffic. We make recommendations to network administrators and software developers aimed at improving the overall DNS system.",
+isbn="978-3-540-24668-8"
+}
+
+@inproceedings{Kohavi:1995:SCB:1643031.1643047,
+ author = {Kohavi, Ron},
+ title = {A Study of Cross-validation and Bootstrap for Accuracy Estimation and Model Selection},
+ booktitle = {Proceedings of the 14th International Joint Conference on Artificial Intelligence - Volume 2},
+ series = {IJCAI'95},
+ year = {1995},
+ isbn = {1-55860-363-8},
+ location = {Montreal, Quebec, Canada},
+ pages = {1137--1143},
+ numpages = {7},
+ url = {http://dl.acm.org/citation.cfm?id=1643031.1643047},
+ acmid = {1643047},
+ publisher = {Morgan Kaufmann Publishers Inc.},
+ address = {San Francisco, CA, USA},
+} 
--- a/Thesis/content/Evaluation_of_existing_Systems/Kopis/Kopis.tex
+++ b/Thesis/content/Evaluation_of_existing_Systems/Kopis/Kopis.tex
@@ -4,7 +4,7 @@
 \subsection{General}
 \label{subsec:kopis_general}

-The last evaluated System is called \textit{Kopis} and has been proposed in 2011 by \fsAuthor{Antonakakis:2011:DMD:2028067.2028094}, the authors that also released \nameref{sec:notos}, at the Georgia Institute of Technology and the University of Georgia. \textit{Kopis} is following a slightly different approach compared to the previous two Systems, \textit{Notos} and \textit{Exposure}. Instead of collecting passively monitored DNS traffic from a (limited) number of different recursive DNS servers in various locations \textit{Kopis} uses requests, registered in the upper DNS layers, from e.g. top-level domain servers and authoritative name servers. See Figure~\ref{fig:kopis_data_sources} for an overview of where those three different System aggregate logs to perform traffic analysis. Operating in the upper DNS layers, \textit{Kopis} is not only able to extract significantly different classes of features compared to \textit{Notos} and \textit{Exposure} but also has to deal with different challenges like DNS caching. The biggest 
+The last evaluated System is called \textit{Kopis} and has been proposed in 2011 by \fsAuthor{Antonakakis:2011:DMD:2028067.2028094}, the authors that also released \nameref{sec:notos}, at the Georgia Institute of Technology and the University of Georgia. \textit{Kopis} is following a slightly different approach compared to the previous two Systems, \textit{Notos} and \textit{Exposure}. Instead of collecting passively monitored DNS traffic from a (limited) number of different recursive DNS servers in various locations \textit{Kopis} uses requests, registered in the upper DNS layers, from e.g. top-level domain servers and authoritative name servers. See Figure~\ref{fig:kopis_data_sources} for an overview of where those three different System aggregate logs to perform traffic analysis. Operating in the upper DNS layers, \textit{Kopis} is not only able to extract significantly different classes of features compared to \textit{Notos} and \textit{Exposure} but also has to deal with different challenges like DNS caching.

 \begin{figure}[!htbp]
    \centering
@@ -54,21 +54,67 @@ The first step of extracting features out of the captured traffic for each dns q
 \subsubsection{Requester Diversity (RD)}
 \label{subsubsec:kopis_requester_diversity}

+This group of features tries to map the requester diversity, i.e. where the requests originate, into values that can be used in the \textit{Feature Computation} function. In general, this aims to find if related machines of a domain \textit{d} are globally distributed or acting in a bound location. It first important to notice that to map an IP address to its corresponding ASN, country and BGP prefix, the Team Cymru IP TO ASN MAPPING database has been leveraged \fsCite{CymruOnline}. This set of features is motivated on the premise that the machines involved with a domain used in malicious purposes, usually have a different distribution than those for legitimate usage. While benign services will show a consistent pattern of IP addresses that are looking up \textit{d}, malicious domains are queried from many machines from different locations around the world, e.g. bots in a botnet or spambots involved in a spam campaign. Recapturing that botnets are usually not targeted at specific geographical regions. Figure~\ref{fig:kopis_requester_distribution} shows the distribution of the ASNs as well as the country codes, calculated by the cumulative distribution function (CDF). In both cases, benign domains have a either low or very high distribution (bimodal distribution). In contrast, malicious domains show a larger spectrum of diversities, mainly depending on how successful the malware is spreading. There are mainly three values involved here. For all requester IP addresses \(\{R_j\}_{j=1..m}\), the BGP prefixes, the autonomous system numbers and the country codes (CC) are resolved. After this, the distribution of the occurrence frequency of these three sets is computed and for each distribution, the mean, standard deviation and variance is calculated (total of nine features). Another four features are extracted simply using the total number of distinct IP addresses (\textit{d} resolved to), the amount of BGP prefixes of these IPs, the total number of different ASNs and the total amount of distinct countries, these IPs reside in. 
+
+\begin{figure}[!htbp]
+    \centering
+    \includegraphics[width=.9\textwidth, clip=true]{content/Evaluation_of_existing_Systems/Kopis/kopis_requester_distribution.png}
+    \caption{Kopis: Requester distribution of autonomous system numbers (a) and country codes (b) for benign and malware related domains \fsCite[Figure 4]{Antonakakis:2011:DMD:2028067.2028094}}
+    \label{fig:kopis_requester_distribution}
+\end{figure}
+
+
 \subsubsection{Requester Profile (RP)}
 \label{subsubsec:kopis_requester_profile}

+The \textit{Requester Profile} features are aiming to separate request that are coming from hardened networks (like enterprise networks) from less secure networks, e.g. ISP networks. Most smaller networks like enterprise or university networks are much better protected against malware in general and such should show less requests to malicious domains. On the other hand, ISPs do usually not invest much effort into cleaning their network from malware and do not offer a high level of protection against malware propagation inside the network. As \textit{Kopis} is operating in the upper DNS layers, it is often not possible to simply measure the population behind the requesting RDNS server (due to e.g. caching \fsCite{10.1007/978-3-540-24668-8_15}) and a different metric has to be found to measure the size of the network, a request has been submitted from. Assuming traffic, monitored from a large AuthNS in epoch \(E_t\) that has authority for a set of Domains \(D\) and all unique requesting IP addresses \(R\). For each requester IP \(R_k \in R\) the amount of different domains, queried by \(R_k\) in \(E_t\), is counted \(c_{t,k}\). A weight can then be applied to each requester \(R_k\) as \(w_{t,k} = \frac{c_{t,k}}{max_{l=1}^{|R|}c_{t,l}}\). Subsequent, the more domains in \(D\) a requester \(R_k\) is querying, the higher the weight will be. This way, high weights are corresponding to larger networks and following the explanation above, the more likely it is that this requester is infected with malicious software. Given a domain \textit{d} and let \(R_d\) being the set of all requester IP addresses, the count \(c_{t,k}\) is computed for each epoch \(E_t\) like previously described. In the following, the count for each epoch is multiplied with each weight \(w_{t-n,k}\), where \(w_{t-n,k}\) are the weights of \textit{n} days before epoch \(E_t\), top get the set of weighted counts of \textit{d} during \(E_t\): \(WC_t(d) = \{c_{t,k} * w_{t-n,k}\}_k\). Finally, five different feature values are calculated with the values of \(WC_t(d)\): the average, the \gls{biased_estimator} and \gls{unbiased_estimator} standard deviation and the biased and unbiased variance (see Glossary for an explanation of \gls{biased_estimator} and \gls{unbiased_estimator} standard deviation).
+
+
 \subsubsection{Resolved-IPs Reputation (IPR)}
 \label{subsubsec:kopis_resolved-ips_reputation}

+The set of \textit{Resolved-IPs Reputation} features consists of nine individual statistical features. This group is much like the \nameref{subsubsec:notos_evidence-based_features} in \textit{Notos} (see Section~\ref{subsubsec:notos_evidence-based_features}) and aims at finding malicious usage of the address space pointed to by \textit{d}. The assumption here is that it is much more likely that a domain \textit{d} is involved in malicious activities if it is using resources that are known to be used by attackers. Let \(IPs(d,t)\) be the set of all resolved addresses during epoch \(E_t\), than \(BGP(d,t)\) are all corresponding BGP prefixes as well as \(AS(d,t)\) being all autonomous system numbers to which the IP addresses \(IPs(d,t)\) belong to. There are three different groups of features that model the historic reputation of the address space:
+
+\begin{itemize}
+  \item \textit{Malware Evidence: } contains three individual features, the amount of IP addresses in the last month (in respect to \(E_t\)) that have been pointed to by any malicious domain and much likely the amount of BGP prefixes and AS numbers that a malicious domain has been resolved to.
+  \item \textit{SBL Evidence: } using the domains in the Spamhaus Block list \fsCite{SBLOnline}, the average number of IP addresses, BGP prefixes and ASNs that have been pointed to by these domains is calculated.
+  \item \textit{Whitelist Evidence: } the list of domains \(WL\) that are suspected to be legitimate is constructed using the DNS whitelist of DNSWL \fsCite{DNSWLOnline} and the top 30 popular domains from Alexa \fsCite{AlexaWebInformationOnline}. Then the set of known good IPs \(WL_{IPs}\) is resolved from all domains in the white list \(WL\). Let \(IPs(d,t)\) be all addresses that \textit{d} points to (similarly to the first two groups) the amount of matching IP addresses and the amount of ASNs and BGP prefixes that include IP addresses of \(WL_{IPs}\) is calculated.
+\end{itemize}

-\subsection{Reputation Engine}
-\label{subsec:kopis_reputation_engine}


 \subsection{Results}
 \label{subsec:kopis_results}

+\textit{Kopis} used DNS traffic captured at two major domain name registrars (AuthNS servers) between 01.01.2010 and 31.08.2010 as well as a country code top level domain server (.ca) from 26.08.2010 up to 18.10.2010. As the TLD server was operated in delegate-only mode, passive DNS traffic had to be additionally collected to get the resolutions for these queries. In total, this led to 321 million lookups a day in average. This amount of data showed to be a significant problem and the overall traffic size to be analysed had to be reduced. The most significant reduction was to remove all duplicate queries and only take unique requests into account. Finally, about 12.5 million daily unique requests remained in average \todo{make reduction more clear}. Using the \textit{KB}, that consists of various sources (see \fsCite{subsec:kopis_architecture}), a sample with 225,429 unique RRs (corresponding to 28,915 unique domain names) could be split into groups with 27,317 malicious and 1,598 benign domains. All raw data was indexed in a relational database and was enriched with information like first and last seen timestamps. Like any system that uses a machine learning approach, it was important for \textit{Kopis} to select significant features and a period that was sufficient for the training to deliver good results. Figure~\ref{fig:kopis_train_period_selection} shows the \glspl{roc} (ROC) of different models, generated with data from periods of one up to five days and validated using 10-fold cross validation. According to \fsAuthor[Section 5.3]{Antonakakis:2011:DMD:2028067.2028094}: ``When we increased the observation window beyond the mark of five days we did not see a significant improvement in the detection results.'' Using these models, the best classification algorithm had to be found. This has been accomplished using a technique called model selection (see e.g. \fsCite{Kohavi:1995:SCB:1643031.1643047}). The most accurate classifier for these models has shown to be the \textit{random forest} implementation with a true positive rate of 98.4\% and a false positive rate of 0.3\% (with training data from a period of five days). \textit{Random forest} is the combination of different decision trees, either trained on different training sets or using different sets of features. Unfortunately, the exact random forest classification implementation of \textit{Kopis} has not been published. Other classifiers that have been experimented with are: Naive Bayes, k-nearest neighbors, Support Vector Machines, MLP Neural Network and random committee.
+
+\begin{figure}[!htbp]
+    \centering
+    \includegraphics[width=.9\textwidth, clip=true]{content/Evaluation_of_existing_Systems/Kopis/kopis_train_period_selection.png}
+    \caption{Kopis: ROCs from datasets with different window sizes \fsCite[Figure 6]{Antonakakis:2011:DMD:2028067.2028094}}
+    \label{fig:kopis_train_period_selection}
+\end{figure}
+
+To get an idea of the overall detection performance, data from a period of 155 days has been analysed. In total, 150 different classification models have been produced (each trained with fifteen days and a one day sliding window i.e. one model overlapping the next by fourteen days). Again, 10-fold cross validation has been used to obtain the false positive rates \(FP_{rates}\) and the true positive rates \(TP_{rates}\) of each of these models. The random forest classifier has shown an average \(FP_{rate}\) of 0.5\% as well as a minimum and maximum \(FP_{rate}\) of 0.2\% and 1.1\% (or respectively 8, 3 and 18 domains). The \(TP_{rates}\) show an average of 99.1\% (27,072 domains), a minimum of 98.1\% (27.071 domains) and a maximum of 99.8\% (27,262 domains). 
+
+Again using those 150 datasets, the performance of each individual feature set as well as different combinations of the feature groups are investigated. The feature group with \textit{Resolved-IPs Reputation (IPR)} features contains most information about resources usage (e.g. filter lists, see Section~\ref{subsubsec:kopis_resolved-ips_reputation}) compared to the \textit{Requester Diversity (RD)} and \textit{Requester Profile (RP)} feature sets. Figure~\ref{fig:kopis_long_term_tp-rate} and \ref{fig:kopis_long_term_fp-rate} although show that the combination of \textit{RD} and \textit{RP} show a similar detection rate, both in terms of the true positive and false positive rate, compared to the \textit{IPR} features. This fact is interesting when having no reputation data available or considering future cases where IPv6 will be playing a much bigger role and the current reputation systems have to be adopted to first. As it has been shown in the previous two system, \textit{Notos} and \textit{Exposure}, again using all three feature groups results in the highest detection rate for malicious and benign domains. 
+
+\begin{figure}[!htbp]
+    \centering
+    \includegraphics[width=.9\textwidth, clip=true]{content/Evaluation_of_existing_Systems/Kopis/kopis_long_term_TP-rate.png}
+    \caption{Kopis: \(TP_{rate}\) of the detection accuracy for different (combined) feature sets in \textit{Kopis} \fsCite[Figure 7]{Antonakakis:2011:DMD:2028067.2028094}}
+    \label{fig:kopis_long_term_tp-rate}
+\end{figure}
+
+
+\begin{figure}[!htbp]
+    \centering
+    \includegraphics[width=.9\textwidth, clip=true]{content/Evaluation_of_existing_Systems/Kopis/kopis_long_term_FP-rate.png}
+    \caption{Kopis: \(FP_{rate}\) of the detection accuracy for different (combined) feature sets in \textit{Kopis} \fsCite[Figure 8]{Antonakakis:2011:DMD:2028067.2028094}}
+    \label{fig:kopis_long_term_fp-rate}
+\end{figure}
+
+To test a more real-world scenario, another approach has been validated. This test case includes one month of data and 20\% of the known benign and known malicious domains have been extracted and not used for training the model (simulating zero knowledge about these domains during training). This model has been tested using the consecutive three weeks, including the 20\% benign and malicious samples as well as all other new, previously unseen (not known to the trained model), domains. This case has been repeated four different month. Summarizing the results, \textit{Kopis} was able to classify new domains with an average \(TP_{rate}\) of 73.62\% and a \(FP_{rate}\) of 0.53\%. In contrast to the first results shown in this chapter (which showed a much higher \(TP_{rate}\) and a lower \(FP_{rate}\)), this results are achieved using zero knowledge of the tested domains and are such still considered a good detection rate. This real-world value could be confirmed by detecting a previously unknown commercial botnet in china. This botnet has been identified within the first weeks of its appearance and could get removed from the internet in September 2010, before it could spread outside of china. The DDos botnet was controlled with eighteen domain names which resolved to five IP addresses in China and one in the the United States.

-Using the \textit{KB}, a sample with 225,429 unique RRs (corresponding to 28,915 unique domain names) could be split into groups with 27,317 malicious and 1,598 benign domains. 

 \todo{see section one for contributions}
--- a/Thesis/content/Evaluation_of_existing_Systems/Kopis/kopis_long_term_FP-rate.png
+++ b/Thesis/content/Evaluation_of_existing_Systems/Kopis/kopis_long_term_FP-rate.png
--- a/Thesis/content/Evaluation_of_existing_Systems/Kopis/kopis_long_term_TP-rate.png
+++ b/Thesis/content/Evaluation_of_existing_Systems/Kopis/kopis_long_term_TP-rate.png
--- a/Thesis/content/Evaluation_of_existing_Systems/Kopis/kopis_requester_distribution.png
+++ b/Thesis/content/Evaluation_of_existing_Systems/Kopis/kopis_requester_distribution.png
--- a/Thesis/content/Evaluation_of_existing_Systems/Kopis/kopis_train_period_selection.png
+++ b/Thesis/content/Evaluation_of_existing_Systems/Kopis/kopis_train_period_selection.png
--- a/Thesis/glossar.tex
+++ b/Thesis/glossar.tex
@@ -47,6 +47,25 @@
    description={Whois is a protocol, used to gather information about owners of domains in the domain name system and IP addresses, specified in RFC 1834.}
 }

+\newglossaryentry{biased_estimator}
+{
+    name={biased},
+    description={The biased estimator for the standard deviation of a random variable \textit{X} is defined as \(\sqrt{\sum_{i=1}^N \frac{1}{N}(\bar{X}_i - \mu)^2}\)}
+}
+
+\newglossaryentry{unbiased_estimator}
+{
+    name={unbiased},
+    description={The unbiased estimator for the standard deviation of a random variable \textit{X} is defined as \(\sqrt{\sum_{i=1}^N \frac{1}{N-1}(\bar{X}_i - \mu)^2}\)}
+}
+
+\newglossaryentry{roc}
+{
+    name={receiver operating characteristic curve},
+    description={The ROC curve is a graphical plot of the true positive rate as well as the false positive rate and highlights the performance of a binary classifier.}
+}
+
+
 \newacronym{sri-nic}{SRI-NIC}{Stanford Research Institute - Network Information Center}

 \newacronym{dns}{DNS}{Domain Name System}