diff --git a/thesis.tex b/thesis.tex index 87366f4..90579ed 100644 --- a/thesis.tex +++ b/thesis.tex @@ -89,6 +89,7 @@ \hyphenation{Proxi-max} \hyphenation{rie-mann} \hyphenation{Web-RTC} +\hyphenation{Java-Script} \usepackage{yfonts} @@ -116,19 +117,21 @@ \newcommand{\gobblecomma}[1]{\@gobble{#1}\ignorespaces} \makeatother \index{Amazon CloudFront!zzz@\gobblecomma|seealso {meek-amazon}} +\index{Android!zzz@\gobblecomma|seealso {Orbot}} \index{App Engine|see {Google App Engine}} -\index{appspot.com@\nolinkurl{appspot.com}|see {Google App Engine}} +\index{appspot.com!zzz@\gobblecomma|seealso {Google App Engine}} \index{APT29|see {Cozy Bear}} \index{AS|see {autonomous system}} \index{Azure|see {Microsoft Azure}} -\index{broker|see {Snowflake, broker}} -\index{ciphersuite|see {TLS, fingerprinting}} +\index{bridges|see {Tor bridges}} +\index{ciphersuite|see {TLS fingerprinting}} \index{CDN|see {content delivery network}} \index{CN|see {China; common name (X.509)}} \index{China!zzz@\gobblecomma|seealso {Great Firewall of China}} \index{CloudFront|see {Amazon CloudFront}} +\index{Datagram TLS|see {DTLS}} \index{decoy routing|see {refraction networking}} -\index{default Tor bridge|see {Tor Bridge, default}} +\index{default Tor bridge|see {Tor bridges, default}} \index{domain fronting!zzza@\gobblecomma|seealso {front domain}} \index{domain fronting!zzzb@\gobblecomma|seealso {meek}} \index{Domain Name System|see {DNS}} @@ -140,30 +143,40 @@ \index{GFW|see {Great Firewall of China}} \index{Google App Engine!zzz@\gobblecomma|seealso {meek-google}} \index{Hypertext Transfer Protocol|see {HTTP}} +\index{Interactive Connectivity Establishment|see {ICE}} \index{injection|see {packet injection}} \index{ISP|see {Internet service provider}} +\index{microblogging!zzz@\gobblecomma|seealso {Twitter; Sina Weibo}} \index{Microsoft Azure!zzz@\gobblecomma|seealso {meek-azure}} \index{MITM|see {man in the middle}} \index{NDSS|see {Network and Distributed System Security Symposium}} +\index{NAT|see {network address translation}} \index{NIDS|see {intrusion detection}} \index{network intrusion detection system|see {intrusion detection}} +\index{nickname|see {Tor bridges, nickname}} \index{OpenSSH|see {obfuscated-openssh}} \index{overblocking|see {false positive}} \index{PETS|see {Privacy Enhancing Technologies Symposium}} \index{pluggable transports!zzz@\gobblecomma|seealso {flash proxy; FTE; meek; obfs2; obfs3; obfs4; ScrambleSuit; Snowflake}} -\index{port scanning!zzz@\gobblecomma|seealso {active probing}} +\index{port scanning!zzza@\gobblecomma|seealso {active probing}} +\index{port scanning!zzzb@\gobblecomma|seealso {hybrid idle scan}} \index{precision|see {false positives}} \index{proxy discovery problem|see {proxy distribution problem}} \index{recall|see {false negatives}} -\index{SNI|see {Server Name Indication}} +\index{Server Name Indication|see {SNI}} +\index{Secure Real-time Transport Protocol|see {SRTP}} \index{Secure Sockets Layer|see {TLS}} +\index{Session Traversal Utilities for NAT|see {STUN}} +\index{SRTP with Security Descriptions|see {SDES}} \index{SSL|see {TLS}} \index{TCP!flags|see {ACK; SYN; RST}} +\index{time to live|see {TTL}} +\index{TLS!zzz@\gobblecomma|seealso {DTLS}} \index{Transport Layer Security|see {TLS}} \index{Transmission Control Protocol|see {TCP}} \index{type~I error|see {false positive}} \index{type~II error|see {false negative}} -\index{time to live|see {TTL}} +\index{Traversal Using Relays around NAT|see {TURN}} \index{User Datagram Protocol|see {UDP}} \index{virtual private network|see {VPN}} \index{VoIP|see {voice over IP}} @@ -172,9 +185,10 @@ \index{Secure Shell|see {SSH}} \index{Uniform Resource Locator|see {URL}} \index{U.S.|see {United States of America}} +\index{web browser!zzz@\gobblecomma|seealso {Chrome; Firefox; Tor Browser}} \index{World Wide Web!zzz@\gobblecomma|seealso {HTTP; HTTPS}} -\index{Tor bridge!zzz@\gobblecomma|seealso {Azadi, cymrubridge31, cymrubridge33, fdctorbridge01, GreenBelt, JonbesheSabz, LeifEricson, Lisbeth, MaBishomarim, Mosaddegh, ndnop3, ndnop4, ndnop5, noether, NX01, riemann}} +\index{Tor!bridges!zzz@\gobblecomma|seealso {Azadi, cymrubridge31, cymrubridge33, fdctorbridge01, GreenBelt, JonbesheSabz, LeifEricson, Lisbeth, MaBishomarim, Mosaddegh, ndnop3, ndnop4, ndnop5, noether, NX01, riemann}} \begin{document} @@ -364,7 +378,7 @@ Some other censorship-related topics that are \emph{not} in scope include: \begin{itemize} \item domain takedowns (affecting all clients globally) \item server-side blocking (servers that refuse to serve certain clients) -\item forum moderation and deletion of social media posts +\item forum moderation and deletion of social media posts\index{forum moderation}\index{social media} \item anything that takes place entirely within the censor's network and does not cross the border \item deletion-resistant publishing in the vein of @@ -433,7 +447,7 @@ but I~have less experience (especially implementation experience) with other systems, particularly those -that are developed in languages other than English\index{English}. +that are developed in languages other than English\index{English language}. And while I~have plenty of operational experience---deploying and maintaining systems with real users---I~have not been in a situation where I~needed @@ -600,7 +614,7 @@ and try to predict the reactions of a censor, real-world testing is expensive. If you really want to test a design against a censor, not only must you write and deploy an implementation, -integrate it with client-facing software like web browser, +integrate it with client-facing software like web browsers\index{web browser}, and work out details of its distribution---you must also attract enough users to merit a censor's attention. @@ -708,21 +722,11 @@ while still blocking most of what they intend to. (Another way to think of it is: reducing false positives\index{false positive} without increasing false negatives\index{false negative}.) -For example, it has been repeatedly documented---by -Clayton et~al.~\indexauthors{\cite{Clayton2006a}}, -Winter and Lindskog~\indexauthors{\cite{Winter2012a}}, -and Fifield, Tsai, and Zhong~(\autoref{chap:proxy-probe}), -for example---that the Great Firewall\index{Great Firewall of China} -prefers to block individual ports -rather than blocking an entire IP address, +For example, Winter and Lindskog~\indexauthors{\cite{Winter2012a}}, +observed that the Great Firewall\index{Great Firewall of China} +preferred to block individual ports, +entire IP addresses, probably in a bid to reduce collateral damage. -In \autoref{chap:domain-fronting} we will see a system -whose blocking resistance is based on widely used web services---the -argument is that to block the circumvention system, -the censor would have to block the entire web service. -However this argument requires that the circumvention system's -use of the web service be indistinguishable\index{distinguishability} from other uses---otherwise -the censor may selectively block only the connections used for circumvention. Local circumstances may serve to reduce collateral damage: for example if a domestic replacement exists for a foreign service, the censor may block @@ -981,7 +985,7 @@ ScrambleSuit, and obfs4---which I~like because they illustrate the mutual advances of censors and circumventors over several years. -obfs2\index{obfs2|textbf}~\cite{obfs2}, which debuted in 2012 in response to +obfs2\index{obfs2}~\cite{obfs2}, which debuted in 2012 in response to blocking in Iran\index{Iran}~\cite{tor-blog-obfsproxy-next-step-censorship-arms-race}, uses very simple obfuscation inspired by obfuscated-openssh: it is essentially equivalent to sending an encryption key, @@ -994,7 +998,7 @@ and it is vulnerable to active probing\index{active probing} attacks, where the censor speculatively connects to servers to see what protocols they use. However, it sufficed against the keyword-\index{keywords} and pattern-based censors of its era. -obfs3\index{obfs3|textbf}~\cite{obfs3}---first available in 2013 +obfs3\index{obfs3}~\cite{obfs3}---first available in 2013 but not really released to users until 2014~\cite{tor-blog-tor-browser-36-released}---was designed to fix the passive detectability of its predecessor. @@ -1006,7 +1010,7 @@ and remains vulnerable to active probing\index{active probing}. had begun active-probing for obfs2 by January 2013, and for obfs3 by August 2013---see \autoref{tab:active-probing-timeline}.) -ScrambleSuit\index{ScrambleSuit|textbf}~\cite{Winter2013b}, +ScrambleSuit\index{ScrambleSuit}~\cite{Winter2013b}, first available to users in 2014~\cite{tor-blog-tor-browser-364-and-40-alpha-1-are-released}, arose in response to the active-probing of obfs3. Its innovations were the use of an out-of-band secret @@ -1207,7 +1211,7 @@ In contrast, disposable proxies may last only minutes or hours. Setting up a Tor bridge or even something lighter-weight like a SOCKS\index{SOCKS} proxy still requires installing some software on a server somewhere. -Flash proxy and Snowflake proxies have a low set-up and tear-down cost: +The proxies of flash proxy and Snowflake have a low set-up and tear-down cost: you can run one just by visiting a web page. These designs do not need a sophisticated proxy distribution strategy as long as the rate of proxy creation is kept higher than the censor's @@ -1379,7 +1383,7 @@ give rise to an unavoidable ``eavesdropper's dilemma\index{eavesdropper's dilemm Monitor evasion techniques can be used to reduce -a censor's sphere of visibility---eliminating certain +a censor's sphere of visibility---remove certain traffic features from its consideration. Crandall et~al.~\indexauthors{\cite{Crandall2007a}} in 2007 suggested using IP fragmentation\index{fragmentation} to prevent keyword\index{keywords} matching. @@ -1456,7 +1460,7 @@ were all of this kind. These systems were effective against their censors of their day---at least with respect to the blocking of destinations. They had the major advantage of requiring no -special client-side software other than a web browser\index{web browser+}. +special client-side software other than a web browser\index{web browser}. The difficulty they faced was second-order blocking as censors discovered and blocked the proxies themselves. Circumvention designers deployed some countermeasures; @@ -1488,7 +1492,7 @@ such as tweaking a protocol or using an alternative DNS\index{DNS} server. (We see the same progression play out again when countries first begin to experiment with censorship, such as in Turkey\index{Turkey} in 2014, where alternative DNS servers\index{DNS} -briefly sufficed to circumvent a block of Twitter\index{Twitter}~\cite{theguardian-how-to-get-around-turkeys-twitter-ban}.\index{DNS!blocking}) +briefly sufficed to circumvent a block of Twitter\index{Twitter}\index{social media}~\cite{theguardian-how-to-get-around-turkeys-twitter-ban}.\index{DNS!blocking}) Not only censors were changing---the world around them was changing as well. In field of circumvention, which is so heavily affected by concerns @@ -1519,8 +1523,8 @@ just as censors do. \chapter{Understanding censors} \label{chap:censor-modeling} -The main tool we have to build relevant threat models -is the natural study of censors. +The main tool we have to build relevant threat models\index{modeling} +is the study of censors. The study of censors is complicated by difficulty of access: censors are not forthcoming about their methods. % firewall vendors don't allow you to study them? @@ -1528,19 +1532,16 @@ Researchers are obligated to treat censors as a black box, drawing inferences about their internal workings from their externally visible characteristics. The easiest thing to learn is the censor's \emph{what}---the -destinations that are blocked. +destinations and contents that are blocked. Somewhat harder is the investigation into \emph{where} and \emph{how}, the specific technical mechanisms used to effect censorship and where they are deployed in the network. Most difficult to infer is the \emph{why}, the motivations and goals that underlie an apparatus of censorship. -The \emph{why} gets to the heart of why circumvention is even possible: -a censoring firewall's true purpose is not only blocking; -the blocking is done in pursuit of some other goal. -From the survey of measurement studies -we may draw some general conclusions. +From past measurement studies +we may draw a few general conclusions. Censors change over time, and not always in the direction of more restrictions. Censorship differs greatly @@ -1549,26 +1550,26 @@ but in mechanism and motivation. However it is reasonable to assume a basic set of capabilities that many censors have in common: \begin{itemize} -\item blocking of specific IP addresses and ports +\item blocking of specific IP addresses and ports\index{blocking!by address} \item control of default DNS servers\index{DNS} \item blocking DNS queries\index{DNS!blocking} \item injection of false DNS responses\index{DNS!poisoning} \item injection of TCP\index{TCP} RSTs\index{RST (TCP flag)} -\item pattern matching / keyword filtering -\item application protocol parsing (``deep packet inspection'') -\item participation in a circumvention system as a client -\item scanning to discover proxies -\item throttling connections -\item temporary total shutdowns +\item keyword\index{keyword filtering} filtering in unencrypted\index{encryption} contents +\item application protocol parsing (``deep packet inspection''\index{deep packet inspection}) +\item participation in a circumvention system as a client\index{insider attack} +\item scanning to discover proxies\index{port scanning}\index{active probing} +\item throttling connections\index{throttling} +\item temporary total shutdowns\index{shutdown} \end{itemize} -Not all censors will be able to---or be motivated to---do all of these. +Not all censors will be able---or motivated---to do all of these. As the amount of traffic to be handled increases, -in-path attacks such as throttling become relatively more expensive. +in-path attacks such as throttling\index{throttling} become relatively more expensive. Whether a particular act of censorship even makes sense will depend on a local cost--benefit analysis, -a weighing of the expected gains against the potential collateral damage. -Some censors may be able to tolerate a brief total shutdown, -while for others the importance of the Internet is too great +a weighing of the expected gains against the potential collateral damage\index{collateral damage}. +Some censors may be able to tolerate a brief total shutdown\index{shutdown}, +while for others the importance of Internet connectivity is too great for such a blunt instrument. The Great Firewall of China (GFW)\index{Great Firewall of China}, @@ -1580,6 +1581,21 @@ But the GFW is in many ways an outlier, and not representative of other censors. A~worldwide view is needed. +\index{ethics|(} +Building accurate models of censor behavior is not only +needed for the purpose of circumvention. +It also has implications for ethical measurement~\cites[\S 5]{Wright2011a}[\S 2]{Jones2015a}. +For example, a common way to test for censorship is +to ask volunteers to run software that connects to potentially censored +destinations and records the results. +This potentially puts volunteers at risk. +Suppose the software accesses a destination +that violates local law. +Could the volunteer be held liable for the access? +Quantifying the degree of risk depends on modeling\index{modeling} +how a censor will react to a given stimulus~\cite[\S 2.2]{Crandall2015a}. +\index{ethics|)} + % Past measurement studies have done well at % determining the technical aspects of censorship, % for example where in the network censorship routers are located. @@ -1607,124 +1623,105 @@ A~worldwide view is needed. % In general, contemporary threat models tend to ignore % resource limitations on the part of the censor. -\todo[inline]{ -Questions of ethics are tied to models of censors; -e.g., will a censor arrest/harm someone who is caught circumventing? -What URLs are ``safe'' to probe in a measurement system? -Wright et~al.\ ``Fine-Grained Censorship Mapping: Information Sources, Legality and Ethics''~\cite{Wright2011a}. -Jones et~al.\ ``Ethical Concerns for Censorship Measurement''~\cite{Jones2015a-local}. -Crandall et~al.\ ``Forgive Us our SYNs: Technical and Ethical Considerations for Measuring Internet Filtering''~\cite{Crandall2015a}. -} - \section{Censorship measurement studies} \label{sec:measurement-surveys} -A~large part of censorship research is composed of +A~large part of research on censorship is composed of studies of censor behavior in the wild. In this section I~summarize past studies, which, taken together, present a picture of censor behavior in general. They are based on those in an evaluation study done by -me and others in 2016~\cite[\S IV.A]{Tschantz2016a-local}. -The studies are diverse and hard to categorize. -Here, I have grouped them according whether they -were one-time measurements or long-term projects, -and whether they looked at more than one censor (or country). - -Thus published knowledge about censors' capabilities -consists mostly of a series of ``spot checks'' -with blank areas between them. -There have been a few designs proposed to -do ongoing measurements of censorship, -such as ConceptDoppler~\cite{Crandall2007a} in 2007 and -CensMon~\cite{Sfakianakis2011a} in 2011, -but these have not lasted long in practice, -and for the most part there is an unfortunate lack of longitudinal -and cross-country measurements. - -\todo[inline]{ -Zittrain and Edelman ``Internet filtering in China''~\cite{Zittrain2003a}. -\textsl{Access Denied}~\cite{OpenNet2008AccessDenied}. -Wright ``Regional Variation in Chinese Internet Filtering''~\cite{Wright2012a}. -Mathrani and Alipour ``Website Blocking Across Ten Countries: A Snapshot''~\cite{Mathrani2010a}. -Aase et~al.\ ``Whiskey, Weed, and Wukan on the World Wide Web: On Measuring Censors' Resources and Motivations''~\cite{Aase2012a}. -Dalek et~al.\ ``O Pakistan, We Stand on Guard for Thee''~\cite{CitizenLab2013opakistan}. -Marquis-Boire et~al.\ ``Planet Blue Coat''\cite{Marquis2013planet}. -Anderson ``Splinternet Behind the Great Firewall of China''~\cite{Anderson2012splinternet}. -Dalek et~al.\ ``A Method for Identifying and Confirming the Use of URL Filtering Products for Censorship''~\cite{Dalek2013a-local}. -Gill et~al.\ ``Characterizing Web Censorship Worldwide: Another Look at the OpenNet Initiative Data''~\cite{Gill2015a}. -Aceto and Pescapè ``Analyzing Internet Censorship in Pakistan''~\cite{Aceto2016a}. -Gwagwa ``A study of {Internet}-based information controls in {Rwanda}''~\cite{Gwagwa_a_study_of_internet-based_information_controls_in_rwanda}. -} +me and others in 2016~\indexauthors{\cite[\S IV.A]{Tschantz2016a-local}}. +The studies are diverse and there are many +possible ways to categorize them. +Here, I have divided them into +one-time experiments and generic measurement platforms. +% Wright ``Regional Variation in Chinese Internet Filtering''~\cite{Wright2012a}. +% Mathrani and Alipour ``Website Blocking Across Ten Countries: A Snapshot''~\cite{Mathrani2010a}. +% Gill et~al.\ ``Characterizing Web Censorship Worldwide: Another Look at the OpenNet Initiative Data''~\cite{Gill2015a}. +% Gwagwa ``A study of {Internet}-based information controls in {Rwanda}''~\cite{Gwagwa_a_study_of_internet-based_information_controls_in_rwanda}. \subsection*{One-shot studies} One of the earliest technical studies of censorship occurred -in a place you might not expect, the German state of -North Rhein-Westphalia. -In 2003, Dornseif~\cite{Dornseif2003a} tested ISPs'\index{Internet service provider} implementation -of a controversial legal order to block web sites. +in a place you might not expect, the German\index{Germany} state of +North Rhein-Westphalia\index{North Rhein-Westphalia}. +Dornseif~\indexauthors{\cite{Dornseif2003a}} tested ISPs'\index{Internet service provider} implementation +of a controversial legal order to block web sites circa 2002. While there were many possible ways to implement the block, none were trivial to implement, nor free of overblocking side effects. The most popular implementation used DNS tampering,\index{DNS!poisoning} which is returning (or injecting) false responses to DNS requests -for the domain names of the blocked sites. +for the blocked sites. An in-depth survey of DNS tampering found a variety of implementations, some blocking more and some blocking less than required by the order. -This time period seems to be near the onset of DNS tampering in general; -Dong~\cite{Dong2002a} had reported it in China in late~2002. - -Clayton~\cite{Clayton2006b} in 2006 studied a ``hybrid'' blocking system, -CleanFeed\index{CleanFeed} by the British ISP\index{Internet service provider} BT\index{BT}, +This time period seems to mark the beginning of censorship by DNS tampering in general; +Dong~\indexauthors{\cite{Dong2002a}} reported it in China\index{China} in late~2002. + +Zittrain and Edelman~\indexauthors{\cite{Zittrain2003a}} +used open proxies\index{open proxy} to experimentally +analyze censorship in China\index{China} in late 2002. +They tested around 200,000 web sites and found +around 19,000 of them to be blocked. +There were multiple methods of censorship: +web server IP address blocking\index{blocking!by address}, +DNS\index{DNS} server IP address blocking\index{blocking!by address}, +DNS poisoning\index{DNS!poisoning}, +and keyword filtering\index{keyword filtering}. + +Clayton~\indexauthors{\cite{Clayton2006b}} in 2006 studied a ``hybrid'' blocking system, +CleanFeed\index{CleanFeed} by the British\index{United Kingdom} ISP\index{Internet service provider} BT\index{BT}, that aimed for a better balance of costs and benefits: a ``fast path'' IP address and port matcher -acted as a prefilter for the ``slow path,'' a full HTTP proxy. +acted as a prefilter for the ``slow path,'' a full HTTP proxy\index{HTTP proxy}. The system, in use since 2004, -was designed to block access to any of a secret list of -pedophile web sites compiled by a third party. -The author identifies ways to circumvent or attack such a system: -use a proxy, use source routing to evade the blocking router, -obfuscate requested URLs, use an alternate IP address or port, -return false DNS\index{DNS} results to put third parties on the ``bad'' list. -They demonstrate that the two-level nature of the blocking system -unintentionally makes it an oracle -that can reveal the IP addresses of sites in the secret blocking list. - -In 2006, Clayton, Murdoch, and Watson~\cite{Clayton2006a} -further studied the technical aspects of the Great Firewall of China. +was designed to block access to any of a secret list of web sites. +The system was vulnerable to a number of evasions, +such a using a proxy, using an alternate IP address or port, +and obfuscating URLs\index{URL}. +The two-level nature of the blocking system +unintentionally made it an oracle +that could reveal the IP addresses of sites in the secret blocking list. + +In 2006, Clayton, Murdoch, and Watson~\indexauthors{\cite{Clayton2006a}} +further studied the technical aspects of the Great Firewall of China\index{Great Firewall of China}. They relied on an observation that the firewall was symmetric, treating incoming and outgoing traffic equally. By sending web requests from outside the firewall to a web server inside, they could provoke the same blocking behavior that someone on the inside would see. -They sent HTTP requests containing forbidden keywords (e.g., ``falun'') -caused the firewall to inject RST packets\index{RST (TCP flag)} +They sent HTTP\index{HTTP} requests containing forbidden keywords\index{keywords} +that caused the firewall to inject RST packets\index{RST (TCP flag)}\index{packet injection} towards both the client and server. -Simply ignoring RST packets (on both ends) +Simply ignoring RST\index{RST (TCP flag)} packets (on both ends) rendered the blocking mostly ineffective. -The injected packets had inconsistent TTLs and other anomalies +The injected packets had inconsistent TTLs\index{TTL} and other anomalies that enabled their identification. -Rudimentary countermeasures such as splitting keywords -across packets were also effective in avoiding blocking. -The authors of this paper bring up an important point -that would become a major theme of future censorship modeling: +Rudimentary countermeasures, such as splitting keywords +across packets, were also effective in avoiding blocking. +The authors brought up an important point +that would become a major theme of future censorship modeling\index{modeling}: censors are forced to trade blocking effectiveness against performance. In order to cope with high load at a reasonable costs, -censors may choose the architecture of a network monitor\index{network monitor} -or intrusion detection system, +censors may employ the ``on-path'' architecture of a +network monitor\index{network monitor} +or intrusion detection system\index{intrusion detection}; i.e., one that can passively monitor and inject packets, -but cannot delay or drop them. - -Nearly contemporary studies by Wolfgarten~\cite{Wolfgarten2006a} -and Tokachu~\cite{Tokachu2006a} found cases of -DNS tampering, search engine filtering, and RST injection\index{RST (TCP flag)}\index{packet injection+} -caused by keyword detection. -In 2007, Lowe, Winters, and Marcus~\cite{Lowe2007a} -did detailed experiments on DNS tampering in China.\index{DNS!poisoning} +but cannot delay or drop them\index{packet dropping}. + +Contemporaneous studies of the Great Firewall\index{Great Firewall of China} +by Wolfgarten~\indexauthors{\cite{Wolfgarten2006a}} +and Tokachu~\indexauthors{\cite{Tokachu2006a}} found cases of +DNS tampering\index{DNS!poisoning}, +search engine filtering, and RST injection\index{RST (TCP flag)}\index{packet injection+} +caused by keyword\index{keyword filtering} detection. +In 2007, Lowe, Winters, and Marcus~\indexauthors{\cite{Lowe2007a}} +did detailed experiments on DNS tampering in China\index{China}.\index{DNS!poisoning} They tested about 1,600 recursive DNS servers in China against a list of about 950 likely-censored domains. For about 400 domains, responses came back with bogus IP addresses, @@ -1736,38 +1733,41 @@ Canada\index{Canada}, China\index{China}, Hong Kong\index{Hong Kong}, and the U.S.\index{United States of America} -By manipulating TTLs\index{TTL}, the authors found that the false responses -were injected by an intermediate router: +By manipulating the IP time-to-live field\index{TTL}, +the authors found that the false responses +were injected by an intermediate router, +evidenced by the fact that the authentic response would be received as well, only later. A more comprehensive survey~\cite{Anonymous2014a} -of DNS tampering\index{DNS!poisoning} and injection occurred in 2014, +of DNS tampering\index{DNS!poisoning} occurred in 2014, giving remarkable insight into the internal structure of the censorship machines. -DNS injection happens only at border routers. -IP ID and TTL analysis show that each node -is a cluster of several hundred processes -that collectively inject censored responses. -They found 174 bogus IP addresses, more than previously documented. -They extracted a blacklist of about 15,000 keywords. - +DNS injection happened only at border routers. +IP ID and TTL\index{TTL} analysis showed that each node +was a cluster of several hundred processes +that collectively injected censored responses. +They found 174 bogus IP addresses, more than previously documented, +and extracted a blacklist of about 15,000 keywords. + +\index{Great Firewall of China|(} The Great Firewall, because of its unusual sophistication, has been an enduring object of study. Part of what makes it interesting is its many blocking modalities, both active and passive, proactive and reactive. -The ConceptDoppler project of Crandall et~al.~\cite{Crandall2007a} +The ConceptDoppler\index{ConceptDoppler} project of Crandall et~al.~\indexauthors{\cite{Crandall2007a}} measured keyword filtering by the Great Firewall -and showed how to discover new keywords automatically +and showed how to discover new keywords\index{keyword filtering} automatically by latent semantic analysis, using -the Chinese-language Wikipedia as a corpus. +the Chinese-language\index{Chinese language} Wikipedia\index{Wikipedia} as a corpus. They found limited statefulness in the firewall: -sending a naked HTTP request +sending a naked HTTP\index{HTTP} request without a preceding SYN\index{SYN (TCP flag)} resulted in no blocking. -In 2008 and 2009, Park and Crandall~\cite{Park2010a} -further tested keyword filtering of HTTP responses. +In 2008 and 2009, Park and Crandall~\indexauthors{\cite{Park2010a}} +further tested keyword filtering of HTTP\index{HTTP} responses. Injecting RST packets\index{RST (TCP flag)} into responses is more difficult than doing the same to requests, because of the greater uncertainty in predicting -TCP\index{TCP} sequence numbers +TCP\index{TCP!sequence numbers+} sequence numbers once a session is well underway. In fact, RST injection\index{RST (TCP flag)} into responses was hit or miss, succeeding only 51\% of the time, @@ -1776,176 +1776,213 @@ They also found inconsistencies in the statefulness of the firewall. Two of ten injection servers would react to a naked HTTP request; that it, one sent outside of an established TCP\index{TCP} connection. The remaining eight of ten required an established TCP connection. -Xu et~al.~\cite{Xu2011a} continued the theme of keyword filtering in 2011, +Xu et~al.~\indexauthors{\cite{Xu2011a}} continued the theme of keyword filtering in 2011, with the goal of -discovering where filters are located at the IP and AS levels. +discovering where filters are located at the +IP and autonomous system\index{autonomous system} levels. Most filtering is done at border networks -(autonomous systems with at least one non-Chinese peer). +(autonomous systems with at least one peer outside China). In their measurements, the firewall was fully stateful: blocking was never -triggered by an HTTP request outside an established TCP\index{TCP} connection. -Much filtering occurs +triggered by an HTTP\index{HTTP} request outside an established TCP\index{TCP} connection. +Much filtering occurred at smaller regional providers, rather than on the network backbone. +Anderson~\cite{Anderson2012splinternet} +gave a detailed description of the design of the Great Firewall +in 2012. +He described IP address blocking\index{blocking!by address} +by null routing, +RST\index{RST (TCP flag)} injection\index{packet injection}, +and DNS poisoning\index{DNS!poisoning}, +and documented cases of collateral damage affecting clients inside +and outside China. +\index{Great Firewall of China|)} + +Dainotti et~al.~\indexauthors{\cite{Dainotti2011a}} +reported on the total Internet shutdowns\index{shutdown} +that took place in Egypt\index{Egypt} and Libya\index{Libya} +in the early months of 2011. +They used multiple measurements to document +the outages as they occurred. +During the shutdowns, they measured a drop in scanning traffic +(mainly from the Conficker\index{Conficker} botnet\index{botnet}). +By comparing these different measurements, +they showed that the shutdown in Libya was accomplished +in more than one way, +both by altering network routes and by firewalls dropping packets\index{packet dropping}. -Winter and Lindskog~\cite{Winter2012a}, -and later Ensafi et~al.~\cite{Ensafi2015b} -did a formal investigation into active probing, +\index{Great Firewall of China|(} +Winter and Lindskog~\indexauthors{\cite{Winter2012a}}, +and later Ensafi et~al.~\indexauthors{\cite{Ensafi2015b}} +did a formal investigation into active probing\index{active probing}, a reported capability of the Great Firewall since around October 2011. -They focused on the firewall's probing of Tor +They focused on the firewall's probing of Tor\index{Tor!protocol} and its most common pluggable transports\index{pluggable transports}. +\index{Great Firewall of China|)} -Anderson~\cite{Anderson2013a-local} -documented network throttling in Iran, +\index{throttling|(} +Anderson~\indexauthors{\cite{Anderson2013a-local}} +documented network throttling in Iran\index{Iran}, which occurred over two major periods between 2011 and 2012. Throttling degrades network access without totally blocking it, and is harder to detect than blocking. Academic institutions were affected by throttling, but less so than other networks. -Aryan et~al.~\cite{Aryan2013a} -tested censorship in Iran +Aryan et~al.~\indexauthors{\cite{Aryan2013a}} +tested censorship in Iran\index{Iran} during the two months before the June 2013 presidential election. -They found multiple blocking methods: HTTP request keyword filtering, +They found multiple blocking methods: HTTP\index{HTTP} request keyword filtering\index{keyword filtering}, DNS tampering\index{DNS!poisoning}, and throttling. -The most usual method was HTTP request filtering. +The most usual method was HTTP request filtering; DNS tampering (directing to a blackhole IP address) -affected only three domains: -facebook.com, -youtube.com, and -plus.google.com. -SSH connections were throttled down to about 15\% +affected only the three domains +\nolinkurl{facebook.com}\index{facebook.com@\nolinkurl{facebook.com}}, +\nolinkurl{youtube.com}\index{youtube.com@\nolinkurl{youtube.com}}, and +\nolinkurl{plus.google.com}\index{plus.google.com@\nolinkurl{plus.google.com}}. +SSH\index{SSH} connections were throttled down to about 15\% of the link capacity, -while randomized protocols were throttled almost down to zero -60 seconds into a connection's lifetime. -Throttling seemed to be achieved by dropping packets, thereby -forcing TCP's\index{TCP} usual recovery. - -Khattak et~al.~\cite{Khattak2013a} -evaluated the Great Firewall from the perspective that it works like +while randomized protocols\index{look-like-nothing transport} +were throttled almost down to zero, +60~seconds into a connection's lifetime. +Throttling seemed to be achieved by dropping packets\index{packet dropping}, +which causes TCP\index{TCP} to slow down. +\index{throttling|)} + +Khattak et~al.~\indexauthors{\cite{Khattak2013a}} +evaluated the Great Firewall\index{Great Firewall of China} from the perspective that it works like an intrusion detection system\index{intrusion detection} or network monitor\index{network monitor}, -and applied existing technique for evading a monitor -the the problem of circumvention. +and applied existing techniques for evading a monitor +to the problem of circumvention. They looked particularly for ways to evade detection that are expensive for the censor to remedy. -They found that the firewall is stateful, +They found that the firewall was stateful, but only in the client-to-server direction. -The firewall is vulnerable to a variety of TCP-\index{TCP} and HTTP-based evasion -techniques, such as overlapping fragments, TTL-limited packets, -and URL encodings. +The firewall was vulnerable to a variety of TCP-\index{TCP} and HTTP-based\index{HTTP} evasion +techniques, such as overlapping fragments\index{fragmentation}, TTL-limited\index{TTL} packets, +and URL\index{URL} encodings. -Nabi~\cite{Nabi2013a} -investigated web censorship in Pakistan in 2013, using a publicly known +Nabi~\indexauthors{\cite{Nabi2013a}} +investigated web censorship in Pakistan\index{Pakistan} in 2013, using a publicly available list of banned web sites. -They tested on 5 different networks in Pakistan. Over half of the sites on the list were blocked by DNS tampering\index{DNS!poisoning}; -less than 2\% were additionally blocked by HTTP filtering -(an injected redirection before April 2013, -or a static block page after that). +less than 2\% were additionally blocked by HTTP\index{HTTP} filtering +(an injected redirect before April 2013, +or a static block page\index{block page} after that). They conducted a small survey to find the most -commonly used circumvention methods in Pakistan. -The most used method was public VPNs, at 45\% of respondents. - -Ensafi et~al.~\cite{Ensafi2015a} +commonly used circumvention methods; +the most common was public VPNs\index{VPN}, at 45\% of respondents. +Khattak et~al.~\indexauthors{\cite{Khattak2014a}} +looked at two censorship events that took place in Pakistan\index{Pakistan} +in 2011 and 2012. +Their analysis is special because unlike most studies of censorship, +theirs uses traffic traces taken directly from an ISP. +They observe that users quickly switched to TLS-based +circumvention following a block of YouTube\index{YouTube}. +The blocks had side effects beyond a loss of connectivity: +the ISP had to deal with more ciphertext than before, +and users turned to alternatives for the blocked sites. +Their survey found that the most common method of circumvention +was VPNs\index{VPN}. +Aceto and Pescapè~\indexauthors{\cite{Aceto2016a}} +revisited Pakistan\index{Pakistan} in 2016. +Their analysis of six months of active measurements in five ISPs +showed that blocking techniques differed across ISPs; +some used DNS poisoning\index{DNS!poisoning} and +others used HTTP\index{HTTP} filtering. +They did their own survey of commonly used circumvention technologies, +and again the winner was VPNs\index{VPN} with 51\% of respondents. + +Ensafi et~al.~\indexauthors{\cite{Ensafi2015a}} employed an intriguing technique to measure censorship -from many locations in China---a ``hybrid idle scan.'' +from many locations in China---a ``hybrid idle scan\index{hybrid idle scan}.'' The hybrid idle scan allows one to test TCP\index{TCP} connectivity between two Internet hosts, without needing to control either one. They selected roughly uniformly geographically distributed sites -in China from which to measure connectivity to -Tor relays, Tor directory authorities, +in China\index{China} from which to measure connectivity to +Tor\index{Tor} relays, Tor directory authorities\index{Tor!directory authorities}, and the web servers of popular Chinese web sites. There were frequent failures of the firewall resulting -in temporary connectivity, typically lasting in bursts of hours. +in temporary connectivity, typically occurring in bursts of hours. -In 2015, Marczak et~al.~\cite{Marczak2015a-local} +In 2015, Marczak et~al.~\indexauthors{\cite{Marczak2015a-local}} investigated an innovation in the capabilities -of the border routers of China, -an attack tool dubbed the ``Great Cannon.'' -The cannon was responsible for denial-of-service attacks -on Amazon CloudFront and GitHub\index{GitHub}. +of the border routers of China\index{China}, +an attack tool dubbed the Great Cannon\index{Great Cannon}. +The cannon was responsible for denial-of-service\index{denial of service} attacks +on Amazon CloudFront\index{Amazon CloudFront} and GitHub\index{GitHub}. The unwitting participants in the attack were -web browsers located outside of China, +web browsers\index{web browser} located \emph{outside} of China, who began their attack when the cannon injected -malicious JavaScript into certain HTTP responses -originating in China. -The new attack tool is noteworthy because it demonstrated +malicious JavaScript\index{JavaScript} into certain HTTP responses +originating inside of China. +The new attack tool was noteworthy because it demonstrated previously unseen in-path behavior, -such as packet dropping. +such as packet dropping\index{packet dropping}. A major aspect of censor modeling is that many censors use commercial firewall hardware. -A case in point is the analysis by -Chaabane et~al.~\cite{Chaabane2014a} -of 600 GB of leaked logs from Blue Coat proxies -used for censorship in Syria. +Dalek et~al.~\indexauthors{\cite{CitizenLab2013opakistan}}, +Dalek et~al.~\indexauthors{\cite{Dalek2013a-local}}, +and Marquis-Boire et~al.~\indexauthors{\cite{Marquis2013planet}} +documented the use of commercial firewalls made by +Blue Coat\index{Blue Coat}, McAfee\index{McAfee}, and Netsweeper\index{Netsweeper} +in a number of countries. +Chaabane et~al.~\indexauthors{\cite{Chaabane2014a}} +analyzed 600 GB of leaked logs from Blue Coat\index{Blue Coat} proxies +that were being used for censorship in Syria\index{Syria}. The logs cover 9 days in July and August 2011, -and contain an entry for every HTTP request. -The authors of the study found evidence of IP address blocking, -domain name blocking, and HTTP request keyword blocking, -and also of users circumventing censorship -by downloading circumvention software or using the Google cache. -All subdomains of .il, the top-level domain for Israel, +and contain an entry for every HTTP\index{HTTP} request. +The authors of the study found evidence of IP address blocking\index{blocking!by address}, +DNS\index{DNS} blocking, and HTTP\index{HTTP} request keyword blocking\index{keyword filtering}; +and also evidence of users circumventing censorship +by downloading circumvention software or using cache feature of Google\index{Google} search. +All subdomains of .il\index{.il}, the top-level domain for Israel\index{Israel}, were blocked, as were many IP address ranges in Israel. Blocked URL keywords included ``proxy'', -``hotspotshield'', -``israel'', and -``ultrasurf'' -(resulting in collateral damage to the Google Toolbar -and Facebook Like button because they have ``proxy'' in HTTP requests). -Tor was only lightly censored---only one of several proxies +which resulted in collateral damage\index{collateral damage} to the Google Toolbar\index{Google} +and the Facebook like button\index{Facebook} because they included the string ``proxy'' in HTTP\index{HTTP} requests. +Tor\index{Tor!protocol} was only lightly censored: only one of several proxies blocked it, and only sporadically. +\subsection*{Generic measurement platforms} -\subsection*{Multiple-location studies} - -For a decade, the OpenNet Initiative produced reports +For a decade, the OpenNet Initiative\index{OpenNet Initiative} produced reports on Internet filtering and surveillance in dozens of countries, until it ceased operation in 2014. -For example, their 2005 report on Internet filtering in China~\cite{oni-china-2005} +For example, their 2005 report on Internet filtering in China\index{China}~\cite{oni-china-2005} studied the problem from many perspectives, political, technical, and legal. -They translated and interpreted Chinese laws relating to the Internet, -which provide strong legal justifications for filtering. -The laws regulate both Internet users and service providers, -including cybercafes. -They prohibit the transfer of information that is indecent, -subversive, false, criminal, or that reveals state secrets. -The OpenNet Initiative tested the extent of filtering -of web sites, search engines, blogs, and email. +They tested the extent of filtering +of web sites, search engines, blogs\index{blog}, and email\index{email}. They found a number of blocked web sites, some related to news and politics, and some on sensitive subjects -such as Tibet and Taiwan. -In some cases, entire sites (domains) were blocked; -in others, only specific pages within a larger site were blocked. -In a small number of cases, sites were accessible by -IP address but not by domain name. -There were cases of overblocking: apparently inadvertently blocked sites -that simply shared an IP address or URL keyword +such as Tibet\index{Tibet} and Taiwan\index{Taiwan}. +In some cases, entire domains were blocked\index{blocking!by address}; +in others, only specific URLs within the domain were blocked\index{URL!filtering}. +There were cases of overblocking\index{false positive}: apparently inadvertently blocked sites +that happened to share an IP address or URL keyword\index{keyword filtering} with an intentionally blocked site. -On seeing a prohibited keyword, the firewall blocked connections -by injecting a TCP\index{TCP} RST packet\index{RST (TCP flag)} to tear down the connection, +The firewall terminated connections +by injecting\index{packet injection} +a TCP\index{TCP} RST packet\index{RST (TCP flag)}, then injecting a zero-sized TCP window\index{TCP!window size}, which would prevent any communication with the same server for a short time. Using technical tricks, the authors inferred -that Chinese search engines index blocked sites +that Chinese search engines indexed blocked sites (perhaps having a special exemption from the general firewall policy), -but do not return them in search results. -% https://opennet.net/bulletins/005/ -The firewall blocks access searches for certain keywords on Google -as well as the Google Cache---but the latter could be worked around -by tweaking the format of the URL. -% https://opennet.net/bulletins/006/ -Censorship of blogs comprised keyword blocking -by domestic blogging services, -and blocking of external domains such as blogspot.com. -% https://opennet.net/bulletins/008/ -Email filtering is done by the email providers themselves, +but did not return them in search results~\cite{oni-bulletin-005}. +Censorship of blogs included keyword blocking\index{keyword filtering} +by domestic blogging\index{blog} services, +and blocking of external domains such as +\nolinkurl{blogspot.com}\index{blogspot.com@\nolinkurl{blogspot.com}}~\cite{oni-bulletin-008}. +Email\index{email} filtering was done by the email providers themselves, not by an independent network firewall. -Email providers seem to implement their filtering rules +Email providers seemed to implement their filtering rules independently and inconsistently: messages were blocked by some providers and not others. % More ONI? @@ -1956,78 +1993,62 @@ messages were blocked by some providers and not others. % \cite{oni-iran-2007} % \cite{oni-iran-2009} -Sfakianakis et~al.~\cite{Sfakianakis2011a} -built CensMon, a system for testing web censorship -using PlanetLab nodes as distributed measurement points. -They ran the system for for 14 days in 2011 across 33 countries, -testing about 5,000 unique URLs. -They found 193 blocked domain–country pairs, 176 of them in China. -CensMon reports the mechanism of blocking. -Across all nodes, it was -18.2\% DNS tampering,\index{DNS!poisoning} -33.3\% IP address blocking, and -48.5\% HTTP keyword filtering. -The system was not run on a continuing basis. -Verkamp and Gupta~\cite{Verkamp2012a} +Sfakianakis et~al.~\indexauthors{\cite{Sfakianakis2011a}} +built CensMon\index{CensMon}, a system for testing web censorship +using PlanetLab\index{PlanetLab}, a distributed network research platform. +They ran the system for 14~days in 2011 across 33~countries, +testing about 5,000 unique URLs\index{URL}. +They found 193 blocked domain--country pairs, 176 of them in China\index{China}. +CensMon\index{CensMon} was not run on a continuing basis. +Verkamp and Gupta~\indexauthors{\cite{Verkamp2012a}} did a separate study in 11 countries, -using a combination of PlanetLab nodes +using a combination of PlanetLab\index{PlanetLab} nodes and the computers of volunteers. -Censorship techniques vary across countries; -for example, some show overt block pages and others do not. -China was the only stateful censor of the 11. -% \cite{Mathrani2010a} - -Dainotti et~al.~\cite{Dainotti2011a} -reported on the total Internet shutdowns -that took place in Egypt and Libya -in the early months of 2011. -They used multiple measurements to document -the outages as they occurred: -BGP data, a large network telescope, and active traceroutes. -During outages, there was a drop in scanning traffic -(mainly from the Conficker botnet) to their telescope. -By comparing these different measurements, -they showed that the shutdown in Libya was accomplished -in more that one way, -both by altering network routes and by firewalls dropping packets. - - -\subsection*{Long-term measurement platforms} - -Just as in circumvention, in censorship measurement -a host of difficulties arise when running a scalable system -for a long time, that do not arise when doing a one-time operation. - -Dedicated measurement platforms such as -OONI~\cite{Filasto2012a} and ICLab~\cite{iclab} -provide regular measurements from many locations worldwide. -Even with these, there are challenges around -getting probes into difficult locations -and keeping them running. - -PlanetLab is a system that was not originally designed for censorship measurement, -that was later adapted for that purpose. -Another recent example is RIPE Atlas, +Censorship techniques varied across countries; +for example, some showed overt block pages\index{block page} and others did not. + +OONI\index{OONI}~\cite{Filasto2012a} and ICLab\index{ICLab}~\cite{iclab} +are dedicated censorship measurement platforms. +Razaghpanah et~al.~\indexauthors{\cite{Razaghpanah2016OONIICLab}} +provide a comparison of the two platforms. +They work by running regular network measurements +from the computers of volunteers or through VPNs\index{VPN}. +UBICA\index{UBICA}~\cite{Aceto2015a} is another system +based on volunteers running probes; +its authors used it to investigate several forms of censorship +in Italy\index{Italy}, +Pakistan\index{Pakistan}, +and South Korea\index{South Korea}. + +Anderson et~al.~\indexauthors{\cite{Anderson2014a-local}} +used RIPE Atlas~\index{RIPE Atlas} a globally distributed Internet -measurement network consisting of physical probes hosted by volunteers, +measurement network, +to examine two case studies of censorship: +Turkey's\index{Turkey} ban on social media sites in March 2014 and +Russia's\index{Russia} blocking of certain LiveJournal\index{LiveJournal}\index{social media} blogs\index{blog} in March 2014. Atlas allows 4 types of measurements: ping, traceroute, DNS resolution\index{DNS}, -and X.509 certificate fetching. -Anderson et~al.~\cite{Anderson2014a-local} -used Atlas to examine two case studies of censorship: -Turkey's ban on social media sites in March 2014 and -Russia's blocking of certain LiveJournal blogs in March 2014. -In Turkey, they +and X.509\index{X.509} certificate fetching. +In Turkey\index{Turkey}, they found at least six shifts in policy during two weeks of site blocking. They observed an escalation in blocking in Turkey: -the authorities first poisoned DNS\index{DNS!poisoning} for twitter.com, +the authorities first poisoned DNS\index{DNS!poisoning} for +\nolinkurl{twitter.com}\index{twitter.com@\nolinkurl{twitter.com}}, then blocked the IP addresses of the Google public DNS servers\index{DNS}, -then finally blocked Twitter's\index{Twitter} IP addresses directly. +then finally blocked Twitter's\index{Twitter}\index{social media} IP addresses directly. In Russia, they found ten unique bogus IP addresses used to poison DNS. -\todo[inline]{ -Pearce, Ensafi, et~al.\ ``Augur: Internet-Wide Detection of Connectivity Disruptions''~\cite{Pearce2017a}. -Pearce et~al.\ ``Global Measurement of DNS Manipulation''~\cite{Pearce2017b-local}. -} +Pearce, Ensafi, et~al.~\indexauthors{\cite{Pearce2017a}} made Augur\index{Augur}, +a scaled-up version of the hybrid idle scan of Ensafi et~al.~\indexauthors{\cite{Ensafi2015a}}, +designed for continuous, global measurement of disruptions of TCP\index{TCP} connectivity. +The basic tool is the ability to detect packet drops between two remote hosts; +but expanding it to a global scale poses a number of technical challenges. +Pearce et~al.\indexauthors{\cite{Pearce2017b-local}} built Iris\index{Iris}, +as system to measure DNS\index{DNS} manipulation globally. +Iris uses open resolvers and evaluates measurements against the detection metrics +of consistency (answers from different locations should the same or similar) +and independent verifiability (checking results against other sources of data +like TLS certificates\index{certificate}) in order to decide when they constitute manipulation. \section{The evaluation of circumvention systems} @@ -2043,19 +2064,21 @@ Without grounding in reality, researchers risk running an imaginary arms race\index{arms race} that evolves independently of the real one. -I~took part, -with Michael Carl Tschantz, Sadia Afroz, and Vern Paxson, -in a meta-study~\cite{Tschantz2016a-local}, +I\index{Fifield, David}~took part, +with Michael Carl Tschantz\index{Tschantz, Michael Carl}, +Sadia Afroz\index{Afroz, Sadia}, +and Vern Paxson\index{Paxson, Vern}, +in a meta-study~\cite{Tschantz2016a-local} of how circumvention systems are evaluated by their authors and designers, -and comparing those empirically determined censor models. +and comparing those to empirically determined censor models. This kind of work is rather different than the direct evaluations of circumvention tools that have happened before, for example those done by the Berkman Center~\cite{Berkman2011} and Freedom House~\cite{FreedomHouse2011} in 2011. Rather than testing tools against censors, we evaluated -how closely calibrated designers' own models were to +how closely aligned designers' own models were to models derived from actual observations of censors. This research was partly born out of @@ -2064,7 +2087,8 @@ in academic research on circumvention, which we felt placed undue emphasis on steganography and obfuscation of traffic streams, while not paying enough attention to -the perhaps more important problems of bridge distribution and rendezvous. +the perhaps more important problems of proxy distribution\index{proxy distribution problem} +and initial rendezvous\index{rendezvous} between client and proxy. We wanted to help bridge the gap by laying out a research agenda to align the incentives of researchers with those of circumventors. This work was built on extensive surveys @@ -2079,15 +2103,14 @@ is established (related to detection by content), while actually deployed systems cared more about how the connection is established initially (related to detection by address). +Designers tend to misperceive the censor's +weighting of false positives and false negatives---assuming +a whitelist rather than a blacklist, say. Real censors care greatly about the cost of running detection, and prefer cheap, passive, stateless solutions whenever possible. It is important to guard against these modes of detection before becoming too concerned with those that require -fast computation, packet flow blocking, or lots of state. -Designers tend to misperceive the censor's -weighting of false positives and false negatives---assuming -a whitelist rather than a blacklist, say---and -indeed it remains an open problem how to estimate these. +sophisticated computation, packet flow blocking, or lots of state. \chapter{Active probing} @@ -2241,12 +2264,12 @@ with important web services: the censor can tell that circumvention is taking place but cannot block the proxy without unacceptable collateral damage\index{collateral damage}. In Snowflake\index{Snowflake} (\autoref{chap:snowflake}), -proxies are web browsers running ordinary peer-to-peer protocols, +proxies are web browsers\index{web browser} running ordinary peer-to-peer protocols, authenticated using a per-connection shared secret. Even if a censor discovers one of Snowflake's proxies, it cannot verify that the proxy is running Snowflake or something else, without having first negotiated a shared secret -through Snowflake's broker\index{Snowflake!broker} mechanism.\index{Snowflake!rendezvous} +through Snowflake's broker\index{broker (snowflake)} mechanism.\index{rendezvous!of Snowflake} \section{History of active probing research} @@ -2306,7 +2329,7 @@ The obfs4\index{obfs4} transport (resistant to active probing) becomes available~\cite{tor-blog-tor-browser-45-released}. \\ 2015 August & -BreakWa11\index{BreakWa11} discovers an active-probing vulnerability in +BreakWa11\index{BreakWa11} finds an active-probing vulnerability in Shadowsocks\index{Shadowsocks}~\cites{github-shadowsocks-rss-issue-38}[\S 2]{BlessingStudio-why-do-shadowsocks-deprecate-ota}. \\ 2015 October & @@ -2315,6 +2338,11 @@ multi-modal experiments on active probing. \\ 2017 February & Shadowsocks\index{Shadowsocks} changes its protocol to better resist active probing~\cite{github-shadowsocks-org-issue-42}. +\\ +2017 May & +Wang et~al.~\indexauthors{\cite[\S 7.3]{Wang2017a}} find that bridges +that are discovered by active probing are blocked on the entire IP address, +not an individual port. \end{tabular} \caption{ Timeline of research on active probing. @@ -2402,7 +2430,7 @@ they developed a server-side tool, brdgrd~\cite{brdgrd}\index{brdgrd}, that rewrote the TCP window\index{TCP!window size} so that the client's handshake would be split across packets\index{fragmentation}. The tool sufficed, at the time, to prevent active probing, -but the authors reported that it stopped working in 2013\indexauthors{\cite[\S Software]{Winter2012a-webpage}}. +but stopped working in 2013~\indexauthors{\cite[\S Software]{Winter2012a-webpage}}. The obfs2\index{obfs2} pluggable transport, first available in February 2012~\cite{tor-blog-obfsproxy-next-step-censorship-arms-race}, @@ -2419,7 +2447,7 @@ but the firewall\index{Great Firewall of China} did not gain the ability to probe for it until August~2013~\cite[Figure~8]{Ensafi2015b}. Majkowski~\indexauthors{\cite{Majkowski-fun-with-the-great-firewall}} -documented a change in active-probing behavior +documented a change in the GFW between June and July~2013. In June, he reproduced the observations of Winter\index{Winter, Philipp} and Lindskog\index{Lindskog, Stefan}: @@ -2427,7 +2455,7 @@ pairs of TLS\index{TLS} probes, one from 202.108.181.70\index{202.108.181.70 (active prober)} and one from some other IP address. He also provided TLS fingerprints\index{TLS!fingerprinting} for the probers, -which were distinct from the fingerprints of ordinary Tor\index{Tor} clients. +which differed from those of ordinary Tor\index{Tor} clients. In July, he began to see pairs of probes with apparently random contents, like the garbage probes\index{garbage probes} Wilde\index{Wilde, Tim} described. @@ -2449,14 +2477,14 @@ in the Shadowsocks\index{Shadowsocks} protocol~\cites{github-shadowsocks-rss-iss The flaw had to do with a lack of integrity protection\index{integrity}, allowing a prober to introduce errors into ciphertext and watch the server's reaction. -As a stopgap, the Shadowsocks developers deployed protocol modifications -that proved to have separate vulnerabilities to probing. -They deployed another protocol change in +As a stopgap, the developers deployed a protocol change +that proved to have its own vulnerabilities to probing. +They deployed another protocol in February~2017, adding cryptographic integrity protection and fixing the problem~\cite{github-shadowsocks-org-issue-42}. Despite the long window of vulnerability, -there is no evidence that the Great Firewall\index{Great Firewall of China} -tried to active-probe Shadowsocks servers. +I~know of no evidence that the Great Firewall\index{Great Firewall of China} +tried to probe for Shadowsocks servers. Ensafi et~al. (including me)~\indexauthors{\cite{Ensafi2015b}} did the largest controlled study of active probing to date @@ -2479,6 +2507,15 @@ protocol implementations. Observations from this research project appear in the remaining sections of this chapter. +Wang et~al.~\indexauthors{\cite[\S 7.3]{Wang2017a}} +tried connecting to bridges from 11~networks in China\index{China}. +They found that connections from four of the networks +did not result in active probing, +while connections from the other seven did. +A bridge that was probed became blocked on all ports, +a change from the single-port blocking that had +been documented earlier. + \section{Types of probes} Our experiments confirmed the existence of known probe types @@ -2533,6 +2570,8 @@ any probes that looked random but were not obfs2. We unexpectedly found evidence of probe types other than Tor-related ones. One of these was an HTTPS request: +{ +\small \begin{verbatim} POST /vpnsvc/connect.cgi HTTP/1.1 Connection: Keep-Alive @@ -2541,6 +2580,7 @@ Content-Type: image/jpeg GIF89a... \end{verbatim} +} \index{HTTP} \index{HTTPS} \index{POST (HTTP method)} @@ -2548,14 +2588,14 @@ GIF89a... \index{Content-Length (HTTP header)} \index{Content-Type (HTTP header)} Both the path ``/vpnsvc/connect.cgi'', -and the body being a GIF\index{GIF} image despite having +and the body being a GIF image despite having a Content-Type\index{Content-Type (HTTP header)} -of ``image/jpeg''\index{JPEG}, +of ``image/jpeg'', are characteristic of the client handshake of the SoftEther VPN software\index{SoftEther VPN} that underlies the VPN Gate\index{VPN Gate} circumvention system~\cite{Nobori2014a}. \item[AppSpot] -This type of probe is an HTTPS\index{HTTPS} request: +This type of probe is also an HTTPS\index{HTTPS} request: { \small \begin{verbatim} @@ -2580,7 +2620,7 @@ User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like where the `\texttt{XX}' is a number that varies. The intent of this probe seems to be the discovery of servers that are capable of domain fronting\index{domain fronting} for Google\index{Google} services, -including Google App Engine\index{Google App Engine}, which runs at \nolinkurl{appspot.com}. +including Google App Engine\index{Google App Engine}, which runs at \nolinkurl{appspot.com}\index{appspot.com}. % \cite[Table~2(a)]{Anonymous2014a} has "appspot.com" as a top-10 DNS poisoning pattern. (See \autoref{chap:domain-fronting} for more on domain fronting.) At one time, there were simple proxies running at @@ -2592,6 +2632,8 @@ I~discovered it while re-analyzing my server logs in order to update \autoref{fig:active-probing-http}. It is a particular request that was sent over both HTTP\index{HTTP} and HTTPS\index{HTTPS}: +{ +\small \begin{verbatim} GET / HTTP/1.1 Accept-Encoding: identity @@ -2599,6 +2641,7 @@ Host: 69.164.193.231 Connection: close User-Agent: Python-urllib/2.7 \end{verbatim} +} \index{GET (HTTP method)} \index{Accept-Encoding (HTTP header)} \index{Host (HTTP header)} @@ -2611,7 +2654,7 @@ of active probe. The User-Agent ``Python-urllib/2.7'' and appears many other places in my logs, not in an active probing context. -I~cannot guess what the purpose of this probe type may be, +I~cannot guess what this probe's purpose may be, except to observe that Nobori and Shinjo also caught a ``Python-urllib'' client scraping the VPN Gate\index{VPN Gate} server list~\cite[\S 6.3]{Nobori2014a}. @@ -2722,7 +2765,7 @@ representing There is one extreme outlier, the address 202.108.181.70\index{202.108.181.70 (active prober)}, which by itself accounted for 2\% of the probes. -(Even this large fraction stands in contrast to previous studies, +(Even this substantial fraction stands in contrast to previous studies, where that single IP address accounted for roughly half the probes~\cite[\S 4.5.1]{Winter2012a}.) Among the address ranges are ones belonging to residential ISPs\index{Internet service provider}. @@ -2833,25 +2876,38 @@ and the TLS fingerprint\index{TLS!fingerprinting} are inconsistent with Chrome. \chapter{Time delays in censors' reactions} \label{chap:proxy-probe} -Tor bridges are secret relays that help clients get around censorship. +Censors' inner workings are mysterious. +To the researcher hoping to understand them +they present only a hostile, black-box interface. +However some of their externally visible behaviors +offers hints about their internal decision making. +In this chapter I~describe the results of an experiment +that is designed to shed light on the actions of censors; +namely, a test of how quickly they react to and block +a certain kind of Tor bridge\index{Tor!bridges!default}. + +\index{Tor!bridges|(} +Tor bridges are secret proxies that help clients get around censorship. The effectiveness of bridges depends on their secrecy---a censor -that learns a bridge's address can simply block its IP address. +that learns a bridge's address can simply block its IP address\index{blocking!by address}. Since the beginning, the designers of Tor's bridge system envisioned that users would learn of bridges through covert or social channels~\cite[\S 7]{tor-techreport-2006-11-001}, in order to prevent any one actor from learning about -and blocking a large number of bridges. +and blocking a large number of them. +\index{Tor!bridges|)} +\index{Tor!bridges!default|(} But as it turns out, most users do not use bridges in the way envisioned. Rather, most users who use bridges use one of a small number of \emph{default} bridges hardcoded -in a configuration file within Tor Browser. -(According to Matic et~al.~\cite[\S VII.C]{Matic2017a}, +in a configuration file\index{bridge configuration file} within Tor Browser. +(According to Matic et~al.~\indexauthors{\cite[\S VII.C]{Matic2017a}}, over 90\% of bridge users use a default bridge.) At a conceptual level, the notion of a ``default'' bridge -is a ridiculous contradiction. -Bridges are meant to be secret, +is a contradiction: +bridges are meant to be secret, not plainly listed in the source code. Any reasonable threat model would assume that default bridges are immediately blocked. @@ -2862,10 +2918,11 @@ why is it that censors do not take blocking steps that we find obvious? There must be some quality of censors' internal dynamics that we do not understand adequately. +\index{Tor!bridges!default|)} -The purpose of the present chapter is to begin -to peel back the veneer of censorship, -to gain insight into why they behave as they do---particularly +The purpose of this chapter is to begin +to go beneath the surface of censorship +for insight into why censors behave as they do---particularly when they behave contrary to expectations. We posit that censors, far from being unitary entities of focused purpose, @@ -2874,42 +2931,43 @@ with perhaps conflicting goals; this project is a small step towards better understanding what lies under the face that censors present. The main vehicle for the exploration of this subject -is the observation of default bridges (a specific -kind of proxy) to find out how quickly they are blocked +is the observation of default Tor bridges\index{Tor!bridges!default} +to find out how quickly they are blocked after they first become discoverable by a censor. I~took part in this project -along with Lynn Tsai and Qi Zhong; +along with Lynn Tsai\index{Tsai, Lynn} and Qi Zhong\index{Zhong, Qi}; the results in this chapter are an extension -of work Lynn and I~published in 2016~\cite{Fifield2016a-local}. +of work Lynn and I~published in~2016~\indexauthors{\cite{Fifield2016a-local}}. Through active measurements of default bridges from probe sites in China\index{China}, Iran\index{Iran}, and Kazakhstan\index{Kazakhstan}, we uncovered previously undocumented behaviors of censors that hint at how they operate at a deeper level. It was with a similar spirit that -Aase, Crandall, Díaz, Knockel, Ocaña Molinero, Saia, Wallach, and Zhu~\cite{Aase2012a} +Aase, Crandall, Díaz, Knockel, Ocaña Molinero, Saia, Wallach, and Zhu~\indexauthors{\cite{Aase2012a-local}} looked into case studies of censorship with a focus on understanding censors' -motivation, resources, and sensitivity to time. -They had ``assumed that censors are fully motivated to block content -and the censored are fully motivated to disseminate it.'' -But some of their observations challenged that assumption, +motivation, resources, and time sensitivity. +They ``had assumed that censors are fully motivated to block content\index{blocking} +and the censored are fully motivated to disseminate it,'' +but some of their observations challenged that assumption, with varied and seemingly undirected censorship hinting at behind-the-scenes resource limitations. -They describe an apparent ``intern effect,'' +They describe an apparent ``intern effect\index{intern effect},'' by which keyword lists seem to have been compiled by a bored and unmotivated worker, without much guidance. -Knockel et~al.~\cite{Knockel2017a} looked into -censorship of keywords in Chinese mobile games, +Knockel et~al.~\indexauthors{\cite{Knockel2017a}} looked into +censorship of keywords\index{keywords+} in Chinese\index{China} mobile games, finding that censorship enforcement in that context is similarly decentralized, different from the centralized control we commonly envision when thinking about censorship. -Zhu et~al.~\cite{Zhu2013a-local} studied the question +Zhu et~al.~\indexauthors{\cite{Zhu2013a-local}} studied the question of censor reaction time in a different context: -deletion of posts on the Chinese microblogging service Sina Weibo. +deletion of posts on the Chinese\index{China} microblogging\index{microblogging} +service Sina Weibo\index{Sina Weibo}\index{social media}. Through frequent polling, they were able to measure---down to the minute---the delay between when a user made a post @@ -2919,71 +2977,83 @@ and 30\% within 30~minutes---but there was a long tail of posts that survived several weeks before being deleted. The authors used their observations to make educated guesses about the inner workings of the censors. +\index{forum moderation} Posts on trending topics tended to be deleted more quickly. Posts made late at night had a longer average lifetime, seemingly reflecting workers arriving in the morning and clearing out a nightly backlog of posts. -King et~al.~\cite{King2012a} examined six months -of deleted posts on Chinese social networks. -The pattern of deletions seemed to give a view -into the goal of the censor: +King et~al.~\indexauthors{\cite{King2012a}} examined six months' worth +of deleted posts on Chinese\index{China} social networks\index{social media}. +The pattern of deletions seemed to reveal +the censor's motivation: not to prevent criticism of the government, as might be expected, but to forestall collective public action. -Nobori and Shinjo give a timeline~\cite[\S 6.3]{Nobori2014a} +Nobori and Shinjo give a timeline~\indexauthors{\cite[\S 6.3]{Nobori2014a}} of circumventor and censor actions and reactions -during the first month and a half of the deployment of VPN~Gate in China. -Within the first four days, the firewall had blocked +during the first month and a half of the deployment of VPN~Gate\index{VPN Gate} in China\index{China}. +Within the first four days, the firewall\index{Great Firewall of China} had blocked their main proxy distribution server, and begun scraping the proxy list. When they blocked the single scraping server, the firewall began scraping from multiple other locations within a day. After VPN~Gate deployed the countermeasure of mixing -high-collateral-damage servers into their proxy list, -the firewall stopped blocking proxies for two days, -after which it resumed blocking proxies, -after checking them first to see that they really were -VPN~Gate proxies. +high-collateral-damage\index{collateral damage} servers into their proxy list, +the firewall stopped blocking for two days, +then resumed again, +with an additional check that an IP addresses +really was a VPN~Gate proxy before blocking\index{blocking+}. -Wright et~al.~\cite{Wright2011a} motivated a desire +Wright et~al.~\indexauthors{\cite[\S 2]{Wright2011a}} motivated a desire for fine-grained censorship measurement by highlighting -limitations that would tend to prevent a censor from +limitations that tend to prevent a censor from begin equally effective everywhere in its controlled network. -Not only resource limitations, +Not only resource limitations\index{resources+}, but also administrative and logistical requirements, make it difficult to manage a system as complex as a national censorship apparatus. +\index{Tor!bridges!default|(} There has been no prior long-term study dedicated to measuring time delays in the blocking of default bridges. There have, however, been a couple of point measurements that put bounds on what blocking delays in the past must have been. -Tor Browser first shipped with obfs2 bridges on +Tor Browser\index{Tor Browser} first shipped with default +obfs2\index{obfs2} bridges on February~11, 2012~\cite{tor-blog-obfsproxy-next-step-censorship-arms-race}; -Winter and Lindskog tested them 41~days later~\cite[\S 5.1]{Winter2012a} +Winter and Lindskog tested them 41~days later~\indexauthors{\cite[\S 5.1]{Winter2012a}} and found all~13 of them blocked. -(The bridges then were blocked by RST injection\index{RST (TCP flag)}, -different than the timeouts we have seen more recently.) +(The bridges then were blocked by RST injection\index{RST (TCP flag)}\index{injection}, +a different blocking technique than the timeouts we have seen more recently.) In 2015 I~used public reports of blocking and non-blocking -of the first batch of default obfs4 bridges +of the first batch of default obfs4\index{obfs4} bridges to infer a blocking delay of not less than~15 -and not more than 76~days~\cite{tor-dev-censorship-lag}. - -\todo[inline]{ -we are used to making conservative assumptions -if an attacker gets code execution, it's game over -but what really \emph{does} happen when someone gets code execution? -similarly, it is prudent to assume that default bridges are immediately blocked -but what really \emph{does} happen? -} +and not more than 76~days~\indexauthors{\cite{tor-dev-censorship-lag}}. +\index{Tor!bridges!default|)} + +As security researchers, are accustomed to making +conservative assumptions when building threat models\index{modeling}. +For example, we assume that when a computer is compromised, +it's game over: the attacker will cause the worst possible outcome +for the computer's owner. +But the actual effects of a compromise can vary +from grave to almost benign, +and it is an interesting question, +what really happens and how severe it is. +Similarly, it is prudent to assume while modeling\index{modeling} +that the disclosure of any secret bridge will +result in its immediate blocking by every censor everywhere. +But as that does not happen in practice, +it is an interesting question, +what really does happen, and why? \section{The experiment} Our experiment primarily involved frequent, -active measurements of the reachability of default bridges +active test of the reachability of default bridges from probe sites in China\index{China}, Iran\index{Iran}, and Kazakhstan\index{Kazakhstan} (countries well known to censor the network), as well as a control site in the U.S.\index{United States of America} @@ -2997,60 +3067,62 @@ the time elapsed, and any error code. The error code allows us to distinguish between different kinds of failures such as ``timeout'' and ``connection refused.'' -The control site in the U.S.\index{United States of America}\ enables +The control site in the U.S.\index{United States of America}\ enables us to distinguish temporary bridge failures from actual blocking. -The script only tests whether it is possible to make a TCP\index{TCP} connection, -which is necessary but not sufficient to actually make a Tor connection +The script only tested whether it is possible to make a TCP\index{TCP} connection, +which is a necessary but not sufficient precondition +to actually establishing a Tor circuit\index{Tor!circuit+} through the bridge. -In Kazakhstan\index{Kazakhstan}, we additionally deployed measurements -that attempted to establish a full Tor connection, +In Kazakhstan\index{Kazakhstan}, we deployed an additional script +that attempted to establish a full Tor-in-obfs4\index{obfs4} connection, in order to better understand the different type of blocking we discovered there. The experiment was opportunistic in nature: we ran from China\index{China}, Iran\index{Iran}, and Kazakhstan\index{Kazakhstan} not only because -they are likely suspects for Tor blocking, -but because we happened to have access to a site from which +they are likely suspects for Tor\index{Tor!protocol} blocking\index{blocking}, +but because we happened to have access to a site in each from which we could run probes over some period of time. Therefore the measurements cover different dates in different countries. We began at a time when Tor\index{Tor Project} was building up its stock of default bridges. -We began monitoring the new bridges as they were added, -coordinating with Tor Browser\index{Tor Browser} developers to get advance notice -of them when possible. +We began monitoring each new bridges as it was added, +coordinating with the Tor Browser\index{Tor Browser} developers to get advance notice +of their addition when possible. Additionally we had the developers run certain more controlled experiments for us---such as adding a bridge to the source code but commenting it out---that are further detailed below. -We were only concerned with default bridges, not secret ones\index{Tor bridges}. +We were only concerned with default bridges, not secret ones\index{Tor!bridges!default}. Our goal was not to estimate the difficulty of the proxy discovery problem\index{proxy discovery problem}, -but to better understand how censors deal with what seems to be a trivial task. -We focused on bridges using the obfs4 pluggable transport~\cite{obfs4}\index{obfs4}, +but to better understand how censors deal with what should be an easy task. +We focused on bridges using the obfs4 pluggable transport~\cite{obfs4}\index{obfs4}\index{pluggable transports}, which not only is the most-used transport and the one marked ``recommended'' in the interface, but also has properties that help in our experiment. -The content obfuscation of obfs4 reduces the risk of its passive detection. -More importantly, it resists active probing attacks +The content obfuscation of obfs4 reduces the risk of its passive detection\index{detection!by content}. +More importantly, it resists active probing\index{active probing} attacks as described in \autoref{chap:active-probing}. We could not have done the experiment with obfs3\index{obfs3} bridges, -because whether default bridges or not, -they would be probed and blocked shortly after their first use. +because whether default or not, +active probing would cause them to be blocked +shortly after their first use. -Bridges are identified by a nickname and a port number\index{nickname (Tor bridges)}\index{Tor bridges!nicknames}. +Bridges are identified by a nickname and a port number\index{Tor!bridges!nickname}. The nickname is an arbitrary identifier, chosen by the bridge operator. So, for example, ``ndnop3:24215''\index{ndnop3 (Tor bridge)} is one bridge, -and ``ndnop3:10527''\index{ndnop3 (Tor bridge)} is another bridge on the same IP address. +and ``ndnop3:10527''\index{ndnop3 (Tor bridge)} is another on the same IP address. We pulled the list of bridges from Tor Browser\index{Tor Browser} -and Orbot, which is the port of Tor for Android\index{Android}. +and Orbot\index{Orbot}, which is the port of Tor for Android\index{Android}. Tor Browser and Orbot\index{Orbot} mostly shared bridges in common, though there were a few Orbot-only bridges. -A~list of bridges and other destinations we measured +A~list of the bridges and other destinations we measured appears in \autoref{tab:proxy-probe-destinations}. -Along with the new obfs4\index{obfs4} bridges, we tested some -existing bridges. +Along with the fresh bridges, we tested some +existing bridges for comparison purposes. \begin{table} \small @@ -3104,6 +3176,8 @@ ndnop4 & 27668 & (obfs4) \\ The bridges whose reachability we tested. Except for the already existing and never-published bridges, they were all introduced during the course of our experiment. +We also tested port~22 (SSH\index{SSH}) on hosts +that had it open. Each bridge is identified by a nickname (a label chosen by its operator) and a port. Each nickname represents a distinct IP address. @@ -3113,7 +3187,8 @@ Port numbers are in chronological order of release. \index{FTE} \index{Tor Browser} \index{Orbot} -\index{Tor bridge!default} +\index{Tor!bridges!default} +\index{Tor!bridges!nickname} \index{ndnop3 (Tor bridge)} \index{ndnop5 (Tor bridge)} \index{riemann (Tor bridge)} @@ -3133,26 +3208,27 @@ Port numbers are in chronological order of release. \label{tab:proxy-probe-destinations} \end{table} +\index{Tor!bridges!default|(} There are four stages in the process of deploying a new default bridge. -At the start, the bridge is secret, -perhaps only having been discussed on a private mailing list. +At the beginning, the bridge is secret, +perhaps having been discussed on a private mailing list. Each successive stage of deployment makes the bridge more public, increasing the number of places where a censor may look to discover it. The whole process takes a few days to a few weeks, -mostly depending on the release schedule. +mostly depending on Tor Browser's\index{Tor Browser} release schedule. \begin{description} \item[Ticket filed] -The process begins with the filing of a ticket in Tor's public issue tracker. +The process begins with the filing of a ticket in Tor's\index{Tor Project} public issue tracker. The ticket includes the bridge's IP address. A~censor that pays attention to the issue tracker could discover bridges as early as this stage. \item[Ticket merged] After review, the ticket is merged and the new bridge -is added to the source code of Tor Browser. +is added to the source code of Tor Browser\index{Tor Browser}. From there it will begin to be included in nightly builds. -A~censor that reads the bridge configuration file +A~censor that reads the bridge configuration file\index{bridge configuration file} from the source code repository, or downloads nightly builds, could discover bridges at this stage. @@ -3163,18 +3239,18 @@ Tor Browser developers send candidate builds to a public mailing list\index{tor-qa mailing list} to solicit quality assurance testing. A~censor that follows testing releases would -find ready-made executables with bridges embedded +find ready-made executables with embedded bridges at this stage. Occasionally the developers skip the testing period, such as in the case of an urgent security release. \item[Public release] After testing, the releases are made public -and announced on the Tor Blog\index{Tor Blog}. +and announced on the Tor Blog\index{Tor Blog}\index{blog}. A~censor could learn of bridges at this stage by reading the blog and downloading executables. This is also the stage at which the new bridges begin to have an appreciable number of users. -There are two release tracks of Tor Browser: stable and alpha. +There are two release tracks of Tor Browser: stable\index{Tor Browser!stable release} and alpha\index{Tor Browser!alpha release}. Alpha releases are distinguished by an `a' in their version number, for example 6.5a4. According to Tor Metrics~\cite{tor-metrics-webstats-tb}\index{Tor Metrics}, @@ -3186,10 +3262,11 @@ so that they would not become public except via the four stages described above. Specifically, we made sure the bridges did not appear in BridgeDB~\cite{BridgeDB}\index{BridgeDB}, the online database of secret bridges, -and that the bridges did not expose any transports other than obfs4. +and that the bridges did not expose any transports other than obfs4\index{obfs4}. We wanted to ensure that any blocking of bridges could only be the result of their status as default bridges, and not a side effect of some other detection system. +\index{Tor!bridges!default|)} \section{Results from China} @@ -3200,10 +3277,10 @@ We had access to probe sites in China for just over a year, from December 2015 to January 2017. Due to the difficulty of getting access to hosts in China, we used four different IP addresses -(all in the same autonomous system) +(all in the same autonomous system\index{autonomous system}) at different points in time. The times during which we had control of each IP address -partially overlap, but there is a 21-day gap in measurements +partially overlap, but there is a 21-day gap in the measurements during August 2016. Our observations in China turned up several @@ -3214,15 +3291,16 @@ which shows the timeline of reachability of every bridge, in context with dates related to tickets and releases. Circled references in the text (\cnref{a}, \cnref{b}, etc.) refer to marked points in the figure. -A~``batch'' of releases is a set that all -contain the same default bridges. +A~``batch'' is a set of Tor Browser releases +that all +contained the same default bridges. \begin{figure} \centering \includegraphics{figures/proxy-probe-timelines-china1} \caption{ -Tor Browser default bridge reachability -from a single autonomous system in China. +Default bridge reachability +from a site in China. Releases are grouped into batches according to the new bridges they contain. The thickness of lines indicates whether the measurements @@ -3260,24 +3338,21 @@ Before that date, blocking was port-specific and happened only after the ``public release'' stage. After, bridges began to be blocked on all ports simultaneously, and were blocked soon after the ``ticket merged'' stage. -We believe that this change reflects a shift in how the censor discovers bridges, -from running the finished software to see what addresses it connects to, -to extracting addresses from source code. +We believe that this change reflects a shift in how the censor discovered bridges, +a shift from running the finished software to see what addresses it accesses, +to extracting the addresses from source code. More details and evidence appear in the following subsections. \subsection{Per-port blocking} \label{sec:china-perport} -In the first few release batches, the censor blocked individual ports, -not an entire IP address. -This characteristic of the Great Firewall has been documented -as far back as 2006 by Clayton et~al.~\cite{Clayton2006a}, -and in 2012 by Winter and Lindskog~\cite{Winter2012a}. +In the first few release batches, the censor blocked individual ports\index{port+}, +not an entire IP address\index{IP address+}. For example, see point~\cnref{a} in \autoref{fig:proxy-probe-timelines-china1}: -after ndnop3:24215 was blocked, +after ndnop3:24215\index{ndnop3 (Tor bridge} was blocked, we opened ndnop3:10527 on the same IP address. -The alternate port remained reachable +The alternate port\index{port} remained reachable until it, too, was blocked in the next release batch. We used this technique of rotating @@ -3285,9 +3360,9 @@ ports in several release batches. Per-port blocking is also evident in the continued reachability of non-bridge ports. -For example, many of the bridges had an SSH port open, -in addition to their obfs4 ports. -After riemann:443 (obfs4) was blocked (point~\cnref{c} in \autoref{fig:proxy-probe-timelines-china1}), +For example, many of the bridges had an SSH\index{SSH} port open, +in addition to their obfs4\index{obfs4} ports. +After riemann:443\index{riemann (Tor bridge)} (obfs4) was blocked (point~\cnref{c} in \autoref{fig:proxy-probe-timelines-china1}), riemann:22 (SSH) remained reachable for a further nine months, until it was finally blocked at point~\cnref{m}. Per-port blocking would give way to whole-IP blocking @@ -3304,14 +3379,14 @@ and blocked the bridges in an earlier stage. In the 5.5.5/6.0a5/6.0 batch, the censor even seems to have missed the 5.5.5 and 6.0a5 releases (point~\cnref{e} in \autoref{fig:proxy-probe-timelines-china1}), -only blocking after the 6.0 release, 36 days later. +only blocking\index{blocking} after the 6.0 release, 36 days later. This observation hints that, before October 2016 anyway, the censor was somehow extracting bridge addresses from the release packages themselves. -In Sections~\ref{sec:china-simultaneous} and~\ref{sec:china-different-times} +In subsections~\ref{sec:china-simultaneous} and~\ref{sec:china-different-times} we present more evidence that supports the hypothesis that the censor extracted bridge addresses -from public releases, +only from public releases, not reacting at any earlier phase. An evident change in blocking technique @@ -3327,24 +3402,26 @@ The changed technique is the subject of \autoref{sec:china-preemptive}. The first five blocking incidents were single events: when a batch contained more than one bridge, -all were blocked at the same time (within 20 minutes). +all were blocked at the same time; that is, +within one of our 20-minute probing periods. These incidents appear as crisp vertical columns of blocking icons in \autoref{fig:proxy-probe-timelines-china1}, for example at point~\cnref{c}. This fact supports the idea that the censor -discovered bridges by examining release packages directly, +discovered bridges by examining released executables directly, and did not, for example, detect bridges one by one by examining network traffic. The 6.0.5/6.5a3 batch is an exception to the pattern of simultaneous blocking. -In that batch, one bridge (LeifEricson:50000) was already blocked, +In that batch, one bridge (LeifEricson:50000\index{LeifEricson (Tor bridge)}) was already blocked, three were blocked simultaneously as in the previous batches, -but two others (GreenBelt:5881 and Azadi:4319) were temporarily unscathed. +but two others (GreenBelt:5881\index{GreenBelt (Tor bridge)} +and Azadi:4319\index{Azadi (Tor bridge)}) were temporarily unscathed. At the time, GreenBelt:5881 was experiencing a temporary outage---which could explain why it was not blocked---but Azadi:4319 was operational. -This specific case is discussed more in +This specific case is discussed further in \autoref{sec:china-different-times}. @@ -3366,7 +3443,7 @@ Recall from \autoref{sec:active-probing-infrastructure} that the firewall was even at that time capable of detecting and blocking \emph{secret} bridges within minutes. -Delays of days or weeks really stand out in contrast. +Delays of days or weeks stand out in contrast. \subsection{Inconsistent blocking and failures of blocking} @@ -3374,7 +3451,7 @@ Delays of days or weeks really stand out in contrast. There is a conspicuous on--off pattern in the reachability of certain bridges from China, -for example in ndnop3:24215 throughout February, March, and April 2016 +for example in ndnop3:24215\index{ndnop3 (Tor bridge)} throughout February, March, and April 2016 (point~\cnref{b} in \autoref{fig:proxy-probe-timelines-china1}). Although the censor no doubt intended to block the bridge fully, % 3,024 of 6,480 (47\%) @@ -3382,11 +3459,11 @@ Although the censor no doubt intended to block the bridge fully, of connection attempts were successful during that time. On closer inspection, we find that the pattern is roughly periodic with a period of 24~hours. -The pattern may come and go, for example in riemann:443 +The pattern may come and go, for example in riemann:443\index{riemann (Tor bridge)} before and after March~27, 2016. The predictable daily variation in reachability rates makes us think that, at least at the times under question, -the Great Firewall's effectiveness was dependent on load---varying +the Great Firewall's\index{Great Firewall of China} effectiveness was dependent on load---varying load at different times of day leads to varying bridge reachability. Beyond the temporary reachability of individual bridges, @@ -3396,7 +3473,7 @@ Point~\cnref{d} in \autoref{fig:proxy-probe-timelines-china1} marks such a failu All the bridges under test, including those that had already been blocked, became available between 10:00 and 18:00~UTC on March~27, 2016. Further evidence that these results indicate a failure of the firewall -come from a press report~\cite{scmp-gfw} that Google services---normally +come from a press report~\cite{scmp-gfw} that Google\index{Google} services---normally blocked in China---were also unexpectedly available on the same day, from about 15:30 to 17:15~UTC. % 2016-06-28 17:42:01,spline,109.105.109.147,13764,30.0200228691,False,None,timed out @@ -3405,19 +3482,21 @@ A similar pattern appears across all bridges for nine hours starting on June~28, 2016 at 17:40~UTC. -After the switch to whole-IP blocking, -there are more instances of spotty and inconsistent blocking, +After the switch to whole-IP\index{IP address} blocking\index{blocking!by address}, +there are further instances of spotty and inconsistent censorship, though of a different nature. Several cases are visible near point~\cnref{j} in \autoref{fig:proxy-probe-timelines-china1}. It is noteworthy that not all ports on a single host are affected equally. -For example, the blocking of GreenBelt is inconsistent on ports 5881 and 12166, +For example, the blocking of GreenBelt\index{GreenBelt (Tor bridge)} +is inconsistent on ports 5881 and 12166, but it is solidly blocked on ports 80, 443, 7013, and 60873. -Similarly, Mosaddegh's ports 1984 and 15937 are intermittently reachable, +Similarly, Mosaddegh's\index{Mosaddegh (Tor bridge)} +ports 1984 and 15937 are intermittently reachable, in the exact same pattern, while ports 80, 443, 2934, and 9332 remain blocked. -These observations lead us to suspect a two-tiered structure of the firewall: -one tier for per-port blocking and a separate one for whole-IP blocking. +These observations lead us to suspect a model of two-tiered blocking: +one tier for per-port blocking, and a separate tier for whole-IP blocking. If there were a temporary failure of the whole-IP tier, any port not specifically handled by the per-port tier would become reachable. @@ -3426,53 +3505,55 @@ any port not specifically handled by the per-port tier would become reachable. \label{sec:china-different-times} The 6.0.5/6.5a2 release batch was noteworthy in several ways. -Its six new bridges were all fresh ports on already used IP addresses. +Its six new bridges were all fresh ports on already-used IP addresses. For the first time, not all bridges were blocked simultaneously. -Only three of the bridges---Mosaddegh:2934, -MaBishomarim:2413, and JonbesheSabz:1894---were blocked in a way +Only three of the bridges---Mosaddegh:2934\index{Mosaddegh (Tor bridge)}, +MaBishomarim:2413\index{MaBishomarim (Tor bridge)}, +and JonbesheSabz:1894\index{JonbesheSabz (Tor bridge)}---were blocked in a way consistent with previous release batches. -Of the other three, +Of the other three: \begin{itemize} -\item LeifEricson:50000 +\item LeifEricson:50000\index{LeifEricson (Tor bridge)} had been blocked since we began measuring it. The LeifEricson IP address is one of the oldest in the browser. We suspect the entire IP address had been blocked at some point. We will have more to say about LeifEricson in \autoref{sec:china-allports}. -\item GreenBelt:5881 (point~\cnref{f}) +\item GreenBelt:5881\index{GreenBelt (Tor bridge)} (point~\cnref{f}) was offline at the time when other bridges in the batch were blocked. We confirmed this fact by talking with the bridge operator and through control measurements: the narrow band in \autoref{fig:proxy-probe-timelines-china1} shows that -while connection attempts were timing out not only from China, but also from the U.S.\index{United States of America} +connection attempts were timing out not only from China, but also from the \index{United States of America}U.S. The bridge became reachable again from China as soon as it came back online. -\item Azadi:4319 (point~\cnref{g}), +\item Azadi:4319\index{Azadi (Tor bridge)} (point~\cnref{g}), in contrast, was fully operational at the time of the other bridges' blocking, and the censor nevertheless failed to block it. \end{itemize} -We take from the failure to block GreenBelt:5881 and Azadi:5881 +We take from the failure to block GreenBelt:5881\index{GreenBelt (Tor bridge)} +and Azadi:5881\index{Azadi (Tor bridge)} that the censor, as late as September 2016, was most likely \emph{not} discovering bridges -by inspecting the bridge configuration file in the source code, +by inspecting the bridge configuration file\index{bridge configuration file} in the source code, because if it had been, it would not have missed two of the bridges in the list. -Rather, we suspect that the censor used some kind of network analysis---perhaps -running a release of Tor Browser in a black-box fashion, +Rather, we suspect that the censor used some kind of network-level analysis---perhaps +running a release of Tor Browser\index{Tor Browser} in a black-box fashion, and making a record of all addresses it connected to. -This would explain why GreenBelt:5881 was not blocked +This would explain why GreenBelt:5881\index{GreenBelt (Tor bridge)} was not blocked (it couldn't be connected to while the censor was harvesting bridge addresses) -and could also explain why Azadi:4319 was not blocked -(Tor does not try every bridge simultaneously, +and could also explain why Azadi:4319\index{Azadi (Tor bridge)} was not blocked +(Tor\index{Tor} does not try every bridge simultaneously, so it simply may not have tried to connect to Azadi:4319 in the time the censors allotted for the test). It is consistent with the observation that bridges were not blocked before a release: the censor's discovery process needed a runnable executable. -Azadi:4319 remained unblocked even after an additional port +Azadi:4319\index{Azadi (Tor bridge)} remained unblocked even after an additional port on the same host was blocked in the next release batch. This tidbit will enable us, in the next section, to fairly narrowly locate the onset of bridge discovery -based on parsing the bridge configuration file +based on parsing the bridge configuration file\index{bridge configuration file} in October 2016. @@ -3482,19 +3563,19 @@ in October 2016. The 6.0.6/6.5a4 release batch marked two major changes in the censor's behavior: \begin{enumerate} \item For the first time, newly added bridges were blocked \emph{before} a release. -(Not counting LeifEricson, an old bridge which we had never been able to reach from China.) +(Not counting LeifEricson\index{LeifEricson (Tor bridge)}, an old bridge which we had never been able to reach from China.) \item For the first time, new blocks affected more than one port. -(Again not counting LeifEricson.) +(Again not counting LeifEricson\index{LeifEricson (Tor bridge)}.) \end{enumerate} The 6.0.6/6.5a4 batch contained eight new bridges. Six were new ports on previously used IP addresses -(including LeifEricson:50001, +(including LeifEricson:50001\index{LeifEricson (Tor bridge)}, which we expected to be already blocked, % our email thread with LeifEricson operator is on 04 Oct 2016 but included for completeness). -The other two---Lisbeth:443 and NX01:443---were fresh IP addresses. +The other two---Lisbeth:443\index{Lisbeth (Tor bridge)} and NX01:443\index{NX01 (Tor bridge)}---were fresh IP addresses. However one of the new bridges, NX01:443, had a twist: -we left it commented out in the bridge configuration file, thus: +we left it commented out in the bridge configuration file\index{bridge configuration file}, thus: \begin{quote} \small \begin{verbatim} @@ -3505,31 +3586,31 @@ pref(..., "obfs4 192.95.36.142:443 ..."); \end{quote} Six of the bridges---all -but the exceptional LeifEricson:50000 and NX01:443---were +but the exceptional LeifEricson:50000\index{LeifEricson (Tor bridge)} and NX01:443\index{NX01 (Tor bridge)}---were blocked, not quite simultaneously, but within 13 hours of each other (see point~\cnref{h} in \autoref{fig:proxy-probe-timelines-china1}). -The blocks happened 14 days (or 22 days in the case of Lisbeth:443 and NX01:443) +The blocks happened 14~days (or 22~days in the case of Lisbeth:443\index{Lisbeth (Tor bridge)} and NX01:443\index{NX01 (Tor bridge)}) after ticket merge, -and 27 days before the next public release. +and 27~days before the next public release. We hypothesize that this blocking event indicates a change in the censor's technique, -and that in October 2016 the Great Firewall began +and that in October 2016 the Great Firewall\index{Great Firewall of China} began to discover bridge addresses either by examining newly filed tickets, -or by inspecting the bridge configuration file in the source code. -A first piece of evidence for the hypothesis is, -of course, that the bridges were blocked at a time -when they were present in the bridge configuration file, +or by inspecting the bridge configuration file\index{bridge configuration file} in the source code. +A first piece of evidence for the hypothesis is +that the bridges were blocked at a time +when they were present in the bridge configuration file\index{bridge configuration file}, but had not yet appeared in a release. -The presence of the never-before-seen Lisbeth:443 +The presence of the never-before-seen Lisbeth:443\index{Lisbeth (Tor bridge)} in the blocked set removes the possibility that the censor spontaneously decided to block additional ports on IP addresses it already knew about, as does the continued reachability of certain blocked bridges on further additional ports. A second piece of evidence comes from a careful scrutiny of the -timelines of the Azadi:4319 and Azadi:6041 bridges. +timelines of the Azadi:4319\index{Azadi (Tor bridge)} and Azadi:6041 bridges. As noted in \autoref{sec:china-different-times}, Azadi:4316 had unexpectedly been left unblocked in the previous release batch, and it remained so, even after Azadi:6041 was blocked in this batch. @@ -3544,14 +3625,14 @@ and it remained so, even after Azadi:6041 was blocked in this batch. \end{center} The same ticket that removed Azadi:4319 on October~6 also added Azadi:6041. On October~20 when the bridges were blocked, -Azadi:4319 was gone from the bridge configuration file, +Azadi:4319 was gone from the bridge configuration file\index{bridge configuration file}, having been replaced by Azadi:6041. -It appears that the yet-unused Azadi:6041 was blocked -merely because it appeared in the bridge configuration file, +It appears that the yet-unused Azadi:6041\index{Azadi (Tor bridge)} was blocked +merely because it appeared in the bridge configuration file\index{bridge configuration file}, even though it would have been more beneficial to the censor to instead block the existing Azadi:4319, which was still in active use. -The Azadi timeline enables us to locate fairly narrowly the change in bridge discovery techniques. +The Azadi\index{Azadi (Tor bridge)} timeline enables us to locate fairly narrowly the change in bridge discovery techniques. It must have happened during the two weeks between October~6 and October~20, 2016. It cannot have happened before October~6, because at that time Azadi:4319 was still listed, which would have gotten it blocked. @@ -3559,8 +3640,8 @@ And it cannot have happened after October~20, because that is when bridges liste were first blocked. A third piece of evidence supporting the hypothesis that the censor -began to discover bridges through the bridge configuration file -is its treatment of the commented-out bridge NX01:443. +began to discover bridges through the bridge configuration file\index{bridge configuration file} +is its treatment of the commented-out bridge NX01:443\index{NX01 (Tor bridge)}. The bridge was commented out in the 6.0.6/6.5a4 batch, in which it remained unblocked, and uncommented in the following 6.0.8/6.5a6 batch. @@ -3576,33 +3657,47 @@ to have become active \subsection{The onset of whole-IP blocking} \label{sec:china-allports} -The blocking event on October~20 2016 was noteworthy not only because it occurred before a release, +The blocking event of October~20, 2016 was noteworthy not only because it occurred before a release, but also because it affected more than one port on some bridges. See point~\cnref{h} in \autoref{fig:proxy-probe-timelines-china1}. -When GreenBelt:7013 was blocked, +When GreenBelt:7013\index{GreenBelt (Tor bridge)} was blocked, so were GreenBelt:5881 (which had escaped blocking in the previous batch) and GreenBelt:12166 (which was awaiting deployment in the next batch). -Similarly, when MaBishomarim:7920 and JonbesheSabz:4148 were blocked, -so were the Orbot-reserved MaBishomarim:1984 and JonbesheSabz:1984 (point~\cnref{k}), +Similarly, when MaBishomarim:7920\index{MaBishomarim (Tor bridge)} and JonbesheSabz:4148\index{JonbesheSabz (Tor bridge)} were blocked, +so were the Orbot-reserved\index{Orbot} MaBishomarim:1984 and JonbesheSabz:1984 (point~\cnref{k}), ending an eight-month unblocked streak. -The blocking of Mosaddegh:9332 and Azadi:6041 also affected other ports, +The blocking of Mosaddegh:9332\index{Mosaddegh (Tor bridge)} +and Azadi:6041\index{Azadi (Tor bridge)} also affected other ports, though after a delay of some days. We do not have an explanation for why some multiple-port blocks took effect faster than others. -The SSH port riemann:22 was blocked at about the same time (point~\cnref{m}), -10~months after the corresponding obfs4 port riemann:443 had been blocked; +The SSH\index{SSH} port riemann:22\index{riemann (Tor bridge)} was blocked at about the same time (point~\cnref{m}), +10~months after the corresponding obfs4\index{obfs4} port riemann:443 had been blocked; there had been no changes to the riemann host in all that time. -We suspected that the Great Firewall might employ a threshold scheme: +We suspected that the Great Firewall\index{Great Firewall of China} might employ a threshold scheme: once a certain number of individual ports on a particular IP address have been blocked, go ahead and block the entire IP address. -But riemann with its single obfs4 port is a counterexample to that idea. +But riemann\index{riemann (Tor bridge)} with its single obfs4 port is a counterexample to that idea. + +The Great Firewall\index{Great Firewall of China} has been repeatedly +documented to block individual ports (or small ranges of ports), +for example +in 2006 by Clayton et~al.~\indexauthors{\cite[\S 6.1]{Clayton2006a}}, +in 2012 by Winter and Lindskog~\indexauthors{\cite[\S 4.1]{Winter2012a}}, +and in 2015 by Ensafi et~al.~\indexauthors{\cite[\S 4.2]{Ensafi2015b}}. +The onset of all-ports blocking is therefore somewhat surprising. +Worth nothing, though, is that Wang et~al.~\indexauthors{\cite[\S 7.3]{Wang2017a}}, +in another test of active probing in May 2017, % "May 2017": emailed the authors to ask +also found that newly probed bridges became blocked on all ports. +The change we saw in October 2016 may therefore be a sign +of a more general change in tactics. This was the first time we saw blocking of multiple ports on bridges that had been introduced during our measurements. -LeifEricson may be an example of the same phenomenon happening in the past, +LeifEricson\index{LeifEricson (Tor bridge)} may be an example of the same phenomenon happening in the past, before we even began our experiment. The host LeifEricson had, since February 2014, been running bridges on multiple ports, -and obfs4 on port 41213 since October 2014. +and obfs4\index{obfs4} on port 41213 since October 2014. % https://gitweb.torproject.org/builders/tor-browser-bundle.git/commit/?id=3279db32f147479af225d7106949e8dddd360dbb % https://gitweb.torproject.org/builders/tor-browser-bundle.git/commit/?id=bb6389fbe7aa9539c4dce2aba0659e61ae8a376a LeifEricson:41213 remained blocked @@ -3614,16 +3709,16 @@ when we began testing them on % 2016-08-30 17:43:25,muskie,83.212.101.3,50000,30.0267419815,False,None,timed out August~30, 2016, they were all already blocked. To confirm, on October~4 we asked the operator privately to open additional, randomly selected ports, -and they too were blocked, as was the SSH port~22. +and they too were blocked, as was the SSH\index{SSH} port~22. In \autoref{sec:china-failures}, we observed that ports that had been caught up in whole-IP blocking exhibited different patterns of intermittent reachability after blocking, than did those ports that had been blocked individually. -We suspected that a two-tiered system left certain ports double-blocked---blocked +We suspected that a two-tiered system made certain ports double-blocked---blocked both by port and by IP address---which would make their blocking robust to a failure of one of the tiers. -The same pattern seems to happen with LeifEricson. +The same pattern seems to happen with LeifEricson\index{LeifEricson (Tor bridge)}. The newly opened ports 50000, 50001, and 50002 share brief periods of reachability in September and October 2016, but port 41213 during the same time remained solidly down. @@ -3632,13 +3727,15 @@ but port 41213 during the same time remained solidly down. \subsection{No discovery of Orbot bridges} \label{sec:china-orbot} -Orbot, the Android version of Tor, +\index{Orbot|(} + +Orbot, the version of Tor\index{Tor} for Android\index{Android}, also includes default bridges. -It has its own bridge configuration file, -similar to Tor Browser's but in a different format. +It has its own bridge configuration file\index{bridge configuration file}, +similar to Tor Browser's\index{Tor Browser}, but in a different format. Most of Orbot's bridges are borrowed from Tor Browser, -so when a bridge gets blocked, it ends up being blocked for both Orbot -and Tor Browser users. +so when a bridge gets blocked, it is blocked for users of both Orbot +and Tor Browser. There were, however, a few bridges that were used only by Orbot (see the ``Orbot bridges'' batch in \autoref{fig:proxy-probe-timelines-china1}). @@ -3654,14 +3751,17 @@ were the result of temporary misconfigurations, not blocking. They were unreachable during those outages from the control site as well.) These results show that whatever mechanism the censor had -for discovering and blocking default Tor Browsers, -it had not even that much for discovering and blocking Orbot bridges. +for discovering and blocking the default bridges of Tor Browser\index{Tor Browser}, +it lacked for discovering and blocking those of Orbot. Again we have a case of our assumptions not matching reality---blocking that should be easy to do, and yet is not done. -A lesson to take from all this is that there is a benefit to +A~lesson is that there is a benefit to some degree of compartmentalization between sets of default bridges. -Even though they are all in theory easy to discover, -in practice the censor may not have built the necessary automation. +Even though they are all, in theory, equally easy to discover, +in practice the censor has to build separate automation +for each set. + +\index{Orbot|)} \subsection{Continued blocking of established bridges} @@ -3677,8 +3777,8 @@ As expected, they were already blocked at the beginning, and remained so \label{sec:china-unused} As a control measure, we reserved a bridge in secret. -ndnop4:27668 (see point~\cnref{n} in \autoref{fig:proxy-probe-timelines-china1}) -was not published, neither in Tor Browser's bridge configuration file, +ndnop4:27668\index{ndnop4 (Tor Bridge} (see point~\cnref{n} in \autoref{fig:proxy-probe-timelines-china1}) +was not published, neither in Tor Browser's\index{Tor Browser} bridge configuration file\index{bridge configuration file}, nor in BridgeDB\index{BridgeDB}. As expected, it was never blocked. @@ -3691,14 +3791,14 @@ As expected, it was never blocked. \index{Iran|(} We had a probe site in Iran from December 2015 to June 2016, -a virtual private server which a personal contact could only -provide for a limited time. +a virtual private server, which a personal contact could only +provide for us for a limited time. \begin{figure} \centering \includegraphics{figures/proxy-probe-timelines-iran} \caption{ -Tor Browser default bridge reachability from Iran. +Default bridge reachability from a site in Iran. We found no evidence of blocking of default bridges in Iran. What connection failures there were, were also seen from our control site. @@ -3719,22 +3819,21 @@ were also seen from our control site. \label{fig:proxy-probe-timelines-iran} \end{figure} -In contrast to the situation in China, +In contrast to the situation in China\index{China}, in Iran we found no evidence of blocking. See \autoref{fig:proxy-probe-timelines-iran}. Although there were timeouts and refused connections, they were the result of failures at the bridge side, -as confirmed by a comparison with measurements -from our control site. -This, despite the fact that Iran is a notorious censor, -and has in the past blocked Tor directory authorities~\cite{tor-trac-12727}. +as confirmed by a comparison with control measurements. +This, despite the fact that Iran is a notorious censor~\cite{Aryan2013a}, +and has in the past blocked Tor\index{Tor!directory authorities} directory authorities~\cite{tor-trac-12727}. It seems that Iran has overlooked the blocking of default bridges. -Tor Metrics shows thousands of simultaneous bridge users +Tor Metrics\index{Tor!Metrics} shows thousands of simultaneous bridge users in Iran since 2014~\cite{tor-metrics-userstats-bridge-country-ir}, so it is unlikely that the bridges were simply blocked in a way that our probing script could not detect. -However, in Kazakhstan\index{Kazakhstan} we found exactly that situation, +However, in Kazakhstan\index{Kazakhstan} we did find such situation, with bridges being effectively blocked despite the firewall allowing TCP\index{TCP} connections to them. @@ -3748,23 +3847,26 @@ despite the firewall allowing TCP\index{TCP} connections to them. We had a single probe site in Kazakhstan between December 2016 and May 2017. -It was a VPN (virtual private network) node, -with IP address 185.120.77.110. -It was in AS~203087, which belongs to GoHost.kz\index{GoHost.kz}, +It was a VPN\index{VPN} node +with IP address 185.120.77.110~\index{185.120.77.110 (Kazakh VPN node)}. +It was in AS~203087\index{autonomous system!AS 203087}, which belongs to GoHost.kz\index{GoHost.kz}, a Kazakh hosting provider. -The flakiness of the VPN left us with two extended +The flaky VPN connection +left us with two extended gaps in measurements. \begin{figure}[p] \centering \includegraphics{figures/proxy-probe-timelines-kazakhstan} \caption{ -Tor Browser default bridge reachability from Kazakhstan. +Default bridge reachability from a site in Kazakhstan. Judging by TCP\index{TCP} reachability alone, -it would seem that the only disagreement with control---and -the only blocked bridge---is LeifEricson:41213, one of the oldest bridges. -However, actually trying to establish a Tor connection -through the obfs4 channel reveals that bridges actually are blocked. +it would seem that there is no disagreement with the control site---and +therefore no blocked bridges +However, the more intensive experiment of +\autoref{fig:proxy-probe-bridgetest-kazakhstan}, below, +reveals that despite being reachable at the TCP layer, +most of the bridges were in fact effectively blocked. } \index{Kazakhstan} \index{ndnop3 (Tor bridge)} @@ -3773,11 +3875,8 @@ through the obfs4 channel reveals that bridges actually are blocked. \index{GreenBelt (Tor bridge)} \index{Lisbeth (Tor bridge)} \index{NX01 (Tor bridge)} -\index{LeifEricson (Tor bridge)} \index{cymrubridge31 (Tor bridge)} \index{cymrubridge33 (Tor bridge)} -\index{fdctorbridge01 (Tor bridge)} -\index{ndnop4 (Tor bridge)} \label{fig:proxy-probe-timelines-kazakhstan} \end{figure} @@ -3785,55 +3884,91 @@ through the obfs4 channel reveals that bridges actually are blocked. \centering \includegraphics{figures/proxy-probe-bridgetest-kazakhstan} \caption{ -Tor connection progress in the U.S.\index{United States of America}\ and Kazakhstan. -These measurements show that even though bridges accepted TCP\index{TCP} connections, -the firewall usually caused them to stall before a Tor circuit -could be fully constructed. +Default bridge bootstrap progress from a site in Kazakhstan. +In contrast to \autoref{fig:proxy-probe-timelines-kazakhstan}, above, +this experiment built a full obfs4 connection and Tor circuit, +revealing blocking beyond the TCP\index{TCP} handshake. +Tor reports its connection progress as a percentage; +so here, ``success'' is on a continuum from 0\% to~100\%, +as is the degree of agreement with the control site. The first three batches were blocked since before we started measuring; -the next two were blocked while we were watching; +the next two were blocked in January, and the last was not blocked. } -\index{United States of America} \index{Kazakhstan} +\index{Tor!bootstrapping} \index{ndnop3 (Tor bridge)} \index{ndnop5 (Tor bridge)} \index{Mosaddegh (Tor bridge)} \index{GreenBelt (Tor bridge)} \index{Lisbeth (Tor bridge)} \index{NX01 (Tor bridge)} -\index{LeifEricson (Tor bridge)} \index{cymrubridge31 (Tor bridge)} \index{cymrubridge33 (Tor bridge)} -\index{fdctorbridge01 (Tor bridge)} -\index{ndnop4 (Tor bridge)} \label{fig:proxy-probe-bridgetest-kazakhstan} \end{figure} -The bridge blocking in Kazakhstan was of a different nature +The bridge blocking in Kazakhstan had a different nature than that which we observed in China\index{China}. -At a TCP\index{TCP} reachability level, the only blocked bridge was -LeifEricson:41213---in \autoref{fig:proxy-probe-timelines-kazakhstan} -it is the only one whose measurements disagree with controls. -This, however, disagreed with reports of blocking of Tor -and pluggable transports since June 2016~\cite[\S obfs blocking]{kazakhstan-wiki}. -The reports stated that the connection would stall -(no packets received from the bridge) -a short time after the TCP\index{TCP} handshake. - -We deployed an additional probe script in Kazakhstan. -This one did not only try to establish a TCP\index{TCP} connection, -but also build a full Tor-in-obfs4 connection and build a circuit. -\autoref{fig:proxy-probe-bridgetest-kazakhstan} shows the results. -Tor reports its connection progress as a percentage; -connections to blocked bridges would usually fail at 25\%. -The bridges in the first three release batches were blocked -before we started measurements in December 2015. +Refer to \autoref{fig:proxy-probe-timelines-kazakhstan}: +every measurement agreed with the control site, +with the sole exception of LeifEricson:41213\index{LeifEricson (Tor bridge)} (not shown), +which was blocked as it had been in China. +However there had been reports of the blocking of Tor\index{Tor!protocol} +and pluggable transports\index{pluggable transports} since June 2016~\cite[\S obfs blocking]{kazakhstan-wiki}. +The reports stated that the TCP\index{TCP} handshake would succeed, +but the connection would stall +(with no packets received from the bridge) +a short time after the connection was underway. + +\index{Tor!bootstrapping|(} +We deployed an additional probing script in Kazakhstan. +This one tried not only to make a TCP\index{TCP} connection, +but also establish a full obfs4\index{obfs4} connection and build a Tor circuit\index{Tor!circuit}. +Tor reports its connection progress as a ``bootstrap'' percentage: +progression from 0\% to 100\% involves first making an obfs4 connection, +then downloading directory information and the consensus, +and finally building a circuit\index{Tor!circuit}. +\autoref{fig:proxy-probe-bridgetest-kazakhstan} shows the results of the tests. +What we found was consistent with reports: +despite being reachable at the TCP\index{TCP} layer, +some bridges would fail bootstrapping at~10\% +(e.g., Mosaddegh:80\index{Mosaddegh (Tor bridge)} and GreenBelt:80\index{GreenBelt (Tor bridge)}) or~25\% +(e.g., Mosaddegh:443 and GreenBelt:443). +For three of the bridges +(Mosaddegh:9332\index{Mosaddegh (Tor bridge)}, +Lisbeth:443\index{Lisbeth (Tor bridge)}, +and NX01:443\index{NX01 (Tor bridge)}) +we caught the approximate moment of blocking. +Initially they bootstrapped to 100\% and agreed with the control, +but later they reached only 25\% and disagreed with the control. +Incidentally, these results suggest that Kazakhstan, too, +blocks on a per-port basis, because for a time +Mosaddegh:80 and Mosaddegh:443 were blocked while +Mosaddegh:9332 was unblocked. +Two more bridges +(cymrubridge31:80\index{cymrubridge31 (Tor bridge)} and +cymrubridge33:80\index{cymrubridge33 (Tor bridge)}) +remained unblocked. + +ndnop3:10527\index{ndnop3 (Tor bridge)} and ndnop5:13764\index{ndnop5 (Tor bridge)}, in the 5.5/6.0a1 batch, +are a special case. +Their varying bootstrap percentages were caused +by a misconfiguration on the bridge itself +(a file descriptor limit\index{file descriptor limit} +was set too low). +Even from the control site in the U.S.\index{United States of America}, +connections would fail to bootstrap to 100\% about 35\% of the time. +Still, it appears that both bridges were also blocked in Kazakhstan, +because from the control site the bootstrap percentage would oscillate +between 10\% and 100\%; +while from Kazakhstan it would oscillate between 10\% to 25\%. +\index{Tor!bootstrapping|)} + The bridges in the 6.0.6/6.5a4 and 6.0.8/6.5a6 batches -were blocked on or around January~26, 2017, -evidenced by the fact that they usually progressed to 100\% -before that date, and only to 25\% after. -The blocking date comes either 71~or 43~days -after public release, depending on which release you compare to. +were blocked on or around January~26, 2017. +This sets the blocking delay at either 71~or 43~days +after public release, depending on which release you compare against. \index{Kazakhstan|)} @@ -3880,7 +4015,7 @@ of the server you're trying to access normally appears in three places that are visible to the censor: \begin{itemize} \item the DNS query\index{DNS} -\item the client's TLS\index{TLS} Server Name Indication (SNI) extension~\cite[\S 3]{rfc6066}\index{Server Name Indication (SNI)} +\item the client's TLS\index{TLS} Server Name Indication (SNI) extension~\cite[\S 3]{rfc6066}\index{SNI} \item the server's TLS\index{TLS} certificate\index{certificate}~\cite[\S 7.4.2]{rfc5246}\index{common name (X.509)} \end{itemize} and in one place that is not visible to the censor, @@ -3918,7 +4053,7 @@ not the HTTP\index{HTTP} layer. \label{fig:domain-fronting} \end{figure} -The SNI extension\index{Server Name Indication (SNI)} +The SNI extension\index{SNI} and the Host header\index{Host (HTTP header)} serve similar purposes. They both enable virtual hosting\index{virtual hosting}, which is when one server handles requests for multiple domains. @@ -3937,7 +4072,7 @@ the client cannot send the Host header\index{Host (HTTP header)} until the TLS\index{TLS} handshake is complete, and the server cannot complete the TLS handshake without knowing which certificate to send. -The SNI extension\index{Server Name Indication (SNI)} +The SNI extension\index{SNI} resolves the deadlock by sending the domain name in plaintext in the TLS layer. Domain fronting takes advantage of decoupling @@ -4028,7 +4163,7 @@ As far as I~have been able to find out, the first implementation of domain fronting was in GoAgent\index{GoAgent}, a circumvention system, circa 2012. GoAgent employed a variant of fronting where -the SNI\index{Server Name Indication (SNI)} +the SNI\index{SNI} is omitted, rather than being faked. % GoAgent 2.0 began sending HTTPS requests: % b4ab1f83f57b91eda34ae1743021fbb60ecd2f60 is the first bad commit @@ -4040,7 +4175,7 @@ is omitted, rather than being faked. Earlier in 2012, Bryce Boe wrote a blog post~\indexauthors{\cite{Boe2012a}} outlining how to use Google App Engine\index{Google App Engine} as a proxy, -and suggested that sending a false SNI\index{Server Name Indication (SNI)} +and suggested that sending a false SNI\index{SNI} could bypass SNI whitelisting\index{whitelisting}. Even farther back, in 2004, when HTTPS\index{HTTPS} and CDNs\index{content delivery network} were less common, @@ -4112,7 +4247,7 @@ Tan et~al.~\indexauthors{\cite{Tan2015a}} measured the Kullback--Leibler divergence\index{Kullback--Leibler divergence} between the distributions of packet size and packet timing\index{packet size and timing} in different protocols. -(The paper is written in Chinese\index{Chinese} +(The paper is written in Chinese\index{Chinese language} and my understanding of it is based on an imperfect translation.) Wang et~al.~\indexauthors{\cite{Wang2015a}} @@ -4125,12 +4260,6 @@ and tested their false-classification\index{false positives}\index{false negativ against real traffic traces. -% \section{Fronting-capable web services} -% \label{sec:fronting-services} -% -% \dragons - - \section{A pluggable transport for Tor} \label{sec:meek-impl} @@ -4166,7 +4295,7 @@ hides the bridge's address, which is presumably blocked\index{blocking!by address}. } \index{Tor bridge} -\index{Server Name Indication (SNI)} +\index{SNI} \index{Host (HTTP header)} \index{meek} \label{fig:meek} @@ -4285,7 +4414,7 @@ through passive observation. For example, the Great Firewall\index{Great Firewall of China} used Tor's\index{Tor} TLS fingerprint for detection as early as 2011~\cite{tor-trac-4744}. For this reason, meek\index{meek} strives to make its TLS fingerprint -look like that of a browser. +look like that of a browser\index{web browser}. It does this by relaying its HTTPS\index{HTTPS} requests through a local headless browser (which is completely separate from the browser that the user interacts with). @@ -4368,15 +4497,17 @@ cost to run it. \subsection*{2013: Precursors; prototypes} -The prehistory of meek begins in 2013 with flash proxy\index{flash proxy}~\cite{Fifield2012a}. -Flash proxy clients need a secure rendezvous, +The prehistory of meek begins in 2013 with flash proxy\index{flash proxy}~\cite{Fifield2012a-local}, +a circumvention system built around +web browser\index{web browser}--based proxies. +Flash proxy clients need a secure rendezvous\index{rendezvous!of flash proxy}, a way to register their address to a central facilitator\index{flash proxy!facilitator}, so that flash proxies may connect back to them. Initially there were only two means of registration: -flashproxy-reg-http\index{flashproxy-reg-http}, +flashproxy-reg-http\index{flash proxy!flashproxy-reg-http}, which sent client registrations as HTTP\index{HTTP} requests; -and flashproxy-reg-email\index{flashproxy-reg-email}, +and flashproxy-reg-email\index{flash proxy!flashproxy-reg-email}, which sent client registrations to a distinguished email\index{email} address. We knew that flashproxy-reg-http was easily blockable; flashproxy-reg-email had good blocking resistance @@ -4387,30 +4518,30 @@ showed me an example of using domain fronting---though we didn't have a name for it then---to access a simple HTML-rewriting proxy\index{HTML-rewriting proxy} based on Google App Engine\index{Google App Engine}. -I~eventually realized that the same trick would work for flash proxy rendezvous\index{flash proxy!rendezvous}. +I~eventually realized that the same trick would work for flash proxy rendezvous\index{rendezvous!of flash proxy}. I~proposed a design~\cite{tor-trac-8860} in May 2013 and within a month Arlo Breault\index{Breault, Arlo} had written -flashproxy-reg-appspot\index{flashproxy-reg-appspot}, -which worked just like flashproxy-reg-http\index{flashproxy-reg-http}, +flashproxy-reg-appspot\index{flash proxy!flashproxy-reg-appspot}, +which worked just like flashproxy-reg-http\index{flash proxy!flashproxy-reg-http}, except that it fronted through \nolinkurl{www.google.com}\index{www.google.com@\nolinkurl{www.google.com}} rather than contacting the registration server directly. The fronting-based registration became flash proxy's\index{flash proxy} preferred registration method, being faster and simpler than the email-based\index{email} one. -The development of domain fronting, from a simple rendezvous technique, -into a full-fledged bidirectional transport, seems slow in retrospect. +The development of domain fronting, from a simple rendezvous\index{rendezvous} technique +to a full-fledged bidirectional transport, seems slow in retrospect. All the pieces were there; it was a matter of putting them together. I~did not immediately appreciate the potential of domain fronting when I~first saw it. -Even after the introduction of flashproxy-reg-appspot\index{flashproxy-reg-appspot}, +Even after the introduction of flashproxy-reg-appspot\index{flash proxy!flashproxy-reg-appspot}, months passed before the beginning of meek\index{meek}. -The whole idea behind flash proxy rendezvous\index{flash proxy!rendezvous} +The whole idea behind flash proxy rendezvous\index{rendezvous!of flash proxy} is that the registration channel can be of low quality---unidirectional, low-bandwidth, and high-latency---because it is only used to bootstrap into a more capable channel (WebSocket\index{WebSocket}, in flash proxy's case). Email\index{email} fits this model well: not good for a general-purpose channel, -but just good enough for rendezvous. +but just good enough for rendezvous\index{rendezvous}. The fronting-based HTTP\index{HTTP} channel, however, was more capable than needed for rendezvous, being bidirectional and reasonably high-performance. @@ -4438,7 +4569,7 @@ I~was self-conscious that the idea at the core of the system, domain fronting was a simple one and easy to implement. Not wanting to oversell it, I~settled on the name ``meek,'' -in lower case for extra meekness. +in small letters for extra meekness. I~lost time in the premature optimization of meek's network performance. I~was thinking about the request--response nature of HTTP\index{HTTP}, @@ -4472,7 +4603,7 @@ In the post, I~linked to the source code, described the protocol, and explained how to try it, using an App Engine\index{Google App Engine} instance I~set up shortly before. -At this time there was no web browser TLS camouflage\index{TLS!fingerprinting}, +At this time there was no web browser\index{web browser} TLS camouflage\index{TLS!fingerprinting}, and only App Engine was supported. I~was not yet using the term ``domain fronting.'' The big ideas of the title were as follows: @@ -4663,14 +4794,14 @@ Shortly thereafter, GreatFire\index{GreatFire}, an anticensorship organization t was mentioned in the article, experienced a new type of denial-of-service\index{denial of service} attack~\cite{greatfire-we-are-under-attack}, caused by a Chinese\index{China} network attack system -later dubbed the Great Cannon\index{Great Cannon}~\cite{Marczak2015a-local}. +later known as the Great Cannon\index{Great Cannon}~\cite{Marczak2015a-local}. They blamed the attack on the attention brought by the news article. As further fallout, Cloudflare\index{Cloudflare}, a CDN\index{content delivery network} which Lantern\index{Lantern} used for fronting and whose CEO was quoted in the article, stopped supporting domain fronting~\cite{PrinceCloudflareHackerNews}, -by beginning to enforce a match between the SNI\index{Server Name Indication (SNI)} +by beginning to enforce a match between the SNI\index{SNI} and the Host header\index{Host (HTTP header)} Since its first deployment, the Azure\index{Microsoft Azure}\index{meek-azure} backend @@ -4767,7 +4898,7 @@ used Kullback--Leibler divergence\index{Kullback--Leibler divergence} to quantify the differences between protocols, with respect to packet size and interarrival time distributions\index{packet size and timing}. -Their paper is written in Chinese\index{Chinese}; +Their paper is written in Chinese\index{Chinese language}; I~read it in machine translation. Wang et~al.~\indexauthors{\cite{Wang2015a}} published a more comprehensive report @@ -4858,7 +4989,7 @@ about what happened. Some botnet\index{botnet} had apparently been misusing meek for command and control\index{command and control} purposes. Its operators had not even bothered to set up their own App Engine\index{Google App Engine} project; -they were freeriding on the service we had been operating for the public. +they were free-riding on the service we had been operating for the public. Although we may have been able to reinstate the meek-google\index{meek-google} service, seeing as the suspension was the result of someone else's actions, not ours, with the existing uncertainty around the terms of service\index{terms of service} @@ -4870,7 +5001,7 @@ misusing meek-google\index{meek-google}, but an organized political hacker group known as Cozy Bear\index{Cozy Bear} or APT29. Matthew Dunwoody presented observations to that effect -in a FireEye blog post~\indexauthors{\cite{fireeye-apt29_domain_frontin}} +in a FireEye\index{FireEye} blog post~\indexauthors{\cite{fireeye-apt29_domain_frontin}} in March 2017. The malware would install a backdoor that operated over a Tor onion service\index{onion service}, and used meek for camouflage. @@ -4880,14 +5011,14 @@ but I~was not aware of them until the blog post. The year 2016 brought the first reports of efforts to block meek. These efforts all had in common that they used TLS fingerprinting\index{TLS!fingerprinting}\index{blocking!by content} -in conjunction with SNI\index{Server Name Indication (SNI)} inspection\index{blocking!by address}. +in conjunction with SNI\index{SNI} inspection\index{blocking!by address}. In May, a Tor user reported that Cyberoam\index{Cyberoam}, a firewall company, had released an update that enabled detection\index{detection} and blocking\index{blocking} of meek, among other Tor pluggable transports\index{pluggable transports}~\cite{tor-dev-cyberoam}. Through experiments we determined that the firewall was detecting meek whenever it saw a combination of two features: a specific client TLS fingerprint, -and an SNI\index{Server Name Indication (SNI)} containing any of our three front domains\index{front domain}: +and an SNI\index{SNI} containing any of our three front domains\index{front domain}: \nolinkurl{www.google.com}\index{www.google.com@\nolinkurl{www.google.com}}, \nolinkurl{a0.awsstatic.com}\index{a0.awsstatic.com@\nolinkurl{a0.awsstatic.com}}, or \nolinkurl{ajax.aspnetcdn.com}\index{ajax.aspnetcdn.com@\nolinkurl{ajax.aspnetcdn.com}}~\cite{traffic-obf-cyberoam}. @@ -4898,9 +5029,9 @@ by the firewall to limit collateral damage\index{collateral damage}: it did not block those domains for all clients, but only for the subset having a particular TLS fingerprint\index{TLS!fingerprinting}. I~admit that I~had not considered the possibility -of using TLS\index{TLS} and SNI\index{Server Name Indication (SNI)} together to make a more precise classifier. +of using TLS\index{TLS} and SNI\index{SNI} together to make a more precise classifier. We had known since the beginning of the possibility of TLS fingerprinting, -which is why we took the trouble to implement browser-based TLS camouflage. +which is why we took the trouble to implement browser\index{web browser}-based TLS camouflage. The camouflage was performing as intended: even an ordinary Firefox~38\index{Firefox web browser} (the basis of Tor Browser\index{Tor Browser}, and what meek camouflaged itself as) @@ -4918,7 +5049,7 @@ by a FortiGuard firewall\index{FortiGuard}~\cite{traffic-obf-fortiguard} from Tor user Kanwaljeet Singh Channey\index{Channey, Kanwaljeet Singh}. The situation was virtually the same as in the Cyberoam\index{Cyberoam} case: the firewall would block connections having a specific TLS fingerprint\index{TLS!fingerprint}\index{blocking!by content} -and a specific SNI\index{Server Name Indication (SNI)}\index{blocking!by address}. +and a specific SNI\index{SNI}\index{blocking!by address}. This time, the TLS fingerprint was that of Firefox~45\index{Firefox web browser} (which by then Tor Browser\index{Tor Browser} had upgraded to); and the specific SNIs were two, not three, omitting @@ -4960,8 +5091,8 @@ pending a migration to new infrastructure. The Brazil\index{Brazil} count would remain low until rising again in June 2017. In September 2016, I~began mentoring Katherine Li\index{Li, Katherine} -in writing a program GAEuploader\index{GAEuploader}~\cite{LiGAEuploader}, -to simplify and automate the process of +in writing GAEuploader\index{GAEuploader}~\cite{LiGAEuploader}, +a program to simplify and automate the process of setting up domain fronting. The program automatically uploads the necessary code to Google App Engine\index{Google App Engine}, @@ -5071,8 +5202,8 @@ roughly doubled during that time. Snowflake is a new circumvention system currently under development. It is based on peer-to-peer connections -through lightweight, ephemeral proxies -that run in web browsers. +through ephemeral proxies +that run in web browsers\index{web browser}. Snowflake proxies are lightweight: activating one is as easy as browsing to a web page and shutting one down only requires closing the browser tab. @@ -5086,51 +5217,55 @@ If the censor manages to block the IP address of one proxy, there is little harm, because many other temporary proxies are ready to take its place. -Snowflake~\cite{snowflake-wiki} is the spiritual successor -to flash proxy~\cite{Fifield2012a-local}, -a system that similarly used browser-based proxies. -Flash proxy, with obfs2 and obfs3, was one of the first three pluggable -transports for Tor~\cite{tor-blog-combined-flash-proxy-pyobfsproxy-browser-bundles}, +Snowflake~\cite{snowflake-wiki,snowflake-technical} is the spiritual successor +to flash proxy\index{flash proxy}~\cite{Fifield2012a-local}, +a system that similarly used browser-based\index{web browser} proxies, +written in JavaScript\index{JavaScript}. +Flash proxy, with obfs2\index{obfs2} and obfs3\index{obfs3}, was one of the first three +pluggable transports\index{pluggable transports} +for Tor~\cite{tor-blog-combined-flash-proxy-pyobfsproxy-browser-bundles}, but since its introduction in 2013 it never had many users~\cite{tor-metrics-userstats-bridge-transport-websocket}. I~believe that its lack of adoption was a result mainly -of its incompatibility with NAT (network address translation): +of its incompatibility with NAT (network address translation)\index{network address translation}: its use of the TCP-based\index{TCP} WebSocket\index{WebSocket} protocol~\cite{rfc6455} -required clients to follow complicated port forwarding instructions~\cite{flashproxyhowto-wiki}. -For that reason flash proxy was deprecated in 2016~\cite{tor-trac-17428}. +required clients to follow complicated port forwarding instructions\index{usability}~\cite{flashproxyhowto-wiki}. +For that reason, flash proxy was deprecated in 2016~\cite{tor-trac-17428}. -Snowflake keeps flash proxy's basic idea of in-browser proxies, +Snowflake keeps the basic idea of in-browser proxies, but replaces WebSocket\index{WebSocket} with WebRTC~\cite{draft-ietf-rtcweb-overview}\index{WebRTC}, a suite of protocols for peer-to-peer communications. Importantly, WebRTC uses UDP\index{UDP} for communication, -and includes facilities for NAT traversal, +and includes facilities for NAT\index{network address translation} traversal, allowing most clients to use it without manual configuration. -WebRTC mandatorily encrypts its channels, +WebRTC mandatorily encrypts\index{encryption} its channels, which as a side effect obscures any keywords or byte patterns -in the tunneled traffic. -(While leaving open the possibility of detecting +in the tunneled\index{tunneling+} traffic. +(Still leaving open the possibility of detecting\index{detection!by content} the use of WebRTC\index{WebRTC} itself---see \autoref{sec:webrtc-fingerprinting}.) -Aside from flash proxy, -the most similar existing design was a former version of uProxy~\cite{uproxy}. +Aside from flash proxy\index{flash proxy}, +the most similar existing design was a former version of uProxy\index{uProxy}~\cite{uproxy} +(an upcoming revision will work differently). uProxy required clients to know a confederate outside the censor's network who could run a proxy. The client would connect through the proxy using WebRTC\index{WebRTC}; the proxy would then directly fetch -the client's requested URLs. -Snowflake centralizes the proxy discovery process, +the client's requested URLs\index{URL+}. +Snowflake centralizes the proxy discovery process\index{proxy discovery problem}, removing the requirement to arrange one's own proxy outside the firewall. Snowflake proxies are merely dumb pipes to a more capable proxy, allowing them to carry traffic other than web traffic, and preventing them from spying on the client's traffic. -prior coordination with a friend before connecting. -The name ``Snowflake'' comes from one of WebRTC's\index{WebRTC} subprotocols, -called ICE (Interactive Connectivity Establishment)~\cite{rfc5245}, +The name Snowflake comes from one of WebRTC's\index{WebRTC} subprotocols, +ICE\index{ICE} (Interactive Connectivity Establishment)~\cite{rfc5245}, and from the temporary proxies, which resemble snowflakes in their impermanence and uniqueness. +\pagebreak[4] + Snowflake now exists in an experimental alpha release, incorporated into Tor Browser\index{Tor Browser}. My main collaborators on the Snowflake project are @@ -5148,7 +5283,15 @@ Hooman Mohajeri Moghaddam\index{Moghaddam, Hooman Mohajeri}. \includegraphics{figures/snowflake} \caption{ Schematic of Snowflake. +See \autoref{fig:snowflake-rendezvous} for an elaboration +on Steps~1, 2, and~3. +\todo[inline]{missing bridge} } +\index{broker (Snowflake)} +\index{offer (Snowflake)} +\index{answer (Snowflake)} +\index{WebRTC} +\index{domain fronting!as Snowflake rendezvous} \label{fig:snowflake} \end{figure} @@ -5156,57 +5299,56 @@ There are three main components of the Snowflake system. Refer to \autoref{fig:snowflake}. \begin{itemize} \item -many \emph{snowflake proxies} (``snowflakes'' for short), -which communicate with clients using WebRTC\index{WebRTC} and forward +many \emph{snowflake proxies}, +which communicate with clients over WebRTC\index{WebRTC} and forward their traffic to the bridge \item many \emph{clients}, responsible for -initially requesting service and then routing traffic -though snowflakes as they arrive +initially requesting service and then establishing +peer-to-peer connections with snowflake proxies \item -the \emph{broker}, an online database that serves -to match clients with snowflakes +a \emph{broker}\index{broker (Snowflake)|textbf}, an online database that serves +to match clients with snowflake proxies \item -the \emph{bridge} +a \emph{bridge} (so called to distinguish it from the snowflake proxies), a full-featured proxy capable of connecting to any destination \end{itemize} The architecture of the system is influenced -by the requirement that proxies run in a browser, +by the requirement that proxies run in a browser\index{web browser}, and the nature of WebRTC\index{WebRTC} connection establishment, which uses a bidirectional handshake. -In our implementation, the bridge is really a Tor bridge. -Even though a Tor circuit consists of multiple hops, -that fact is abstracted away from the client's perspective; -the Snowflake design does not inherently depend on Tor. - -A Snowflake connection happens in multiple steps -(refer to \autoref{fig:snowflake}). -In the first part, called \emph{rendezvous}, +In our implementation, the bridge is really a Tor bridge\index{Tor!bridges}. +Even though a Tor circuit\index{Tor!circuit} consists of multiple hops, +that fact is abstracted away from the Tor client's perspective; +Snowflake does not inherently depend on Tor\index{Tor}. + +A Snowflake connection happens in multiple steps. +In the first phase, called \emph{rendezvous}\index{rendezvous!of Snowflake}, the client and snowflake exchange information necessary for a WebRTC\index{WebRTC} connection. \begin{enumerate} \item The client registers its need for service by sending -a message to the broker. -The message, called an \emph{offer}~\cite{rfc3264}, +a message to the broker\index{broker (Snowflake)}. +The message, called an \emph{offer}\index{offer (Snowflake)}~\cite{rfc3264}, contains the client's IP address and other metadata needed to establish a WebRTC\index{WebRTC} connection. How the client sends its offer is further explained below. \item -At some point, a snowflake comes online -and polls the broker. -The broker hands the client's offer to the proxy, -which sends back its \emph{answer}~\cite{rfc3264}, +At some point, a snowflake proxy comes online +and polls the broker\index{broker (Snowflake)}. +The broker hands the client's offer to the snowflake proxy, +which sends back its \emph{answer}\index{answer (Snowflake)}~\cite{rfc3264}, containing its IP address and other connection metadata the client will need to know. \item -The broker sends back to the client +The broker\index{broker (Snowflake)} sends back to the client the snowflake's answer message. \end{enumerate} At this point rendezvous is finished. -The snowflake has the client's offer, -and the client has the snowflake's answer, +The snowflake has the client's offer\index{offer (Snowflake)}, +and the client has the snowflake's answer\index{answer (Snowflake)}, so they have all the information needed to establish a WebRTC\index{WebRTC} connection to each other. \begin{enumerate}[resume] @@ -5223,39 +5365,41 @@ between client and bridge until it is terminated. The client's communication with the bridge is encrypted and authenticated end-to-end through the WebRTC\index{WebRTC} tunnel, -so the proxy may not interfere with it. +so the proxy cannot interfere with it. When the snowflake proxy terminates, the client may request a new one. Various optimizations are possible, such as having the client maintain a pool of proxies -so as to bridge gaps in connectivity, +in order to bridge gaps in connectivity, but we have not implemented and tested them sufficiently to state their effects. +\index{rendezvous!of Snowflake|(} The rendezvous phase bears further explanation. Steps~1, 2, and~3 actually happen synchronously, -using interleaved HTTP requests and responses. -See \autoref{fig:snowflake-rendezvous}. -The client's single request uses domain fronting\index{domain fronting!as rendezvous for Snowflake}\index{Snowflake!rendezvous} -and those of the snowflakes are direct. -In Step~1, the client sends an request containing its offer. -The broker holds the connection open but does not immediately respond. -In Step~2, a snowflake makes a polling request +using interleaved HTTP\index{HTTP} requests and responses: +see \autoref{fig:snowflake-rendezvous}. +The client's single request uses domain fronting\index{domain fronting!as rendezvous for Snowflake}\index{rendezvous!of Snowflake}, +but the requests of the snowflake proxies are direct. +In Step~1, the client sends a request containing its offer\index{offer (Snowflake)}. +The broker\index{broker (Snowflake)} holds the connection open but does not immediately respond. +In Step~2, a snowflake proxy makes a polling request (``do you have any clients for me?'') -and the broker responds with the client's offer. -The snowflake composes its answer and sends it back -to the broker in a second HTTP request -(linked to the first by a random token). -In Step~3, the broker finally responds to the client's -initial request by passing on the snowflake's answer. +and the broker responds with the client's offer\index{offer (Snowflake)}. +The snowflake composes its answer\index{answer (Snowflake)} and sends it back +to the broker in a second HTTP\index{HTTP} request +(linked to the first by a random\index{randomness} token). +In Step~3, the broker\index{broker (Snowflake)} finally responds to the client's +initial request by passing on the snowflake proxy's answer. From the client's point of view, it has sent -a single request (containing an offer) -and received a single response (containing an answer). +a single request (containing an offer\index{offer (Snowflake)}) +and received a single response (containing an answer\index{answer (Snowflake)}). If no proxy arrives within a time threshold of the client sending its offer, the broker replies with an error message instead. -We learned from the experience of running flash proxy -that it is not difficult to archive a proxy arrival rate of -several per second, so timeouts should be exceptional. +We learned from the experience of running flash proxy\index{flash proxy} +that it is not difficult to achieve a proxy arrival rate of +several per second, so timeouts ought to be exceptional. +\index{rendezvous!of Snowflake|)} \begin{figure} \centering @@ -5272,18 +5416,22 @@ to send back its answer. \index{POST (HTTP method)} \index{HTTP} \index{200 (HTTP status code)} -\index{domain fronting!as rendezvous for Snowflake} -\index{Snowflake!rendezvous} +\index{domain fronting!as Snowflake rendezvous} +\index{rendezvous!of Snowflake} +\index{broker (Snowflake)} +\index{offer (Snowflake)} +\index{answer (Snowflake)} \label{fig:snowflake-rendezvous} \end{figure} -One may ask, if the domain-fronted rendezvous channel +One may ask, if the domain-fronted\index{domain fronting} +rendezvous\index{rendezvous} channel is bidirectional and already assumed to be difficult to block, -why doesn't it suffice for circumvention on its own? +doesn't it suffice for circumvention on its own? The answer is that it does suffice---that's the idea behind meek (\autoref{sec:meek-history}). The disadvantage of building a system -exclusively on domain fronting is high monetary cost +exclusively on domain fronting, though, is high monetary cost\index{domain fronting!cost of} (see \autoref{tab:meek-costs} on page~\pageref{tab:meek-costs}). Snowflake offloads the bulk of data transfer onto WebRTC\index{WebRTC}, and uses expensive domain fronting only for rendezvous. @@ -5292,7 +5440,7 @@ There are two reasons why the snowflake proxies forward client traffic to a separate bridge, rather than connecting directly to the client's desired destination. -The first is generality: a browser-based proxy +The first is generality: a browser\index{web browser}-based proxy can only do the things a browser can do; it can fetch web pages but cannot, for example, open sockets to arbitrary destinations. @@ -5300,19 +5448,19 @@ The second is privacy: the proxies are operated by untrusted, potentially malicious strangers. If they were to exit client traffic directly, -they could tamper with it; -furthermore a malicious \emph{client} -would be able to cause a well-meaning proxy +they would be able to tamper with it. +Furthermore, a malicious \emph{client} +could cause a well-meaning proxy to connect to suspicious destinations, potentially getting its operator in trouble. -This is essentially -untrusted messenger delivery~\cite{Feamster2003a}, +This ``many proxies, one bridge'' model is essentially +untrusted messenger delivery\index{untrusted messenger delivery}~\indexauthors{\cite{Feamster2003a}}, proposed by Feamster et~al.\ in~2003. WebRTC\index{WebRTC} offers two features that are necessary for Snowflake: \begin{enumerate*} -\item it is supported in web browsers, and -\item it deals with NAT. +\item it is supported in web browsers\index{web browser}, and +\item it deals with NAT\index{network address translation}. \end{enumerate*} In other respects, though, WebRTC\index{WebRTC} is a nuisance. Its close coupling with browser code makes it difficult @@ -5325,8 +5473,8 @@ which is useful for some of its intended use cases, but which we would prefer not to have to deal with. Working within a browser environment limits our flexibility, because we cannot access the network directly, -but only at arm's length through some or other~API. -This has implications for detection by content, +but only at arm's length through some~API. +This has implications for detection by content\index{detection!by content}, as discussed in the next section. @@ -5336,17 +5484,17 @@ as discussed in the next section. \index{WebRTC!fingerprinting|(} Snowflake primarily tackles the problem of -detection by address. +detection by address\index{detection!by address}. The pool of temporary proxies changes too quickly -for a censor to keep up with -(at least that's the idea). +for a censor to keep up with---or at +least that's the idea. Equally important, though, -is the problem of detection by content. +is the problem of detection by content\index{detection!by content}. If Snowflake's protocol has an easily detectable ``tell,'' then it could be blocked despite its address diversity. -Just as with meek we were concerned about -TLS fingerprinting (\autoref{sec:meek-impl}), +Just as with meek\index{meek} we were concerned about +TLS fingerprinting\index{TLS!fingerprinting} (\autoref{sec:meek-impl}), with Snowflake we are concerned with WebRTC\index{WebRTC} fingerprinting. @@ -5359,7 +5507,8 @@ And that alone is not enough---it also must be that the censor is reluctant to block those other uses of WebRTC\index{WebRTC}. -Mia Gil~Epner and I~began an investigation into +Mia Gil~Epner\index{Gil Epner, Mia}\index{Fifield, David} +and I~began an investigation into the potential fingerprintability of WebRTC~\cite{FifieldGilEpnerWebRTC,snowflake-fingerprinting-wiki}\index{WebRTC}. While preliminary, we were able to find many @@ -5381,48 +5530,48 @@ the peer-to-peer connection. WebRTC offers no standard way to do signaling~\cite[\S 3]{draft-ietf-rtcweb-overview}; it is left up to implementers. For example, some implementations do signaling via -XMPP, an instant messaging protocol. -Snowflake does signaling through the broker, -during the rendezvous phase. +XMPP\index{XMPP}\index{instant messaging}, an instant messaging protocol. +Snowflake does signaling through the broker\index{broker (Snowflake)}, +during the rendezvous\index{rendezvous!of Snowflake} phase. \item[ICE] -ICE (Interactive Connectivity Establishment)~\cite{rfc5245} +ICE\index{ICE} (Interactive Connectivity Establishment)~\cite{rfc5245} is a combination of two protocols. -STUN (Session Traversal Utilities for NAT)~\cite{rfc5389} +STUN\index{STUN} (Session Traversal Utilities for NAT)~\cite{rfc5389} helps hosts open and maintain a binding -in a NAT table. -TURN (Traversal Using Relays around NAT)~\cite{rfc5766} -is a way to proxying through a third party, +in a NAT\index{network address translation} table. +TURN\index{TURN} (Traversal Using Relays around NAT)~\cite{rfc5766} +is a way of proxying through a third party when the end hosts' NAT configurations are such that they cannot communicate directly. In STUN, both client and server messages have a number of optional attributes, -including one called SOFTWARE +including one called SOFTWARE\index{SOFTWARE (STUN attribute)} that directly specifies the implementation. -Furthermore, the very choice of what STUN and TURN server +Furthermore, the very choice of which STUN and TURN servers to use is a choice made by the client. \item[Media and data channels] -WebRTC\index{WebRTC} offers media channels (used for audio and video) +WebRTC\index{WebRTC}\index{WebRTC!media versus data channels} offers media channels (used for audio and video) as well as two kinds of data channels (stream-oriented reliable and datagram-oriented unreliable). -All channels are encrypted, +All channels are encrypted\index{encryption}, however they are encrypted differently according to their type. Media channels use -SRTP (Secure Real-time Transport Protocol)~\cite{rfc3711} +SRTP\index{SRTP} (Secure Real-time Transport Protocol)~\cite{rfc3711} and data channels use -DTLS (Datagram TLS)~\cite{rfc6347}. +DTLS\index{DTLS} (Datagram TLS)~\cite{rfc6347}. Even though the contents of both are encrypted, an observer can easily distinguish\index{distinguishability} a media channel from a data channel\index{WebRTC!media channel versus data channel}. Applications that use media channels have options for doing key exchange: some borrow the DTLS handshake in a process called -DTLS-SRTP~\cite{rfc5764} +DTLS-SRTP\index{DTLS-SRTP}~\cite{rfc5764} and some use -SRTP with Security Descriptions (SDES)~\cite{rfc4568}. +SRTP with Security Descriptions (SDES\index{SDES})~\cite{rfc4568}. Snowflake uses reliable data channels. \item[DTLS] -DTLS, as with TLS, +DTLS, as with TLS\index{TLS}, offers a wealth of fingerprintable features. Some of the most salient are the protocol version, extensions, @@ -5431,74 +5580,75 @@ and values in the server's certificate. \end{description} Snowflake uses a WebRTC\index{WebRTC} library extracted -from the Chromium web browser, +from the Chromium web browser\index{Chromium web browser}, which mitigates some potential dead-parrot distinguishers~\cite{Houmansadr2013b}\index{dead-parrot attacks}. -But the protocol remains complicated +But WebRTC remains complicated and its behavior on the network -depends on more than the WebRTC library in use. +depends on more than just what library is in use. We conducted a survey of some WebRTC-using\index{WebRTC} applications in order to get an idea of the implementation choices being made in practice. -We tested three applications that use media channels, +We tested three applications that use media channels\index{WebRTC!media versus data channels}, all chat services: -Google Hangouts (\url{https://hangouts.google.com}), -Facebook Messenger (\url{https://www.messenger.com}), -and OpenTokRTC (\url{https://opentokrtc.com/}). +Google Hangouts (\url{https://hangouts.google.com})\index{Google Hangouts}, +Facebook Messenger (\url{https://www.messenger.com})\index{Facebook Messenger}, +and OpenTokRTC (\url{https://opentokrtc.com/})\index{OpenTokRTC}. We also tested two applications that use data channels: Snowflake itself and -Sharefest (\url{https://github.com/Peer5/ShareFest}), +Sharefest (\url{https://github.com/Peer5/ShareFest})\index{Sharefest}, a now-defunct file sharing service. Naturally, the network fingerprints of all five applications -were distinguishable at some level. -Snowflake, by default, uses a Google-operated STUN server, +were distinguishable\index{distinguishability} at some level. +Snowflake, by default, uses a Google\index{Google}-operated STUN\index{STUN} server, which may be a good choice because so do Hangouts and Sharefest. All applications other than Hangouts -used DTLS for key exchange. +used DTLS\index{DTLS} for key exchange. While the client portions differed, the server certificate was more promising, in all cases having a Common Name\index{common name (X.509)} of ``WebRTC'' and a validity of 30~days. +\index{DTLS!fingerprinting|(} Finally, we wrote a script~\cite{github-DTLS-fingerprint} to detect and fingerprint DTLS handshakes. -While DTLS does not -Vern Paxson ran it for us on a day's worth of traffic -from Lawrence Berkeley National Lab. -The script turned up only seven handshakes, +Running the script on a day's worth of traffic +from Lawrence Berkeley National Laboratory\index{Lawrence Berkeley National Laboratory} +turned up only seven handshakes, having three distinct fingerprints. While it is difficult to generalize from one measurement at one site, these results suggest that WebRTC\index{WebRTC} use---at least the forms that use DTLS---is not common. -We guessed that Google Hangouts would be the +We guessed that Google Hangouts\index{Google Hangouts} would be the main source of WebRTC connections; however our script would not have found Hangouts connections because Hangouts does not use DTLS. +\index{DTLS!fingerprinting|)} \index{WebRTC!fingerprinting|)} \index{Snowflake|)} -\chapter{Don't call it a conclusion} +% \chapter{Don't call it a conclusion} -\dragons +% \dragons -Computer security is already on shaky ground -even when we are dealing with trustworthy endpoints. -How much harder it is when users' -own computers must be counted among the threats they face. -People already have an adversarial relationship -with the hostile apps on their phones -free software +% Computer security is already on shaky ground +% even when we are dealing with trustworthy endpoints. +% How much harder it is when users' +% own computers must be counted among the threats they face. +% People already have an adversarial relationship +% with the hostile apps on their phones +% free software -Let us strive, therefore, -to control the pace, -and spend whatever time remains in the race -winning, not losing. +% Let us strive, therefore, +% to control the pace, +% and spend whatever time remains in the race +% winning, not losing. % Probably the circumstances of the world change % and make irrelevant this field of study. @@ -5507,7 +5657,14 @@ winning, not losing. \backmatter -\defbibnote{bibnote}{\todo[inline]{Note about archived URLs.}} +\defbibnote{bibnote}{ +I~strive to provide a URL\index{URL} with references whenever possible. +I~archived a copy of each URL at the +Internet Archive\index{Internet Archive} +on or about December 14, 2017. +If a link is broken, look for an archived version at +\url{https://web.archive.org/}. +\todo[inline]{Archive the URLs.}} \printbibliography[heading=bibintoc,prenote=bibnote] \clearpage