From c9e403585ba4bcd76ddcb67e586ad83e9065fa26 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 29 Jun 2016 14:56:14 -0500 Subject: [PATCH] switching from host limits to domain limits, which apply in aggregate to the host and subdomains --- setup.py | 2 +- tests/test_warcprox.py | 115 +++++++++++++++++++++++++++++------------ warcprox/__init__.py | 66 +++++++++++++++++++++++ warcprox/stats.py | 25 +++++---- warcprox/warcproxy.py | 58 ++------------------- 5 files changed, 164 insertions(+), 102 deletions(-) diff --git a/setup.py b/setup.py index d23fcea..d3428f6 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0.dev17', + version='2.0.dev18', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index edc44ec..76901d1 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -42,6 +42,7 @@ import pprint import traceback import signal from collections import Counter +import socket try: import http.server as http_server @@ -65,6 +66,33 @@ logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN) warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning) warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning) +# monkey patch dns lookup so we can test domain inheritance on localhost +orig_getaddrinfo = socket.getaddrinfo +orig_gethostbyname = socket.gethostbyname +orig_socket_connect = socket.socket.connect + +def _getaddrinfo(host, port, family=0, type=0, proto=0, flags=0): + if host.endswith('.localhost'): + return orig_getaddrinfo('localhost', port, family, type, proto, flags) + else: + return orig_getaddrinfo(host, port, family, type, proto, flags) + +def _gethostbyname(host): + if host.endswith('.localhost'): + return orig_gethostbyname('localhost') + else: + return orig_gethostbyname(host) + +def _socket_connect(self, address): + if address[0].endswith('.localhost'): + return orig_socket_connect(self, ('localhost', address[1])) + else: + return orig_socket_connect(self, address) + +socket.gethostbyname = _gethostbyname +socket.getaddrinfo = _getaddrinfo +socket.socket.connect = _socket_connect + def dump_state(signum=None, frame=None): pp = pprint.PrettyPrinter(indent=4) state_strs = [] @@ -373,6 +401,13 @@ def test_httpds_no_proxy(http_daemon, https_daemon): assert response.headers['warcprox-test-header'] == 'c!' assert response.content == b'I am the warcprox test payload! dddddddddd!\n' + # ensure monkey-patched dns resolution is working + url = 'https://foo.bar.localhost:{}/c/d'.format(https_daemon.server_port) + response = requests.get(url, verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'c!' + assert response.content == b'I am the warcprox test payload! dddddddddd!\n' + def _poll_playback_until(playback_proxies, url, status, timeout_sec): start = time.time() # check playback (warc writing is asynchronous, give it up to 10 sec) @@ -840,15 +875,16 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies): assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} -def test_host_doc_soft_limit( +def test_domain_doc_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): request_meta = { - "stats": {"buckets": [{"bucket":"test_host_doc_limit_bucket","tally-host-stats":True}]}, - "soft-limits": {"test_host_doc_limit_bucket:localhost/total/urls":10}, + "stats": {"buckets": [{"bucket":"test_domain_doc_limit_bucket","tally-domains":["foo.localhost"]}]}, + "soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls":10}, } headers = {"Warcprox-Meta": json.dumps(request_meta)} - url = 'http://localhost:{}/o/p'.format(http_daemon.server_port) + # (1) + url = 'http://foo.localhost:{}/o/p'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 200 @@ -861,8 +897,8 @@ def test_host_doc_soft_limit( time.sleep(0.5) time.sleep(0.5) - # make sure stats from different host don't count - url = 'http://127.0.0.1:{}/o/p'.format(http_daemon.server_port) + # make sure stats from different domain don't count + url = 'http://bar.localhost:{}/o/p'.format(http_daemon.server_port) for i in range(10): response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) @@ -877,9 +913,19 @@ def test_host_doc_soft_limit( # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) time.sleep(2.0) - # same host but different scheme and port -- host limit still applies - url = 'https://localhost:{}/o/p'.format(https_daemon.server_port) - for i in range(8): + # (2) same host but different scheme and port: domain limit applies + # + url = 'https://foo.localhost:{}/o/p'.format(https_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'o!' + assert response.content == b'I am the warcprox test payload! pppppppppp!\n' + + # (3-9) different subdomain: host limit applies + url = 'https://baz.foo.localhost:{}/o/p'.format(https_daemon.server_port) + for i in range(7): response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) @@ -894,6 +940,7 @@ def test_host_doc_soft_limit( # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) time.sleep(2.0) + # (10) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) @@ -908,19 +955,19 @@ def test_host_doc_soft_limit( # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) time.sleep(2.0) - # back to http, and this is the 11th request - url = 'http://localhost:{}/o/p'.format(http_daemon.server_port) + # (11) back to http, and this is the 11th request + url = 'http://zuh.foo.localhost:{}/o/p'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 430 assert response.reason == "Reached soft limit" - expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} + expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n" - # make sure limit doesn't get applied to a different host - url = 'https://127.0.0.1:{}/o/p'.format(https_daemon.server_port) + # make sure limit doesn't get applied to a different domain + url = 'https://localhost:{}/o/p'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) @@ -929,39 +976,39 @@ def test_host_doc_soft_limit( assert response.content == b'I am the warcprox test payload! pppppppppp!\n' # https also blocked - url = 'https://localhost:{}/o/p'.format(https_daemon.server_port) + url = 'https://zuh.foo.localhost:{}/o/p'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) assert response.status_code == 430 assert response.reason == "Reached soft limit" - expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} + expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n" # same host, different capitalization still blocked - url = 'https://lOcALhoST:{}/o/p'.format(https_daemon.server_port) + url = 'https://HEHEHE.fOO.lOcALhoST:{}/o/p'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) assert response.status_code == 430 assert response.reason == "Reached soft limit" - expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} + expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n" -def test_host_data_soft_limit( +def test_domain_data_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): request_meta = { - "stats": {"buckets": [{"bucket":"test_host_data_limit_bucket","tally-host-stats":True}]}, + "stats": {"buckets": [{"bucket":"test_domain_data_limit_bucket","tally-domains":['foo.LOCALhost']}]}, # response is 135 bytes, so 3rd novel url should be disallowed - "soft-limits": {"test_host_data_limit_bucket:localhost/new/wire_bytes":200}, + "soft-limits": {"test_domain_data_limit_bucket:foo.localhost/new/wire_bytes":200}, } headers = {"Warcprox-Meta": json.dumps(request_meta)} - url = 'http://localhost:{}/y/z'.format(http_daemon.server_port) + url = 'http://foo.localhost:{}/y/z'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 200 @@ -976,7 +1023,7 @@ def test_host_data_soft_limit( time.sleep(2.0) # duplicate, does not count toward limit - url = 'https://localhost:{}/y/z'.format(https_daemon.server_port) + url = 'https://baz.foo.localhost:{}/y/z'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) @@ -992,7 +1039,7 @@ def test_host_data_soft_limit( time.sleep(2.0) # novel, pushes stats over the limit - url = 'https://localhost:{}/z/~'.format(https_daemon.server_port) + url = 'https://muh.foo.localhost:{}/z/~'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) @@ -1008,7 +1055,7 @@ def test_host_data_soft_limit( time.sleep(2.0) # make sure limit doesn't get applied to a different host - url = 'http://127.0.0.1:{}/z/~'.format(http_daemon.server_port) + url = 'http://baz.localhost:{}/z/~'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 200 @@ -1016,27 +1063,27 @@ def test_host_data_soft_limit( assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n' # blocked because we're over the limit now - url = 'http://localhost:{}/y/z'.format(http_daemon.server_port) + url = 'http://lOl.wHut.fOo.lOcALHOst:{}/y/z'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 430 assert response.reason == "Reached soft limit" - expected_response_meta = {'reached-soft-limit': {'test_host_data_limit_bucket:localhost/new/wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}} + expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:foo.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:foo.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:foo.localhost'}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_data_limit_bucket:localhost/new/wire_bytes=200\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:foo.localhost/new/wire_bytes=200\n" # https also blocked - url = 'https://localhost:{}/w/x'.format(https_daemon.server_port) + url = 'https://foo.localhost:{}/w/x'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) assert response.status_code == 430 assert response.reason == "Reached soft limit" - expected_response_meta = {'reached-soft-limit': {'test_host_data_limit_bucket:localhost/new/wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}} + expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:foo.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:foo.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:foo.localhost'}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_data_limit_bucket:localhost/new/wire_bytes=200\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:foo.localhost/new/wire_bytes=200\n" # XXX this test relies on a tor proxy running at localhost:9050 with a working # connection to the internet, and relies on a third party site (facebook) being diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 1eeb9a4..89d8e4e 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -57,6 +57,72 @@ class RequestBlockedByRule(Exception): def __str__(self): return "%s: %s" % (self.__class__.__name__, self.msg) +class Url: + ''' + Utility class + ''' + def __init__(self, url): + self.url = url + self._surt = None + self._host = None + + @property + def surt(self): + if not self._surt: + import surt + hurl = surt.handyurl.parse(self.url) + surt.GoogleURLCanonicalizer.canonicalize(hurl) + hurl.query = None + hurl.hash = None + self._surt = hurl.getURLString(surt=True, trailing_comma=True) + return self._surt + + @property + def host(self): + if not self._host: + import surt + self._host = surt.handyurl.parse(self.url).host + return self._host + + def matches_ip_or_domain(self, ip_or_domain): + return host_matches_ip_or_domain(self.host, ip_or_domain) + +def normalize_host(host): + # normalize host (punycode and lowercase) + return host.encode('idna').decode('ascii').lower() + +def host_matches_ip_or_domain(host, ip_or_domain): + ''' + Returns true if + - ip_or_domain is an ip address and host is the same ip address + - ip_or_domain is a domain and host is the same domain + - ip_or_domain is a domain and host is a subdomain of it + ''' + _host = normalize_host(host) + _ip_or_domain = normalize_host(ip_or_domain) + + if _ip_or_domain == _host: + return True + + # if either _ip_or_domain or host are ip addresses, and they're not + # identical (previous check), not a match + try: + ipaddress.ip_address(_ip_or_domain) + return False + except: + pass + try: + ipaddress.ip_address(_host) + return False + except: + pass + + # if we get here, we're looking at two hostnames + domain_parts = _ip_or_domain.split(".") + host_parts = _host.split(".") + + return host_parts[-len(domain_parts):] == domain_parts + # logging level more fine-grained than logging.DEBUG==10 TRACE = 5 diff --git a/warcprox/stats.py b/warcprox/stats.py index 8d5b324..9fd892d 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -113,15 +113,15 @@ class StatsDb: definition can either be a string, which signifies the name of the bucket, or a dict. If a dict it is expected to have at least an item with key 'bucket' whose value is the name of the bucket. The other - currently recognized item is 'tally-host-stats', which if true, - instructs warcprox to additionally tally substats of the given bucket - by host. Host stats are stored in the stats table under the key - '{parent-bucket}:{host}'. + currently recognized item is 'tally-domains', which if supplied should + be a list of domains. This instructs warcprox to additionally tally + substats of the given bucket by domain. Host stats are stored in the + stats table under the key '{parent-bucket}:{domain(normalized)}'. Example Warcprox-Meta header (a real one will likely have other sections besides 'stats'): - Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-host-stats':true}]}} + Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-domains':['foo.bar.com','192.168.10.20'}]}} ''' buckets = ["__all__"] if (recorded_url.warcprox_meta @@ -135,14 +135,13 @@ class StatsDb: 'warcprox-meta header %s', bucket) continue buckets.append(bucket['bucket']) - # XXX maybe host has been computed elsewhere and can be - # cached somewhere, but maybe the performance gain would be - # negligible - if bucket.get('tally-host-stats'): - buckets.append('%s:%s' % ( - bucket['bucket'], - surt.handyurl.parse(recorded_url.url.decode( - 'utf-8')).host)) + if bucket.get('tally-domains'): + url = warcprox.Url(recorded_url.url.decode('utf-8')) + for domain in bucket['tally-domains']: + if url.matches_ip_or_domain(domain): + buckets.append('%s:%s' % ( + bucket['bucket'], + warcprox.normalize_host(domain))) else: buckets.append(bucket) else: diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 38e39ae..ab1a5b7 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -48,57 +48,6 @@ import resource import ipaddress import surt -class Url: - def __init__(self, url): - self.url = url - self._surt = None - self._host = None - - @property - def surt(self): - if not self._surt: - hurl = surt.handyurl.parse(self.url) - surt.GoogleURLCanonicalizer.canonicalize(hurl) - hurl.query = None - hurl.hash = None - self._surt = hurl.getURLString(surt=True, trailing_comma=True) - return self._surt - - @property - def host(self): - if not self._host: - self._host = surt.handyurl.parse(self.url).host - return self._host - - def matches_ip_or_domain(self, ip_or_domain): - """Returns true if - - ip_or_domain is an ip address and self.host is the same ip address - - ip_or_domain is a domain and self.host is the same domain - - ip_or_domain is a domain and self.host is a subdomain of it - """ - if ip_or_domain == self.host: - return True - - # if either ip_or_domain or self.host are ip addresses, and they're not - # identical (previous check), not a match - try: - ipaddress.ip_address(ip_or_domain) - return False - except: - pass - try: - ipaddress.ip_address(self.host) - return False - except: - pass - - # if we get here, we're looking at two hostnames - # XXX do we need to handle case of one punycoded idn, other not? - domain_parts = ip_or_domain.split(".") - host_parts = self.host.split(".") - - return host_parts[-len(domain_parts):] == domain_parts - class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): ''' XXX add more information. @@ -118,7 +67,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): # XXX nearly identical to brozzler.site.Site._scope_rule_applies() but # there's no obvious common dependency where this code should go... TBD def _scope_rule_applies(self, rule): - u = Url(self.url) + u = warcprox.Url(self.url) if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]): return False @@ -179,10 +128,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) # if limit_key looks like 'job1:foo.com/total/urls' then we only want - # to apply this rule if the requested url is on host foo.com + # to apply this rule if the requested url is within domain bucket0_fields = bucket0.split(':') if len(bucket0_fields) == 2: - if self.hostname.lower() != bucket0_fields[1].lower(): + if not warcprox.host_matches_ip_or_domain( + self.hostname.lower(), bucket0_fields[1].lower()): return # else host matches, go ahead and enforce the limit value = self.server.stats_db.value(bucket0, bucket1, bucket2)