From a59871e17b71f043d9739cebe43e9c968fe4a557 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 29 Jun 2016 15:54:40 -0500 Subject: [PATCH] idn support, at least for domain limits (getting a segfault in tests on mac however, let's see what happens on travis-ci) --- setup.py | 2 +- tests/test_warcprox.py | 24 +++++++++++++----------- warcprox/__init__.py | 4 +++- warcprox/mitmproxy.py | 19 ++++++++----------- warcprox/warcproxy.py | 17 ++++++++++++----- 5 files changed, 37 insertions(+), 29 deletions(-) diff --git a/setup.py b/setup.py index d3428f6..215caf4 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0.dev18', + version='2.0.dev19', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 76901d1..8e021e3 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# vim: set fileencoding=utf-8: ''' tests/test_warcprox.py - automated tests for warcprox @@ -1001,14 +1002,15 @@ def test_domain_doc_soft_limit( def test_domain_data_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): + # using idn request_meta = { - "stats": {"buckets": [{"bucket":"test_domain_data_limit_bucket","tally-domains":['foo.LOCALhost']}]}, + "stats": {"buckets": [{"bucket":"test_domain_data_limit_bucket","tally-domains":['🎵zZ.LOCALhost']}]}, # response is 135 bytes, so 3rd novel url should be disallowed - "soft-limits": {"test_domain_data_limit_bucket:foo.localhost/new/wire_bytes":200}, + "soft-limits": {"test_domain_data_limit_bucket:🎵ZZ.localhost/new/wire_bytes":200}, } headers = {"Warcprox-Meta": json.dumps(request_meta)} - url = 'http://foo.localhost:{}/y/z'.format(http_daemon.server_port) + url = 'http://🎵Zz.localhost:{}/y/z'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 200 @@ -1023,7 +1025,7 @@ def test_domain_data_soft_limit( time.sleep(2.0) # duplicate, does not count toward limit - url = 'https://baz.foo.localhost:{}/y/z'.format(https_daemon.server_port) + url = 'https://baz.🎵zz.localhost:{}/y/z'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) @@ -1039,7 +1041,7 @@ def test_domain_data_soft_limit( time.sleep(2.0) # novel, pushes stats over the limit - url = 'https://muh.foo.localhost:{}/z/~'.format(https_daemon.server_port) + url = 'https://muh.XN--Zz-B862a.locALHOst:{}/z/~'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) @@ -1063,27 +1065,27 @@ def test_domain_data_soft_limit( assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n' # blocked because we're over the limit now - url = 'http://lOl.wHut.fOo.lOcALHOst:{}/y/z'.format(http_daemon.server_port) + url = 'http://lOl.wHut.🎵ZZ.lOcALHOst:{}/y/z'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 430 assert response.reason == "Reached soft limit" - expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:foo.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:foo.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:foo.localhost'}}} + expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:xn--zz-b862a.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:xn--zz-b862a.localhost'}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:foo.localhost/new/wire_bytes=200\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes=200\n" # https also blocked - url = 'https://foo.localhost:{}/w/x'.format(https_daemon.server_port) + url = 'https://xn--zz-b862ah.loCAlhost:{}/w/x'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) assert response.status_code == 430 assert response.reason == "Reached soft limit" - expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:foo.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:foo.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:foo.localhost'}}} + expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:xn--zz-b862a.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:xn--zz-b862a.localhost'}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:foo.localhost/new/wire_bytes=200\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes=200\n" # XXX this test relies on a tor proxy running at localhost:9050 with a working # connection to the internet, and relies on a third party site (facebook) being diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 89d8e4e..45b38b2 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -121,7 +121,9 @@ def host_matches_ip_or_domain(host, ip_or_domain): domain_parts = _ip_or_domain.split(".") host_parts = _host.split(".") - return host_parts[-len(domain_parts):] == domain_parts + result = host_parts[-len(domain_parts):] == domain_parts + return result + # logging level more fine-grained than logging.DEBUG==10 TRACE = 5 diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index c6c75b9..85960ec 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -184,24 +184,21 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): def _determine_host_port(self): # Get hostname and port to connect to if self.is_connect: - self.hostname, self.port = self.path.split(':') + host, self.port = self.path.split(':') else: self.url = self.path u = urllib_parse.urlparse(self.url) if u.scheme != 'http': - raise Exception('unable to parse request "{}" as a proxy request'.format(self.requestline)) - self.hostname = u.hostname + raise Exception( + 'unable to parse request %s as a proxy request' % ( + repr(self.requestline))) + host = u.hostname self.port = u.port or 80 self.path = urllib_parse.urlunparse( urllib_parse.ParseResult( - scheme='', - netloc='', - params=u.params, - path=u.path or '/', - query=u.query, - fragment=u.fragment - ) - ) + scheme='', netloc='', params=u.params, path=u.path or '/', + query=u.query, fragment=u.fragment)) + self.hostname = warcprox.normalize_host(host) def _connect_to_remote_server(self): # Connect to destination diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index ab1a5b7..0dc736e 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -126,19 +126,26 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): def _enforce_limit(self, limit_key, limit_value, soft=False): bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) + _limit_key = limit_key # if limit_key looks like 'job1:foo.com/total/urls' then we only want # to apply this rule if the requested url is within domain bucket0_fields = bucket0.split(':') if len(bucket0_fields) == 2: + self.logger.info( + 'checking %s:%s', repr(limit_key), repr(limit_value)) if not warcprox.host_matches_ip_or_domain( - self.hostname.lower(), bucket0_fields[1].lower()): + self.hostname, bucket0_fields[1]): return # else host matches, go ahead and enforce the limit + bucket0 = '%s:%s' % ( + bucket0_fields[0], + warcprox.normalize_host(bucket0_fields[1])) + _limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2) value = self.server.stats_db.value(bucket0, bucket1, bucket2) if value and value >= limit_value: body = ("request rejected by warcprox: reached %s %s=%s\n" % ( - "soft limit" if soft else "limit", limit_key, + "soft limit" if soft else "limit", _limit_key, limit_value)).encode("utf-8") if soft: self.send_response(430, "Reached soft limit") @@ -151,9 +158,9 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): "stats": {bucket0:self.server.stats_db.value(bucket0)} } if soft: - response_meta["reached-soft-limit"] = {limit_key:limit_value} + response_meta["reached-soft-limit"] = {_limit_key:limit_value} else: - response_meta["reached-limit"] = {limit_key:limit_value} + response_meta["reached-limit"] = {_limit_key:limit_value} self.send_header( "Warcprox-Meta", json.dumps(response_meta, separators=(",",":"))) @@ -166,7 +173,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self.client_address[0], 430 if soft else 420, self.command, self.url, "soft limit" if soft else "limit", - limit_key, limit_value)) + _limit_key, limit_value)) def _enforce_limits(self, warcprox_meta): """