mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
idn support, at least for domain limits (getting a segfault in tests on mac however, let's see what happens on travis-ci)
This commit is contained in:
parent
c9e403585b
commit
a59871e17b
2
setup.py
2
setup.py
@ -51,7 +51,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.0.dev18',
|
version='2.0.dev19',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# vim: set fileencoding=utf-8:
|
||||||
'''
|
'''
|
||||||
tests/test_warcprox.py - automated tests for warcprox
|
tests/test_warcprox.py - automated tests for warcprox
|
||||||
|
|
||||||
@ -1001,14 +1002,15 @@ def test_domain_doc_soft_limit(
|
|||||||
|
|
||||||
def test_domain_data_soft_limit(
|
def test_domain_data_soft_limit(
|
||||||
http_daemon, https_daemon, warcprox_, archiving_proxies):
|
http_daemon, https_daemon, warcprox_, archiving_proxies):
|
||||||
|
# using idn
|
||||||
request_meta = {
|
request_meta = {
|
||||||
"stats": {"buckets": [{"bucket":"test_domain_data_limit_bucket","tally-domains":['foo.LOCALhost']}]},
|
"stats": {"buckets": [{"bucket":"test_domain_data_limit_bucket","tally-domains":['🎵zZ.LOCALhost']}]},
|
||||||
# response is 135 bytes, so 3rd novel url should be disallowed
|
# response is 135 bytes, so 3rd novel url should be disallowed
|
||||||
"soft-limits": {"test_domain_data_limit_bucket:foo.localhost/new/wire_bytes":200},
|
"soft-limits": {"test_domain_data_limit_bucket:🎵ZZ.localhost/new/wire_bytes":200},
|
||||||
}
|
}
|
||||||
headers = {"Warcprox-Meta": json.dumps(request_meta)}
|
headers = {"Warcprox-Meta": json.dumps(request_meta)}
|
||||||
|
|
||||||
url = 'http://foo.localhost:{}/y/z'.format(http_daemon.server_port)
|
url = 'http://🎵Zz.localhost:{}/y/z'.format(http_daemon.server_port)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True)
|
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
@ -1023,7 +1025,7 @@ def test_domain_data_soft_limit(
|
|||||||
time.sleep(2.0)
|
time.sleep(2.0)
|
||||||
|
|
||||||
# duplicate, does not count toward limit
|
# duplicate, does not count toward limit
|
||||||
url = 'https://baz.foo.localhost:{}/y/z'.format(https_daemon.server_port)
|
url = 'https://baz.🎵zz.localhost:{}/y/z'.format(https_daemon.server_port)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||||
verify=False)
|
verify=False)
|
||||||
@ -1039,7 +1041,7 @@ def test_domain_data_soft_limit(
|
|||||||
time.sleep(2.0)
|
time.sleep(2.0)
|
||||||
|
|
||||||
# novel, pushes stats over the limit
|
# novel, pushes stats over the limit
|
||||||
url = 'https://muh.foo.localhost:{}/z/~'.format(https_daemon.server_port)
|
url = 'https://muh.XN--Zz-B862a.locALHOst:{}/z/~'.format(https_daemon.server_port)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||||
verify=False)
|
verify=False)
|
||||||
@ -1063,27 +1065,27 @@ def test_domain_data_soft_limit(
|
|||||||
assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n'
|
assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n'
|
||||||
|
|
||||||
# blocked because we're over the limit now
|
# blocked because we're over the limit now
|
||||||
url = 'http://lOl.wHut.fOo.lOcALHOst:{}/y/z'.format(http_daemon.server_port)
|
url = 'http://lOl.wHut.🎵ZZ.lOcALHOst:{}/y/z'.format(http_daemon.server_port)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True)
|
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||||
assert response.status_code == 430
|
assert response.status_code == 430
|
||||||
assert response.reason == "Reached soft limit"
|
assert response.reason == "Reached soft limit"
|
||||||
expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:foo.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:foo.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:foo.localhost'}}}
|
expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:xn--zz-b862a.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:xn--zz-b862a.localhost'}}}
|
||||||
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
||||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:foo.localhost/new/wire_bytes=200\n"
|
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes=200\n"
|
||||||
|
|
||||||
# https also blocked
|
# https also blocked
|
||||||
url = 'https://foo.localhost:{}/w/x'.format(https_daemon.server_port)
|
url = 'https://xn--zz-b862ah.loCAlhost:{}/w/x'.format(https_daemon.server_port)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||||
verify=False)
|
verify=False)
|
||||||
assert response.status_code == 430
|
assert response.status_code == 430
|
||||||
assert response.reason == "Reached soft limit"
|
assert response.reason == "Reached soft limit"
|
||||||
expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:foo.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:foo.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:foo.localhost'}}}
|
expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:xn--zz-b862a.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:xn--zz-b862a.localhost'}}}
|
||||||
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
||||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:foo.localhost/new/wire_bytes=200\n"
|
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:xn--zz-b862a.localhost/new/wire_bytes=200\n"
|
||||||
|
|
||||||
# XXX this test relies on a tor proxy running at localhost:9050 with a working
|
# XXX this test relies on a tor proxy running at localhost:9050 with a working
|
||||||
# connection to the internet, and relies on a third party site (facebook) being
|
# connection to the internet, and relies on a third party site (facebook) being
|
||||||
|
@ -121,7 +121,9 @@ def host_matches_ip_or_domain(host, ip_or_domain):
|
|||||||
domain_parts = _ip_or_domain.split(".")
|
domain_parts = _ip_or_domain.split(".")
|
||||||
host_parts = _host.split(".")
|
host_parts = _host.split(".")
|
||||||
|
|
||||||
return host_parts[-len(domain_parts):] == domain_parts
|
result = host_parts[-len(domain_parts):] == domain_parts
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
# logging level more fine-grained than logging.DEBUG==10
|
# logging level more fine-grained than logging.DEBUG==10
|
||||||
TRACE = 5
|
TRACE = 5
|
||||||
|
@ -184,24 +184,21 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
def _determine_host_port(self):
|
def _determine_host_port(self):
|
||||||
# Get hostname and port to connect to
|
# Get hostname and port to connect to
|
||||||
if self.is_connect:
|
if self.is_connect:
|
||||||
self.hostname, self.port = self.path.split(':')
|
host, self.port = self.path.split(':')
|
||||||
else:
|
else:
|
||||||
self.url = self.path
|
self.url = self.path
|
||||||
u = urllib_parse.urlparse(self.url)
|
u = urllib_parse.urlparse(self.url)
|
||||||
if u.scheme != 'http':
|
if u.scheme != 'http':
|
||||||
raise Exception('unable to parse request "{}" as a proxy request'.format(self.requestline))
|
raise Exception(
|
||||||
self.hostname = u.hostname
|
'unable to parse request %s as a proxy request' % (
|
||||||
|
repr(self.requestline)))
|
||||||
|
host = u.hostname
|
||||||
self.port = u.port or 80
|
self.port = u.port or 80
|
||||||
self.path = urllib_parse.urlunparse(
|
self.path = urllib_parse.urlunparse(
|
||||||
urllib_parse.ParseResult(
|
urllib_parse.ParseResult(
|
||||||
scheme='',
|
scheme='', netloc='', params=u.params, path=u.path or '/',
|
||||||
netloc='',
|
query=u.query, fragment=u.fragment))
|
||||||
params=u.params,
|
self.hostname = warcprox.normalize_host(host)
|
||||||
path=u.path or '/',
|
|
||||||
query=u.query,
|
|
||||||
fragment=u.fragment
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
def _connect_to_remote_server(self):
|
def _connect_to_remote_server(self):
|
||||||
# Connect to destination
|
# Connect to destination
|
||||||
|
@ -126,19 +126,26 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
|
|
||||||
def _enforce_limit(self, limit_key, limit_value, soft=False):
|
def _enforce_limit(self, limit_key, limit_value, soft=False):
|
||||||
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
|
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
|
||||||
|
_limit_key = limit_key
|
||||||
|
|
||||||
# if limit_key looks like 'job1:foo.com/total/urls' then we only want
|
# if limit_key looks like 'job1:foo.com/total/urls' then we only want
|
||||||
# to apply this rule if the requested url is within domain
|
# to apply this rule if the requested url is within domain
|
||||||
bucket0_fields = bucket0.split(':')
|
bucket0_fields = bucket0.split(':')
|
||||||
if len(bucket0_fields) == 2:
|
if len(bucket0_fields) == 2:
|
||||||
|
self.logger.info(
|
||||||
|
'checking %s:%s', repr(limit_key), repr(limit_value))
|
||||||
if not warcprox.host_matches_ip_or_domain(
|
if not warcprox.host_matches_ip_or_domain(
|
||||||
self.hostname.lower(), bucket0_fields[1].lower()):
|
self.hostname, bucket0_fields[1]):
|
||||||
return # else host matches, go ahead and enforce the limit
|
return # else host matches, go ahead and enforce the limit
|
||||||
|
bucket0 = '%s:%s' % (
|
||||||
|
bucket0_fields[0],
|
||||||
|
warcprox.normalize_host(bucket0_fields[1]))
|
||||||
|
_limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2)
|
||||||
|
|
||||||
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
|
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
|
||||||
if value and value >= limit_value:
|
if value and value >= limit_value:
|
||||||
body = ("request rejected by warcprox: reached %s %s=%s\n" % (
|
body = ("request rejected by warcprox: reached %s %s=%s\n" % (
|
||||||
"soft limit" if soft else "limit", limit_key,
|
"soft limit" if soft else "limit", _limit_key,
|
||||||
limit_value)).encode("utf-8")
|
limit_value)).encode("utf-8")
|
||||||
if soft:
|
if soft:
|
||||||
self.send_response(430, "Reached soft limit")
|
self.send_response(430, "Reached soft limit")
|
||||||
@ -151,9 +158,9 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
"stats": {bucket0:self.server.stats_db.value(bucket0)}
|
"stats": {bucket0:self.server.stats_db.value(bucket0)}
|
||||||
}
|
}
|
||||||
if soft:
|
if soft:
|
||||||
response_meta["reached-soft-limit"] = {limit_key:limit_value}
|
response_meta["reached-soft-limit"] = {_limit_key:limit_value}
|
||||||
else:
|
else:
|
||||||
response_meta["reached-limit"] = {limit_key:limit_value}
|
response_meta["reached-limit"] = {_limit_key:limit_value}
|
||||||
self.send_header(
|
self.send_header(
|
||||||
"Warcprox-Meta",
|
"Warcprox-Meta",
|
||||||
json.dumps(response_meta, separators=(",",":")))
|
json.dumps(response_meta, separators=(",",":")))
|
||||||
@ -166,7 +173,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
self.client_address[0], 430 if soft else 420,
|
self.client_address[0], 430 if soft else 420,
|
||||||
self.command, self.url,
|
self.command, self.url,
|
||||||
"soft limit" if soft else "limit",
|
"soft limit" if soft else "limit",
|
||||||
limit_key, limit_value))
|
_limit_key, limit_value))
|
||||||
|
|
||||||
def _enforce_limits(self, warcprox_meta):
|
def _enforce_limits(self, warcprox_meta):
|
||||||
"""
|
"""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user