switching from host limits to domain limits, which apply in aggregate to the host and subdomains

This commit is contained in:
Noah Levitt 2016-06-29 14:56:14 -05:00
parent 2c8b194090
commit c9e403585b
5 changed files with 164 additions and 102 deletions

View File

@ -51,7 +51,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.0.dev17',
version='2.0.dev18',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -42,6 +42,7 @@ import pprint
import traceback
import signal
from collections import Counter
import socket
try:
import http.server as http_server
@ -65,6 +66,33 @@ logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning)
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
# monkey patch dns lookup so we can test domain inheritance on localhost
orig_getaddrinfo = socket.getaddrinfo
orig_gethostbyname = socket.gethostbyname
orig_socket_connect = socket.socket.connect
def _getaddrinfo(host, port, family=0, type=0, proto=0, flags=0):
if host.endswith('.localhost'):
return orig_getaddrinfo('localhost', port, family, type, proto, flags)
else:
return orig_getaddrinfo(host, port, family, type, proto, flags)
def _gethostbyname(host):
if host.endswith('.localhost'):
return orig_gethostbyname('localhost')
else:
return orig_gethostbyname(host)
def _socket_connect(self, address):
if address[0].endswith('.localhost'):
return orig_socket_connect(self, ('localhost', address[1]))
else:
return orig_socket_connect(self, address)
socket.gethostbyname = _gethostbyname
socket.getaddrinfo = _getaddrinfo
socket.socket.connect = _socket_connect
def dump_state(signum=None, frame=None):
pp = pprint.PrettyPrinter(indent=4)
state_strs = []
@ -373,6 +401,13 @@ def test_httpds_no_proxy(http_daemon, https_daemon):
assert response.headers['warcprox-test-header'] == 'c!'
assert response.content == b'I am the warcprox test payload! dddddddddd!\n'
# ensure monkey-patched dns resolution is working
url = 'https://foo.bar.localhost:{}/c/d'.format(https_daemon.server_port)
response = requests.get(url, verify=False)
assert response.status_code == 200
assert response.headers['warcprox-test-header'] == 'c!'
assert response.content == b'I am the warcprox test payload! dddddddddd!\n'
def _poll_playback_until(playback_proxies, url, status, timeout_sec):
start = time.time()
# check playback (warc writing is asynchronous, give it up to 10 sec)
@ -840,15 +875,16 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies):
assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:")
assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]}
def test_host_doc_soft_limit(
def test_domain_doc_soft_limit(
http_daemon, https_daemon, warcprox_, archiving_proxies):
request_meta = {
"stats": {"buckets": [{"bucket":"test_host_doc_limit_bucket","tally-host-stats":True}]},
"soft-limits": {"test_host_doc_limit_bucket:localhost/total/urls":10},
"stats": {"buckets": [{"bucket":"test_domain_doc_limit_bucket","tally-domains":["foo.localhost"]}]},
"soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls":10},
}
headers = {"Warcprox-Meta": json.dumps(request_meta)}
url = 'http://localhost:{}/o/p'.format(http_daemon.server_port)
# (1)
url = 'http://foo.localhost:{}/o/p'.format(http_daemon.server_port)
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 200
@ -861,8 +897,8 @@ def test_host_doc_soft_limit(
time.sleep(0.5)
time.sleep(0.5)
# make sure stats from different host don't count
url = 'http://127.0.0.1:{}/o/p'.format(http_daemon.server_port)
# make sure stats from different domain don't count
url = 'http://bar.localhost:{}/o/p'.format(http_daemon.server_port)
for i in range(10):
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True)
@ -877,9 +913,19 @@ def test_host_doc_soft_limit(
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
time.sleep(2.0)
# same host but different scheme and port -- host limit still applies
url = 'https://localhost:{}/o/p'.format(https_daemon.server_port)
for i in range(8):
# (2) same host but different scheme and port: domain limit applies
#
url = 'https://foo.localhost:{}/o/p'.format(https_daemon.server_port)
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False)
assert response.status_code == 200
assert response.headers['warcprox-test-header'] == 'o!'
assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
# (3-9) different subdomain: host limit applies
url = 'https://baz.foo.localhost:{}/o/p'.format(https_daemon.server_port)
for i in range(7):
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False)
@ -894,6 +940,7 @@ def test_host_doc_soft_limit(
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
time.sleep(2.0)
# (10)
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False)
@ -908,19 +955,19 @@ def test_host_doc_soft_limit(
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
time.sleep(2.0)
# back to http, and this is the 11th request
url = 'http://localhost:{}/o/p'.format(http_daemon.server_port)
# (11) back to http, and this is the 11th request
url = 'http://zuh.foo.localhost:{}/o/p'.format(http_daemon.server_port)
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 430
assert response.reason == "Reached soft limit"
expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
assert response.headers["content-type"] == "text/plain;charset=utf-8"
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n"
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n"
# make sure limit doesn't get applied to a different host
url = 'https://127.0.0.1:{}/o/p'.format(https_daemon.server_port)
# make sure limit doesn't get applied to a different domain
url = 'https://localhost:{}/o/p'.format(https_daemon.server_port)
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False)
@ -929,39 +976,39 @@ def test_host_doc_soft_limit(
assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
# https also blocked
url = 'https://localhost:{}/o/p'.format(https_daemon.server_port)
url = 'https://zuh.foo.localhost:{}/o/p'.format(https_daemon.server_port)
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False)
assert response.status_code == 430
assert response.reason == "Reached soft limit"
expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
assert response.headers["content-type"] == "text/plain;charset=utf-8"
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n"
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n"
# same host, different capitalization still blocked
url = 'https://lOcALhoST:{}/o/p'.format(https_daemon.server_port)
url = 'https://HEHEHE.fOO.lOcALhoST:{}/o/p'.format(https_daemon.server_port)
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False)
assert response.status_code == 430
assert response.reason == "Reached soft limit"
expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
assert response.headers["content-type"] == "text/plain;charset=utf-8"
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n"
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n"
def test_host_data_soft_limit(
def test_domain_data_soft_limit(
http_daemon, https_daemon, warcprox_, archiving_proxies):
request_meta = {
"stats": {"buckets": [{"bucket":"test_host_data_limit_bucket","tally-host-stats":True}]},
"stats": {"buckets": [{"bucket":"test_domain_data_limit_bucket","tally-domains":['foo.LOCALhost']}]},
# response is 135 bytes, so 3rd novel url should be disallowed
"soft-limits": {"test_host_data_limit_bucket:localhost/new/wire_bytes":200},
"soft-limits": {"test_domain_data_limit_bucket:foo.localhost/new/wire_bytes":200},
}
headers = {"Warcprox-Meta": json.dumps(request_meta)}
url = 'http://localhost:{}/y/z'.format(http_daemon.server_port)
url = 'http://foo.localhost:{}/y/z'.format(http_daemon.server_port)
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 200
@ -976,7 +1023,7 @@ def test_host_data_soft_limit(
time.sleep(2.0)
# duplicate, does not count toward limit
url = 'https://localhost:{}/y/z'.format(https_daemon.server_port)
url = 'https://baz.foo.localhost:{}/y/z'.format(https_daemon.server_port)
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False)
@ -992,7 +1039,7 @@ def test_host_data_soft_limit(
time.sleep(2.0)
# novel, pushes stats over the limit
url = 'https://localhost:{}/z/~'.format(https_daemon.server_port)
url = 'https://muh.foo.localhost:{}/z/~'.format(https_daemon.server_port)
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False)
@ -1008,7 +1055,7 @@ def test_host_data_soft_limit(
time.sleep(2.0)
# make sure limit doesn't get applied to a different host
url = 'http://127.0.0.1:{}/z/~'.format(http_daemon.server_port)
url = 'http://baz.localhost:{}/z/~'.format(http_daemon.server_port)
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 200
@ -1016,27 +1063,27 @@ def test_host_data_soft_limit(
assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n'
# blocked because we're over the limit now
url = 'http://localhost:{}/y/z'.format(http_daemon.server_port)
url = 'http://lOl.wHut.fOo.lOcALHOst:{}/y/z'.format(http_daemon.server_port)
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 430
assert response.reason == "Reached soft limit"
expected_response_meta = {'reached-soft-limit': {'test_host_data_limit_bucket:localhost/new/wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}}
expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:foo.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:foo.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:foo.localhost'}}}
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
assert response.headers["content-type"] == "text/plain;charset=utf-8"
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_data_limit_bucket:localhost/new/wire_bytes=200\n"
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:foo.localhost/new/wire_bytes=200\n"
# https also blocked
url = 'https://localhost:{}/w/x'.format(https_daemon.server_port)
url = 'https://foo.localhost:{}/w/x'.format(https_daemon.server_port)
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False)
assert response.status_code == 430
assert response.reason == "Reached soft limit"
expected_response_meta = {'reached-soft-limit': {'test_host_data_limit_bucket:localhost/new/wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}}
expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:foo.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:foo.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:foo.localhost'}}}
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
assert response.headers["content-type"] == "text/plain;charset=utf-8"
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_data_limit_bucket:localhost/new/wire_bytes=200\n"
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:foo.localhost/new/wire_bytes=200\n"
# XXX this test relies on a tor proxy running at localhost:9050 with a working
# connection to the internet, and relies on a third party site (facebook) being

View File

@ -57,6 +57,72 @@ class RequestBlockedByRule(Exception):
def __str__(self):
return "%s: %s" % (self.__class__.__name__, self.msg)
class Url:
'''
Utility class
'''
def __init__(self, url):
self.url = url
self._surt = None
self._host = None
@property
def surt(self):
if not self._surt:
import surt
hurl = surt.handyurl.parse(self.url)
surt.GoogleURLCanonicalizer.canonicalize(hurl)
hurl.query = None
hurl.hash = None
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
return self._surt
@property
def host(self):
if not self._host:
import surt
self._host = surt.handyurl.parse(self.url).host
return self._host
def matches_ip_or_domain(self, ip_or_domain):
return host_matches_ip_or_domain(self.host, ip_or_domain)
def normalize_host(host):
# normalize host (punycode and lowercase)
return host.encode('idna').decode('ascii').lower()
def host_matches_ip_or_domain(host, ip_or_domain):
'''
Returns true if
- ip_or_domain is an ip address and host is the same ip address
- ip_or_domain is a domain and host is the same domain
- ip_or_domain is a domain and host is a subdomain of it
'''
_host = normalize_host(host)
_ip_or_domain = normalize_host(ip_or_domain)
if _ip_or_domain == _host:
return True
# if either _ip_or_domain or host are ip addresses, and they're not
# identical (previous check), not a match
try:
ipaddress.ip_address(_ip_or_domain)
return False
except:
pass
try:
ipaddress.ip_address(_host)
return False
except:
pass
# if we get here, we're looking at two hostnames
domain_parts = _ip_or_domain.split(".")
host_parts = _host.split(".")
return host_parts[-len(domain_parts):] == domain_parts
# logging level more fine-grained than logging.DEBUG==10
TRACE = 5

View File

@ -113,15 +113,15 @@ class StatsDb:
definition can either be a string, which signifies the name of the
bucket, or a dict. If a dict it is expected to have at least an item
with key 'bucket' whose value is the name of the bucket. The other
currently recognized item is 'tally-host-stats', which if true,
instructs warcprox to additionally tally substats of the given bucket
by host. Host stats are stored in the stats table under the key
'{parent-bucket}:{host}'.
currently recognized item is 'tally-domains', which if supplied should
be a list of domains. This instructs warcprox to additionally tally
substats of the given bucket by domain. Host stats are stored in the
stats table under the key '{parent-bucket}:{domain(normalized)}'.
Example Warcprox-Meta header (a real one will likely have other
sections besides 'stats'):
Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-host-stats':true}]}}
Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-domains':['foo.bar.com','192.168.10.20'}]}}
'''
buckets = ["__all__"]
if (recorded_url.warcprox_meta
@ -135,14 +135,13 @@ class StatsDb:
'warcprox-meta header %s', bucket)
continue
buckets.append(bucket['bucket'])
# XXX maybe host has been computed elsewhere and can be
# cached somewhere, but maybe the performance gain would be
# negligible
if bucket.get('tally-host-stats'):
buckets.append('%s:%s' % (
bucket['bucket'],
surt.handyurl.parse(recorded_url.url.decode(
'utf-8')).host))
if bucket.get('tally-domains'):
url = warcprox.Url(recorded_url.url.decode('utf-8'))
for domain in bucket['tally-domains']:
if url.matches_ip_or_domain(domain):
buckets.append('%s:%s' % (
bucket['bucket'],
warcprox.normalize_host(domain)))
else:
buckets.append(bucket)
else:

View File

@ -48,57 +48,6 @@ import resource
import ipaddress
import surt
class Url:
def __init__(self, url):
self.url = url
self._surt = None
self._host = None
@property
def surt(self):
if not self._surt:
hurl = surt.handyurl.parse(self.url)
surt.GoogleURLCanonicalizer.canonicalize(hurl)
hurl.query = None
hurl.hash = None
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
return self._surt
@property
def host(self):
if not self._host:
self._host = surt.handyurl.parse(self.url).host
return self._host
def matches_ip_or_domain(self, ip_or_domain):
"""Returns true if
- ip_or_domain is an ip address and self.host is the same ip address
- ip_or_domain is a domain and self.host is the same domain
- ip_or_domain is a domain and self.host is a subdomain of it
"""
if ip_or_domain == self.host:
return True
# if either ip_or_domain or self.host are ip addresses, and they're not
# identical (previous check), not a match
try:
ipaddress.ip_address(ip_or_domain)
return False
except:
pass
try:
ipaddress.ip_address(self.host)
return False
except:
pass
# if we get here, we're looking at two hostnames
# XXX do we need to handle case of one punycoded idn, other not?
domain_parts = ip_or_domain.split(".")
host_parts = self.host.split(".")
return host_parts[-len(domain_parts):] == domain_parts
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
'''
XXX add more information.
@ -118,7 +67,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
# XXX nearly identical to brozzler.site.Site._scope_rule_applies() but
# there's no obvious common dependency where this code should go... TBD
def _scope_rule_applies(self, rule):
u = Url(self.url)
u = warcprox.Url(self.url)
if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]):
return False
@ -179,10 +128,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
# if limit_key looks like 'job1:foo.com/total/urls' then we only want
# to apply this rule if the requested url is on host foo.com
# to apply this rule if the requested url is within domain
bucket0_fields = bucket0.split(':')
if len(bucket0_fields) == 2:
if self.hostname.lower() != bucket0_fields[1].lower():
if not warcprox.host_matches_ip_or_domain(
self.hostname.lower(), bucket0_fields[1].lower()):
return # else host matches, go ahead and enforce the limit
value = self.server.stats_db.value(bucket0, bucket1, bucket2)