switching from host limits to domain limits, which apply in aggregate to the host and subdomains

This commit is contained in:
Noah Levitt 2016-06-29 14:56:14 -05:00
parent 2c8b194090
commit c9e403585b
5 changed files with 164 additions and 102 deletions

View File

@ -51,7 +51,7 @@ except:
setuptools.setup( setuptools.setup(
name='warcprox', name='warcprox',
version='2.0.dev17', version='2.0.dev18',
description='WARC writing MITM HTTP/S proxy', description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox', url='https://github.com/internetarchive/warcprox',
author='Noah Levitt', author='Noah Levitt',

View File

@ -42,6 +42,7 @@ import pprint
import traceback import traceback
import signal import signal
from collections import Counter from collections import Counter
import socket
try: try:
import http.server as http_server import http.server as http_server
@ -65,6 +66,33 @@ logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning) warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning)
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning) warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
# monkey patch dns lookup so we can test domain inheritance on localhost
orig_getaddrinfo = socket.getaddrinfo
orig_gethostbyname = socket.gethostbyname
orig_socket_connect = socket.socket.connect
def _getaddrinfo(host, port, family=0, type=0, proto=0, flags=0):
if host.endswith('.localhost'):
return orig_getaddrinfo('localhost', port, family, type, proto, flags)
else:
return orig_getaddrinfo(host, port, family, type, proto, flags)
def _gethostbyname(host):
if host.endswith('.localhost'):
return orig_gethostbyname('localhost')
else:
return orig_gethostbyname(host)
def _socket_connect(self, address):
if address[0].endswith('.localhost'):
return orig_socket_connect(self, ('localhost', address[1]))
else:
return orig_socket_connect(self, address)
socket.gethostbyname = _gethostbyname
socket.getaddrinfo = _getaddrinfo
socket.socket.connect = _socket_connect
def dump_state(signum=None, frame=None): def dump_state(signum=None, frame=None):
pp = pprint.PrettyPrinter(indent=4) pp = pprint.PrettyPrinter(indent=4)
state_strs = [] state_strs = []
@ -373,6 +401,13 @@ def test_httpds_no_proxy(http_daemon, https_daemon):
assert response.headers['warcprox-test-header'] == 'c!' assert response.headers['warcprox-test-header'] == 'c!'
assert response.content == b'I am the warcprox test payload! dddddddddd!\n' assert response.content == b'I am the warcprox test payload! dddddddddd!\n'
# ensure monkey-patched dns resolution is working
url = 'https://foo.bar.localhost:{}/c/d'.format(https_daemon.server_port)
response = requests.get(url, verify=False)
assert response.status_code == 200
assert response.headers['warcprox-test-header'] == 'c!'
assert response.content == b'I am the warcprox test payload! dddddddddd!\n'
def _poll_playback_until(playback_proxies, url, status, timeout_sec): def _poll_playback_until(playback_proxies, url, status, timeout_sec):
start = time.time() start = time.time()
# check playback (warc writing is asynchronous, give it up to 10 sec) # check playback (warc writing is asynchronous, give it up to 10 sec)
@ -840,15 +875,16 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies):
assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:")
assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]}
def test_host_doc_soft_limit( def test_domain_doc_soft_limit(
http_daemon, https_daemon, warcprox_, archiving_proxies): http_daemon, https_daemon, warcprox_, archiving_proxies):
request_meta = { request_meta = {
"stats": {"buckets": [{"bucket":"test_host_doc_limit_bucket","tally-host-stats":True}]}, "stats": {"buckets": [{"bucket":"test_domain_doc_limit_bucket","tally-domains":["foo.localhost"]}]},
"soft-limits": {"test_host_doc_limit_bucket:localhost/total/urls":10}, "soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls":10},
} }
headers = {"Warcprox-Meta": json.dumps(request_meta)} headers = {"Warcprox-Meta": json.dumps(request_meta)}
url = 'http://localhost:{}/o/p'.format(http_daemon.server_port) # (1)
url = 'http://foo.localhost:{}/o/p'.format(http_daemon.server_port)
response = requests.get( response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True) url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 200 assert response.status_code == 200
@ -861,8 +897,8 @@ def test_host_doc_soft_limit(
time.sleep(0.5) time.sleep(0.5)
time.sleep(0.5) time.sleep(0.5)
# make sure stats from different host don't count # make sure stats from different domain don't count
url = 'http://127.0.0.1:{}/o/p'.format(http_daemon.server_port) url = 'http://bar.localhost:{}/o/p'.format(http_daemon.server_port)
for i in range(10): for i in range(10):
response = requests.get( response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True) url, proxies=archiving_proxies, headers=headers, stream=True)
@ -877,9 +913,19 @@ def test_host_doc_soft_limit(
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway) # rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
time.sleep(2.0) time.sleep(2.0)
# same host but different scheme and port -- host limit still applies # (2) same host but different scheme and port: domain limit applies
url = 'https://localhost:{}/o/p'.format(https_daemon.server_port) #
for i in range(8): url = 'https://foo.localhost:{}/o/p'.format(https_daemon.server_port)
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False)
assert response.status_code == 200
assert response.headers['warcprox-test-header'] == 'o!'
assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
# (3-9) different subdomain: host limit applies
url = 'https://baz.foo.localhost:{}/o/p'.format(https_daemon.server_port)
for i in range(7):
response = requests.get( response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True, url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False) verify=False)
@ -894,6 +940,7 @@ def test_host_doc_soft_limit(
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway) # rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
time.sleep(2.0) time.sleep(2.0)
# (10)
response = requests.get( response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True, url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False) verify=False)
@ -908,19 +955,19 @@ def test_host_doc_soft_limit(
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway) # rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
time.sleep(2.0) time.sleep(2.0)
# back to http, and this is the 11th request # (11) back to http, and this is the 11th request
url = 'http://localhost:{}/o/p'.format(http_daemon.server_port) url = 'http://zuh.foo.localhost:{}/o/p'.format(http_daemon.server_port)
response = requests.get( response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True) url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 430 assert response.status_code == 430
assert response.reason == "Reached soft limit" assert response.reason == "Reached soft limit"
expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.headers["content-type"] == "text/plain;charset=utf-8"
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n" assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n"
# make sure limit doesn't get applied to a different host # make sure limit doesn't get applied to a different domain
url = 'https://127.0.0.1:{}/o/p'.format(https_daemon.server_port) url = 'https://localhost:{}/o/p'.format(https_daemon.server_port)
response = requests.get( response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True, url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False) verify=False)
@ -929,39 +976,39 @@ def test_host_doc_soft_limit(
assert response.content == b'I am the warcprox test payload! pppppppppp!\n' assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
# https also blocked # https also blocked
url = 'https://localhost:{}/o/p'.format(https_daemon.server_port) url = 'https://zuh.foo.localhost:{}/o/p'.format(https_daemon.server_port)
response = requests.get( response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True, url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False) verify=False)
assert response.status_code == 430 assert response.status_code == 430
assert response.reason == "Reached soft limit" assert response.reason == "Reached soft limit"
expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.headers["content-type"] == "text/plain;charset=utf-8"
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n" assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n"
# same host, different capitalization still blocked # same host, different capitalization still blocked
url = 'https://lOcALhoST:{}/o/p'.format(https_daemon.server_port) url = 'https://HEHEHE.fOO.lOcALhoST:{}/o/p'.format(https_daemon.server_port)
response = requests.get( response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True, url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False) verify=False)
assert response.status_code == 430 assert response.status_code == 430
assert response.reason == "Reached soft limit" assert response.reason == "Reached soft limit"
expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.headers["content-type"] == "text/plain;charset=utf-8"
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n" assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n"
def test_host_data_soft_limit( def test_domain_data_soft_limit(
http_daemon, https_daemon, warcprox_, archiving_proxies): http_daemon, https_daemon, warcprox_, archiving_proxies):
request_meta = { request_meta = {
"stats": {"buckets": [{"bucket":"test_host_data_limit_bucket","tally-host-stats":True}]}, "stats": {"buckets": [{"bucket":"test_domain_data_limit_bucket","tally-domains":['foo.LOCALhost']}]},
# response is 135 bytes, so 3rd novel url should be disallowed # response is 135 bytes, so 3rd novel url should be disallowed
"soft-limits": {"test_host_data_limit_bucket:localhost/new/wire_bytes":200}, "soft-limits": {"test_domain_data_limit_bucket:foo.localhost/new/wire_bytes":200},
} }
headers = {"Warcprox-Meta": json.dumps(request_meta)} headers = {"Warcprox-Meta": json.dumps(request_meta)}
url = 'http://localhost:{}/y/z'.format(http_daemon.server_port) url = 'http://foo.localhost:{}/y/z'.format(http_daemon.server_port)
response = requests.get( response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True) url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 200 assert response.status_code == 200
@ -976,7 +1023,7 @@ def test_host_data_soft_limit(
time.sleep(2.0) time.sleep(2.0)
# duplicate, does not count toward limit # duplicate, does not count toward limit
url = 'https://localhost:{}/y/z'.format(https_daemon.server_port) url = 'https://baz.foo.localhost:{}/y/z'.format(https_daemon.server_port)
response = requests.get( response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True, url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False) verify=False)
@ -992,7 +1039,7 @@ def test_host_data_soft_limit(
time.sleep(2.0) time.sleep(2.0)
# novel, pushes stats over the limit # novel, pushes stats over the limit
url = 'https://localhost:{}/z/~'.format(https_daemon.server_port) url = 'https://muh.foo.localhost:{}/z/~'.format(https_daemon.server_port)
response = requests.get( response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True, url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False) verify=False)
@ -1008,7 +1055,7 @@ def test_host_data_soft_limit(
time.sleep(2.0) time.sleep(2.0)
# make sure limit doesn't get applied to a different host # make sure limit doesn't get applied to a different host
url = 'http://127.0.0.1:{}/z/~'.format(http_daemon.server_port) url = 'http://baz.localhost:{}/z/~'.format(http_daemon.server_port)
response = requests.get( response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True) url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 200 assert response.status_code == 200
@ -1016,27 +1063,27 @@ def test_host_data_soft_limit(
assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n' assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n'
# blocked because we're over the limit now # blocked because we're over the limit now
url = 'http://localhost:{}/y/z'.format(http_daemon.server_port) url = 'http://lOl.wHut.fOo.lOcALHOst:{}/y/z'.format(http_daemon.server_port)
response = requests.get( response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True) url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 430 assert response.status_code == 430
assert response.reason == "Reached soft limit" assert response.reason == "Reached soft limit"
expected_response_meta = {'reached-soft-limit': {'test_host_data_limit_bucket:localhost/new/wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}} expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:foo.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:foo.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:foo.localhost'}}}
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.headers["content-type"] == "text/plain;charset=utf-8"
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_data_limit_bucket:localhost/new/wire_bytes=200\n" assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:foo.localhost/new/wire_bytes=200\n"
# https also blocked # https also blocked
url = 'https://localhost:{}/w/x'.format(https_daemon.server_port) url = 'https://foo.localhost:{}/w/x'.format(https_daemon.server_port)
response = requests.get( response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True, url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False) verify=False)
assert response.status_code == 430 assert response.status_code == 430
assert response.reason == "Reached soft limit" assert response.reason == "Reached soft limit"
expected_response_meta = {'reached-soft-limit': {'test_host_data_limit_bucket:localhost/new/wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}} expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:foo.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:foo.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:foo.localhost'}}}
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.headers["content-type"] == "text/plain;charset=utf-8"
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_data_limit_bucket:localhost/new/wire_bytes=200\n" assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:foo.localhost/new/wire_bytes=200\n"
# XXX this test relies on a tor proxy running at localhost:9050 with a working # XXX this test relies on a tor proxy running at localhost:9050 with a working
# connection to the internet, and relies on a third party site (facebook) being # connection to the internet, and relies on a third party site (facebook) being

View File

@ -57,6 +57,72 @@ class RequestBlockedByRule(Exception):
def __str__(self): def __str__(self):
return "%s: %s" % (self.__class__.__name__, self.msg) return "%s: %s" % (self.__class__.__name__, self.msg)
class Url:
'''
Utility class
'''
def __init__(self, url):
self.url = url
self._surt = None
self._host = None
@property
def surt(self):
if not self._surt:
import surt
hurl = surt.handyurl.parse(self.url)
surt.GoogleURLCanonicalizer.canonicalize(hurl)
hurl.query = None
hurl.hash = None
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
return self._surt
@property
def host(self):
if not self._host:
import surt
self._host = surt.handyurl.parse(self.url).host
return self._host
def matches_ip_or_domain(self, ip_or_domain):
return host_matches_ip_or_domain(self.host, ip_or_domain)
def normalize_host(host):
# normalize host (punycode and lowercase)
return host.encode('idna').decode('ascii').lower()
def host_matches_ip_or_domain(host, ip_or_domain):
'''
Returns true if
- ip_or_domain is an ip address and host is the same ip address
- ip_or_domain is a domain and host is the same domain
- ip_or_domain is a domain and host is a subdomain of it
'''
_host = normalize_host(host)
_ip_or_domain = normalize_host(ip_or_domain)
if _ip_or_domain == _host:
return True
# if either _ip_or_domain or host are ip addresses, and they're not
# identical (previous check), not a match
try:
ipaddress.ip_address(_ip_or_domain)
return False
except:
pass
try:
ipaddress.ip_address(_host)
return False
except:
pass
# if we get here, we're looking at two hostnames
domain_parts = _ip_or_domain.split(".")
host_parts = _host.split(".")
return host_parts[-len(domain_parts):] == domain_parts
# logging level more fine-grained than logging.DEBUG==10 # logging level more fine-grained than logging.DEBUG==10
TRACE = 5 TRACE = 5

View File

@ -113,15 +113,15 @@ class StatsDb:
definition can either be a string, which signifies the name of the definition can either be a string, which signifies the name of the
bucket, or a dict. If a dict it is expected to have at least an item bucket, or a dict. If a dict it is expected to have at least an item
with key 'bucket' whose value is the name of the bucket. The other with key 'bucket' whose value is the name of the bucket. The other
currently recognized item is 'tally-host-stats', which if true, currently recognized item is 'tally-domains', which if supplied should
instructs warcprox to additionally tally substats of the given bucket be a list of domains. This instructs warcprox to additionally tally
by host. Host stats are stored in the stats table under the key substats of the given bucket by domain. Host stats are stored in the
'{parent-bucket}:{host}'. stats table under the key '{parent-bucket}:{domain(normalized)}'.
Example Warcprox-Meta header (a real one will likely have other Example Warcprox-Meta header (a real one will likely have other
sections besides 'stats'): sections besides 'stats'):
Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-host-stats':true}]}} Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-domains':['foo.bar.com','192.168.10.20'}]}}
''' '''
buckets = ["__all__"] buckets = ["__all__"]
if (recorded_url.warcprox_meta if (recorded_url.warcprox_meta
@ -135,14 +135,13 @@ class StatsDb:
'warcprox-meta header %s', bucket) 'warcprox-meta header %s', bucket)
continue continue
buckets.append(bucket['bucket']) buckets.append(bucket['bucket'])
# XXX maybe host has been computed elsewhere and can be if bucket.get('tally-domains'):
# cached somewhere, but maybe the performance gain would be url = warcprox.Url(recorded_url.url.decode('utf-8'))
# negligible for domain in bucket['tally-domains']:
if bucket.get('tally-host-stats'): if url.matches_ip_or_domain(domain):
buckets.append('%s:%s' % ( buckets.append('%s:%s' % (
bucket['bucket'], bucket['bucket'],
surt.handyurl.parse(recorded_url.url.decode( warcprox.normalize_host(domain)))
'utf-8')).host))
else: else:
buckets.append(bucket) buckets.append(bucket)
else: else:

View File

@ -48,57 +48,6 @@ import resource
import ipaddress import ipaddress
import surt import surt
class Url:
def __init__(self, url):
self.url = url
self._surt = None
self._host = None
@property
def surt(self):
if not self._surt:
hurl = surt.handyurl.parse(self.url)
surt.GoogleURLCanonicalizer.canonicalize(hurl)
hurl.query = None
hurl.hash = None
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
return self._surt
@property
def host(self):
if not self._host:
self._host = surt.handyurl.parse(self.url).host
return self._host
def matches_ip_or_domain(self, ip_or_domain):
"""Returns true if
- ip_or_domain is an ip address and self.host is the same ip address
- ip_or_domain is a domain and self.host is the same domain
- ip_or_domain is a domain and self.host is a subdomain of it
"""
if ip_or_domain == self.host:
return True
# if either ip_or_domain or self.host are ip addresses, and they're not
# identical (previous check), not a match
try:
ipaddress.ip_address(ip_or_domain)
return False
except:
pass
try:
ipaddress.ip_address(self.host)
return False
except:
pass
# if we get here, we're looking at two hostnames
# XXX do we need to handle case of one punycoded idn, other not?
domain_parts = ip_or_domain.split(".")
host_parts = self.host.split(".")
return host_parts[-len(domain_parts):] == domain_parts
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
''' '''
XXX add more information. XXX add more information.
@ -118,7 +67,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
# XXX nearly identical to brozzler.site.Site._scope_rule_applies() but # XXX nearly identical to brozzler.site.Site._scope_rule_applies() but
# there's no obvious common dependency where this code should go... TBD # there's no obvious common dependency where this code should go... TBD
def _scope_rule_applies(self, rule): def _scope_rule_applies(self, rule):
u = Url(self.url) u = warcprox.Url(self.url)
if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]): if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]):
return False return False
@ -179,10 +128,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
# if limit_key looks like 'job1:foo.com/total/urls' then we only want # if limit_key looks like 'job1:foo.com/total/urls' then we only want
# to apply this rule if the requested url is on host foo.com # to apply this rule if the requested url is within domain
bucket0_fields = bucket0.split(':') bucket0_fields = bucket0.split(':')
if len(bucket0_fields) == 2: if len(bucket0_fields) == 2:
if self.hostname.lower() != bucket0_fields[1].lower(): if not warcprox.host_matches_ip_or_domain(
self.hostname.lower(), bucket0_fields[1].lower()):
return # else host matches, go ahead and enforce the limit return # else host matches, go ahead and enforce the limit
value = self.server.stats_db.value(bucket0, bucket1, bucket2) value = self.server.stats_db.value(bucket0, bucket1, bucket2)