mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
switching from host limits to domain limits, which apply in aggregate to the host and subdomains
This commit is contained in:
parent
2c8b194090
commit
c9e403585b
2
setup.py
2
setup.py
@ -51,7 +51,7 @@ except:
|
||||
|
||||
setuptools.setup(
|
||||
name='warcprox',
|
||||
version='2.0.dev17',
|
||||
version='2.0.dev18',
|
||||
description='WARC writing MITM HTTP/S proxy',
|
||||
url='https://github.com/internetarchive/warcprox',
|
||||
author='Noah Levitt',
|
||||
|
@ -42,6 +42,7 @@ import pprint
|
||||
import traceback
|
||||
import signal
|
||||
from collections import Counter
|
||||
import socket
|
||||
|
||||
try:
|
||||
import http.server as http_server
|
||||
@ -65,6 +66,33 @@ logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
|
||||
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
||||
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
|
||||
|
||||
# monkey patch dns lookup so we can test domain inheritance on localhost
|
||||
orig_getaddrinfo = socket.getaddrinfo
|
||||
orig_gethostbyname = socket.gethostbyname
|
||||
orig_socket_connect = socket.socket.connect
|
||||
|
||||
def _getaddrinfo(host, port, family=0, type=0, proto=0, flags=0):
|
||||
if host.endswith('.localhost'):
|
||||
return orig_getaddrinfo('localhost', port, family, type, proto, flags)
|
||||
else:
|
||||
return orig_getaddrinfo(host, port, family, type, proto, flags)
|
||||
|
||||
def _gethostbyname(host):
|
||||
if host.endswith('.localhost'):
|
||||
return orig_gethostbyname('localhost')
|
||||
else:
|
||||
return orig_gethostbyname(host)
|
||||
|
||||
def _socket_connect(self, address):
|
||||
if address[0].endswith('.localhost'):
|
||||
return orig_socket_connect(self, ('localhost', address[1]))
|
||||
else:
|
||||
return orig_socket_connect(self, address)
|
||||
|
||||
socket.gethostbyname = _gethostbyname
|
||||
socket.getaddrinfo = _getaddrinfo
|
||||
socket.socket.connect = _socket_connect
|
||||
|
||||
def dump_state(signum=None, frame=None):
|
||||
pp = pprint.PrettyPrinter(indent=4)
|
||||
state_strs = []
|
||||
@ -373,6 +401,13 @@ def test_httpds_no_proxy(http_daemon, https_daemon):
|
||||
assert response.headers['warcprox-test-header'] == 'c!'
|
||||
assert response.content == b'I am the warcprox test payload! dddddddddd!\n'
|
||||
|
||||
# ensure monkey-patched dns resolution is working
|
||||
url = 'https://foo.bar.localhost:{}/c/d'.format(https_daemon.server_port)
|
||||
response = requests.get(url, verify=False)
|
||||
assert response.status_code == 200
|
||||
assert response.headers['warcprox-test-header'] == 'c!'
|
||||
assert response.content == b'I am the warcprox test payload! dddddddddd!\n'
|
||||
|
||||
def _poll_playback_until(playback_proxies, url, status, timeout_sec):
|
||||
start = time.time()
|
||||
# check playback (warc writing is asynchronous, give it up to 10 sec)
|
||||
@ -840,15 +875,16 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies):
|
||||
assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:")
|
||||
assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]}
|
||||
|
||||
def test_host_doc_soft_limit(
|
||||
def test_domain_doc_soft_limit(
|
||||
http_daemon, https_daemon, warcprox_, archiving_proxies):
|
||||
request_meta = {
|
||||
"stats": {"buckets": [{"bucket":"test_host_doc_limit_bucket","tally-host-stats":True}]},
|
||||
"soft-limits": {"test_host_doc_limit_bucket:localhost/total/urls":10},
|
||||
"stats": {"buckets": [{"bucket":"test_domain_doc_limit_bucket","tally-domains":["foo.localhost"]}]},
|
||||
"soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls":10},
|
||||
}
|
||||
headers = {"Warcprox-Meta": json.dumps(request_meta)}
|
||||
|
||||
url = 'http://localhost:{}/o/p'.format(http_daemon.server_port)
|
||||
# (1)
|
||||
url = 'http://foo.localhost:{}/o/p'.format(http_daemon.server_port)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||
assert response.status_code == 200
|
||||
@ -861,8 +897,8 @@ def test_host_doc_soft_limit(
|
||||
time.sleep(0.5)
|
||||
time.sleep(0.5)
|
||||
|
||||
# make sure stats from different host don't count
|
||||
url = 'http://127.0.0.1:{}/o/p'.format(http_daemon.server_port)
|
||||
# make sure stats from different domain don't count
|
||||
url = 'http://bar.localhost:{}/o/p'.format(http_daemon.server_port)
|
||||
for i in range(10):
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||
@ -877,9 +913,19 @@ def test_host_doc_soft_limit(
|
||||
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
|
||||
time.sleep(2.0)
|
||||
|
||||
# same host but different scheme and port -- host limit still applies
|
||||
url = 'https://localhost:{}/o/p'.format(https_daemon.server_port)
|
||||
for i in range(8):
|
||||
# (2) same host but different scheme and port: domain limit applies
|
||||
#
|
||||
url = 'https://foo.localhost:{}/o/p'.format(https_daemon.server_port)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||
verify=False)
|
||||
assert response.status_code == 200
|
||||
assert response.headers['warcprox-test-header'] == 'o!'
|
||||
assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
|
||||
|
||||
# (3-9) different subdomain: host limit applies
|
||||
url = 'https://baz.foo.localhost:{}/o/p'.format(https_daemon.server_port)
|
||||
for i in range(7):
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||
verify=False)
|
||||
@ -894,6 +940,7 @@ def test_host_doc_soft_limit(
|
||||
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
|
||||
time.sleep(2.0)
|
||||
|
||||
# (10)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||
verify=False)
|
||||
@ -908,19 +955,19 @@ def test_host_doc_soft_limit(
|
||||
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
|
||||
time.sleep(2.0)
|
||||
|
||||
# back to http, and this is the 11th request
|
||||
url = 'http://localhost:{}/o/p'.format(http_daemon.server_port)
|
||||
# (11) back to http, and this is the 11th request
|
||||
url = 'http://zuh.foo.localhost:{}/o/p'.format(http_daemon.server_port)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||
assert response.status_code == 430
|
||||
assert response.reason == "Reached soft limit"
|
||||
expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
|
||||
expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
|
||||
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n"
|
||||
|
||||
# make sure limit doesn't get applied to a different host
|
||||
url = 'https://127.0.0.1:{}/o/p'.format(https_daemon.server_port)
|
||||
# make sure limit doesn't get applied to a different domain
|
||||
url = 'https://localhost:{}/o/p'.format(https_daemon.server_port)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||
verify=False)
|
||||
@ -929,39 +976,39 @@ def test_host_doc_soft_limit(
|
||||
assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
|
||||
|
||||
# https also blocked
|
||||
url = 'https://localhost:{}/o/p'.format(https_daemon.server_port)
|
||||
url = 'https://zuh.foo.localhost:{}/o/p'.format(https_daemon.server_port)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||
verify=False)
|
||||
assert response.status_code == 430
|
||||
assert response.reason == "Reached soft limit"
|
||||
expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
|
||||
expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
|
||||
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n"
|
||||
|
||||
# same host, different capitalization still blocked
|
||||
url = 'https://lOcALhoST:{}/o/p'.format(https_daemon.server_port)
|
||||
url = 'https://HEHEHE.fOO.lOcALhoST:{}/o/p'.format(https_daemon.server_port)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||
verify=False)
|
||||
assert response.status_code == 430
|
||||
assert response.reason == "Reached soft limit"
|
||||
expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
|
||||
expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
|
||||
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n"
|
||||
|
||||
def test_host_data_soft_limit(
|
||||
def test_domain_data_soft_limit(
|
||||
http_daemon, https_daemon, warcprox_, archiving_proxies):
|
||||
request_meta = {
|
||||
"stats": {"buckets": [{"bucket":"test_host_data_limit_bucket","tally-host-stats":True}]},
|
||||
"stats": {"buckets": [{"bucket":"test_domain_data_limit_bucket","tally-domains":['foo.LOCALhost']}]},
|
||||
# response is 135 bytes, so 3rd novel url should be disallowed
|
||||
"soft-limits": {"test_host_data_limit_bucket:localhost/new/wire_bytes":200},
|
||||
"soft-limits": {"test_domain_data_limit_bucket:foo.localhost/new/wire_bytes":200},
|
||||
}
|
||||
headers = {"Warcprox-Meta": json.dumps(request_meta)}
|
||||
|
||||
url = 'http://localhost:{}/y/z'.format(http_daemon.server_port)
|
||||
url = 'http://foo.localhost:{}/y/z'.format(http_daemon.server_port)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||
assert response.status_code == 200
|
||||
@ -976,7 +1023,7 @@ def test_host_data_soft_limit(
|
||||
time.sleep(2.0)
|
||||
|
||||
# duplicate, does not count toward limit
|
||||
url = 'https://localhost:{}/y/z'.format(https_daemon.server_port)
|
||||
url = 'https://baz.foo.localhost:{}/y/z'.format(https_daemon.server_port)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||
verify=False)
|
||||
@ -992,7 +1039,7 @@ def test_host_data_soft_limit(
|
||||
time.sleep(2.0)
|
||||
|
||||
# novel, pushes stats over the limit
|
||||
url = 'https://localhost:{}/z/~'.format(https_daemon.server_port)
|
||||
url = 'https://muh.foo.localhost:{}/z/~'.format(https_daemon.server_port)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||
verify=False)
|
||||
@ -1008,7 +1055,7 @@ def test_host_data_soft_limit(
|
||||
time.sleep(2.0)
|
||||
|
||||
# make sure limit doesn't get applied to a different host
|
||||
url = 'http://127.0.0.1:{}/z/~'.format(http_daemon.server_port)
|
||||
url = 'http://baz.localhost:{}/z/~'.format(http_daemon.server_port)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||
assert response.status_code == 200
|
||||
@ -1016,27 +1063,27 @@ def test_host_data_soft_limit(
|
||||
assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n'
|
||||
|
||||
# blocked because we're over the limit now
|
||||
url = 'http://localhost:{}/y/z'.format(http_daemon.server_port)
|
||||
url = 'http://lOl.wHut.fOo.lOcALHOst:{}/y/z'.format(http_daemon.server_port)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||
assert response.status_code == 430
|
||||
assert response.reason == "Reached soft limit"
|
||||
expected_response_meta = {'reached-soft-limit': {'test_host_data_limit_bucket:localhost/new/wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}}
|
||||
expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:foo.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:foo.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:foo.localhost'}}}
|
||||
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_data_limit_bucket:localhost/new/wire_bytes=200\n"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:foo.localhost/new/wire_bytes=200\n"
|
||||
|
||||
# https also blocked
|
||||
url = 'https://localhost:{}/w/x'.format(https_daemon.server_port)
|
||||
url = 'https://foo.localhost:{}/w/x'.format(https_daemon.server_port)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||
verify=False)
|
||||
assert response.status_code == 430
|
||||
assert response.reason == "Reached soft limit"
|
||||
expected_response_meta = {'reached-soft-limit': {'test_host_data_limit_bucket:localhost/new/wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}}
|
||||
expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:foo.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:foo.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:foo.localhost'}}}
|
||||
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_data_limit_bucket:localhost/new/wire_bytes=200\n"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:foo.localhost/new/wire_bytes=200\n"
|
||||
|
||||
# XXX this test relies on a tor proxy running at localhost:9050 with a working
|
||||
# connection to the internet, and relies on a third party site (facebook) being
|
||||
|
@ -57,6 +57,72 @@ class RequestBlockedByRule(Exception):
|
||||
def __str__(self):
|
||||
return "%s: %s" % (self.__class__.__name__, self.msg)
|
||||
|
||||
class Url:
|
||||
'''
|
||||
Utility class
|
||||
'''
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
self._surt = None
|
||||
self._host = None
|
||||
|
||||
@property
|
||||
def surt(self):
|
||||
if not self._surt:
|
||||
import surt
|
||||
hurl = surt.handyurl.parse(self.url)
|
||||
surt.GoogleURLCanonicalizer.canonicalize(hurl)
|
||||
hurl.query = None
|
||||
hurl.hash = None
|
||||
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
|
||||
return self._surt
|
||||
|
||||
@property
|
||||
def host(self):
|
||||
if not self._host:
|
||||
import surt
|
||||
self._host = surt.handyurl.parse(self.url).host
|
||||
return self._host
|
||||
|
||||
def matches_ip_or_domain(self, ip_or_domain):
|
||||
return host_matches_ip_or_domain(self.host, ip_or_domain)
|
||||
|
||||
def normalize_host(host):
|
||||
# normalize host (punycode and lowercase)
|
||||
return host.encode('idna').decode('ascii').lower()
|
||||
|
||||
def host_matches_ip_or_domain(host, ip_or_domain):
|
||||
'''
|
||||
Returns true if
|
||||
- ip_or_domain is an ip address and host is the same ip address
|
||||
- ip_or_domain is a domain and host is the same domain
|
||||
- ip_or_domain is a domain and host is a subdomain of it
|
||||
'''
|
||||
_host = normalize_host(host)
|
||||
_ip_or_domain = normalize_host(ip_or_domain)
|
||||
|
||||
if _ip_or_domain == _host:
|
||||
return True
|
||||
|
||||
# if either _ip_or_domain or host are ip addresses, and they're not
|
||||
# identical (previous check), not a match
|
||||
try:
|
||||
ipaddress.ip_address(_ip_or_domain)
|
||||
return False
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
ipaddress.ip_address(_host)
|
||||
return False
|
||||
except:
|
||||
pass
|
||||
|
||||
# if we get here, we're looking at two hostnames
|
||||
domain_parts = _ip_or_domain.split(".")
|
||||
host_parts = _host.split(".")
|
||||
|
||||
return host_parts[-len(domain_parts):] == domain_parts
|
||||
|
||||
# logging level more fine-grained than logging.DEBUG==10
|
||||
TRACE = 5
|
||||
|
||||
|
@ -113,15 +113,15 @@ class StatsDb:
|
||||
definition can either be a string, which signifies the name of the
|
||||
bucket, or a dict. If a dict it is expected to have at least an item
|
||||
with key 'bucket' whose value is the name of the bucket. The other
|
||||
currently recognized item is 'tally-host-stats', which if true,
|
||||
instructs warcprox to additionally tally substats of the given bucket
|
||||
by host. Host stats are stored in the stats table under the key
|
||||
'{parent-bucket}:{host}'.
|
||||
currently recognized item is 'tally-domains', which if supplied should
|
||||
be a list of domains. This instructs warcprox to additionally tally
|
||||
substats of the given bucket by domain. Host stats are stored in the
|
||||
stats table under the key '{parent-bucket}:{domain(normalized)}'.
|
||||
|
||||
Example Warcprox-Meta header (a real one will likely have other
|
||||
sections besides 'stats'):
|
||||
|
||||
Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-host-stats':true}]}}
|
||||
Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-domains':['foo.bar.com','192.168.10.20'}]}}
|
||||
'''
|
||||
buckets = ["__all__"]
|
||||
if (recorded_url.warcprox_meta
|
||||
@ -135,14 +135,13 @@ class StatsDb:
|
||||
'warcprox-meta header %s', bucket)
|
||||
continue
|
||||
buckets.append(bucket['bucket'])
|
||||
# XXX maybe host has been computed elsewhere and can be
|
||||
# cached somewhere, but maybe the performance gain would be
|
||||
# negligible
|
||||
if bucket.get('tally-host-stats'):
|
||||
buckets.append('%s:%s' % (
|
||||
bucket['bucket'],
|
||||
surt.handyurl.parse(recorded_url.url.decode(
|
||||
'utf-8')).host))
|
||||
if bucket.get('tally-domains'):
|
||||
url = warcprox.Url(recorded_url.url.decode('utf-8'))
|
||||
for domain in bucket['tally-domains']:
|
||||
if url.matches_ip_or_domain(domain):
|
||||
buckets.append('%s:%s' % (
|
||||
bucket['bucket'],
|
||||
warcprox.normalize_host(domain)))
|
||||
else:
|
||||
buckets.append(bucket)
|
||||
else:
|
||||
|
@ -48,57 +48,6 @@ import resource
|
||||
import ipaddress
|
||||
import surt
|
||||
|
||||
class Url:
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
self._surt = None
|
||||
self._host = None
|
||||
|
||||
@property
|
||||
def surt(self):
|
||||
if not self._surt:
|
||||
hurl = surt.handyurl.parse(self.url)
|
||||
surt.GoogleURLCanonicalizer.canonicalize(hurl)
|
||||
hurl.query = None
|
||||
hurl.hash = None
|
||||
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
|
||||
return self._surt
|
||||
|
||||
@property
|
||||
def host(self):
|
||||
if not self._host:
|
||||
self._host = surt.handyurl.parse(self.url).host
|
||||
return self._host
|
||||
|
||||
def matches_ip_or_domain(self, ip_or_domain):
|
||||
"""Returns true if
|
||||
- ip_or_domain is an ip address and self.host is the same ip address
|
||||
- ip_or_domain is a domain and self.host is the same domain
|
||||
- ip_or_domain is a domain and self.host is a subdomain of it
|
||||
"""
|
||||
if ip_or_domain == self.host:
|
||||
return True
|
||||
|
||||
# if either ip_or_domain or self.host are ip addresses, and they're not
|
||||
# identical (previous check), not a match
|
||||
try:
|
||||
ipaddress.ip_address(ip_or_domain)
|
||||
return False
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
ipaddress.ip_address(self.host)
|
||||
return False
|
||||
except:
|
||||
pass
|
||||
|
||||
# if we get here, we're looking at two hostnames
|
||||
# XXX do we need to handle case of one punycoded idn, other not?
|
||||
domain_parts = ip_or_domain.split(".")
|
||||
host_parts = self.host.split(".")
|
||||
|
||||
return host_parts[-len(domain_parts):] == domain_parts
|
||||
|
||||
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
'''
|
||||
XXX add more information.
|
||||
@ -118,7 +67,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
# XXX nearly identical to brozzler.site.Site._scope_rule_applies() but
|
||||
# there's no obvious common dependency where this code should go... TBD
|
||||
def _scope_rule_applies(self, rule):
|
||||
u = Url(self.url)
|
||||
u = warcprox.Url(self.url)
|
||||
|
||||
if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]):
|
||||
return False
|
||||
@ -179,10 +128,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
|
||||
|
||||
# if limit_key looks like 'job1:foo.com/total/urls' then we only want
|
||||
# to apply this rule if the requested url is on host foo.com
|
||||
# to apply this rule if the requested url is within domain
|
||||
bucket0_fields = bucket0.split(':')
|
||||
if len(bucket0_fields) == 2:
|
||||
if self.hostname.lower() != bucket0_fields[1].lower():
|
||||
if not warcprox.host_matches_ip_or_domain(
|
||||
self.hostname.lower(), bucket0_fields[1].lower()):
|
||||
return # else host matches, go ahead and enforce the limit
|
||||
|
||||
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
|
||||
|
Loading…
x
Reference in New Issue
Block a user