mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
switching from host limits to domain limits, which apply in aggregate to the host and subdomains
This commit is contained in:
parent
2c8b194090
commit
c9e403585b
2
setup.py
2
setup.py
@ -51,7 +51,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.0.dev17',
|
version='2.0.dev18',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -42,6 +42,7 @@ import pprint
|
|||||||
import traceback
|
import traceback
|
||||||
import signal
|
import signal
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
import socket
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import http.server as http_server
|
import http.server as http_server
|
||||||
@ -65,6 +66,33 @@ logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
|
|||||||
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
||||||
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
|
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
|
||||||
|
|
||||||
|
# monkey patch dns lookup so we can test domain inheritance on localhost
|
||||||
|
orig_getaddrinfo = socket.getaddrinfo
|
||||||
|
orig_gethostbyname = socket.gethostbyname
|
||||||
|
orig_socket_connect = socket.socket.connect
|
||||||
|
|
||||||
|
def _getaddrinfo(host, port, family=0, type=0, proto=0, flags=0):
|
||||||
|
if host.endswith('.localhost'):
|
||||||
|
return orig_getaddrinfo('localhost', port, family, type, proto, flags)
|
||||||
|
else:
|
||||||
|
return orig_getaddrinfo(host, port, family, type, proto, flags)
|
||||||
|
|
||||||
|
def _gethostbyname(host):
|
||||||
|
if host.endswith('.localhost'):
|
||||||
|
return orig_gethostbyname('localhost')
|
||||||
|
else:
|
||||||
|
return orig_gethostbyname(host)
|
||||||
|
|
||||||
|
def _socket_connect(self, address):
|
||||||
|
if address[0].endswith('.localhost'):
|
||||||
|
return orig_socket_connect(self, ('localhost', address[1]))
|
||||||
|
else:
|
||||||
|
return orig_socket_connect(self, address)
|
||||||
|
|
||||||
|
socket.gethostbyname = _gethostbyname
|
||||||
|
socket.getaddrinfo = _getaddrinfo
|
||||||
|
socket.socket.connect = _socket_connect
|
||||||
|
|
||||||
def dump_state(signum=None, frame=None):
|
def dump_state(signum=None, frame=None):
|
||||||
pp = pprint.PrettyPrinter(indent=4)
|
pp = pprint.PrettyPrinter(indent=4)
|
||||||
state_strs = []
|
state_strs = []
|
||||||
@ -373,6 +401,13 @@ def test_httpds_no_proxy(http_daemon, https_daemon):
|
|||||||
assert response.headers['warcprox-test-header'] == 'c!'
|
assert response.headers['warcprox-test-header'] == 'c!'
|
||||||
assert response.content == b'I am the warcprox test payload! dddddddddd!\n'
|
assert response.content == b'I am the warcprox test payload! dddddddddd!\n'
|
||||||
|
|
||||||
|
# ensure monkey-patched dns resolution is working
|
||||||
|
url = 'https://foo.bar.localhost:{}/c/d'.format(https_daemon.server_port)
|
||||||
|
response = requests.get(url, verify=False)
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.headers['warcprox-test-header'] == 'c!'
|
||||||
|
assert response.content == b'I am the warcprox test payload! dddddddddd!\n'
|
||||||
|
|
||||||
def _poll_playback_until(playback_proxies, url, status, timeout_sec):
|
def _poll_playback_until(playback_proxies, url, status, timeout_sec):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
# check playback (warc writing is asynchronous, give it up to 10 sec)
|
# check playback (warc writing is asynchronous, give it up to 10 sec)
|
||||||
@ -840,15 +875,16 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies):
|
|||||||
assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:")
|
assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:")
|
||||||
assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]}
|
assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]}
|
||||||
|
|
||||||
def test_host_doc_soft_limit(
|
def test_domain_doc_soft_limit(
|
||||||
http_daemon, https_daemon, warcprox_, archiving_proxies):
|
http_daemon, https_daemon, warcprox_, archiving_proxies):
|
||||||
request_meta = {
|
request_meta = {
|
||||||
"stats": {"buckets": [{"bucket":"test_host_doc_limit_bucket","tally-host-stats":True}]},
|
"stats": {"buckets": [{"bucket":"test_domain_doc_limit_bucket","tally-domains":["foo.localhost"]}]},
|
||||||
"soft-limits": {"test_host_doc_limit_bucket:localhost/total/urls":10},
|
"soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls":10},
|
||||||
}
|
}
|
||||||
headers = {"Warcprox-Meta": json.dumps(request_meta)}
|
headers = {"Warcprox-Meta": json.dumps(request_meta)}
|
||||||
|
|
||||||
url = 'http://localhost:{}/o/p'.format(http_daemon.server_port)
|
# (1)
|
||||||
|
url = 'http://foo.localhost:{}/o/p'.format(http_daemon.server_port)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True)
|
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
@ -861,8 +897,8 @@ def test_host_doc_soft_limit(
|
|||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
# make sure stats from different host don't count
|
# make sure stats from different domain don't count
|
||||||
url = 'http://127.0.0.1:{}/o/p'.format(http_daemon.server_port)
|
url = 'http://bar.localhost:{}/o/p'.format(http_daemon.server_port)
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True)
|
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||||
@ -877,9 +913,19 @@ def test_host_doc_soft_limit(
|
|||||||
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
|
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
|
||||||
time.sleep(2.0)
|
time.sleep(2.0)
|
||||||
|
|
||||||
# same host but different scheme and port -- host limit still applies
|
# (2) same host but different scheme and port: domain limit applies
|
||||||
url = 'https://localhost:{}/o/p'.format(https_daemon.server_port)
|
#
|
||||||
for i in range(8):
|
url = 'https://foo.localhost:{}/o/p'.format(https_daemon.server_port)
|
||||||
|
response = requests.get(
|
||||||
|
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||||
|
verify=False)
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.headers['warcprox-test-header'] == 'o!'
|
||||||
|
assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
|
||||||
|
|
||||||
|
# (3-9) different subdomain: host limit applies
|
||||||
|
url = 'https://baz.foo.localhost:{}/o/p'.format(https_daemon.server_port)
|
||||||
|
for i in range(7):
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||||
verify=False)
|
verify=False)
|
||||||
@ -894,6 +940,7 @@ def test_host_doc_soft_limit(
|
|||||||
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
|
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
|
||||||
time.sleep(2.0)
|
time.sleep(2.0)
|
||||||
|
|
||||||
|
# (10)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||||
verify=False)
|
verify=False)
|
||||||
@ -908,19 +955,19 @@ def test_host_doc_soft_limit(
|
|||||||
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
|
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
|
||||||
time.sleep(2.0)
|
time.sleep(2.0)
|
||||||
|
|
||||||
# back to http, and this is the 11th request
|
# (11) back to http, and this is the 11th request
|
||||||
url = 'http://localhost:{}/o/p'.format(http_daemon.server_port)
|
url = 'http://zuh.foo.localhost:{}/o/p'.format(http_daemon.server_port)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True)
|
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||||
assert response.status_code == 430
|
assert response.status_code == 430
|
||||||
assert response.reason == "Reached soft limit"
|
assert response.reason == "Reached soft limit"
|
||||||
expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
|
expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
|
||||||
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
||||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n"
|
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n"
|
||||||
|
|
||||||
# make sure limit doesn't get applied to a different host
|
# make sure limit doesn't get applied to a different domain
|
||||||
url = 'https://127.0.0.1:{}/o/p'.format(https_daemon.server_port)
|
url = 'https://localhost:{}/o/p'.format(https_daemon.server_port)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||||
verify=False)
|
verify=False)
|
||||||
@ -929,39 +976,39 @@ def test_host_doc_soft_limit(
|
|||||||
assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
|
assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
|
||||||
|
|
||||||
# https also blocked
|
# https also blocked
|
||||||
url = 'https://localhost:{}/o/p'.format(https_daemon.server_port)
|
url = 'https://zuh.foo.localhost:{}/o/p'.format(https_daemon.server_port)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||||
verify=False)
|
verify=False)
|
||||||
assert response.status_code == 430
|
assert response.status_code == 430
|
||||||
assert response.reason == "Reached soft limit"
|
assert response.reason == "Reached soft limit"
|
||||||
expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
|
expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
|
||||||
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
||||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n"
|
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n"
|
||||||
|
|
||||||
# same host, different capitalization still blocked
|
# same host, different capitalization still blocked
|
||||||
url = 'https://lOcALhoST:{}/o/p'.format(https_daemon.server_port)
|
url = 'https://HEHEHE.fOO.lOcALhoST:{}/o/p'.format(https_daemon.server_port)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||||
verify=False)
|
verify=False)
|
||||||
assert response.status_code == 430
|
assert response.status_code == 430
|
||||||
assert response.reason == "Reached soft limit"
|
assert response.reason == "Reached soft limit"
|
||||||
expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
|
expected_response_meta = {'reached-soft-limit': {'test_domain_doc_limit_bucket:foo.localhost/total/urls': 10}, 'stats': {'test_domain_doc_limit_bucket:foo.localhost': {'bucket': 'test_domain_doc_limit_bucket:foo.localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
|
||||||
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
||||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n"
|
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_doc_limit_bucket:foo.localhost/total/urls=10\n"
|
||||||
|
|
||||||
def test_host_data_soft_limit(
|
def test_domain_data_soft_limit(
|
||||||
http_daemon, https_daemon, warcprox_, archiving_proxies):
|
http_daemon, https_daemon, warcprox_, archiving_proxies):
|
||||||
request_meta = {
|
request_meta = {
|
||||||
"stats": {"buckets": [{"bucket":"test_host_data_limit_bucket","tally-host-stats":True}]},
|
"stats": {"buckets": [{"bucket":"test_domain_data_limit_bucket","tally-domains":['foo.LOCALhost']}]},
|
||||||
# response is 135 bytes, so 3rd novel url should be disallowed
|
# response is 135 bytes, so 3rd novel url should be disallowed
|
||||||
"soft-limits": {"test_host_data_limit_bucket:localhost/new/wire_bytes":200},
|
"soft-limits": {"test_domain_data_limit_bucket:foo.localhost/new/wire_bytes":200},
|
||||||
}
|
}
|
||||||
headers = {"Warcprox-Meta": json.dumps(request_meta)}
|
headers = {"Warcprox-Meta": json.dumps(request_meta)}
|
||||||
|
|
||||||
url = 'http://localhost:{}/y/z'.format(http_daemon.server_port)
|
url = 'http://foo.localhost:{}/y/z'.format(http_daemon.server_port)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True)
|
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
@ -976,7 +1023,7 @@ def test_host_data_soft_limit(
|
|||||||
time.sleep(2.0)
|
time.sleep(2.0)
|
||||||
|
|
||||||
# duplicate, does not count toward limit
|
# duplicate, does not count toward limit
|
||||||
url = 'https://localhost:{}/y/z'.format(https_daemon.server_port)
|
url = 'https://baz.foo.localhost:{}/y/z'.format(https_daemon.server_port)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||||
verify=False)
|
verify=False)
|
||||||
@ -992,7 +1039,7 @@ def test_host_data_soft_limit(
|
|||||||
time.sleep(2.0)
|
time.sleep(2.0)
|
||||||
|
|
||||||
# novel, pushes stats over the limit
|
# novel, pushes stats over the limit
|
||||||
url = 'https://localhost:{}/z/~'.format(https_daemon.server_port)
|
url = 'https://muh.foo.localhost:{}/z/~'.format(https_daemon.server_port)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||||
verify=False)
|
verify=False)
|
||||||
@ -1008,7 +1055,7 @@ def test_host_data_soft_limit(
|
|||||||
time.sleep(2.0)
|
time.sleep(2.0)
|
||||||
|
|
||||||
# make sure limit doesn't get applied to a different host
|
# make sure limit doesn't get applied to a different host
|
||||||
url = 'http://127.0.0.1:{}/z/~'.format(http_daemon.server_port)
|
url = 'http://baz.localhost:{}/z/~'.format(http_daemon.server_port)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True)
|
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
@ -1016,27 +1063,27 @@ def test_host_data_soft_limit(
|
|||||||
assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n'
|
assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n'
|
||||||
|
|
||||||
# blocked because we're over the limit now
|
# blocked because we're over the limit now
|
||||||
url = 'http://localhost:{}/y/z'.format(http_daemon.server_port)
|
url = 'http://lOl.wHut.fOo.lOcALHOst:{}/y/z'.format(http_daemon.server_port)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True)
|
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||||
assert response.status_code == 430
|
assert response.status_code == 430
|
||||||
assert response.reason == "Reached soft limit"
|
assert response.reason == "Reached soft limit"
|
||||||
expected_response_meta = {'reached-soft-limit': {'test_host_data_limit_bucket:localhost/new/wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}}
|
expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:foo.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:foo.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:foo.localhost'}}}
|
||||||
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
||||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_data_limit_bucket:localhost/new/wire_bytes=200\n"
|
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:foo.localhost/new/wire_bytes=200\n"
|
||||||
|
|
||||||
# https also blocked
|
# https also blocked
|
||||||
url = 'https://localhost:{}/w/x'.format(https_daemon.server_port)
|
url = 'https://foo.localhost:{}/w/x'.format(https_daemon.server_port)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||||
verify=False)
|
verify=False)
|
||||||
assert response.status_code == 430
|
assert response.status_code == 430
|
||||||
assert response.reason == "Reached soft limit"
|
assert response.reason == "Reached soft limit"
|
||||||
expected_response_meta = {'reached-soft-limit': {'test_host_data_limit_bucket:localhost/new/wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}}
|
expected_response_meta = {'reached-soft-limit': {'test_domain_data_limit_bucket:foo.localhost/new/wire_bytes': 200}, 'stats': {'test_domain_data_limit_bucket:foo.localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_domain_data_limit_bucket:foo.localhost'}}}
|
||||||
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
||||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_data_limit_bucket:localhost/new/wire_bytes=200\n"
|
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_domain_data_limit_bucket:foo.localhost/new/wire_bytes=200\n"
|
||||||
|
|
||||||
# XXX this test relies on a tor proxy running at localhost:9050 with a working
|
# XXX this test relies on a tor proxy running at localhost:9050 with a working
|
||||||
# connection to the internet, and relies on a third party site (facebook) being
|
# connection to the internet, and relies on a third party site (facebook) being
|
||||||
|
@ -57,6 +57,72 @@ class RequestBlockedByRule(Exception):
|
|||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "%s: %s" % (self.__class__.__name__, self.msg)
|
return "%s: %s" % (self.__class__.__name__, self.msg)
|
||||||
|
|
||||||
|
class Url:
|
||||||
|
'''
|
||||||
|
Utility class
|
||||||
|
'''
|
||||||
|
def __init__(self, url):
|
||||||
|
self.url = url
|
||||||
|
self._surt = None
|
||||||
|
self._host = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def surt(self):
|
||||||
|
if not self._surt:
|
||||||
|
import surt
|
||||||
|
hurl = surt.handyurl.parse(self.url)
|
||||||
|
surt.GoogleURLCanonicalizer.canonicalize(hurl)
|
||||||
|
hurl.query = None
|
||||||
|
hurl.hash = None
|
||||||
|
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
|
||||||
|
return self._surt
|
||||||
|
|
||||||
|
@property
|
||||||
|
def host(self):
|
||||||
|
if not self._host:
|
||||||
|
import surt
|
||||||
|
self._host = surt.handyurl.parse(self.url).host
|
||||||
|
return self._host
|
||||||
|
|
||||||
|
def matches_ip_or_domain(self, ip_or_domain):
|
||||||
|
return host_matches_ip_or_domain(self.host, ip_or_domain)
|
||||||
|
|
||||||
|
def normalize_host(host):
|
||||||
|
# normalize host (punycode and lowercase)
|
||||||
|
return host.encode('idna').decode('ascii').lower()
|
||||||
|
|
||||||
|
def host_matches_ip_or_domain(host, ip_or_domain):
|
||||||
|
'''
|
||||||
|
Returns true if
|
||||||
|
- ip_or_domain is an ip address and host is the same ip address
|
||||||
|
- ip_or_domain is a domain and host is the same domain
|
||||||
|
- ip_or_domain is a domain and host is a subdomain of it
|
||||||
|
'''
|
||||||
|
_host = normalize_host(host)
|
||||||
|
_ip_or_domain = normalize_host(ip_or_domain)
|
||||||
|
|
||||||
|
if _ip_or_domain == _host:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# if either _ip_or_domain or host are ip addresses, and they're not
|
||||||
|
# identical (previous check), not a match
|
||||||
|
try:
|
||||||
|
ipaddress.ip_address(_ip_or_domain)
|
||||||
|
return False
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
ipaddress.ip_address(_host)
|
||||||
|
return False
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# if we get here, we're looking at two hostnames
|
||||||
|
domain_parts = _ip_or_domain.split(".")
|
||||||
|
host_parts = _host.split(".")
|
||||||
|
|
||||||
|
return host_parts[-len(domain_parts):] == domain_parts
|
||||||
|
|
||||||
# logging level more fine-grained than logging.DEBUG==10
|
# logging level more fine-grained than logging.DEBUG==10
|
||||||
TRACE = 5
|
TRACE = 5
|
||||||
|
|
||||||
|
@ -113,15 +113,15 @@ class StatsDb:
|
|||||||
definition can either be a string, which signifies the name of the
|
definition can either be a string, which signifies the name of the
|
||||||
bucket, or a dict. If a dict it is expected to have at least an item
|
bucket, or a dict. If a dict it is expected to have at least an item
|
||||||
with key 'bucket' whose value is the name of the bucket. The other
|
with key 'bucket' whose value is the name of the bucket. The other
|
||||||
currently recognized item is 'tally-host-stats', which if true,
|
currently recognized item is 'tally-domains', which if supplied should
|
||||||
instructs warcprox to additionally tally substats of the given bucket
|
be a list of domains. This instructs warcprox to additionally tally
|
||||||
by host. Host stats are stored in the stats table under the key
|
substats of the given bucket by domain. Host stats are stored in the
|
||||||
'{parent-bucket}:{host}'.
|
stats table under the key '{parent-bucket}:{domain(normalized)}'.
|
||||||
|
|
||||||
Example Warcprox-Meta header (a real one will likely have other
|
Example Warcprox-Meta header (a real one will likely have other
|
||||||
sections besides 'stats'):
|
sections besides 'stats'):
|
||||||
|
|
||||||
Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-host-stats':true}]}}
|
Warcprox-Meta: {'stats':{'buckets':['bucket1',{'bucket':'bucket2','tally-domains':['foo.bar.com','192.168.10.20'}]}}
|
||||||
'''
|
'''
|
||||||
buckets = ["__all__"]
|
buckets = ["__all__"]
|
||||||
if (recorded_url.warcprox_meta
|
if (recorded_url.warcprox_meta
|
||||||
@ -135,14 +135,13 @@ class StatsDb:
|
|||||||
'warcprox-meta header %s', bucket)
|
'warcprox-meta header %s', bucket)
|
||||||
continue
|
continue
|
||||||
buckets.append(bucket['bucket'])
|
buckets.append(bucket['bucket'])
|
||||||
# XXX maybe host has been computed elsewhere and can be
|
if bucket.get('tally-domains'):
|
||||||
# cached somewhere, but maybe the performance gain would be
|
url = warcprox.Url(recorded_url.url.decode('utf-8'))
|
||||||
# negligible
|
for domain in bucket['tally-domains']:
|
||||||
if bucket.get('tally-host-stats'):
|
if url.matches_ip_or_domain(domain):
|
||||||
buckets.append('%s:%s' % (
|
buckets.append('%s:%s' % (
|
||||||
bucket['bucket'],
|
bucket['bucket'],
|
||||||
surt.handyurl.parse(recorded_url.url.decode(
|
warcprox.normalize_host(domain)))
|
||||||
'utf-8')).host))
|
|
||||||
else:
|
else:
|
||||||
buckets.append(bucket)
|
buckets.append(bucket)
|
||||||
else:
|
else:
|
||||||
|
@ -48,57 +48,6 @@ import resource
|
|||||||
import ipaddress
|
import ipaddress
|
||||||
import surt
|
import surt
|
||||||
|
|
||||||
class Url:
|
|
||||||
def __init__(self, url):
|
|
||||||
self.url = url
|
|
||||||
self._surt = None
|
|
||||||
self._host = None
|
|
||||||
|
|
||||||
@property
|
|
||||||
def surt(self):
|
|
||||||
if not self._surt:
|
|
||||||
hurl = surt.handyurl.parse(self.url)
|
|
||||||
surt.GoogleURLCanonicalizer.canonicalize(hurl)
|
|
||||||
hurl.query = None
|
|
||||||
hurl.hash = None
|
|
||||||
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
|
|
||||||
return self._surt
|
|
||||||
|
|
||||||
@property
|
|
||||||
def host(self):
|
|
||||||
if not self._host:
|
|
||||||
self._host = surt.handyurl.parse(self.url).host
|
|
||||||
return self._host
|
|
||||||
|
|
||||||
def matches_ip_or_domain(self, ip_or_domain):
|
|
||||||
"""Returns true if
|
|
||||||
- ip_or_domain is an ip address and self.host is the same ip address
|
|
||||||
- ip_or_domain is a domain and self.host is the same domain
|
|
||||||
- ip_or_domain is a domain and self.host is a subdomain of it
|
|
||||||
"""
|
|
||||||
if ip_or_domain == self.host:
|
|
||||||
return True
|
|
||||||
|
|
||||||
# if either ip_or_domain or self.host are ip addresses, and they're not
|
|
||||||
# identical (previous check), not a match
|
|
||||||
try:
|
|
||||||
ipaddress.ip_address(ip_or_domain)
|
|
||||||
return False
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
ipaddress.ip_address(self.host)
|
|
||||||
return False
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# if we get here, we're looking at two hostnames
|
|
||||||
# XXX do we need to handle case of one punycoded idn, other not?
|
|
||||||
domain_parts = ip_or_domain.split(".")
|
|
||||||
host_parts = self.host.split(".")
|
|
||||||
|
|
||||||
return host_parts[-len(domain_parts):] == domain_parts
|
|
||||||
|
|
||||||
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||||
'''
|
'''
|
||||||
XXX add more information.
|
XXX add more information.
|
||||||
@ -118,7 +67,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
# XXX nearly identical to brozzler.site.Site._scope_rule_applies() but
|
# XXX nearly identical to brozzler.site.Site._scope_rule_applies() but
|
||||||
# there's no obvious common dependency where this code should go... TBD
|
# there's no obvious common dependency where this code should go... TBD
|
||||||
def _scope_rule_applies(self, rule):
|
def _scope_rule_applies(self, rule):
|
||||||
u = Url(self.url)
|
u = warcprox.Url(self.url)
|
||||||
|
|
||||||
if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]):
|
if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]):
|
||||||
return False
|
return False
|
||||||
@ -179,10 +128,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
|
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
|
||||||
|
|
||||||
# if limit_key looks like 'job1:foo.com/total/urls' then we only want
|
# if limit_key looks like 'job1:foo.com/total/urls' then we only want
|
||||||
# to apply this rule if the requested url is on host foo.com
|
# to apply this rule if the requested url is within domain
|
||||||
bucket0_fields = bucket0.split(':')
|
bucket0_fields = bucket0.split(':')
|
||||||
if len(bucket0_fields) == 2:
|
if len(bucket0_fields) == 2:
|
||||||
if self.hostname.lower() != bucket0_fields[1].lower():
|
if not warcprox.host_matches_ip_or_domain(
|
||||||
|
self.hostname.lower(), bucket0_fields[1].lower()):
|
||||||
return # else host matches, go ahead and enforce the limit
|
return # else host matches, go ahead and enforce the limit
|
||||||
|
|
||||||
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
|
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user