really only apply host limits to the host

This commit is contained in:
Noah Levitt 2016-06-28 15:53:29 -05:00
parent 04c4b63f03
commit 2c8b194090
3 changed files with 54 additions and 1 deletions

View File

@ -51,7 +51,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.0.dev16',
version='2.0.dev17',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -861,6 +861,22 @@ def test_host_doc_soft_limit(
time.sleep(0.5)
time.sleep(0.5)
# make sure stats from different host don't count
url = 'http://127.0.0.1:{}/o/p'.format(http_daemon.server_port)
for i in range(10):
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 200
assert response.headers['warcprox-test-header'] == 'o!'
assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
# wait for writer thread to process
time.sleep(0.5)
while not warcprox_.warc_writer_thread.idle:
time.sleep(0.5)
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
time.sleep(2.0)
# same host but different scheme and port -- host limit still applies
url = 'https://localhost:{}/o/p'.format(https_daemon.server_port)
for i in range(8):
@ -903,6 +919,15 @@ def test_host_doc_soft_limit(
assert response.headers["content-type"] == "text/plain;charset=utf-8"
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n"
# make sure limit doesn't get applied to a different host
url = 'https://127.0.0.1:{}/o/p'.format(https_daemon.server_port)
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False)
assert response.status_code == 200
assert response.headers['warcprox-test-header'] == 'o!'
assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
# https also blocked
url = 'https://localhost:{}/o/p'.format(https_daemon.server_port)
response = requests.get(
@ -915,6 +940,18 @@ def test_host_doc_soft_limit(
assert response.headers["content-type"] == "text/plain;charset=utf-8"
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n"
# same host, different capitalization still blocked
url = 'https://lOcALhoST:{}/o/p'.format(https_daemon.server_port)
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True,
verify=False)
assert response.status_code == 430
assert response.reason == "Reached soft limit"
expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
assert response.headers["content-type"] == "text/plain;charset=utf-8"
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n"
def test_host_data_soft_limit(
http_daemon, https_daemon, warcprox_, archiving_proxies):
request_meta = {
@ -970,6 +1007,14 @@ def test_host_data_soft_limit(
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
time.sleep(2.0)
# make sure limit doesn't get applied to a different host
url = 'http://127.0.0.1:{}/z/~'.format(http_daemon.server_port)
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 200
assert response.headers['warcprox-test-header'] == 'z!'
assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n'
# blocked because we're over the limit now
url = 'http://localhost:{}/y/z'.format(http_daemon.server_port)
response = requests.get(

View File

@ -177,6 +177,14 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
def _enforce_limit(self, limit_key, limit_value, soft=False):
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
# if limit_key looks like 'job1:foo.com/total/urls' then we only want
# to apply this rule if the requested url is on host foo.com
bucket0_fields = bucket0.split(':')
if len(bucket0_fields) == 2:
if self.hostname.lower() != bucket0_fields[1].lower():
return # else host matches, go ahead and enforce the limit
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
if value and value >= limit_value:
body = ("request rejected by warcprox: reached %s %s=%s\n" % (