mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
really only apply host limits to the host
This commit is contained in:
parent
04c4b63f03
commit
2c8b194090
2
setup.py
2
setup.py
@ -51,7 +51,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.0.dev16',
|
version='2.0.dev17',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -861,6 +861,22 @@ def test_host_doc_soft_limit(
|
|||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
# make sure stats from different host don't count
|
||||||
|
url = 'http://127.0.0.1:{}/o/p'.format(http_daemon.server_port)
|
||||||
|
for i in range(10):
|
||||||
|
response = requests.get(
|
||||||
|
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.headers['warcprox-test-header'] == 'o!'
|
||||||
|
assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
|
||||||
|
|
||||||
|
# wait for writer thread to process
|
||||||
|
time.sleep(0.5)
|
||||||
|
while not warcprox_.warc_writer_thread.idle:
|
||||||
|
time.sleep(0.5)
|
||||||
|
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
|
||||||
|
time.sleep(2.0)
|
||||||
|
|
||||||
# same host but different scheme and port -- host limit still applies
|
# same host but different scheme and port -- host limit still applies
|
||||||
url = 'https://localhost:{}/o/p'.format(https_daemon.server_port)
|
url = 'https://localhost:{}/o/p'.format(https_daemon.server_port)
|
||||||
for i in range(8):
|
for i in range(8):
|
||||||
@ -903,6 +919,15 @@ def test_host_doc_soft_limit(
|
|||||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n"
|
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n"
|
||||||
|
|
||||||
|
# make sure limit doesn't get applied to a different host
|
||||||
|
url = 'https://127.0.0.1:{}/o/p'.format(https_daemon.server_port)
|
||||||
|
response = requests.get(
|
||||||
|
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||||
|
verify=False)
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.headers['warcprox-test-header'] == 'o!'
|
||||||
|
assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
|
||||||
|
|
||||||
# https also blocked
|
# https also blocked
|
||||||
url = 'https://localhost:{}/o/p'.format(https_daemon.server_port)
|
url = 'https://localhost:{}/o/p'.format(https_daemon.server_port)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
@ -915,6 +940,18 @@ def test_host_doc_soft_limit(
|
|||||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n"
|
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n"
|
||||||
|
|
||||||
|
# same host, different capitalization still blocked
|
||||||
|
url = 'https://lOcALhoST:{}/o/p'.format(https_daemon.server_port)
|
||||||
|
response = requests.get(
|
||||||
|
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||||
|
verify=False)
|
||||||
|
assert response.status_code == 430
|
||||||
|
assert response.reason == "Reached soft limit"
|
||||||
|
expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
|
||||||
|
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
||||||
|
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||||
|
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n"
|
||||||
|
|
||||||
def test_host_data_soft_limit(
|
def test_host_data_soft_limit(
|
||||||
http_daemon, https_daemon, warcprox_, archiving_proxies):
|
http_daemon, https_daemon, warcprox_, archiving_proxies):
|
||||||
request_meta = {
|
request_meta = {
|
||||||
@ -970,6 +1007,14 @@ def test_host_data_soft_limit(
|
|||||||
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
|
# rethinkdb stats db update cycle is 2 seconds (at the moment anyway)
|
||||||
time.sleep(2.0)
|
time.sleep(2.0)
|
||||||
|
|
||||||
|
# make sure limit doesn't get applied to a different host
|
||||||
|
url = 'http://127.0.0.1:{}/z/~'.format(http_daemon.server_port)
|
||||||
|
response = requests.get(
|
||||||
|
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.headers['warcprox-test-header'] == 'z!'
|
||||||
|
assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n'
|
||||||
|
|
||||||
# blocked because we're over the limit now
|
# blocked because we're over the limit now
|
||||||
url = 'http://localhost:{}/y/z'.format(http_daemon.server_port)
|
url = 'http://localhost:{}/y/z'.format(http_daemon.server_port)
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
|
@ -177,6 +177,14 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
|
|
||||||
def _enforce_limit(self, limit_key, limit_value, soft=False):
|
def _enforce_limit(self, limit_key, limit_value, soft=False):
|
||||||
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
|
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
|
||||||
|
|
||||||
|
# if limit_key looks like 'job1:foo.com/total/urls' then we only want
|
||||||
|
# to apply this rule if the requested url is on host foo.com
|
||||||
|
bucket0_fields = bucket0.split(':')
|
||||||
|
if len(bucket0_fields) == 2:
|
||||||
|
if self.hostname.lower() != bucket0_fields[1].lower():
|
||||||
|
return # else host matches, go ahead and enforce the limit
|
||||||
|
|
||||||
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
|
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
|
||||||
if value and value >= limit_value:
|
if value and value >= limit_value:
|
||||||
body = ("request rejected by warcprox: reached %s %s=%s\n" % (
|
body = ("request rejected by warcprox: reached %s %s=%s\n" % (
|
||||||
|
Loading…
x
Reference in New Issue
Block a user