From 2c8b1940900bcc2c3dfc0b41c44e20539d0bc980 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 28 Jun 2016 15:53:29 -0500 Subject: [PATCH] really only apply host limits to the host --- setup.py | 2 +- tests/test_warcprox.py | 45 ++++++++++++++++++++++++++++++++++++++++++ warcprox/warcproxy.py | 8 ++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e6b35e4..d23fcea 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0.dev16', + version='2.0.dev17', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index e6c17a0..edc44ec 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -861,6 +861,22 @@ def test_host_doc_soft_limit( time.sleep(0.5) time.sleep(0.5) + # make sure stats from different host don't count + url = 'http://127.0.0.1:{}/o/p'.format(http_daemon.server_port) + for i in range(10): + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'o!' + assert response.content == b'I am the warcprox test payload! pppppppppp!\n' + + # wait for writer thread to process + time.sleep(0.5) + while not warcprox_.warc_writer_thread.idle: + time.sleep(0.5) + # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) + time.sleep(2.0) + # same host but different scheme and port -- host limit still applies url = 'https://localhost:{}/o/p'.format(https_daemon.server_port) for i in range(8): @@ -903,6 +919,15 @@ def test_host_doc_soft_limit( assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n" + # make sure limit doesn't get applied to a different host + url = 'https://127.0.0.1:{}/o/p'.format(https_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'o!' + assert response.content == b'I am the warcprox test payload! pppppppppp!\n' + # https also blocked url = 'https://localhost:{}/o/p'.format(https_daemon.server_port) response = requests.get( @@ -915,6 +940,18 @@ def test_host_doc_soft_limit( assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n" + # same host, different capitalization still blocked + url = 'https://lOcALhoST:{}/o/p'.format(https_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True, + verify=False) + assert response.status_code == 430 + assert response.reason == "Reached soft limit" + expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} + assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta + assert response.headers["content-type"] == "text/plain;charset=utf-8" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n" + def test_host_data_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): request_meta = { @@ -970,6 +1007,14 @@ def test_host_data_soft_limit( # rethinkdb stats db update cycle is 2 seconds (at the moment anyway) time.sleep(2.0) + # make sure limit doesn't get applied to a different host + url = 'http://127.0.0.1:{}/z/~'.format(http_daemon.server_port) + response = requests.get( + url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + assert response.headers['warcprox-test-header'] == 'z!' + assert response.content == b'I am the warcprox test payload! ~~~~~~~~~~!\n' + # blocked because we're over the limit now url = 'http://localhost:{}/y/z'.format(http_daemon.server_port) response = requests.get( diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 9966a14..38e39ae 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -177,6 +177,14 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): def _enforce_limit(self, limit_key, limit_value, soft=False): bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) + + # if limit_key looks like 'job1:foo.com/total/urls' then we only want + # to apply this rule if the requested url is on host foo.com + bucket0_fields = bucket0.split(':') + if len(bucket0_fields) == 2: + if self.hostname.lower() != bucket0_fields[1].lower(): + return # else host matches, go ahead and enforce the limit + value = self.server.stats_db.value(bucket0, bucket1, bucket2) if value and value >= limit_value: body = ("request rejected by warcprox: reached %s %s=%s\n" % (