diff --git a/setup.py b/setup.py index 9b760f5..6fc4875 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.0.dev13', + version='2.0.dev14', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index ebca589..8793f80 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -564,7 +564,7 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie def test_limits(http_daemon, warcprox_, archiving_proxies): url = 'http://localhost:{}/i/j'.format(http_daemon.server_port) - request_meta = {"stats":{"buckets":["test_limits_bucket"]},"limits":{"test_limits_bucket.total.urls":10}} + request_meta = {"stats":{"buckets":["test_limits_bucket"]},"limits":{"test_limits_bucket/total/urls":10}} headers = {"Warcprox-Meta": json.dumps(request_meta)} response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) @@ -593,10 +593,10 @@ def test_limits(http_daemon, warcprox_, archiving_proxies): response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 420 assert response.reason == "Reached limit" - expected_response_meta = {'reached-limit': {'test_limits_bucket.total.urls': 10}, 'stats': {'test_limits_bucket': {'bucket': 'test_limits_bucket', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'total': {'wire_bytes': 1350, 'urls': 10}, 'new': {'wire_bytes': 135, 'urls': 1}}}} + expected_response_meta = {'reached-limit': {'test_limits_bucket/total/urls': 10}, 'stats': {'test_limits_bucket': {'bucket': 'test_limits_bucket', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'total': {'wire_bytes': 1350, 'urls': 10}, 'new': {'wire_bytes': 135, 'urls': 1}}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket.total.urls=10\n" + assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=10\n" def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, playback_proxies): url1 = 'http://localhost:{}/k/l'.format(http_daemon.server_port) @@ -840,11 +840,11 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies): assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:") assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]} -def test_host_doc_limit( +def test_host_doc_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): request_meta = { "stats": {"buckets": [{"bucket":"test_host_doc_limit_bucket","tally-host-stats":True}]}, - "limits": {"test_host_doc_limit_bucket:localhost.total.urls":10}, + "soft-limits": {"test_host_doc_limit_bucket:localhost/total/urls":10}, } headers = {"Warcprox-Meta": json.dumps(request_meta)} @@ -896,31 +896,31 @@ def test_host_doc_limit( url = 'http://localhost:{}/o/p'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) - assert response.status_code == 420 - assert response.reason == "Reached limit" - expected_response_meta = {'reached-limit': {'test_host_doc_limit_bucket:localhost.total.urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} + assert response.status_code == 430 + assert response.reason == "Reached soft limit" + expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached limit test_host_doc_limit_bucket:localhost.total.urls=10\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n" # https also blocked url = 'https://localhost:{}/o/p'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) - assert response.status_code == 420 - assert response.reason == "Reached limit" - expected_response_meta = {'reached-limit': {'test_host_doc_limit_bucket:localhost.total.urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} + assert response.status_code == 430 + assert response.reason == "Reached soft limit" + expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached limit test_host_doc_limit_bucket:localhost.total.urls=10\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n" -def test_host_data_limit( +def test_host_data_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): request_meta = { "stats": {"buckets": [{"bucket":"test_host_data_limit_bucket","tally-host-stats":True}]}, # response is 135 bytes, so 3rd novel url should be disallowed - "limits": {"test_host_data_limit_bucket:localhost.new.wire_bytes":200}, + "soft-limits": {"test_host_data_limit_bucket:localhost/new/wire_bytes":200}, } headers = {"Warcprox-Meta": json.dumps(request_meta)} @@ -974,24 +974,24 @@ def test_host_data_limit( url = 'http://localhost:{}/y/z'.format(http_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) - assert response.status_code == 420 - assert response.reason == "Reached limit" - expected_response_meta = {'reached-limit': {'test_host_data_limit_bucket:localhost.new.wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}} + assert response.status_code == 430 + assert response.reason == "Reached soft limit" + expected_response_meta = {'reached-soft-limit': {'test_host_data_limit_bucket:localhost/new/wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached limit test_host_data_limit_bucket:localhost.new.wire_bytes=200\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_data_limit_bucket:localhost/new/wire_bytes=200\n" # https also blocked url = 'https://localhost:{}/w/x'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) - assert response.status_code == 420 - assert response.reason == "Reached limit" - expected_response_meta = {'reached-limit': {'test_host_data_limit_bucket:localhost.new.wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}} + assert response.status_code == 430 + assert response.reason == "Reached soft limit" + expected_response_meta = {'reached-soft-limit': {'test_host_data_limit_bucket:localhost/new/wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}} assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta assert response.headers["content-type"] == "text/plain;charset=utf-8" - assert response.raw.data == b"request rejected by warcprox: reached limit test_host_data_limit_bucket:localhost.new.wire_bytes=200\n" + assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_data_limit_bucket:localhost/new/wire_bytes=200\n" # XXX this test relies on a tor proxy running at localhost:9050 with a working # connection to the internet, and relies on a third party site (facebook) being diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 3950e4e..c6c75b9 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -319,7 +319,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self.logger.info("%s: %s", repr(self.requestline), e) return except Exception as e: - self.logger.error("problem processing request {}: {}".format(repr(self.requestline), e)) + self.logger.error("problem processing request {}: {}".format(repr(self.requestline), e), exc_info=True) self.send_error(500, str(e)) return diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 5ffe83d..882fec9 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -175,42 +175,55 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self.client_address[0], self.command, self.url, rule)) + def _enforce_limit(self, limit_key, limit_value, soft=False): + bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) + value = self.server.stats_db.value(bucket0, bucket1, bucket2) + if value and value >= limit_value: + body = ("request rejected by warcprox: reached %s %s=%s\n" % ( + "soft limit" if soft else "limit", limit_key, + limit_value)).encode("utf-8") + if soft: + self.send_response(430, "Reached soft limit") + else: + self.send_response(420, "Reached limit") + self.send_header("Content-Type", "text/plain;charset=utf-8") + self.send_header("Connection", "close") + self.send_header("Content-Length", len(body)) + response_meta = { + "stats": {bucket0:self.server.stats_db.value(bucket0)} + } + if soft: + response_meta["reached-soft-limit"] = {limit_key:limit_value} + else: + response_meta["reached-limit"] = {limit_key:limit_value} + self.send_header( + "Warcprox-Meta", + json.dumps(response_meta, separators=(",",":"))) + self.end_headers() + if self.command != "HEAD": + self.wfile.write(body) + self.connection.close() + raise warcprox.RequestBlockedByRule( + "%s %s %s %s -- reached %s %s=%s" % ( + self.client_address[0], 430 if soft else 420, + self.command, self.url, + "soft limit" if soft else "limit", + limit_key, limit_value)) + def _enforce_limits(self, warcprox_meta): """ - Sends a 420 response and raises warcprox.RequestBlockedByRule if a - limit specified in warcprox_meta is reached. + Sends a 420 (hard limit) or 430 (soft limit) response and raises + warcprox.RequestBlockedByRule if a limit specified in warcprox_meta is + reached. """ if warcprox_meta and "limits" in warcprox_meta: for item in warcprox_meta["limits"].items(): - key, limit = item - bucket0, bucket1, bucket2 = key.rsplit(".", 2) - value = self.server.stats_db.value(bucket0, bucket1, bucket2) - self.logger.debug( - "warcprox_meta['limits']=%s stats['%s']=%s " - "recorded_url_q.qsize()=%s", warcprox_meta['limits'], - key, value, self.server.recorded_url_q.qsize()) - if value and value >= limit: - body = ("request rejected by warcprox: reached limit " - "%s=%s\n" % (key, limit)).encode("utf-8") - self.send_response(420, "Reached limit") - self.send_header("Content-Type", "text/plain;charset=utf-8") - self.send_header("Connection", "close") - self.send_header("Content-Length", len(body)) - response_meta = { - "reached-limit": {key:limit}, - "stats": {bucket0:self.server.stats_db.value(bucket0)} - } - self.send_header( - "Warcprox-Meta", - json.dumps(response_meta, separators=(",",":"))) - self.end_headers() - if self.command != "HEAD": - self.wfile.write(body) - self.connection.close() - raise warcprox.RequestBlockedByRule( - "%s 420 %s %s -- reached limit %s=%s" % ( - self.client_address[0], self.command, - self.url, key, limit)) + limit_key, limit_value = item + self._enforce_limit(limit_key, limit_value, soft=False) + if warcprox_meta and "soft-limits" in warcprox_meta: + for item in warcprox_meta["soft-limits"].items(): + limit_key, limit_value = item + self._enforce_limit(limit_key, limit_value, soft=True) def _connect_to_remote_server(self): '''