mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
support "soft limits" which result in a different response code (430) than regular (hard) limits (which result in a 420)
This commit is contained in:
parent
9df2ce0fbe
commit
320df0565e
2
setup.py
2
setup.py
@ -51,7 +51,7 @@ except:
|
||||
|
||||
setuptools.setup(
|
||||
name='warcprox',
|
||||
version='2.0.dev13',
|
||||
version='2.0.dev14',
|
||||
description='WARC writing MITM HTTP/S proxy',
|
||||
url='https://github.com/internetarchive/warcprox',
|
||||
author='Noah Levitt',
|
||||
|
@ -564,7 +564,7 @@ def test_dedup_https(https_daemon, warcprox_, archiving_proxies, playback_proxie
|
||||
|
||||
def test_limits(http_daemon, warcprox_, archiving_proxies):
|
||||
url = 'http://localhost:{}/i/j'.format(http_daemon.server_port)
|
||||
request_meta = {"stats":{"buckets":["test_limits_bucket"]},"limits":{"test_limits_bucket.total.urls":10}}
|
||||
request_meta = {"stats":{"buckets":["test_limits_bucket"]},"limits":{"test_limits_bucket/total/urls":10}}
|
||||
headers = {"Warcprox-Meta": json.dumps(request_meta)}
|
||||
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||
@ -593,10 +593,10 @@ def test_limits(http_daemon, warcprox_, archiving_proxies):
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||
assert response.status_code == 420
|
||||
assert response.reason == "Reached limit"
|
||||
expected_response_meta = {'reached-limit': {'test_limits_bucket.total.urls': 10}, 'stats': {'test_limits_bucket': {'bucket': 'test_limits_bucket', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'total': {'wire_bytes': 1350, 'urls': 10}, 'new': {'wire_bytes': 135, 'urls': 1}}}}
|
||||
expected_response_meta = {'reached-limit': {'test_limits_bucket/total/urls': 10}, 'stats': {'test_limits_bucket': {'bucket': 'test_limits_bucket', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'total': {'wire_bytes': 1350, 'urls': 10}, 'new': {'wire_bytes': 135, 'urls': 1}}}}
|
||||
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket.total.urls=10\n"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=10\n"
|
||||
|
||||
def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, playback_proxies):
|
||||
url1 = 'http://localhost:{}/k/l'.format(http_daemon.server_port)
|
||||
@ -840,11 +840,11 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies):
|
||||
assert response.content.startswith(b"request rejected by warcprox: blocked by rule found in Warcprox-Meta header:")
|
||||
assert json.loads(response.headers['warcprox-meta']) == {"blocked-by-rule":rules[3]}
|
||||
|
||||
def test_host_doc_limit(
|
||||
def test_host_doc_soft_limit(
|
||||
http_daemon, https_daemon, warcprox_, archiving_proxies):
|
||||
request_meta = {
|
||||
"stats": {"buckets": [{"bucket":"test_host_doc_limit_bucket","tally-host-stats":True}]},
|
||||
"limits": {"test_host_doc_limit_bucket:localhost.total.urls":10},
|
||||
"soft-limits": {"test_host_doc_limit_bucket:localhost/total/urls":10},
|
||||
}
|
||||
headers = {"Warcprox-Meta": json.dumps(request_meta)}
|
||||
|
||||
@ -896,31 +896,31 @@ def test_host_doc_limit(
|
||||
url = 'http://localhost:{}/o/p'.format(http_daemon.server_port)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||
assert response.status_code == 420
|
||||
assert response.reason == "Reached limit"
|
||||
expected_response_meta = {'reached-limit': {'test_host_doc_limit_bucket:localhost.total.urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
|
||||
assert response.status_code == 430
|
||||
assert response.reason == "Reached soft limit"
|
||||
expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
|
||||
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached limit test_host_doc_limit_bucket:localhost.total.urls=10\n"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n"
|
||||
|
||||
# https also blocked
|
||||
url = 'https://localhost:{}/o/p'.format(https_daemon.server_port)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||
verify=False)
|
||||
assert response.status_code == 420
|
||||
assert response.reason == "Reached limit"
|
||||
expected_response_meta = {'reached-limit': {'test_host_doc_limit_bucket:localhost.total.urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
|
||||
assert response.status_code == 430
|
||||
assert response.reason == "Reached soft limit"
|
||||
expected_response_meta = {'reached-soft-limit': {'test_host_doc_limit_bucket:localhost/total/urls': 10}, 'stats': {'test_host_doc_limit_bucket:localhost': {'bucket': 'test_host_doc_limit_bucket:localhost', 'revisit': {'wire_bytes': 1215, 'urls': 9}, 'new': {'wire_bytes': 135, 'urls': 1}, 'total': {'wire_bytes': 1350, 'urls': 10}}}}
|
||||
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached limit test_host_doc_limit_bucket:localhost.total.urls=10\n"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_doc_limit_bucket:localhost/total/urls=10\n"
|
||||
|
||||
def test_host_data_limit(
|
||||
def test_host_data_soft_limit(
|
||||
http_daemon, https_daemon, warcprox_, archiving_proxies):
|
||||
request_meta = {
|
||||
"stats": {"buckets": [{"bucket":"test_host_data_limit_bucket","tally-host-stats":True}]},
|
||||
# response is 135 bytes, so 3rd novel url should be disallowed
|
||||
"limits": {"test_host_data_limit_bucket:localhost.new.wire_bytes":200},
|
||||
"soft-limits": {"test_host_data_limit_bucket:localhost/new/wire_bytes":200},
|
||||
}
|
||||
headers = {"Warcprox-Meta": json.dumps(request_meta)}
|
||||
|
||||
@ -974,24 +974,24 @@ def test_host_data_limit(
|
||||
url = 'http://localhost:{}/y/z'.format(http_daemon.server_port)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||
assert response.status_code == 420
|
||||
assert response.reason == "Reached limit"
|
||||
expected_response_meta = {'reached-limit': {'test_host_data_limit_bucket:localhost.new.wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}}
|
||||
assert response.status_code == 430
|
||||
assert response.reason == "Reached soft limit"
|
||||
expected_response_meta = {'reached-soft-limit': {'test_host_data_limit_bucket:localhost/new/wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}}
|
||||
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached limit test_host_data_limit_bucket:localhost.new.wire_bytes=200\n"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_data_limit_bucket:localhost/new/wire_bytes=200\n"
|
||||
|
||||
# https also blocked
|
||||
url = 'https://localhost:{}/w/x'.format(https_daemon.server_port)
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True,
|
||||
verify=False)
|
||||
assert response.status_code == 420
|
||||
assert response.reason == "Reached limit"
|
||||
expected_response_meta = {'reached-limit': {'test_host_data_limit_bucket:localhost.new.wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}}
|
||||
assert response.status_code == 430
|
||||
assert response.reason == "Reached soft limit"
|
||||
expected_response_meta = {'reached-soft-limit': {'test_host_data_limit_bucket:localhost/new/wire_bytes': 200}, 'stats': {'test_host_data_limit_bucket:localhost': {'total': {'wire_bytes': 405, 'urls': 3}, 'revisit': {'wire_bytes': 135, 'urls': 1}, 'new': {'wire_bytes': 270, 'urls': 2}, 'bucket': 'test_host_data_limit_bucket:localhost'}}}
|
||||
assert json.loads(response.headers["warcprox-meta"]) == expected_response_meta
|
||||
assert response.headers["content-type"] == "text/plain;charset=utf-8"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached limit test_host_data_limit_bucket:localhost.new.wire_bytes=200\n"
|
||||
assert response.raw.data == b"request rejected by warcprox: reached soft limit test_host_data_limit_bucket:localhost/new/wire_bytes=200\n"
|
||||
|
||||
# XXX this test relies on a tor proxy running at localhost:9050 with a working
|
||||
# connection to the internet, and relies on a third party site (facebook) being
|
||||
|
@ -319,7 +319,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
self.logger.info("%s: %s", repr(self.requestline), e)
|
||||
return
|
||||
except Exception as e:
|
||||
self.logger.error("problem processing request {}: {}".format(repr(self.requestline), e))
|
||||
self.logger.error("problem processing request {}: {}".format(repr(self.requestline), e), exc_info=True)
|
||||
self.send_error(500, str(e))
|
||||
return
|
||||
|
||||
|
@ -175,42 +175,55 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
self.client_address[0], self.command,
|
||||
self.url, rule))
|
||||
|
||||
def _enforce_limit(self, limit_key, limit_value, soft=False):
|
||||
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
|
||||
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
|
||||
if value and value >= limit_value:
|
||||
body = ("request rejected by warcprox: reached %s %s=%s\n" % (
|
||||
"soft limit" if soft else "limit", limit_key,
|
||||
limit_value)).encode("utf-8")
|
||||
if soft:
|
||||
self.send_response(430, "Reached soft limit")
|
||||
else:
|
||||
self.send_response(420, "Reached limit")
|
||||
self.send_header("Content-Type", "text/plain;charset=utf-8")
|
||||
self.send_header("Connection", "close")
|
||||
self.send_header("Content-Length", len(body))
|
||||
response_meta = {
|
||||
"stats": {bucket0:self.server.stats_db.value(bucket0)}
|
||||
}
|
||||
if soft:
|
||||
response_meta["reached-soft-limit"] = {limit_key:limit_value}
|
||||
else:
|
||||
response_meta["reached-limit"] = {limit_key:limit_value}
|
||||
self.send_header(
|
||||
"Warcprox-Meta",
|
||||
json.dumps(response_meta, separators=(",",":")))
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
self.connection.close()
|
||||
raise warcprox.RequestBlockedByRule(
|
||||
"%s %s %s %s -- reached %s %s=%s" % (
|
||||
self.client_address[0], 430 if soft else 420,
|
||||
self.command, self.url,
|
||||
"soft limit" if soft else "limit",
|
||||
limit_key, limit_value))
|
||||
|
||||
def _enforce_limits(self, warcprox_meta):
|
||||
"""
|
||||
Sends a 420 response and raises warcprox.RequestBlockedByRule if a
|
||||
limit specified in warcprox_meta is reached.
|
||||
Sends a 420 (hard limit) or 430 (soft limit) response and raises
|
||||
warcprox.RequestBlockedByRule if a limit specified in warcprox_meta is
|
||||
reached.
|
||||
"""
|
||||
if warcprox_meta and "limits" in warcprox_meta:
|
||||
for item in warcprox_meta["limits"].items():
|
||||
key, limit = item
|
||||
bucket0, bucket1, bucket2 = key.rsplit(".", 2)
|
||||
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
|
||||
self.logger.debug(
|
||||
"warcprox_meta['limits']=%s stats['%s']=%s "
|
||||
"recorded_url_q.qsize()=%s", warcprox_meta['limits'],
|
||||
key, value, self.server.recorded_url_q.qsize())
|
||||
if value and value >= limit:
|
||||
body = ("request rejected by warcprox: reached limit "
|
||||
"%s=%s\n" % (key, limit)).encode("utf-8")
|
||||
self.send_response(420, "Reached limit")
|
||||
self.send_header("Content-Type", "text/plain;charset=utf-8")
|
||||
self.send_header("Connection", "close")
|
||||
self.send_header("Content-Length", len(body))
|
||||
response_meta = {
|
||||
"reached-limit": {key:limit},
|
||||
"stats": {bucket0:self.server.stats_db.value(bucket0)}
|
||||
}
|
||||
self.send_header(
|
||||
"Warcprox-Meta",
|
||||
json.dumps(response_meta, separators=(",",":")))
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
self.connection.close()
|
||||
raise warcprox.RequestBlockedByRule(
|
||||
"%s 420 %s %s -- reached limit %s=%s" % (
|
||||
self.client_address[0], self.command,
|
||||
self.url, key, limit))
|
||||
limit_key, limit_value = item
|
||||
self._enforce_limit(limit_key, limit_value, soft=False)
|
||||
if warcprox_meta and "soft-limits" in warcprox_meta:
|
||||
for item in warcprox_meta["soft-limits"].items():
|
||||
limit_key, limit_value = item
|
||||
self._enforce_limit(limit_key, limit_value, soft=True)
|
||||
|
||||
def _connect_to_remote_server(self):
|
||||
'''
|
||||
|
Loading…
x
Reference in New Issue
Block a user