mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
fix bug in limits enforcement
enforce limit only if url is in stats bucket that limit applies to!
This commit is contained in:
parent
07dc978f09
commit
d9e0ed31f2
@ -1158,6 +1158,9 @@ def test_domain_doc_soft_limit(
|
||||
assert response.headers['warcprox-test-header'] == 'o!'
|
||||
assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
|
||||
|
||||
# wait for postfetch chain
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 22)
|
||||
|
||||
def test_domain_data_soft_limit(
|
||||
http_daemon, https_daemon, warcprox_, archiving_proxies):
|
||||
urls_before = warcprox_.proxy.running_stats.urls
|
||||
@ -1265,6 +1268,9 @@ def test_domain_data_soft_limit(
|
||||
assert response.headers['warcprox-test-header'] == 'y!'
|
||||
assert response.content == b'I am the warcprox test payload! zzzzzzzzzz!\n'
|
||||
|
||||
# wait for postfetch chain
|
||||
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 5)
|
||||
|
||||
# XXX this test relies on a tor proxy running at localhost:9050 with a working
|
||||
# connection to the internet, and relies on a third party site (facebook) being
|
||||
# up and behaving a certain way
|
||||
|
@ -53,6 +53,53 @@ def _empty_bucket(bucket):
|
||||
},
|
||||
}
|
||||
|
||||
def unravel_buckets(url, warcprox_meta):
|
||||
'''
|
||||
Unravels bucket definitions in Warcprox-Meta header. Each bucket
|
||||
definition can either be a string, which signifies the name of the
|
||||
bucket, or a dict. If a dict it is expected to have at least an item
|
||||
with key 'bucket' whose value is the name of the bucket. The other
|
||||
currently recognized item is 'tally-domains', which if supplied should
|
||||
be a list of domains. This instructs warcprox to additionally tally
|
||||
substats of the given bucket by domain. Host stats are stored in the
|
||||
stats table under the key '{parent-bucket}:{domain(normalized)}'.
|
||||
|
||||
Returns:
|
||||
list of strings
|
||||
|
||||
Example Warcprox-Meta header (a real one will likely have other
|
||||
sections besides 'stats'):
|
||||
|
||||
Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}
|
||||
|
||||
In this case the return value would be
|
||||
["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"]
|
||||
'''
|
||||
buckets = ["__all__"]
|
||||
if (warcprox_meta and "stats" in warcprox_meta
|
||||
and "buckets" in warcprox_meta["stats"]):
|
||||
for bucket in warcprox_meta["stats"]["buckets"]:
|
||||
if isinstance(bucket, dict):
|
||||
if not 'bucket' in bucket:
|
||||
self.logger.warn(
|
||||
'ignoring invalid stats bucket in '
|
||||
'warcprox-meta header %s', bucket)
|
||||
continue
|
||||
buckets.append(bucket['bucket'])
|
||||
if bucket.get('tally-domains'):
|
||||
canon_url = urlcanon.semantic(url)
|
||||
for domain in bucket['tally-domains']:
|
||||
domain = urlcanon.normalize_host(domain).decode('ascii')
|
||||
if urlcanon.url_matches_domain(canon_url, domain):
|
||||
buckets.append(
|
||||
'%s:%s' % (bucket['bucket'], domain))
|
||||
else:
|
||||
buckets.append(bucket)
|
||||
else:
|
||||
buckets.append("__unspecified__")
|
||||
|
||||
return buckets
|
||||
|
||||
class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
|
||||
logger = logging.getLogger("warcprox.stats.StatsProcessor")
|
||||
|
||||
@ -153,46 +200,7 @@ class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
|
||||
return None
|
||||
|
||||
def buckets(self, recorded_url):
|
||||
'''
|
||||
Unravels bucket definitions in Warcprox-Meta header. Each bucket
|
||||
definition can either be a string, which signifies the name of the
|
||||
bucket, or a dict. If a dict it is expected to have at least an item
|
||||
with key 'bucket' whose value is the name of the bucket. The other
|
||||
currently recognized item is 'tally-domains', which if supplied should
|
||||
be a list of domains. This instructs warcprox to additionally tally
|
||||
substats of the given bucket by domain. Host stats are stored in the
|
||||
stats table under the key '{parent-bucket}:{domain(normalized)}'.
|
||||
|
||||
Example Warcprox-Meta header (a real one will likely have other
|
||||
sections besides 'stats'):
|
||||
|
||||
Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}
|
||||
'''
|
||||
buckets = ["__all__"]
|
||||
if (recorded_url.warcprox_meta
|
||||
and "stats" in recorded_url.warcprox_meta
|
||||
and "buckets" in recorded_url.warcprox_meta["stats"]):
|
||||
for bucket in recorded_url.warcprox_meta["stats"]["buckets"]:
|
||||
if isinstance(bucket, dict):
|
||||
if not 'bucket' in bucket:
|
||||
self.logger.warn(
|
||||
'ignoring invalid stats bucket in '
|
||||
'warcprox-meta header %s', bucket)
|
||||
continue
|
||||
buckets.append(bucket['bucket'])
|
||||
if bucket.get('tally-domains'):
|
||||
url = urlcanon.semantic(recorded_url.url)
|
||||
for domain in bucket['tally-domains']:
|
||||
domain = urlcanon.normalize_host(domain).decode('ascii')
|
||||
if urlcanon.url_matches_domain(url, domain):
|
||||
buckets.append(
|
||||
'%s:%s' % (bucket['bucket'], domain))
|
||||
else:
|
||||
buckets.append(bucket)
|
||||
else:
|
||||
buckets.append("__unspecified__")
|
||||
|
||||
return buckets
|
||||
return unravel_buckets(recorded_url.url, recorded_url.warcprox_meta)
|
||||
|
||||
class RethinkStatsProcessor(StatsProcessor):
|
||||
logger = logging.getLogger("warcprox.stats.RethinkStatsProcessor")
|
||||
|
@ -92,26 +92,26 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
self.client_address[0], self.command,
|
||||
self.url, rule))
|
||||
|
||||
def _enforce_limit(self, limit_key, limit_value, soft=False):
|
||||
def _enforce_limit(self, buckets, limit_key, limit_value, soft=False):
|
||||
if not self.server.stats_db:
|
||||
return
|
||||
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
|
||||
_limit_key = limit_key
|
||||
|
||||
# if limit_key looks like 'job1:foo.com/total/urls' then we only want
|
||||
# to apply this rule if the requested url is within domain
|
||||
bucket0_fields = bucket0.split(':')
|
||||
if len(bucket0_fields) == 2:
|
||||
domain = urlcanon.normalize_host(bucket0_fields[1])
|
||||
if not urlcanon.host_matches_domain(self.hostname, domain):
|
||||
return # else host matches, go ahead and enforce the limit
|
||||
bucket0 = '%s:%s' % (bucket0_fields[0], domain.decode('ascii'))
|
||||
_limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2)
|
||||
# parse limit key
|
||||
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
|
||||
# normalize domain if part of bucket
|
||||
if ":" in bucket0:
|
||||
b, raw_domain = bucket0.split(":", 1)
|
||||
domain = urlcanon.normalize_host(raw_domain).decode("ascii")
|
||||
bucket0 = "%s:%s" % (b, domain)
|
||||
limit_key = "%s/%s/%s" % (bucket0, bucket1, bucket2)
|
||||
|
||||
if not bucket0 in buckets:
|
||||
return
|
||||
|
||||
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
|
||||
if value and limit_value and limit_value > 0 and value >= limit_value:
|
||||
body = ("request rejected by warcprox: reached %s %s=%s\n" % (
|
||||
"soft limit" if soft else "limit", _limit_key,
|
||||
"soft limit" if soft else "limit", limit_key,
|
||||
limit_value)).encode("utf-8")
|
||||
if soft:
|
||||
self.send_response(430, "Reached soft limit")
|
||||
@ -124,12 +124,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
"stats": {bucket0:self.server.stats_db.value(bucket0)}
|
||||
}
|
||||
if soft:
|
||||
response_meta["reached-soft-limit"] = {_limit_key:limit_value}
|
||||
response_meta["reached-soft-limit"] = {limit_key:limit_value}
|
||||
else:
|
||||
response_meta["reached-limit"] = {_limit_key:limit_value}
|
||||
response_meta["reached-limit"] = {limit_key:limit_value}
|
||||
self.send_header(
|
||||
"Warcprox-Meta",
|
||||
json.dumps(response_meta, separators=(",",":")))
|
||||
"Warcprox-Meta", json.dumps(response_meta, separators=",:"))
|
||||
self.end_headers()
|
||||
if self.command != "HEAD":
|
||||
self.wfile.write(body)
|
||||
@ -139,7 +138,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
self.client_address[0], 430 if soft else 420,
|
||||
self.command, self.url,
|
||||
"soft limit" if soft else "limit",
|
||||
_limit_key, limit_value))
|
||||
limit_key, limit_value))
|
||||
|
||||
def _enforce_limits(self, warcprox_meta):
|
||||
"""
|
||||
@ -147,14 +146,15 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
warcprox.RequestBlockedByRule if a limit specified in warcprox_meta is
|
||||
reached.
|
||||
"""
|
||||
buckets = warcprox.stats.unravel_buckets(self.url, warcprox_meta)
|
||||
if warcprox_meta and "limits" in warcprox_meta:
|
||||
for item in warcprox_meta["limits"].items():
|
||||
limit_key, limit_value = item
|
||||
self._enforce_limit(limit_key, limit_value, soft=False)
|
||||
self._enforce_limit(buckets, limit_key, limit_value, soft=False)
|
||||
if warcprox_meta and "soft-limits" in warcprox_meta:
|
||||
for item in warcprox_meta["soft-limits"].items():
|
||||
limit_key, limit_value = item
|
||||
self._enforce_limit(limit_key, limit_value, soft=True)
|
||||
self._enforce_limit(buckets, limit_key, limit_value, soft=True)
|
||||
|
||||
def _security_check(self, warcprox_meta):
|
||||
'''
|
||||
|
Loading…
x
Reference in New Issue
Block a user