fix bug in limits enforcement

enforce limit only if url is in stats bucket that limit applies to!
This commit is contained in:
Noah Levitt 2018-05-29 12:18:51 -07:00
parent 07dc978f09
commit d9e0ed31f2
3 changed files with 74 additions and 60 deletions

View File

@ -1158,6 +1158,9 @@ def test_domain_doc_soft_limit(
assert response.headers['warcprox-test-header'] == 'o!'
assert response.content == b'I am the warcprox test payload! pppppppppp!\n'
# wait for postfetch chain
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 22)
def test_domain_data_soft_limit(
http_daemon, https_daemon, warcprox_, archiving_proxies):
urls_before = warcprox_.proxy.running_stats.urls
@ -1265,6 +1268,9 @@ def test_domain_data_soft_limit(
assert response.headers['warcprox-test-header'] == 'y!'
assert response.content == b'I am the warcprox test payload! zzzzzzzzzz!\n'
# wait for postfetch chain
wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 5)
# XXX this test relies on a tor proxy running at localhost:9050 with a working
# connection to the internet, and relies on a third party site (facebook) being
# up and behaving a certain way

View File

@ -53,6 +53,53 @@ def _empty_bucket(bucket):
},
}
def unravel_buckets(url, warcprox_meta):
'''
Unravels bucket definitions in Warcprox-Meta header. Each bucket
definition can either be a string, which signifies the name of the
bucket, or a dict. If a dict it is expected to have at least an item
with key 'bucket' whose value is the name of the bucket. The other
currently recognized item is 'tally-domains', which if supplied should
be a list of domains. This instructs warcprox to additionally tally
substats of the given bucket by domain. Host stats are stored in the
stats table under the key '{parent-bucket}:{domain(normalized)}'.
Returns:
list of strings
Example Warcprox-Meta header (a real one will likely have other
sections besides 'stats'):
Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}
In this case the return value would be
["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"]
'''
buckets = ["__all__"]
if (warcprox_meta and "stats" in warcprox_meta
and "buckets" in warcprox_meta["stats"]):
for bucket in warcprox_meta["stats"]["buckets"]:
if isinstance(bucket, dict):
if not 'bucket' in bucket:
self.logger.warn(
'ignoring invalid stats bucket in '
'warcprox-meta header %s', bucket)
continue
buckets.append(bucket['bucket'])
if bucket.get('tally-domains'):
canon_url = urlcanon.semantic(url)
for domain in bucket['tally-domains']:
domain = urlcanon.normalize_host(domain).decode('ascii')
if urlcanon.url_matches_domain(canon_url, domain):
buckets.append(
'%s:%s' % (bucket['bucket'], domain))
else:
buckets.append(bucket)
else:
buckets.append("__unspecified__")
return buckets
class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
logger = logging.getLogger("warcprox.stats.StatsProcessor")
@ -153,46 +200,7 @@ class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
return None
def buckets(self, recorded_url):
'''
Unravels bucket definitions in Warcprox-Meta header. Each bucket
definition can either be a string, which signifies the name of the
bucket, or a dict. If a dict it is expected to have at least an item
with key 'bucket' whose value is the name of the bucket. The other
currently recognized item is 'tally-domains', which if supplied should
be a list of domains. This instructs warcprox to additionally tally
substats of the given bucket by domain. Host stats are stored in the
stats table under the key '{parent-bucket}:{domain(normalized)}'.
Example Warcprox-Meta header (a real one will likely have other
sections besides 'stats'):
Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}
'''
buckets = ["__all__"]
if (recorded_url.warcprox_meta
and "stats" in recorded_url.warcprox_meta
and "buckets" in recorded_url.warcprox_meta["stats"]):
for bucket in recorded_url.warcprox_meta["stats"]["buckets"]:
if isinstance(bucket, dict):
if not 'bucket' in bucket:
self.logger.warn(
'ignoring invalid stats bucket in '
'warcprox-meta header %s', bucket)
continue
buckets.append(bucket['bucket'])
if bucket.get('tally-domains'):
url = urlcanon.semantic(recorded_url.url)
for domain in bucket['tally-domains']:
domain = urlcanon.normalize_host(domain).decode('ascii')
if urlcanon.url_matches_domain(url, domain):
buckets.append(
'%s:%s' % (bucket['bucket'], domain))
else:
buckets.append(bucket)
else:
buckets.append("__unspecified__")
return buckets
return unravel_buckets(recorded_url.url, recorded_url.warcprox_meta)
class RethinkStatsProcessor(StatsProcessor):
logger = logging.getLogger("warcprox.stats.RethinkStatsProcessor")

View File

@ -92,26 +92,26 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
self.client_address[0], self.command,
self.url, rule))
def _enforce_limit(self, limit_key, limit_value, soft=False):
def _enforce_limit(self, buckets, limit_key, limit_value, soft=False):
if not self.server.stats_db:
return
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
_limit_key = limit_key
# if limit_key looks like 'job1:foo.com/total/urls' then we only want
# to apply this rule if the requested url is within domain
bucket0_fields = bucket0.split(':')
if len(bucket0_fields) == 2:
domain = urlcanon.normalize_host(bucket0_fields[1])
if not urlcanon.host_matches_domain(self.hostname, domain):
return # else host matches, go ahead and enforce the limit
bucket0 = '%s:%s' % (bucket0_fields[0], domain.decode('ascii'))
_limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2)
# parse limit key
bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
# normalize domain if part of bucket
if ":" in bucket0:
b, raw_domain = bucket0.split(":", 1)
domain = urlcanon.normalize_host(raw_domain).decode("ascii")
bucket0 = "%s:%s" % (b, domain)
limit_key = "%s/%s/%s" % (bucket0, bucket1, bucket2)
if not bucket0 in buckets:
return
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
if value and limit_value and limit_value > 0 and value >= limit_value:
body = ("request rejected by warcprox: reached %s %s=%s\n" % (
"soft limit" if soft else "limit", _limit_key,
"soft limit" if soft else "limit", limit_key,
limit_value)).encode("utf-8")
if soft:
self.send_response(430, "Reached soft limit")
@ -124,12 +124,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
"stats": {bucket0:self.server.stats_db.value(bucket0)}
}
if soft:
response_meta["reached-soft-limit"] = {_limit_key:limit_value}
response_meta["reached-soft-limit"] = {limit_key:limit_value}
else:
response_meta["reached-limit"] = {_limit_key:limit_value}
response_meta["reached-limit"] = {limit_key:limit_value}
self.send_header(
"Warcprox-Meta",
json.dumps(response_meta, separators=(",",":")))
"Warcprox-Meta", json.dumps(response_meta, separators=",:"))
self.end_headers()
if self.command != "HEAD":
self.wfile.write(body)
@ -139,7 +138,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
self.client_address[0], 430 if soft else 420,
self.command, self.url,
"soft limit" if soft else "limit",
_limit_key, limit_value))
limit_key, limit_value))
def _enforce_limits(self, warcprox_meta):
"""
@ -147,14 +146,15 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
warcprox.RequestBlockedByRule if a limit specified in warcprox_meta is
reached.
"""
buckets = warcprox.stats.unravel_buckets(self.url, warcprox_meta)
if warcprox_meta and "limits" in warcprox_meta:
for item in warcprox_meta["limits"].items():
limit_key, limit_value = item
self._enforce_limit(limit_key, limit_value, soft=False)
self._enforce_limit(buckets, limit_key, limit_value, soft=False)
if warcprox_meta and "soft-limits" in warcprox_meta:
for item in warcprox_meta["soft-limits"].items():
limit_key, limit_value = item
self._enforce_limit(limit_key, limit_value, soft=True)
self._enforce_limit(buckets, limit_key, limit_value, soft=True)
def _security_check(self, warcprox_meta):
'''