diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 0e60319..0deecc6 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1158,6 +1158,9 @@ def test_domain_doc_soft_limit( assert response.headers['warcprox-test-header'] == 'o!' assert response.content == b'I am the warcprox test payload! pppppppppp!\n' + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 22) + def test_domain_data_soft_limit( http_daemon, https_daemon, warcprox_, archiving_proxies): urls_before = warcprox_.proxy.running_stats.urls @@ -1265,6 +1268,9 @@ def test_domain_data_soft_limit( assert response.headers['warcprox-test-header'] == 'y!' assert response.content == b'I am the warcprox test payload! zzzzzzzzzz!\n' + # wait for postfetch chain + wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 5) + # XXX this test relies on a tor proxy running at localhost:9050 with a working # connection to the internet, and relies on a third party site (facebook) being # up and behaving a certain way diff --git a/warcprox/stats.py b/warcprox/stats.py index 4de5fef..85539e2 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -53,6 +53,53 @@ def _empty_bucket(bucket): }, } +def unravel_buckets(url, warcprox_meta): + ''' + Unravels bucket definitions in Warcprox-Meta header. Each bucket + definition can either be a string, which signifies the name of the + bucket, or a dict. If a dict it is expected to have at least an item + with key 'bucket' whose value is the name of the bucket. The other + currently recognized item is 'tally-domains', which if supplied should + be a list of domains. This instructs warcprox to additionally tally + substats of the given bucket by domain. Host stats are stored in the + stats table under the key '{parent-bucket}:{domain(normalized)}'. + + Returns: + list of strings + + Example Warcprox-Meta header (a real one will likely have other + sections besides 'stats'): + + Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}} + + In this case the return value would be + ["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"] + ''' + buckets = ["__all__"] + if (warcprox_meta and "stats" in warcprox_meta + and "buckets" in warcprox_meta["stats"]): + for bucket in warcprox_meta["stats"]["buckets"]: + if isinstance(bucket, dict): + if not 'bucket' in bucket: + self.logger.warn( + 'ignoring invalid stats bucket in ' + 'warcprox-meta header %s', bucket) + continue + buckets.append(bucket['bucket']) + if bucket.get('tally-domains'): + canon_url = urlcanon.semantic(url) + for domain in bucket['tally-domains']: + domain = urlcanon.normalize_host(domain).decode('ascii') + if urlcanon.url_matches_domain(canon_url, domain): + buckets.append( + '%s:%s' % (bucket['bucket'], domain)) + else: + buckets.append(bucket) + else: + buckets.append("__unspecified__") + + return buckets + class StatsProcessor(warcprox.BaseBatchPostfetchProcessor): logger = logging.getLogger("warcprox.stats.StatsProcessor") @@ -153,46 +200,7 @@ class StatsProcessor(warcprox.BaseBatchPostfetchProcessor): return None def buckets(self, recorded_url): - ''' - Unravels bucket definitions in Warcprox-Meta header. Each bucket - definition can either be a string, which signifies the name of the - bucket, or a dict. If a dict it is expected to have at least an item - with key 'bucket' whose value is the name of the bucket. The other - currently recognized item is 'tally-domains', which if supplied should - be a list of domains. This instructs warcprox to additionally tally - substats of the given bucket by domain. Host stats are stored in the - stats table under the key '{parent-bucket}:{domain(normalized)}'. - - Example Warcprox-Meta header (a real one will likely have other - sections besides 'stats'): - - Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}} - ''' - buckets = ["__all__"] - if (recorded_url.warcprox_meta - and "stats" in recorded_url.warcprox_meta - and "buckets" in recorded_url.warcprox_meta["stats"]): - for bucket in recorded_url.warcprox_meta["stats"]["buckets"]: - if isinstance(bucket, dict): - if not 'bucket' in bucket: - self.logger.warn( - 'ignoring invalid stats bucket in ' - 'warcprox-meta header %s', bucket) - continue - buckets.append(bucket['bucket']) - if bucket.get('tally-domains'): - url = urlcanon.semantic(recorded_url.url) - for domain in bucket['tally-domains']: - domain = urlcanon.normalize_host(domain).decode('ascii') - if urlcanon.url_matches_domain(url, domain): - buckets.append( - '%s:%s' % (bucket['bucket'], domain)) - else: - buckets.append(bucket) - else: - buckets.append("__unspecified__") - - return buckets + return unravel_buckets(recorded_url.url, recorded_url.warcprox_meta) class RethinkStatsProcessor(StatsProcessor): logger = logging.getLogger("warcprox.stats.RethinkStatsProcessor") diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 2050807..417f450 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -92,26 +92,26 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self.client_address[0], self.command, self.url, rule)) - def _enforce_limit(self, limit_key, limit_value, soft=False): + def _enforce_limit(self, buckets, limit_key, limit_value, soft=False): if not self.server.stats_db: return - bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) - _limit_key = limit_key - # if limit_key looks like 'job1:foo.com/total/urls' then we only want - # to apply this rule if the requested url is within domain - bucket0_fields = bucket0.split(':') - if len(bucket0_fields) == 2: - domain = urlcanon.normalize_host(bucket0_fields[1]) - if not urlcanon.host_matches_domain(self.hostname, domain): - return # else host matches, go ahead and enforce the limit - bucket0 = '%s:%s' % (bucket0_fields[0], domain.decode('ascii')) - _limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2) + # parse limit key + bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2) + # normalize domain if part of bucket + if ":" in bucket0: + b, raw_domain = bucket0.split(":", 1) + domain = urlcanon.normalize_host(raw_domain).decode("ascii") + bucket0 = "%s:%s" % (b, domain) + limit_key = "%s/%s/%s" % (bucket0, bucket1, bucket2) + + if not bucket0 in buckets: + return value = self.server.stats_db.value(bucket0, bucket1, bucket2) if value and limit_value and limit_value > 0 and value >= limit_value: body = ("request rejected by warcprox: reached %s %s=%s\n" % ( - "soft limit" if soft else "limit", _limit_key, + "soft limit" if soft else "limit", limit_key, limit_value)).encode("utf-8") if soft: self.send_response(430, "Reached soft limit") @@ -124,12 +124,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): "stats": {bucket0:self.server.stats_db.value(bucket0)} } if soft: - response_meta["reached-soft-limit"] = {_limit_key:limit_value} + response_meta["reached-soft-limit"] = {limit_key:limit_value} else: - response_meta["reached-limit"] = {_limit_key:limit_value} + response_meta["reached-limit"] = {limit_key:limit_value} self.send_header( - "Warcprox-Meta", - json.dumps(response_meta, separators=(",",":"))) + "Warcprox-Meta", json.dumps(response_meta, separators=",:")) self.end_headers() if self.command != "HEAD": self.wfile.write(body) @@ -139,7 +138,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self.client_address[0], 430 if soft else 420, self.command, self.url, "soft limit" if soft else "limit", - _limit_key, limit_value)) + limit_key, limit_value)) def _enforce_limits(self, warcprox_meta): """ @@ -147,14 +146,15 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): warcprox.RequestBlockedByRule if a limit specified in warcprox_meta is reached. """ + buckets = warcprox.stats.unravel_buckets(self.url, warcprox_meta) if warcprox_meta and "limits" in warcprox_meta: for item in warcprox_meta["limits"].items(): limit_key, limit_value = item - self._enforce_limit(limit_key, limit_value, soft=False) + self._enforce_limit(buckets, limit_key, limit_value, soft=False) if warcprox_meta and "soft-limits" in warcprox_meta: for item in warcprox_meta["soft-limits"].items(): limit_key, limit_value = item - self._enforce_limit(limit_key, limit_value, soft=True) + self._enforce_limit(buckets, limit_key, limit_value, soft=True) def _security_check(self, warcprox_meta): '''