diff --git a/setup.py b/setup.py index e4f1fd6..f1ffd48 100755 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ deps = [ 'certauth>=1.1.0', 'warctools', 'kafka-python>=1.0.1', - 'surt>=0.3b4', + 'urlcanon>=0.1.dev16', 'doublethink>=0.2.0.dev69', 'PySocks', ] @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.1b1.dev54', + version='2.1b1.dev55', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 0ba3c8b..e6312f7 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -837,7 +837,6 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies): # blocked by SURT_MATCH url = 'http://localhost:{}/fuh/guh'.format(http_daemon.server_port) - # logging.info("%s => %s", repr(url), repr(warcprox.warcproxy.Url(url).surt)) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 403 diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 70a8601..1eeb9a4 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -57,76 +57,6 @@ class RequestBlockedByRule(Exception): def __str__(self): return "%s: %s" % (self.__class__.__name__, self.msg) -class Url: - ''' - Utility class - ''' - def __init__(self, url): - self.url = url - self._surt = None - self._host = None - - @property - def surt(self): - if not self._surt: - import surt - hurl = surt.handyurl.parse(self.url) - surt.GoogleURLCanonicalizer.canonicalize(hurl) - hurl.query = None - hurl.hash = None - self._surt = hurl.getURLString(surt=True, trailing_comma=True) - return self._surt - - @property - def host(self): - if not self._host: - import surt - self._host = surt.handyurl.parse(self.url).host - return self._host - - def matches_ip_or_domain(self, ip_or_domain): - return host_matches_ip_or_domain(self.host, ip_or_domain) - -def normalize_host(host): - # normalize host (punycode and lowercase) - return host.encode('idna').decode('ascii').lower() - -def host_matches_ip_or_domain(host, ip_or_domain): - ''' - Returns true if - - ip_or_domain is an ip address and host is the same ip address - - ip_or_domain is a domain and host is the same domain - - ip_or_domain is a domain and host is a subdomain of it - ''' - if not host: - return False - _host = normalize_host(host) - _ip_or_domain = normalize_host(ip_or_domain) - - if _ip_or_domain == _host: - return True - - # if either _ip_or_domain or host are ip addresses, and they're not - # identical (previous check), not a match - try: - ipaddress.ip_address(_ip_or_domain) - return False - except: - pass - try: - ipaddress.ip_address(_host) - return False - except: - pass - - # if we get here, we're looking at two hostnames - domain_parts = _ip_or_domain.split(".") - host_parts = _host.split(".") - - result = host_parts[-len(domain_parts):] == domain_parts - return result - - # logging level more fine-grained than logging.DEBUG==10 TRACE = 5 diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index e32bdf7..69876c2 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -29,7 +29,7 @@ from hanzo import warctools import random import warcprox import base64 -import surt +import urlcanon import os import hashlib import threading @@ -159,8 +159,7 @@ class RethinkCaptures: else: bucket = "__unspecified__" - canon_surt = surt.surt(recorded_url.url.decode("utf-8"), - trailing_comma=True, host_massage=False, with_scheme=True) + canon_surt = urlcanon.semantic(recorded_url.url).decode("utf-8") entry = { # id only specified for rethinkdb partitioning diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 675fb87..470e952 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -54,6 +54,7 @@ except ImportError: import SocketServer as socketserver import resource import concurrent.futures +import urlcanon class ProxyingRecorder(object): """ @@ -204,15 +205,15 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): urllib_parse.ParseResult( scheme='', netloc='', params=u.params, path=u.path or '/', query=u.query, fragment=u.fragment)) - self.hostname = warcprox.normalize_host(host) + self.hostname = urlcanon.normalize_host(host).decode('ascii') def _connect_to_remote_server(self): # Connect to destination - if self.onion_tor_socks_proxy_host and self.hostname.lower().endswith('.onion'): - self.logger.info("using tor socks proxy at %s:%s to connect to %s", + if self.onion_tor_socks_proxy_host and self.hostname.endswith('.onion'): + self.logger.info( + "using tor socks proxy at %s:%s to connect to %s", self.onion_tor_socks_proxy_host, - self.onion_tor_socks_proxy_port or 1080, - self.hostname) + self.onion_tor_socks_proxy_port or 1080, self.hostname) self._remote_server_sock = socks.socksocket() self._remote_server_sock.set_proxy( socks.SOCKS5, addr=self.onion_tor_socks_proxy_host, @@ -247,8 +248,9 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): return self._remote_server_sock def _transition_to_ssl(self): - self.request = self.connection = ssl.wrap_socket(self.connection, - server_side=True, certfile=self.server.ca.cert_for_host(self.hostname)) + self.request = self.connection = ssl.wrap_socket( + self.connection, server_side=True, + certfile=self.server.ca.cert_for_host(self.hostname)) def do_CONNECT(self): ''' diff --git a/warcprox/stats.py b/warcprox/stats.py index db1884d..4baf939 100644 --- a/warcprox/stats.py +++ b/warcprox/stats.py @@ -30,7 +30,7 @@ import warcprox import threading import rethinkdb as r import datetime -import surt +import urlcanon def _empty_bucket(bucket): return { @@ -136,12 +136,12 @@ class StatsDb: continue buckets.append(bucket['bucket']) if bucket.get('tally-domains'): - url = warcprox.Url(recorded_url.url.decode('utf-8')) + url = urlcanon.semantic(recorded_url.url) for domain in bucket['tally-domains']: - if url.matches_ip_or_domain(domain): - buckets.append('%s:%s' % ( - bucket['bucket'], - warcprox.normalize_host(domain))) + domain = urlcanon.normalize_host(domain).decode('ascii') + if urlcanon.url_matches_domain(url, domain): + buckets.append( + '%s:%s' % (bucket['bucket'], domain)) else: buckets.append(bucket) else: diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 43b67f1..bc99bd7 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -44,7 +44,7 @@ from certauth.certauth import CertificateAuthority import warcprox import datetime import ipaddress -import surt +import urlcanon class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): ''' @@ -62,45 +62,16 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): # self.server is WarcProxy logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler") - # XXX nearly identical to brozzler.site.Site._scope_rule_applies() but - # there's no obvious common dependency where this code should go... TBD - def _scope_rule_applies(self, rule): - u = warcprox.Url(self.url) - - if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]): - return False - if "url_match" in rule: - if rule["url_match"] == "STRING_MATCH": - return u.url.find(rule["value"]) >= 0 - elif rule["url_match"] == "REGEX_MATCH": - try: - return re.fullmatch(rule["value"], u.url) - except Exception as e: - self.logger.warn( - "caught exception matching against regex %s: %s", - rule["value"], e) - return False - elif rule["url_match"] == "SURT_MATCH": - return u.surt.startswith(rule["value"]) - else: - self.logger.warn("invalid rule.url_match=%s", rule.url_match) - return False - else: - if "domain" in rule: - # we already know that it matches from earlier check - return True - else: - self.logger.warn("unable to make sense of scope rule %s", rule) - return False - def _enforce_blocks(self, warcprox_meta): """ Sends a 403 response and raises warcprox.RequestBlockedByRule if the url is blocked by a rule in warcprox_meta. """ + url = urlcanon.semantic(self.url) if warcprox_meta and "blocks" in warcprox_meta: for rule in warcprox_meta["blocks"]: - if self._scope_rule_applies(rule): + block_rule = urlcanon.MatchRule(**rule) + if block_rule.applies(url): body = ("request rejected by warcprox: blocked by " "rule found in Warcprox-Meta header: %s" % rule).encode("utf-8") @@ -130,12 +101,10 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): # to apply this rule if the requested url is within domain bucket0_fields = bucket0.split(':') if len(bucket0_fields) == 2: - if not warcprox.host_matches_ip_or_domain( - self.hostname, bucket0_fields[1]): + domain = urlcanon.normalize_host(bucket0_fields[1]) + if not urlcanon.host_matches_domain(self.hostname, domain): return # else host matches, go ahead and enforce the limit - bucket0 = '%s:%s' % ( - bucket0_fields[0], - warcprox.normalize_host(bucket0_fields[1])) + bucket0 = '%s:%s' % (bucket0_fields[0], domain.decode('ascii')) _limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2) value = self.server.stats_db.value(bucket0, bucket1, bucket2)