use urlcanon library for canonicalization, surtification, scope match rules

2025-01-18 13:22:09 +01:00 · 2017-03-15 09:33:50 -07:00 · 2017-03-15 09:33:50 -07:00 · f1d07ad921
commit f1d07ad921
parent f30160d8ee
7 changed files with 26 additions and 127 deletions
--- a/setup.py
+++ b/setup.py
@ -40,7 +40,7 @@ deps = [
    'certauth>=1.1.0',
    'warctools',
    'kafka-python>=1.0.1',
-    'surt>=0.3b4',
+    'urlcanon>=0.1.dev16',
    'doublethink>=0.2.0.dev69',
    'PySocks',
 ]
@ -51,7 +51,7 @@ except:

 setuptools.setup(
        name='warcprox',
-        version='2.1b1.dev54',
+        version='2.1b1.dev55',
        description='WARC writing MITM HTTP/S proxy',
        url='https://github.com/internetarchive/warcprox',
        author='Noah Levitt',
--- a/tests/test_warcprox.py
+++ b/tests/test_warcprox.py
@ -837,7 +837,6 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies):

    # blocked by SURT_MATCH
    url = 'http://localhost:{}/fuh/guh'.format(http_daemon.server_port)
-    # logging.info("%s => %s", repr(url), repr(warcprox.warcproxy.Url(url).surt))
    response = requests.get(
            url, proxies=archiving_proxies, headers=headers, stream=True)
    assert response.status_code == 403
--- a/warcprox/init.py
+++ b/warcprox/init.py
@ -57,76 +57,6 @@ class RequestBlockedByRule(Exception):
    def __str__(self):
        return "%s: %s" % (self.__class__.__name__, self.msg)

-class Url:
-    '''
-    Utility class
-    '''
-    def __init__(self, url):
-        self.url = url
-        self._surt = None
-        self._host = None
-
-    @property
-    def surt(self):
-        if not self._surt:
-            import surt
-            hurl = surt.handyurl.parse(self.url)
-            surt.GoogleURLCanonicalizer.canonicalize(hurl)
-            hurl.query = None
-            hurl.hash = None
-            self._surt = hurl.getURLString(surt=True, trailing_comma=True)
-        return self._surt
-
-    @property
-    def host(self):
-        if not self._host:
-            import surt
-            self._host = surt.handyurl.parse(self.url).host
-        return self._host
-
-    def matches_ip_or_domain(self, ip_or_domain):
-        return host_matches_ip_or_domain(self.host, ip_or_domain)
-
-def normalize_host(host):
-    # normalize host (punycode and lowercase)
-    return host.encode('idna').decode('ascii').lower()
-
-def host_matches_ip_or_domain(host, ip_or_domain):
-    '''
-    Returns true if
-     - ip_or_domain is an ip address and host is the same ip address
-     - ip_or_domain is a domain and host is the same domain
-     - ip_or_domain is a domain and host is a subdomain of it
-    '''
-    if not host:
-        return False
-    _host = normalize_host(host)
-    _ip_or_domain = normalize_host(ip_or_domain)
-
-    if _ip_or_domain == _host:
-        return True
-
-    # if either _ip_or_domain or host are ip addresses, and they're not
-    # identical (previous check), not a match
-    try:
-        ipaddress.ip_address(_ip_or_domain)
-        return False
-    except:
-        pass
-    try:
-        ipaddress.ip_address(_host)
-        return False
-    except:
-        pass
-
-    # if we get here, we're looking at two hostnames
-    domain_parts = _ip_or_domain.split(".")
-    host_parts = _host.split(".")
-
-    result = host_parts[-len(domain_parts):] == domain_parts
-    return result
-
-
 # logging level more fine-grained than logging.DEBUG==10
 TRACE = 5

--- a/warcprox/bigtable.py
+++ b/warcprox/bigtable.py
@ -29,7 +29,7 @@ from hanzo import warctools
 import random
 import warcprox
 import base64
-import surt
+import urlcanon
 import os
 import hashlib
 import threading
@ -159,8 +159,7 @@ class RethinkCaptures:
        else:
            bucket = "__unspecified__"

-        canon_surt = surt.surt(recorded_url.url.decode("utf-8"),
-            trailing_comma=True, host_massage=False, with_scheme=True)
+        canon_surt = urlcanon.semantic(recorded_url.url).decode("utf-8")

        entry = {
            # id only specified for rethinkdb partitioning
--- a/warcprox/mitmproxy.py
+++ b/warcprox/mitmproxy.py
@ -54,6 +54,7 @@ except ImportError:
    import SocketServer as socketserver
 import resource
 import concurrent.futures
+import urlcanon

 class ProxyingRecorder(object):
    """
@ -204,15 +205,15 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
                urllib_parse.ParseResult(
                    scheme='', netloc='', params=u.params, path=u.path or '/',
                    query=u.query, fragment=u.fragment))
-        self.hostname = warcprox.normalize_host(host)
+        self.hostname = urlcanon.normalize_host(host).decode('ascii')

    def _connect_to_remote_server(self):
        # Connect to destination
-        if self.onion_tor_socks_proxy_host and self.hostname.lower().endswith('.onion'):
-            self.logger.info("using tor socks proxy at %s:%s to connect to %s",
+        if self.onion_tor_socks_proxy_host and self.hostname.endswith('.onion'):
+            self.logger.info(
+                    "using tor socks proxy at %s:%s to connect to %s",
                    self.onion_tor_socks_proxy_host,
-                    self.onion_tor_socks_proxy_port or 1080,
-                    self.hostname)
+                    self.onion_tor_socks_proxy_port or 1080, self.hostname)
            self._remote_server_sock = socks.socksocket()
            self._remote_server_sock.set_proxy(
                    socks.SOCKS5, addr=self.onion_tor_socks_proxy_host,
@ -247,8 +248,9 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
        return self._remote_server_sock

    def _transition_to_ssl(self):
-        self.request = self.connection = ssl.wrap_socket(self.connection,
-                server_side=True, certfile=self.server.ca.cert_for_host(self.hostname))
+        self.request = self.connection = ssl.wrap_socket(
+                self.connection, server_side=True,
+                certfile=self.server.ca.cert_for_host(self.hostname))

    def do_CONNECT(self):
        '''
--- a/warcprox/stats.py
+++ b/warcprox/stats.py
@ -30,7 +30,7 @@ import warcprox
 import threading
 import rethinkdb as r
 import datetime
-import surt
+import urlcanon

 def _empty_bucket(bucket):
    return {
@ -136,12 +136,12 @@ class StatsDb:
                        continue
                    buckets.append(bucket['bucket'])
                    if bucket.get('tally-domains'):
-                        url = warcprox.Url(recorded_url.url.decode('utf-8'))
+                        url = urlcanon.semantic(recorded_url.url)
                        for domain in bucket['tally-domains']:
-                            if url.matches_ip_or_domain(domain):
-                                buckets.append('%s:%s' % (
-                                    bucket['bucket'],
-                                    warcprox.normalize_host(domain)))
+                            domain = urlcanon.normalize_host(domain).decode('ascii')
+                            if urlcanon.url_matches_domain(url, domain):
+                                buckets.append(
+                                        '%s:%s' % (bucket['bucket'], domain))
                else:
                    buckets.append(bucket)
        else:
--- a/warcprox/warcproxy.py
+++ b/warcprox/warcproxy.py
@ -44,7 +44,7 @@ from certauth.certauth import CertificateAuthority
 import warcprox
 import datetime
 import ipaddress
-import surt
+import urlcanon

 class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
    '''
@ -62,45 +62,16 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
    # self.server is WarcProxy
    logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler")

-    # XXX nearly identical to brozzler.site.Site._scope_rule_applies() but
-    # there's no obvious common dependency where this code should go... TBD
-    def _scope_rule_applies(self, rule):
-        u = warcprox.Url(self.url)
-
-        if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]):
-            return False
-        if "url_match" in rule:
-            if rule["url_match"] == "STRING_MATCH":
-                return u.url.find(rule["value"]) >= 0
-            elif rule["url_match"] == "REGEX_MATCH":
-                try:
-                    return re.fullmatch(rule["value"], u.url)
-                except Exception as e:
-                    self.logger.warn(
-                            "caught exception matching against regex %s: %s",
-                            rule["value"], e)
-                    return False
-            elif rule["url_match"] == "SURT_MATCH":
-                return u.surt.startswith(rule["value"])
-            else:
-                self.logger.warn("invalid rule.url_match=%s", rule.url_match)
-                return False
-        else:
-            if "domain" in rule:
-                # we already know that it matches from earlier check
-                return True
-            else:
-                self.logger.warn("unable to make sense of scope rule %s", rule)
-                return False
-
    def _enforce_blocks(self, warcprox_meta):
        """
        Sends a 403 response and raises warcprox.RequestBlockedByRule if the
        url is blocked by a rule in warcprox_meta.
        """
+        url = urlcanon.semantic(self.url)
        if warcprox_meta and "blocks" in warcprox_meta:
            for rule in warcprox_meta["blocks"]:
-                if self._scope_rule_applies(rule):
+                block_rule = urlcanon.MatchRule(**rule)
+                if block_rule.applies(url):
                    body = ("request rejected by warcprox: blocked by "
                            "rule found in Warcprox-Meta header: %s"
                            % rule).encode("utf-8")
@ -130,12 +101,10 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
        # to apply this rule if the requested url is within domain
        bucket0_fields = bucket0.split(':')
        if len(bucket0_fields) == 2:
-            if not warcprox.host_matches_ip_or_domain(
-                    self.hostname, bucket0_fields[1]):
+            domain = urlcanon.normalize_host(bucket0_fields[1])
+            if not urlcanon.host_matches_domain(self.hostname, domain):
                return # else host matches, go ahead and enforce the limit
-            bucket0 = '%s:%s' % (
-                    bucket0_fields[0],
-                    warcprox.normalize_host(bucket0_fields[1]))
+            bucket0 = '%s:%s' % (bucket0_fields[0], domain.decode('ascii'))
            _limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2)

        value = self.server.stats_db.value(bucket0, bucket1, bucket2)