use urlcanon library for canonicalization, surtification, scope match rules

This commit is contained in:
Noah Levitt 2017-03-15 09:33:50 -07:00
parent f30160d8ee
commit f1d07ad921
7 changed files with 26 additions and 127 deletions

View File

@ -40,7 +40,7 @@ deps = [
'certauth>=1.1.0',
'warctools',
'kafka-python>=1.0.1',
'surt>=0.3b4',
'urlcanon>=0.1.dev16',
'doublethink>=0.2.0.dev69',
'PySocks',
]
@ -51,7 +51,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.1b1.dev54',
version='2.1b1.dev55',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -837,7 +837,6 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies):
# blocked by SURT_MATCH
url = 'http://localhost:{}/fuh/guh'.format(http_daemon.server_port)
# logging.info("%s => %s", repr(url), repr(warcprox.warcproxy.Url(url).surt))
response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 403

View File

@ -57,76 +57,6 @@ class RequestBlockedByRule(Exception):
def __str__(self):
return "%s: %s" % (self.__class__.__name__, self.msg)
class Url:
'''
Utility class
'''
def __init__(self, url):
self.url = url
self._surt = None
self._host = None
@property
def surt(self):
if not self._surt:
import surt
hurl = surt.handyurl.parse(self.url)
surt.GoogleURLCanonicalizer.canonicalize(hurl)
hurl.query = None
hurl.hash = None
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
return self._surt
@property
def host(self):
if not self._host:
import surt
self._host = surt.handyurl.parse(self.url).host
return self._host
def matches_ip_or_domain(self, ip_or_domain):
return host_matches_ip_or_domain(self.host, ip_or_domain)
def normalize_host(host):
# normalize host (punycode and lowercase)
return host.encode('idna').decode('ascii').lower()
def host_matches_ip_or_domain(host, ip_or_domain):
'''
Returns true if
- ip_or_domain is an ip address and host is the same ip address
- ip_or_domain is a domain and host is the same domain
- ip_or_domain is a domain and host is a subdomain of it
'''
if not host:
return False
_host = normalize_host(host)
_ip_or_domain = normalize_host(ip_or_domain)
if _ip_or_domain == _host:
return True
# if either _ip_or_domain or host are ip addresses, and they're not
# identical (previous check), not a match
try:
ipaddress.ip_address(_ip_or_domain)
return False
except:
pass
try:
ipaddress.ip_address(_host)
return False
except:
pass
# if we get here, we're looking at two hostnames
domain_parts = _ip_or_domain.split(".")
host_parts = _host.split(".")
result = host_parts[-len(domain_parts):] == domain_parts
return result
# logging level more fine-grained than logging.DEBUG==10
TRACE = 5

View File

@ -29,7 +29,7 @@ from hanzo import warctools
import random
import warcprox
import base64
import surt
import urlcanon
import os
import hashlib
import threading
@ -159,8 +159,7 @@ class RethinkCaptures:
else:
bucket = "__unspecified__"
canon_surt = surt.surt(recorded_url.url.decode("utf-8"),
trailing_comma=True, host_massage=False, with_scheme=True)
canon_surt = urlcanon.semantic(recorded_url.url).decode("utf-8")
entry = {
# id only specified for rethinkdb partitioning

View File

@ -54,6 +54,7 @@ except ImportError:
import SocketServer as socketserver
import resource
import concurrent.futures
import urlcanon
class ProxyingRecorder(object):
"""
@ -204,15 +205,15 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
urllib_parse.ParseResult(
scheme='', netloc='', params=u.params, path=u.path or '/',
query=u.query, fragment=u.fragment))
self.hostname = warcprox.normalize_host(host)
self.hostname = urlcanon.normalize_host(host).decode('ascii')
def _connect_to_remote_server(self):
# Connect to destination
if self.onion_tor_socks_proxy_host and self.hostname.lower().endswith('.onion'):
self.logger.info("using tor socks proxy at %s:%s to connect to %s",
if self.onion_tor_socks_proxy_host and self.hostname.endswith('.onion'):
self.logger.info(
"using tor socks proxy at %s:%s to connect to %s",
self.onion_tor_socks_proxy_host,
self.onion_tor_socks_proxy_port or 1080,
self.hostname)
self.onion_tor_socks_proxy_port or 1080, self.hostname)
self._remote_server_sock = socks.socksocket()
self._remote_server_sock.set_proxy(
socks.SOCKS5, addr=self.onion_tor_socks_proxy_host,
@ -247,8 +248,9 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
return self._remote_server_sock
def _transition_to_ssl(self):
self.request = self.connection = ssl.wrap_socket(self.connection,
server_side=True, certfile=self.server.ca.cert_for_host(self.hostname))
self.request = self.connection = ssl.wrap_socket(
self.connection, server_side=True,
certfile=self.server.ca.cert_for_host(self.hostname))
def do_CONNECT(self):
'''

View File

@ -30,7 +30,7 @@ import warcprox
import threading
import rethinkdb as r
import datetime
import surt
import urlcanon
def _empty_bucket(bucket):
return {
@ -136,12 +136,12 @@ class StatsDb:
continue
buckets.append(bucket['bucket'])
if bucket.get('tally-domains'):
url = warcprox.Url(recorded_url.url.decode('utf-8'))
url = urlcanon.semantic(recorded_url.url)
for domain in bucket['tally-domains']:
if url.matches_ip_or_domain(domain):
buckets.append('%s:%s' % (
bucket['bucket'],
warcprox.normalize_host(domain)))
domain = urlcanon.normalize_host(domain).decode('ascii')
if urlcanon.url_matches_domain(url, domain):
buckets.append(
'%s:%s' % (bucket['bucket'], domain))
else:
buckets.append(bucket)
else:

View File

@ -44,7 +44,7 @@ from certauth.certauth import CertificateAuthority
import warcprox
import datetime
import ipaddress
import surt
import urlcanon
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
'''
@ -62,45 +62,16 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
# self.server is WarcProxy
logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler")
# XXX nearly identical to brozzler.site.Site._scope_rule_applies() but
# there's no obvious common dependency where this code should go... TBD
def _scope_rule_applies(self, rule):
u = warcprox.Url(self.url)
if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]):
return False
if "url_match" in rule:
if rule["url_match"] == "STRING_MATCH":
return u.url.find(rule["value"]) >= 0
elif rule["url_match"] == "REGEX_MATCH":
try:
return re.fullmatch(rule["value"], u.url)
except Exception as e:
self.logger.warn(
"caught exception matching against regex %s: %s",
rule["value"], e)
return False
elif rule["url_match"] == "SURT_MATCH":
return u.surt.startswith(rule["value"])
else:
self.logger.warn("invalid rule.url_match=%s", rule.url_match)
return False
else:
if "domain" in rule:
# we already know that it matches from earlier check
return True
else:
self.logger.warn("unable to make sense of scope rule %s", rule)
return False
def _enforce_blocks(self, warcprox_meta):
"""
Sends a 403 response and raises warcprox.RequestBlockedByRule if the
url is blocked by a rule in warcprox_meta.
"""
url = urlcanon.semantic(self.url)
if warcprox_meta and "blocks" in warcprox_meta:
for rule in warcprox_meta["blocks"]:
if self._scope_rule_applies(rule):
block_rule = urlcanon.MatchRule(**rule)
if block_rule.applies(url):
body = ("request rejected by warcprox: blocked by "
"rule found in Warcprox-Meta header: %s"
% rule).encode("utf-8")
@ -130,12 +101,10 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
# to apply this rule if the requested url is within domain
bucket0_fields = bucket0.split(':')
if len(bucket0_fields) == 2:
if not warcprox.host_matches_ip_or_domain(
self.hostname, bucket0_fields[1]):
domain = urlcanon.normalize_host(bucket0_fields[1])
if not urlcanon.host_matches_domain(self.hostname, domain):
return # else host matches, go ahead and enforce the limit
bucket0 = '%s:%s' % (
bucket0_fields[0],
warcprox.normalize_host(bucket0_fields[1]))
bucket0 = '%s:%s' % (bucket0_fields[0], domain.decode('ascii'))
_limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2)
value = self.server.stats_db.value(bucket0, bucket1, bucket2)