use urlcanon library for canonicalization, surtification, scope match rules

This commit is contained in:
Noah Levitt 2017-03-15 09:33:50 -07:00
parent f30160d8ee
commit f1d07ad921
7 changed files with 26 additions and 127 deletions

View File

@ -40,7 +40,7 @@ deps = [
'certauth>=1.1.0', 'certauth>=1.1.0',
'warctools', 'warctools',
'kafka-python>=1.0.1', 'kafka-python>=1.0.1',
'surt>=0.3b4', 'urlcanon>=0.1.dev16',
'doublethink>=0.2.0.dev69', 'doublethink>=0.2.0.dev69',
'PySocks', 'PySocks',
] ]
@ -51,7 +51,7 @@ except:
setuptools.setup( setuptools.setup(
name='warcprox', name='warcprox',
version='2.1b1.dev54', version='2.1b1.dev55',
description='WARC writing MITM HTTP/S proxy', description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox', url='https://github.com/internetarchive/warcprox',
author='Noah Levitt', author='Noah Levitt',

View File

@ -837,7 +837,6 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies):
# blocked by SURT_MATCH # blocked by SURT_MATCH
url = 'http://localhost:{}/fuh/guh'.format(http_daemon.server_port) url = 'http://localhost:{}/fuh/guh'.format(http_daemon.server_port)
# logging.info("%s => %s", repr(url), repr(warcprox.warcproxy.Url(url).surt))
response = requests.get( response = requests.get(
url, proxies=archiving_proxies, headers=headers, stream=True) url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 403 assert response.status_code == 403

View File

@ -57,76 +57,6 @@ class RequestBlockedByRule(Exception):
def __str__(self): def __str__(self):
return "%s: %s" % (self.__class__.__name__, self.msg) return "%s: %s" % (self.__class__.__name__, self.msg)
class Url:
'''
Utility class
'''
def __init__(self, url):
self.url = url
self._surt = None
self._host = None
@property
def surt(self):
if not self._surt:
import surt
hurl = surt.handyurl.parse(self.url)
surt.GoogleURLCanonicalizer.canonicalize(hurl)
hurl.query = None
hurl.hash = None
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
return self._surt
@property
def host(self):
if not self._host:
import surt
self._host = surt.handyurl.parse(self.url).host
return self._host
def matches_ip_or_domain(self, ip_or_domain):
return host_matches_ip_or_domain(self.host, ip_or_domain)
def normalize_host(host):
# normalize host (punycode and lowercase)
return host.encode('idna').decode('ascii').lower()
def host_matches_ip_or_domain(host, ip_or_domain):
'''
Returns true if
- ip_or_domain is an ip address and host is the same ip address
- ip_or_domain is a domain and host is the same domain
- ip_or_domain is a domain and host is a subdomain of it
'''
if not host:
return False
_host = normalize_host(host)
_ip_or_domain = normalize_host(ip_or_domain)
if _ip_or_domain == _host:
return True
# if either _ip_or_domain or host are ip addresses, and they're not
# identical (previous check), not a match
try:
ipaddress.ip_address(_ip_or_domain)
return False
except:
pass
try:
ipaddress.ip_address(_host)
return False
except:
pass
# if we get here, we're looking at two hostnames
domain_parts = _ip_or_domain.split(".")
host_parts = _host.split(".")
result = host_parts[-len(domain_parts):] == domain_parts
return result
# logging level more fine-grained than logging.DEBUG==10 # logging level more fine-grained than logging.DEBUG==10
TRACE = 5 TRACE = 5

View File

@ -29,7 +29,7 @@ from hanzo import warctools
import random import random
import warcprox import warcprox
import base64 import base64
import surt import urlcanon
import os import os
import hashlib import hashlib
import threading import threading
@ -159,8 +159,7 @@ class RethinkCaptures:
else: else:
bucket = "__unspecified__" bucket = "__unspecified__"
canon_surt = surt.surt(recorded_url.url.decode("utf-8"), canon_surt = urlcanon.semantic(recorded_url.url).decode("utf-8")
trailing_comma=True, host_massage=False, with_scheme=True)
entry = { entry = {
# id only specified for rethinkdb partitioning # id only specified for rethinkdb partitioning

View File

@ -54,6 +54,7 @@ except ImportError:
import SocketServer as socketserver import SocketServer as socketserver
import resource import resource
import concurrent.futures import concurrent.futures
import urlcanon
class ProxyingRecorder(object): class ProxyingRecorder(object):
""" """
@ -204,15 +205,15 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
urllib_parse.ParseResult( urllib_parse.ParseResult(
scheme='', netloc='', params=u.params, path=u.path or '/', scheme='', netloc='', params=u.params, path=u.path or '/',
query=u.query, fragment=u.fragment)) query=u.query, fragment=u.fragment))
self.hostname = warcprox.normalize_host(host) self.hostname = urlcanon.normalize_host(host).decode('ascii')
def _connect_to_remote_server(self): def _connect_to_remote_server(self):
# Connect to destination # Connect to destination
if self.onion_tor_socks_proxy_host and self.hostname.lower().endswith('.onion'): if self.onion_tor_socks_proxy_host and self.hostname.endswith('.onion'):
self.logger.info("using tor socks proxy at %s:%s to connect to %s", self.logger.info(
"using tor socks proxy at %s:%s to connect to %s",
self.onion_tor_socks_proxy_host, self.onion_tor_socks_proxy_host,
self.onion_tor_socks_proxy_port or 1080, self.onion_tor_socks_proxy_port or 1080, self.hostname)
self.hostname)
self._remote_server_sock = socks.socksocket() self._remote_server_sock = socks.socksocket()
self._remote_server_sock.set_proxy( self._remote_server_sock.set_proxy(
socks.SOCKS5, addr=self.onion_tor_socks_proxy_host, socks.SOCKS5, addr=self.onion_tor_socks_proxy_host,
@ -247,8 +248,9 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
return self._remote_server_sock return self._remote_server_sock
def _transition_to_ssl(self): def _transition_to_ssl(self):
self.request = self.connection = ssl.wrap_socket(self.connection, self.request = self.connection = ssl.wrap_socket(
server_side=True, certfile=self.server.ca.cert_for_host(self.hostname)) self.connection, server_side=True,
certfile=self.server.ca.cert_for_host(self.hostname))
def do_CONNECT(self): def do_CONNECT(self):
''' '''

View File

@ -30,7 +30,7 @@ import warcprox
import threading import threading
import rethinkdb as r import rethinkdb as r
import datetime import datetime
import surt import urlcanon
def _empty_bucket(bucket): def _empty_bucket(bucket):
return { return {
@ -136,12 +136,12 @@ class StatsDb:
continue continue
buckets.append(bucket['bucket']) buckets.append(bucket['bucket'])
if bucket.get('tally-domains'): if bucket.get('tally-domains'):
url = warcprox.Url(recorded_url.url.decode('utf-8')) url = urlcanon.semantic(recorded_url.url)
for domain in bucket['tally-domains']: for domain in bucket['tally-domains']:
if url.matches_ip_or_domain(domain): domain = urlcanon.normalize_host(domain).decode('ascii')
buckets.append('%s:%s' % ( if urlcanon.url_matches_domain(url, domain):
bucket['bucket'], buckets.append(
warcprox.normalize_host(domain))) '%s:%s' % (bucket['bucket'], domain))
else: else:
buckets.append(bucket) buckets.append(bucket)
else: else:

View File

@ -44,7 +44,7 @@ from certauth.certauth import CertificateAuthority
import warcprox import warcprox
import datetime import datetime
import ipaddress import ipaddress
import surt import urlcanon
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
''' '''
@ -62,45 +62,16 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
# self.server is WarcProxy # self.server is WarcProxy
logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler") logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler")
# XXX nearly identical to brozzler.site.Site._scope_rule_applies() but
# there's no obvious common dependency where this code should go... TBD
def _scope_rule_applies(self, rule):
u = warcprox.Url(self.url)
if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]):
return False
if "url_match" in rule:
if rule["url_match"] == "STRING_MATCH":
return u.url.find(rule["value"]) >= 0
elif rule["url_match"] == "REGEX_MATCH":
try:
return re.fullmatch(rule["value"], u.url)
except Exception as e:
self.logger.warn(
"caught exception matching against regex %s: %s",
rule["value"], e)
return False
elif rule["url_match"] == "SURT_MATCH":
return u.surt.startswith(rule["value"])
else:
self.logger.warn("invalid rule.url_match=%s", rule.url_match)
return False
else:
if "domain" in rule:
# we already know that it matches from earlier check
return True
else:
self.logger.warn("unable to make sense of scope rule %s", rule)
return False
def _enforce_blocks(self, warcprox_meta): def _enforce_blocks(self, warcprox_meta):
""" """
Sends a 403 response and raises warcprox.RequestBlockedByRule if the Sends a 403 response and raises warcprox.RequestBlockedByRule if the
url is blocked by a rule in warcprox_meta. url is blocked by a rule in warcprox_meta.
""" """
url = urlcanon.semantic(self.url)
if warcprox_meta and "blocks" in warcprox_meta: if warcprox_meta and "blocks" in warcprox_meta:
for rule in warcprox_meta["blocks"]: for rule in warcprox_meta["blocks"]:
if self._scope_rule_applies(rule): block_rule = urlcanon.MatchRule(**rule)
if block_rule.applies(url):
body = ("request rejected by warcprox: blocked by " body = ("request rejected by warcprox: blocked by "
"rule found in Warcprox-Meta header: %s" "rule found in Warcprox-Meta header: %s"
% rule).encode("utf-8") % rule).encode("utf-8")
@ -130,12 +101,10 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
# to apply this rule if the requested url is within domain # to apply this rule if the requested url is within domain
bucket0_fields = bucket0.split(':') bucket0_fields = bucket0.split(':')
if len(bucket0_fields) == 2: if len(bucket0_fields) == 2:
if not warcprox.host_matches_ip_or_domain( domain = urlcanon.normalize_host(bucket0_fields[1])
self.hostname, bucket0_fields[1]): if not urlcanon.host_matches_domain(self.hostname, domain):
return # else host matches, go ahead and enforce the limit return # else host matches, go ahead and enforce the limit
bucket0 = '%s:%s' % ( bucket0 = '%s:%s' % (bucket0_fields[0], domain.decode('ascii'))
bucket0_fields[0],
warcprox.normalize_host(bucket0_fields[1]))
_limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2) _limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2)
value = self.server.stats_db.value(bucket0, bucket1, bucket2) value = self.server.stats_db.value(bucket0, bucket1, bucket2)