mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
use urlcanon library for canonicalization, surtification, scope match rules
This commit is contained in:
parent
f30160d8ee
commit
f1d07ad921
4
setup.py
4
setup.py
@ -40,7 +40,7 @@ deps = [
|
||||
'certauth>=1.1.0',
|
||||
'warctools',
|
||||
'kafka-python>=1.0.1',
|
||||
'surt>=0.3b4',
|
||||
'urlcanon>=0.1.dev16',
|
||||
'doublethink>=0.2.0.dev69',
|
||||
'PySocks',
|
||||
]
|
||||
@ -51,7 +51,7 @@ except:
|
||||
|
||||
setuptools.setup(
|
||||
name='warcprox',
|
||||
version='2.1b1.dev54',
|
||||
version='2.1b1.dev55',
|
||||
description='WARC writing MITM HTTP/S proxy',
|
||||
url='https://github.com/internetarchive/warcprox',
|
||||
author='Noah Levitt',
|
||||
|
@ -837,7 +837,6 @@ def test_block_rules(http_daemon, https_daemon, warcprox_, archiving_proxies):
|
||||
|
||||
# blocked by SURT_MATCH
|
||||
url = 'http://localhost:{}/fuh/guh'.format(http_daemon.server_port)
|
||||
# logging.info("%s => %s", repr(url), repr(warcprox.warcproxy.Url(url).surt))
|
||||
response = requests.get(
|
||||
url, proxies=archiving_proxies, headers=headers, stream=True)
|
||||
assert response.status_code == 403
|
||||
|
@ -57,76 +57,6 @@ class RequestBlockedByRule(Exception):
|
||||
def __str__(self):
|
||||
return "%s: %s" % (self.__class__.__name__, self.msg)
|
||||
|
||||
class Url:
|
||||
'''
|
||||
Utility class
|
||||
'''
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
self._surt = None
|
||||
self._host = None
|
||||
|
||||
@property
|
||||
def surt(self):
|
||||
if not self._surt:
|
||||
import surt
|
||||
hurl = surt.handyurl.parse(self.url)
|
||||
surt.GoogleURLCanonicalizer.canonicalize(hurl)
|
||||
hurl.query = None
|
||||
hurl.hash = None
|
||||
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
|
||||
return self._surt
|
||||
|
||||
@property
|
||||
def host(self):
|
||||
if not self._host:
|
||||
import surt
|
||||
self._host = surt.handyurl.parse(self.url).host
|
||||
return self._host
|
||||
|
||||
def matches_ip_or_domain(self, ip_or_domain):
|
||||
return host_matches_ip_or_domain(self.host, ip_or_domain)
|
||||
|
||||
def normalize_host(host):
|
||||
# normalize host (punycode and lowercase)
|
||||
return host.encode('idna').decode('ascii').lower()
|
||||
|
||||
def host_matches_ip_or_domain(host, ip_or_domain):
|
||||
'''
|
||||
Returns true if
|
||||
- ip_or_domain is an ip address and host is the same ip address
|
||||
- ip_or_domain is a domain and host is the same domain
|
||||
- ip_or_domain is a domain and host is a subdomain of it
|
||||
'''
|
||||
if not host:
|
||||
return False
|
||||
_host = normalize_host(host)
|
||||
_ip_or_domain = normalize_host(ip_or_domain)
|
||||
|
||||
if _ip_or_domain == _host:
|
||||
return True
|
||||
|
||||
# if either _ip_or_domain or host are ip addresses, and they're not
|
||||
# identical (previous check), not a match
|
||||
try:
|
||||
ipaddress.ip_address(_ip_or_domain)
|
||||
return False
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
ipaddress.ip_address(_host)
|
||||
return False
|
||||
except:
|
||||
pass
|
||||
|
||||
# if we get here, we're looking at two hostnames
|
||||
domain_parts = _ip_or_domain.split(".")
|
||||
host_parts = _host.split(".")
|
||||
|
||||
result = host_parts[-len(domain_parts):] == domain_parts
|
||||
return result
|
||||
|
||||
|
||||
# logging level more fine-grained than logging.DEBUG==10
|
||||
TRACE = 5
|
||||
|
||||
|
@ -29,7 +29,7 @@ from hanzo import warctools
|
||||
import random
|
||||
import warcprox
|
||||
import base64
|
||||
import surt
|
||||
import urlcanon
|
||||
import os
|
||||
import hashlib
|
||||
import threading
|
||||
@ -159,8 +159,7 @@ class RethinkCaptures:
|
||||
else:
|
||||
bucket = "__unspecified__"
|
||||
|
||||
canon_surt = surt.surt(recorded_url.url.decode("utf-8"),
|
||||
trailing_comma=True, host_massage=False, with_scheme=True)
|
||||
canon_surt = urlcanon.semantic(recorded_url.url).decode("utf-8")
|
||||
|
||||
entry = {
|
||||
# id only specified for rethinkdb partitioning
|
||||
|
@ -54,6 +54,7 @@ except ImportError:
|
||||
import SocketServer as socketserver
|
||||
import resource
|
||||
import concurrent.futures
|
||||
import urlcanon
|
||||
|
||||
class ProxyingRecorder(object):
|
||||
"""
|
||||
@ -204,15 +205,15 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
urllib_parse.ParseResult(
|
||||
scheme='', netloc='', params=u.params, path=u.path or '/',
|
||||
query=u.query, fragment=u.fragment))
|
||||
self.hostname = warcprox.normalize_host(host)
|
||||
self.hostname = urlcanon.normalize_host(host).decode('ascii')
|
||||
|
||||
def _connect_to_remote_server(self):
|
||||
# Connect to destination
|
||||
if self.onion_tor_socks_proxy_host and self.hostname.lower().endswith('.onion'):
|
||||
self.logger.info("using tor socks proxy at %s:%s to connect to %s",
|
||||
if self.onion_tor_socks_proxy_host and self.hostname.endswith('.onion'):
|
||||
self.logger.info(
|
||||
"using tor socks proxy at %s:%s to connect to %s",
|
||||
self.onion_tor_socks_proxy_host,
|
||||
self.onion_tor_socks_proxy_port or 1080,
|
||||
self.hostname)
|
||||
self.onion_tor_socks_proxy_port or 1080, self.hostname)
|
||||
self._remote_server_sock = socks.socksocket()
|
||||
self._remote_server_sock.set_proxy(
|
||||
socks.SOCKS5, addr=self.onion_tor_socks_proxy_host,
|
||||
@ -247,8 +248,9 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
return self._remote_server_sock
|
||||
|
||||
def _transition_to_ssl(self):
|
||||
self.request = self.connection = ssl.wrap_socket(self.connection,
|
||||
server_side=True, certfile=self.server.ca.cert_for_host(self.hostname))
|
||||
self.request = self.connection = ssl.wrap_socket(
|
||||
self.connection, server_side=True,
|
||||
certfile=self.server.ca.cert_for_host(self.hostname))
|
||||
|
||||
def do_CONNECT(self):
|
||||
'''
|
||||
|
@ -30,7 +30,7 @@ import warcprox
|
||||
import threading
|
||||
import rethinkdb as r
|
||||
import datetime
|
||||
import surt
|
||||
import urlcanon
|
||||
|
||||
def _empty_bucket(bucket):
|
||||
return {
|
||||
@ -136,12 +136,12 @@ class StatsDb:
|
||||
continue
|
||||
buckets.append(bucket['bucket'])
|
||||
if bucket.get('tally-domains'):
|
||||
url = warcprox.Url(recorded_url.url.decode('utf-8'))
|
||||
url = urlcanon.semantic(recorded_url.url)
|
||||
for domain in bucket['tally-domains']:
|
||||
if url.matches_ip_or_domain(domain):
|
||||
buckets.append('%s:%s' % (
|
||||
bucket['bucket'],
|
||||
warcprox.normalize_host(domain)))
|
||||
domain = urlcanon.normalize_host(domain).decode('ascii')
|
||||
if urlcanon.url_matches_domain(url, domain):
|
||||
buckets.append(
|
||||
'%s:%s' % (bucket['bucket'], domain))
|
||||
else:
|
||||
buckets.append(bucket)
|
||||
else:
|
||||
|
@ -44,7 +44,7 @@ from certauth.certauth import CertificateAuthority
|
||||
import warcprox
|
||||
import datetime
|
||||
import ipaddress
|
||||
import surt
|
||||
import urlcanon
|
||||
|
||||
class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
'''
|
||||
@ -62,45 +62,16 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
# self.server is WarcProxy
|
||||
logger = logging.getLogger("warcprox.warcprox.WarcProxyHandler")
|
||||
|
||||
# XXX nearly identical to brozzler.site.Site._scope_rule_applies() but
|
||||
# there's no obvious common dependency where this code should go... TBD
|
||||
def _scope_rule_applies(self, rule):
|
||||
u = warcprox.Url(self.url)
|
||||
|
||||
if "domain" in rule and not u.matches_ip_or_domain(rule["domain"]):
|
||||
return False
|
||||
if "url_match" in rule:
|
||||
if rule["url_match"] == "STRING_MATCH":
|
||||
return u.url.find(rule["value"]) >= 0
|
||||
elif rule["url_match"] == "REGEX_MATCH":
|
||||
try:
|
||||
return re.fullmatch(rule["value"], u.url)
|
||||
except Exception as e:
|
||||
self.logger.warn(
|
||||
"caught exception matching against regex %s: %s",
|
||||
rule["value"], e)
|
||||
return False
|
||||
elif rule["url_match"] == "SURT_MATCH":
|
||||
return u.surt.startswith(rule["value"])
|
||||
else:
|
||||
self.logger.warn("invalid rule.url_match=%s", rule.url_match)
|
||||
return False
|
||||
else:
|
||||
if "domain" in rule:
|
||||
# we already know that it matches from earlier check
|
||||
return True
|
||||
else:
|
||||
self.logger.warn("unable to make sense of scope rule %s", rule)
|
||||
return False
|
||||
|
||||
def _enforce_blocks(self, warcprox_meta):
|
||||
"""
|
||||
Sends a 403 response and raises warcprox.RequestBlockedByRule if the
|
||||
url is blocked by a rule in warcprox_meta.
|
||||
"""
|
||||
url = urlcanon.semantic(self.url)
|
||||
if warcprox_meta and "blocks" in warcprox_meta:
|
||||
for rule in warcprox_meta["blocks"]:
|
||||
if self._scope_rule_applies(rule):
|
||||
block_rule = urlcanon.MatchRule(**rule)
|
||||
if block_rule.applies(url):
|
||||
body = ("request rejected by warcprox: blocked by "
|
||||
"rule found in Warcprox-Meta header: %s"
|
||||
% rule).encode("utf-8")
|
||||
@ -130,12 +101,10 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
# to apply this rule if the requested url is within domain
|
||||
bucket0_fields = bucket0.split(':')
|
||||
if len(bucket0_fields) == 2:
|
||||
if not warcprox.host_matches_ip_or_domain(
|
||||
self.hostname, bucket0_fields[1]):
|
||||
domain = urlcanon.normalize_host(bucket0_fields[1])
|
||||
if not urlcanon.host_matches_domain(self.hostname, domain):
|
||||
return # else host matches, go ahead and enforce the limit
|
||||
bucket0 = '%s:%s' % (
|
||||
bucket0_fields[0],
|
||||
warcprox.normalize_host(bucket0_fields[1]))
|
||||
bucket0 = '%s:%s' % (bucket0_fields[0], domain.decode('ascii'))
|
||||
_limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2)
|
||||
|
||||
value = self.server.stats_db.value(bucket0, bucket1, bucket2)
|
||||
|
Loading…
x
Reference in New Issue
Block a user