1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-16 00:24:48 +01:00

cdx: add domain-specific rules at cdx layer for custom canonicalization!

and 'fuzzy' matching when not found
handled via cdxdomainspecific.py
BaseCDXServer contains a canonicalizer object and a fuzzy query
canonicalizer abstracted to seperate class (in canonicalizer.py)
clean up cdx related exceptions
default rules read from cdx/rules.yaml
filename configurable via 'domain_specific_rules' setting in config.yaml
fix typo in pywb/rewrite
This commit is contained in:
Ilya Kreymer 2014-02-18 14:47:48 -08:00
parent ab95524b7b
commit a09dec4b3e
13 changed files with 375 additions and 131 deletions

@ -92,4 +92,8 @@ static_routes:
enable_http_proxy: true
# enable cdx server api for querying cdx directly (experimental)
#enable_cdx_api: false
enable_cdx_api: true
# custom rules for domain specific matching
# set to false to disable
#domain_specific_rules: rules.yaml

74
pywb/cdx/canonicalize.py Normal file

@ -0,0 +1,74 @@
""" Standard url-canonicalzation, surt and non-surt
"""
import surt
from cdxobject import CDXException
#=================================================================
class UrlCanonicalizer(object):
def __init__(self, surt_ordered=True):
self.surt_ordered = surt_ordered
def __call__(self, url):
return canonicalize(url, self.surt_ordered)
#=================================================================
def canonicalize(url, surt_ordered=True):
"""
Canonicalize url and convert to surt
If not in surt ordered mode, convert back to url form
as surt conversion is currently part of canonicalization
>>> canonicalize('http://example.com/path/file.html', surt_ordered=True)
'com,example)/path/file.html'
>>> canonicalize('http://example.com/path/file.html', surt_ordered=False)
'example.com/path/file.html'
"""
try:
key = surt.surt(url)
except Exception as e:
raise CDXException('Invalid Url: ' + url)
# if not surt, unsurt the surt to get canonicalized non-surt url
if not surt_ordered:
key = unsurt(key)
return key
#=================================================================
def unsurt(surt):
"""
# Simple surt
>>> unsurt('com,example)/')
'example.com/'
# Broken surt
>>> unsurt('com,example)')
'com,example)'
# Long surt
>>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\
index.html?a=b?c=)/')
'subdomain.another.subsub.sub.domain.suffix/path/file/index.html?a=b?c=)/'
"""
try:
index = surt.index(')/')
parts = surt[0:index].split(',')
parts.reverse()
host = '.'.join(parts)
host += surt[index + 1:]
return host
except ValueError:
# May not be a valid surt
return surt
if __name__ == "__main__":
import doctest
doctest.testmod()

@ -0,0 +1,125 @@
import yaml
import re
import logging
import pkgutil
from canonicalize import unsurt, UrlCanonicalizer
#=================================================================
def load_domain_specific_cdx_rules(filename, surt_ordered):
fh = pkgutil.get_data(__package__, filename)
config = yaml.load(fh)
# Load Canonicalizer Rules
rules = StartsWithRule.load_rules(config.get('canon_rules'),
surt_ordered)
if rules:
canon = CustomUrlCanonicalizer(rules, surt_ordered)
else:
canon = None
# Load Fuzzy Lookup Rules
rules = StartsWithRule.load_rules(config.get('fuzzy_lookup_rules'),
surt_ordered)
if rules:
fuzzy = FuzzyQuery(rules)
else:
fuzzy = None
logging.debug('CANON: ' + str(canon))
logging.debug('FUZZY: ' + str(fuzzy))
return (canon, fuzzy)
#=================================================================
class CustomUrlCanonicalizer(UrlCanonicalizer):
def __init__(self, rules, surt_ordered=True):
super(CustomUrlCanonicalizer, self).__init__(surt_ordered)
self.rules = rules
def __call__(self, url):
urlkey = super(CustomUrlCanonicalizer, self).__call__(url)
for rule in self.rules:
if not any(urlkey.startswith(x) for x in rule.starts):
continue
m = rule.regex.match(urlkey)
if not m:
continue
if rule.replace:
return m.expand(rule.replace)
return urlkey
#=================================================================
class FuzzyQuery:
def __init__(self, rules):
self.rules = rules
def __call__(self, params):
matched_rule = None
urlkey = params['key']
url = params['url']
for rule in self.rules:
if not any(urlkey.startswith(x) for x in rule.starts):
continue
m = rule.regex.search(urlkey)
if not m:
continue
matched_rule = rule
if len(m.groups()) == 1:
params['filter'] = '=urlkey:' + m.group(1)
break
if not matched_rule:
return None
inx = url.find('?')
if inx > 0:
params['url'] = url[:inx + 1]
params['matchType'] = 'prefix'
params['key'] = None
return params
#=================================================================
class StartsWithRule:
def __init__(self, config, surt_ordered=True):
self.starts = config.get('startswith')
if not isinstance(self.starts, list):
self.starts = [self.starts]
self.regex = re.compile(config.get('matches'))
self.replace = config.get('replace')
def unsurt(self):
# must convert to non-surt form
self.starts = map(unsurt, self.starts)
self.regex = unsurt(self.regex)
self.replace = unsurt(self.replace)
@staticmethod
def load_rules(rules_config, surt_ordered=True):
if not rules_config:
return []
rules = map(StartsWithRule, rules_config)
if not surt_ordered:
for rule in rules:
rule.unsurt()
return rules

@ -2,6 +2,24 @@ from collections import OrderedDict
import itertools
#=================================================================
class CDXException(Exception):
def status(self):
return '400 Bad Request'
#=================================================================
class CaptureNotFoundException(CDXException):
def status(self):
return '404 Not Found'
#=================================================================
class AccessException(CDXException):
def status(self):
return '403 Access Denied'
#=================================================================
class CDXObject(OrderedDict):
CDX_FORMATS = [

@ -1,82 +1,103 @@
import surt
from cdxops import cdx_load
from canonicalize import UrlCanonicalizer
import itertools
from cdxops import cdx_load
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
from cdxobject import CDXObject, CaptureNotFoundException, CDXException
from cdxdomainspecific import load_domain_specific_cdx_rules
from itertools import chain
import logging
import os
import urlparse
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
from cdxobject import CDXObject
#=================================================================
class BaseCDXServer(object):
def __init__(self, url_canon=None, fuzzy_query=None):
self.url_canon = url_canon if url_canon else UrlCanonicalizer()
self.fuzzy_query = fuzzy_query
def _check_cdx_iter(self, cdx_iter, params):
""" Check cdx iter semantics
If iter is empty (no matches), check if fuzzy matching
is allowed, and try it -- otherwise,
throw CaptureNotFoundException
"""
cdx_iter = self.peek_iter(cdx_iter)
if cdx_iter:
return cdx_iter
url = params['url']
if self.fuzzy_query and params.get('allow_fuzzy'):
if not 'key' in params:
params['key'] = self.url_canon(url)
params = self.fuzzy_query(params)
if params:
params['allow_fuzzy'] = False
return self.load_cdx(**params)
msg = 'No Captures found for: ' + url
raise CaptureNotFoundException(msg)
def load_cdx(self, **params):
raise NotImplementedError('Implement in subclass')
@staticmethod
def peek_iter(iterable):
try:
first = next(iterable)
except StopIteration:
return None
return chain([first], iterable)
#=================================================================
class CDXException(Exception):
def status(self):
return '400 Bad Request'
#=================================================================
class AccessException(CDXException):
def status(self):
return '403 Bad Request'
#=================================================================
class CDXServer(object):
class CDXServer(BaseCDXServer):
"""
Top-level cdx server object which maintains a list of cdx sources,
responds to queries and dispatches to the cdx ops for processing
"""
def __init__(self, paths, surt_ordered=True):
def __init__(self, paths, url_canon=None, fuzzy_query=None):
super(CDXServer, self).__init__(url_canon, fuzzy_query)
self.sources = create_cdx_sources(paths)
self.surt_ordered = surt_ordered
def load_cdx(self, **params):
# if key not set, assume 'url' is set and needs canonicalization
if not params.get('key'):
params['key'] = self._canonicalize(params)
convert_old_style_params(params)
return cdx_load(self.sources, params)
def _canonicalize(self, params):
"""
Canonicalize url and convert to surt
If no surt-mode, convert back to url form
as surt conversion is currently part of canonicalization
"""
try:
url = params['url']
except KeyError:
msg = 'A url= param must be specified to query the cdx server'
raise CDXException(msg)
try:
key = surt.surt(url)
except Exception as e:
raise CDXException('Invalid Url: ' + url)
params['key'] = self.url_canon(url)
# if not surt, unsurt the surt to get canonicalized non-surt url
if not self.surt_ordered:
key = unsurt(key)
convert_old_style_params(params)
return key
cdx_iter = cdx_load(self.sources, params)
return self._check_cdx_iter(cdx_iter, params)
def __str__(self):
return 'CDX server serving from ' + str(self.sources)
#=================================================================
class RemoteCDXServer(object):
class RemoteCDXServer(BaseCDXServer):
"""
A special cdx server that uses a single RemoteCDXSource
It simply proxies the query params to the remote source
and performs no local processing/filtering
"""
def __init__(self, source):
def __init__(self, source, url_canon=None, fuzzy_query=None):
super(RemoteCDXServer, self).__init__(url_canon, fuzzy_query)
if isinstance(source, RemoteCDXSource):
self.source = source
elif (isinstance(source, str) and
@ -87,18 +108,19 @@ class RemoteCDXServer(object):
def load_cdx(self, **params):
remote_iter = self.source.load_cdx(params)
# if need raw, convert to raw format here
if params.get('output') == 'raw':
return (CDXObject(cdx) for cdx in remote_iter)
else:
return remote_iter
remote_iter = (CDXObject(cdx) for cdx in remote_iter)
return self._check_cdx_iter(remote_iter, params)
def __str__(self):
return 'Remote CDX server serving from ' + str(self.sources[0])
#=================================================================
def create_cdx_server(config):
def create_cdx_server(config, ds_rules_file=None):
if hasattr(config, 'get'):
paths = config.get('index_paths')
surt_ordered = config.get('surt_ordered', True)
@ -108,11 +130,22 @@ def create_cdx_server(config):
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
if ds_rules_file:
canon, fuzzy = load_domain_specific_cdx_rules(ds_rules_file,
surt_ordered)
else:
canon, fuzzy = None, None
if not canon:
canon = UrlCanonicalizer(surt_ordered)
if (isinstance(paths, str) and
any(paths.startswith(x) for x in ['http://', 'https://'])):
return RemoteCDXServer(paths)
server_cls = RemoteCDXServer
else:
return CDXServer(paths)
server_cls = CDXServer
return server_cls(paths, url_canon=canon, fuzzy_query=fuzzy)
#=================================================================
@ -170,13 +203,17 @@ def convert_old_style_params(params):
"""
Convert old-style CDX Server param semantics
"""
collapse_time = params.get('collapseTime')
if collapse_time:
params['collapse_time'] = collapse_time
param = params.get('collapseTime')
if param:
params['collapse_time'] = param
resolve_revisits = params.get('resolveRevisits')
if resolve_revisits:
params['resolve_revisits'] = resolve_revisits
param = params.get('matchType')
if param:
params['match_type'] = param
param = params.get('resolveRevisits')
if param:
params['resolve_revisits'] = param
if params.get('sort') == 'reverse':
params['reverse'] = True
@ -204,38 +241,3 @@ def extract_params_from_wsgi_env(env):
params[name] = val[0]
return params
#=================================================================
def unsurt(surt):
"""
# Simple surt
>>> unsurt('com,example)/')
'example.com)/'
# Broken surt
>>> unsurt('com,example)')
'com,example)'
# Long surt
>>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\
index.html?a=b?c=)/')
'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/'
"""
try:
index = surt.index(')/')
parts = surt[0:index].split(',')
parts.reverse()
host = '.'.join(parts)
host += surt[index:]
return host
except ValueError:
# May not be a valid surt
return surt
if __name__ == "__main__":
import doctest
doctest.testmod()

24
pywb/cdx/rules.yaml Normal file

@ -0,0 +1,24 @@
fuzzy_lookup_rules:
- startswith: 'com,twitter)/i/profiles/show/'
matches: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)'
- startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
matches: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))'
- startswith: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo']
matches: '([^/]+(?:\.css|\.js))'
# matches all urls
- startswith: ''
matches: '[&?](?:_|uncache)=[\d]+[&]?'
canon_rules:
- startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
matches: 'com,facebook\)/.*[?&]data=([^&]+).*'
replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'

@ -25,6 +25,8 @@ org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jq
# No matching results
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2)
Traceback (most recent call last):
CaptureNotFoundException: No Captures found for: http://iana.org/dont_have_this
# Filter cdx (default: regex)

@ -1,13 +1,7 @@
import urllib
import urllib2
from wbexceptions import NotFoundException
from itertools import chain
from pprint import pprint
from pywb.cdx.cdxserver import create_cdx_server, CDXException
from pywb.cdx.cdxobject import CDXObject
from pywb.cdx.cdxserver import create_cdx_server
#=================================================================
class IndexReader(object):
@ -18,8 +12,8 @@ class IndexReader(object):
Creates an appropriate query based on wbrequest type info
"""
def __init__(self, config):
self.cdx_server = create_cdx_server(config)
def __init__(self, config, ds_rules_file=None):
self.cdx_server = create_cdx_server(config, ds_rules_file)
def load_for_request(self, wbrequest):
wburl = wbrequest.wb_url
@ -29,19 +23,14 @@ class IndexReader(object):
# add any custom filter from the request
if wbrequest.query_filter:
params['filter'] = wbrequest.query_filter
params['filter'].extend(wbrequest.query_filter)
if wbrequest.custom_params:
params.update(wbrequest.custom_params)
params['url'] = wburl.url
params['allow_fuzzy'] = True
cdxlines = self.load_cdx(output='raw', **params)
cdxlines = self.peek_iter(cdxlines)
if cdxlines is None:
raise NotFoundException('No Captures found for: ' + wburl.url)
cdxlines = self.load_cdx(url=wburl.url, output='raw', **params)
return cdxlines
@ -54,7 +43,7 @@ class IndexReader(object):
return {
wburl.QUERY:
{'collapseTime': collapse_time, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
{'collapseTime': collapse_time, 'filter': ['!statuscode:(500|502|504)'], 'limit': limit},
wburl.URL_QUERY:
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
@ -62,21 +51,12 @@ class IndexReader(object):
},
wburl.REPLAY:
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replay_closest, 'closest': wburl.timestamp, 'resolveRevisits': True},
{'sort': 'closest', 'filter': ['!statuscode:(500|502|504)'], 'limit': replay_closest, 'closest': wburl.timestamp, 'resolveRevisits': True},
# BUG: resolveRevisits currently doesn't work for this type of query
# This is not an issue in archival mode, as there is a redirect to the actual timestamp query
# but may be an issue in proxy mode
wburl.LATEST_REPLAY:
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
{'sort': 'reverse', 'filter': ['statuscode:[23]..'], 'limit': '1', 'resolveRevisits': True}
}[wburl.type]
@staticmethod
def peek_iter(iterable):
try:
first = next(iterable)
except StopIteration:
return None
return chain([first], iterable)

@ -21,6 +21,8 @@ DEFAULTS = {
'error_html': 'ui/error.html',
'static_routes': {'static/default': 'static/'},
'domain_specific_rules': 'rules.yaml',
}
class DictChain:
@ -30,7 +32,7 @@ class DictChain:
def get(self, key, default_val=None):
for d in self.dicts:
val = d.get(key)
if val:
if val is not None:
return val
return default_val
@ -52,11 +54,13 @@ def pywb_config_manual(passed_config = {}):
for name, value in collections.iteritems():
if isinstance(value, str):
route_config = config
cdx_server = IndexReader(value)
cdx_config = value
else:
route_config = DictChain(value, config)
cdx_server = IndexReader(route_config)
cdx_config = route_config
ds_rules = route_config.get('domain_specific_rules', None)
cdx_server = IndexReader(cdx_config, ds_rules)
wb_handler = config_utils.create_wb_handler(
cdx_server = cdx_server,
@ -118,7 +122,8 @@ def pywb_config(config_file = None):
if not config_file:
config_file = os.environ.get('PYWB_CONFIG', DEFAULT_CONFIG_FILE)
config = yaml.load(open(config_file))
with open(config_file) as fh:
config = yaml.load(fh)
return pywb_config_manual(config)

@ -54,8 +54,7 @@ class RewriteContent:
# =========================================================================
# special case -- need to ungzip the body
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
stream = BufferedReader(stream, 'gzip')
stream = BufferedReader(stream, decomp_type='gzip')
if rewritten_headers.charset:
encoding = rewritten_headers.charset

@ -1,14 +1,15 @@
class WbException(Exception):
pass
class NotFoundException(WbException):
def status(_):
def status(self):
return '404 Not Found'
# Exceptions that effect a specific capture and result in a retry
class CaptureException(WbException):
def status(_):
def status(self):
return '500 Internal Server Error'
class InternalRedirect(WbException):

@ -93,3 +93,6 @@ enable_cdx_api: true
# optional reporter callback func
# if set, called with request and cdx object
reporter_func: pywb.run-tests.print_reporter
# custom rules for domain specific matching
#domain_specific_rules: rules.yaml

@ -50,6 +50,13 @@ class TestWb:
# 1 Capture (filtered) + header
assert len(resp.html.find_all('tr')) == 2
def test_calendar_query_fuzzy_match(self):
# fuzzy match removing _= according to standard rules.yaml
resp = self.testapp.get('/pywb/*/http://www.iana.org/_css/2013.1/screen.css?_=3141592653')
self._assert_basic_html(resp)
# 17 Captures + header
assert len(resp.html.find_all('tr')) == 18
def test_cdx_query(self):
resp = self.testapp.get('/pywb/cdx_/*/http://www.iana.org/')
self._assert_basic_text(resp)