mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
cdx: add domain-specific rules at cdx layer for custom canonicalization!
and 'fuzzy' matching when not found handled via cdxdomainspecific.py BaseCDXServer contains a canonicalizer object and a fuzzy query canonicalizer abstracted to seperate class (in canonicalizer.py) clean up cdx related exceptions default rules read from cdx/rules.yaml filename configurable via 'domain_specific_rules' setting in config.yaml fix typo in pywb/rewrite
This commit is contained in:
parent
ab95524b7b
commit
a09dec4b3e
@ -92,4 +92,8 @@ static_routes:
|
||||
enable_http_proxy: true
|
||||
|
||||
# enable cdx server api for querying cdx directly (experimental)
|
||||
#enable_cdx_api: false
|
||||
enable_cdx_api: true
|
||||
|
||||
# custom rules for domain specific matching
|
||||
# set to false to disable
|
||||
#domain_specific_rules: rules.yaml
|
||||
|
74
pywb/cdx/canonicalize.py
Normal file
74
pywb/cdx/canonicalize.py
Normal file
@ -0,0 +1,74 @@
|
||||
""" Standard url-canonicalzation, surt and non-surt
|
||||
"""
|
||||
|
||||
import surt
|
||||
from cdxobject import CDXException
|
||||
|
||||
|
||||
#=================================================================
|
||||
class UrlCanonicalizer(object):
|
||||
def __init__(self, surt_ordered=True):
|
||||
self.surt_ordered = surt_ordered
|
||||
|
||||
def __call__(self, url):
|
||||
return canonicalize(url, self.surt_ordered)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def canonicalize(url, surt_ordered=True):
|
||||
"""
|
||||
Canonicalize url and convert to surt
|
||||
If not in surt ordered mode, convert back to url form
|
||||
as surt conversion is currently part of canonicalization
|
||||
|
||||
>>> canonicalize('http://example.com/path/file.html', surt_ordered=True)
|
||||
'com,example)/path/file.html'
|
||||
|
||||
>>> canonicalize('http://example.com/path/file.html', surt_ordered=False)
|
||||
'example.com/path/file.html'
|
||||
"""
|
||||
try:
|
||||
key = surt.surt(url)
|
||||
except Exception as e:
|
||||
raise CDXException('Invalid Url: ' + url)
|
||||
|
||||
# if not surt, unsurt the surt to get canonicalized non-surt url
|
||||
if not surt_ordered:
|
||||
key = unsurt(key)
|
||||
|
||||
return key
|
||||
|
||||
|
||||
#=================================================================
|
||||
def unsurt(surt):
|
||||
"""
|
||||
# Simple surt
|
||||
>>> unsurt('com,example)/')
|
||||
'example.com/'
|
||||
|
||||
# Broken surt
|
||||
>>> unsurt('com,example)')
|
||||
'com,example)'
|
||||
|
||||
# Long surt
|
||||
>>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\
|
||||
index.html?a=b?c=)/')
|
||||
'subdomain.another.subsub.sub.domain.suffix/path/file/index.html?a=b?c=)/'
|
||||
"""
|
||||
|
||||
try:
|
||||
index = surt.index(')/')
|
||||
parts = surt[0:index].split(',')
|
||||
parts.reverse()
|
||||
host = '.'.join(parts)
|
||||
host += surt[index + 1:]
|
||||
return host
|
||||
|
||||
except ValueError:
|
||||
# May not be a valid surt
|
||||
return surt
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
125
pywb/cdx/cdxdomainspecific.py
Normal file
125
pywb/cdx/cdxdomainspecific.py
Normal file
@ -0,0 +1,125 @@
|
||||
import yaml
|
||||
import re
|
||||
import logging
|
||||
import pkgutil
|
||||
|
||||
from canonicalize import unsurt, UrlCanonicalizer
|
||||
|
||||
|
||||
#=================================================================
|
||||
def load_domain_specific_cdx_rules(filename, surt_ordered):
|
||||
fh = pkgutil.get_data(__package__, filename)
|
||||
config = yaml.load(fh)
|
||||
|
||||
# Load Canonicalizer Rules
|
||||
rules = StartsWithRule.load_rules(config.get('canon_rules'),
|
||||
surt_ordered)
|
||||
|
||||
if rules:
|
||||
canon = CustomUrlCanonicalizer(rules, surt_ordered)
|
||||
else:
|
||||
canon = None
|
||||
|
||||
# Load Fuzzy Lookup Rules
|
||||
rules = StartsWithRule.load_rules(config.get('fuzzy_lookup_rules'),
|
||||
surt_ordered)
|
||||
|
||||
if rules:
|
||||
fuzzy = FuzzyQuery(rules)
|
||||
else:
|
||||
fuzzy = None
|
||||
|
||||
logging.debug('CANON: ' + str(canon))
|
||||
logging.debug('FUZZY: ' + str(fuzzy))
|
||||
return (canon, fuzzy)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CustomUrlCanonicalizer(UrlCanonicalizer):
|
||||
def __init__(self, rules, surt_ordered=True):
|
||||
super(CustomUrlCanonicalizer, self).__init__(surt_ordered)
|
||||
self.rules = rules
|
||||
|
||||
def __call__(self, url):
|
||||
urlkey = super(CustomUrlCanonicalizer, self).__call__(url)
|
||||
|
||||
for rule in self.rules:
|
||||
if not any(urlkey.startswith(x) for x in rule.starts):
|
||||
continue
|
||||
|
||||
m = rule.regex.match(urlkey)
|
||||
if not m:
|
||||
continue
|
||||
|
||||
if rule.replace:
|
||||
return m.expand(rule.replace)
|
||||
|
||||
return urlkey
|
||||
|
||||
|
||||
#=================================================================
|
||||
class FuzzyQuery:
|
||||
def __init__(self, rules):
|
||||
self.rules = rules
|
||||
|
||||
def __call__(self, params):
|
||||
matched_rule = None
|
||||
|
||||
urlkey = params['key']
|
||||
url = params['url']
|
||||
|
||||
for rule in self.rules:
|
||||
if not any(urlkey.startswith(x) for x in rule.starts):
|
||||
continue
|
||||
|
||||
m = rule.regex.search(urlkey)
|
||||
if not m:
|
||||
continue
|
||||
|
||||
matched_rule = rule
|
||||
|
||||
if len(m.groups()) == 1:
|
||||
params['filter'] = '=urlkey:' + m.group(1)
|
||||
|
||||
break
|
||||
|
||||
if not matched_rule:
|
||||
return None
|
||||
|
||||
inx = url.find('?')
|
||||
if inx > 0:
|
||||
params['url'] = url[:inx + 1]
|
||||
|
||||
params['matchType'] = 'prefix'
|
||||
params['key'] = None
|
||||
return params
|
||||
|
||||
|
||||
#=================================================================
|
||||
class StartsWithRule:
|
||||
def __init__(self, config, surt_ordered=True):
|
||||
self.starts = config.get('startswith')
|
||||
if not isinstance(self.starts, list):
|
||||
self.starts = [self.starts]
|
||||
|
||||
self.regex = re.compile(config.get('matches'))
|
||||
self.replace = config.get('replace')
|
||||
|
||||
def unsurt(self):
|
||||
# must convert to non-surt form
|
||||
self.starts = map(unsurt, self.starts)
|
||||
self.regex = unsurt(self.regex)
|
||||
self.replace = unsurt(self.replace)
|
||||
|
||||
@staticmethod
|
||||
def load_rules(rules_config, surt_ordered=True):
|
||||
if not rules_config:
|
||||
return []
|
||||
|
||||
rules = map(StartsWithRule, rules_config)
|
||||
|
||||
if not surt_ordered:
|
||||
for rule in rules:
|
||||
rule.unsurt()
|
||||
|
||||
return rules
|
@ -2,6 +2,24 @@ from collections import OrderedDict
|
||||
import itertools
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXException(Exception):
|
||||
def status(self):
|
||||
return '400 Bad Request'
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CaptureNotFoundException(CDXException):
|
||||
def status(self):
|
||||
return '404 Not Found'
|
||||
|
||||
|
||||
#=================================================================
|
||||
class AccessException(CDXException):
|
||||
def status(self):
|
||||
return '403 Access Denied'
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXObject(OrderedDict):
|
||||
CDX_FORMATS = [
|
||||
|
@ -1,82 +1,103 @@
|
||||
import surt
|
||||
from cdxops import cdx_load
|
||||
from canonicalize import UrlCanonicalizer
|
||||
|
||||
import itertools
|
||||
from cdxops import cdx_load
|
||||
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
|
||||
from cdxobject import CDXObject, CaptureNotFoundException, CDXException
|
||||
from cdxdomainspecific import load_domain_specific_cdx_rules
|
||||
|
||||
from itertools import chain
|
||||
import logging
|
||||
import os
|
||||
import urlparse
|
||||
|
||||
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
|
||||
from cdxobject import CDXObject
|
||||
|
||||
#=================================================================
|
||||
class BaseCDXServer(object):
|
||||
def __init__(self, url_canon=None, fuzzy_query=None):
|
||||
self.url_canon = url_canon if url_canon else UrlCanonicalizer()
|
||||
self.fuzzy_query = fuzzy_query
|
||||
|
||||
def _check_cdx_iter(self, cdx_iter, params):
|
||||
""" Check cdx iter semantics
|
||||
If iter is empty (no matches), check if fuzzy matching
|
||||
is allowed, and try it -- otherwise,
|
||||
throw CaptureNotFoundException
|
||||
"""
|
||||
|
||||
cdx_iter = self.peek_iter(cdx_iter)
|
||||
|
||||
if cdx_iter:
|
||||
return cdx_iter
|
||||
|
||||
url = params['url']
|
||||
|
||||
if self.fuzzy_query and params.get('allow_fuzzy'):
|
||||
if not 'key' in params:
|
||||
params['key'] = self.url_canon(url)
|
||||
|
||||
params = self.fuzzy_query(params)
|
||||
if params:
|
||||
params['allow_fuzzy'] = False
|
||||
return self.load_cdx(**params)
|
||||
|
||||
msg = 'No Captures found for: ' + url
|
||||
raise CaptureNotFoundException(msg)
|
||||
|
||||
def load_cdx(self, **params):
|
||||
raise NotImplementedError('Implement in subclass')
|
||||
|
||||
@staticmethod
|
||||
def peek_iter(iterable):
|
||||
try:
|
||||
first = next(iterable)
|
||||
except StopIteration:
|
||||
return None
|
||||
|
||||
return chain([first], iterable)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXException(Exception):
|
||||
def status(self):
|
||||
return '400 Bad Request'
|
||||
|
||||
|
||||
#=================================================================
|
||||
class AccessException(CDXException):
|
||||
def status(self):
|
||||
return '403 Bad Request'
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXServer(object):
|
||||
class CDXServer(BaseCDXServer):
|
||||
"""
|
||||
Top-level cdx server object which maintains a list of cdx sources,
|
||||
responds to queries and dispatches to the cdx ops for processing
|
||||
"""
|
||||
|
||||
def __init__(self, paths, surt_ordered=True):
|
||||
def __init__(self, paths, url_canon=None, fuzzy_query=None):
|
||||
super(CDXServer, self).__init__(url_canon, fuzzy_query)
|
||||
self.sources = create_cdx_sources(paths)
|
||||
self.surt_ordered = surt_ordered
|
||||
|
||||
def load_cdx(self, **params):
|
||||
# if key not set, assume 'url' is set and needs canonicalization
|
||||
if not params.get('key'):
|
||||
params['key'] = self._canonicalize(params)
|
||||
|
||||
convert_old_style_params(params)
|
||||
|
||||
return cdx_load(self.sources, params)
|
||||
|
||||
def _canonicalize(self, params):
|
||||
"""
|
||||
Canonicalize url and convert to surt
|
||||
If no surt-mode, convert back to url form
|
||||
as surt conversion is currently part of canonicalization
|
||||
"""
|
||||
try:
|
||||
url = params['url']
|
||||
except KeyError:
|
||||
msg = 'A url= param must be specified to query the cdx server'
|
||||
raise CDXException(msg)
|
||||
|
||||
try:
|
||||
key = surt.surt(url)
|
||||
except Exception as e:
|
||||
raise CDXException('Invalid Url: ' + url)
|
||||
params['key'] = self.url_canon(url)
|
||||
|
||||
# if not surt, unsurt the surt to get canonicalized non-surt url
|
||||
if not self.surt_ordered:
|
||||
key = unsurt(key)
|
||||
convert_old_style_params(params)
|
||||
|
||||
return key
|
||||
cdx_iter = cdx_load(self.sources, params)
|
||||
|
||||
return self._check_cdx_iter(cdx_iter, params)
|
||||
|
||||
def __str__(self):
|
||||
return 'CDX server serving from ' + str(self.sources)
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RemoteCDXServer(object):
|
||||
class RemoteCDXServer(BaseCDXServer):
|
||||
"""
|
||||
A special cdx server that uses a single RemoteCDXSource
|
||||
It simply proxies the query params to the remote source
|
||||
and performs no local processing/filtering
|
||||
"""
|
||||
def __init__(self, source):
|
||||
def __init__(self, source, url_canon=None, fuzzy_query=None):
|
||||
super(RemoteCDXServer, self).__init__(url_canon, fuzzy_query)
|
||||
|
||||
if isinstance(source, RemoteCDXSource):
|
||||
self.source = source
|
||||
elif (isinstance(source, str) and
|
||||
@ -87,18 +108,19 @@ class RemoteCDXServer(object):
|
||||
|
||||
def load_cdx(self, **params):
|
||||
remote_iter = self.source.load_cdx(params)
|
||||
|
||||
# if need raw, convert to raw format here
|
||||
if params.get('output') == 'raw':
|
||||
return (CDXObject(cdx) for cdx in remote_iter)
|
||||
else:
|
||||
return remote_iter
|
||||
remote_iter = (CDXObject(cdx) for cdx in remote_iter)
|
||||
|
||||
return self._check_cdx_iter(remote_iter, params)
|
||||
|
||||
def __str__(self):
|
||||
return 'Remote CDX server serving from ' + str(self.sources[0])
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_cdx_server(config):
|
||||
def create_cdx_server(config, ds_rules_file=None):
|
||||
if hasattr(config, 'get'):
|
||||
paths = config.get('index_paths')
|
||||
surt_ordered = config.get('surt_ordered', True)
|
||||
@ -108,11 +130,22 @@ def create_cdx_server(config):
|
||||
|
||||
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
||||
|
||||
if ds_rules_file:
|
||||
canon, fuzzy = load_domain_specific_cdx_rules(ds_rules_file,
|
||||
surt_ordered)
|
||||
else:
|
||||
canon, fuzzy = None, None
|
||||
|
||||
if not canon:
|
||||
canon = UrlCanonicalizer(surt_ordered)
|
||||
|
||||
if (isinstance(paths, str) and
|
||||
any(paths.startswith(x) for x in ['http://', 'https://'])):
|
||||
return RemoteCDXServer(paths)
|
||||
server_cls = RemoteCDXServer
|
||||
else:
|
||||
return CDXServer(paths)
|
||||
server_cls = CDXServer
|
||||
|
||||
return server_cls(paths, url_canon=canon, fuzzy_query=fuzzy)
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -170,13 +203,17 @@ def convert_old_style_params(params):
|
||||
"""
|
||||
Convert old-style CDX Server param semantics
|
||||
"""
|
||||
collapse_time = params.get('collapseTime')
|
||||
if collapse_time:
|
||||
params['collapse_time'] = collapse_time
|
||||
param = params.get('collapseTime')
|
||||
if param:
|
||||
params['collapse_time'] = param
|
||||
|
||||
resolve_revisits = params.get('resolveRevisits')
|
||||
if resolve_revisits:
|
||||
params['resolve_revisits'] = resolve_revisits
|
||||
param = params.get('matchType')
|
||||
if param:
|
||||
params['match_type'] = param
|
||||
|
||||
param = params.get('resolveRevisits')
|
||||
if param:
|
||||
params['resolve_revisits'] = param
|
||||
|
||||
if params.get('sort') == 'reverse':
|
||||
params['reverse'] = True
|
||||
@ -204,38 +241,3 @@ def extract_params_from_wsgi_env(env):
|
||||
params[name] = val[0]
|
||||
|
||||
return params
|
||||
|
||||
|
||||
#=================================================================
|
||||
def unsurt(surt):
|
||||
"""
|
||||
# Simple surt
|
||||
>>> unsurt('com,example)/')
|
||||
'example.com)/'
|
||||
|
||||
# Broken surt
|
||||
>>> unsurt('com,example)')
|
||||
'com,example)'
|
||||
|
||||
# Long surt
|
||||
>>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\
|
||||
index.html?a=b?c=)/')
|
||||
'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/'
|
||||
"""
|
||||
|
||||
try:
|
||||
index = surt.index(')/')
|
||||
parts = surt[0:index].split(',')
|
||||
parts.reverse()
|
||||
host = '.'.join(parts)
|
||||
host += surt[index:]
|
||||
return host
|
||||
|
||||
except ValueError:
|
||||
# May not be a valid surt
|
||||
return surt
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
24
pywb/cdx/rules.yaml
Normal file
24
pywb/cdx/rules.yaml
Normal file
@ -0,0 +1,24 @@
|
||||
|
||||
fuzzy_lookup_rules:
|
||||
- startswith: 'com,twitter)/i/profiles/show/'
|
||||
matches: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)'
|
||||
|
||||
- startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
|
||||
matches: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))'
|
||||
|
||||
- startswith: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo']
|
||||
matches: '([^/]+(?:\.css|\.js))'
|
||||
|
||||
# matches all urls
|
||||
- startswith: ''
|
||||
matches: '[&?](?:_|uncache)=[\d]+[&]?'
|
||||
|
||||
canon_rules:
|
||||
- startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
|
||||
matches: 'com,facebook\)/.*[?&]data=([^&]+).*'
|
||||
replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -25,6 +25,8 @@ org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jq
|
||||
|
||||
# No matching results
|
||||
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2)
|
||||
Traceback (most recent call last):
|
||||
CaptureNotFoundException: No Captures found for: http://iana.org/dont_have_this
|
||||
|
||||
|
||||
# Filter cdx (default: regex)
|
||||
|
@ -1,13 +1,7 @@
|
||||
import urllib
|
||||
import urllib2
|
||||
|
||||
from wbexceptions import NotFoundException
|
||||
|
||||
from itertools import chain
|
||||
from pprint import pprint
|
||||
|
||||
from pywb.cdx.cdxserver import create_cdx_server, CDXException
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.cdx.cdxserver import create_cdx_server
|
||||
|
||||
#=================================================================
|
||||
class IndexReader(object):
|
||||
@ -18,8 +12,8 @@ class IndexReader(object):
|
||||
Creates an appropriate query based on wbrequest type info
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
self.cdx_server = create_cdx_server(config)
|
||||
def __init__(self, config, ds_rules_file=None):
|
||||
self.cdx_server = create_cdx_server(config, ds_rules_file)
|
||||
|
||||
def load_for_request(self, wbrequest):
|
||||
wburl = wbrequest.wb_url
|
||||
@ -29,19 +23,14 @@ class IndexReader(object):
|
||||
|
||||
# add any custom filter from the request
|
||||
if wbrequest.query_filter:
|
||||
params['filter'] = wbrequest.query_filter
|
||||
params['filter'].extend(wbrequest.query_filter)
|
||||
|
||||
if wbrequest.custom_params:
|
||||
params.update(wbrequest.custom_params)
|
||||
|
||||
params['url'] = wburl.url
|
||||
params['allow_fuzzy'] = True
|
||||
|
||||
cdxlines = self.load_cdx(output='raw', **params)
|
||||
|
||||
cdxlines = self.peek_iter(cdxlines)
|
||||
|
||||
if cdxlines is None:
|
||||
raise NotFoundException('No Captures found for: ' + wburl.url)
|
||||
cdxlines = self.load_cdx(url=wburl.url, output='raw', **params)
|
||||
|
||||
return cdxlines
|
||||
|
||||
@ -54,7 +43,7 @@ class IndexReader(object):
|
||||
|
||||
return {
|
||||
wburl.QUERY:
|
||||
{'collapseTime': collapse_time, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
|
||||
{'collapseTime': collapse_time, 'filter': ['!statuscode:(500|502|504)'], 'limit': limit},
|
||||
|
||||
wburl.URL_QUERY:
|
||||
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
|
||||
@ -62,21 +51,12 @@ class IndexReader(object):
|
||||
},
|
||||
|
||||
wburl.REPLAY:
|
||||
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replay_closest, 'closest': wburl.timestamp, 'resolveRevisits': True},
|
||||
{'sort': 'closest', 'filter': ['!statuscode:(500|502|504)'], 'limit': replay_closest, 'closest': wburl.timestamp, 'resolveRevisits': True},
|
||||
|
||||
# BUG: resolveRevisits currently doesn't work for this type of query
|
||||
# This is not an issue in archival mode, as there is a redirect to the actual timestamp query
|
||||
# but may be an issue in proxy mode
|
||||
wburl.LATEST_REPLAY:
|
||||
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
|
||||
{'sort': 'reverse', 'filter': ['statuscode:[23]..'], 'limit': '1', 'resolveRevisits': True}
|
||||
|
||||
}[wburl.type]
|
||||
|
||||
@staticmethod
|
||||
def peek_iter(iterable):
|
||||
try:
|
||||
first = next(iterable)
|
||||
except StopIteration:
|
||||
return None
|
||||
|
||||
return chain([first], iterable)
|
||||
|
@ -21,6 +21,8 @@ DEFAULTS = {
|
||||
'error_html': 'ui/error.html',
|
||||
|
||||
'static_routes': {'static/default': 'static/'},
|
||||
|
||||
'domain_specific_rules': 'rules.yaml',
|
||||
}
|
||||
|
||||
class DictChain:
|
||||
@ -30,7 +32,7 @@ class DictChain:
|
||||
def get(self, key, default_val=None):
|
||||
for d in self.dicts:
|
||||
val = d.get(key)
|
||||
if val:
|
||||
if val is not None:
|
||||
return val
|
||||
return default_val
|
||||
|
||||
@ -52,11 +54,13 @@ def pywb_config_manual(passed_config = {}):
|
||||
for name, value in collections.iteritems():
|
||||
if isinstance(value, str):
|
||||
route_config = config
|
||||
cdx_server = IndexReader(value)
|
||||
cdx_config = value
|
||||
else:
|
||||
route_config = DictChain(value, config)
|
||||
cdx_server = IndexReader(route_config)
|
||||
cdx_config = route_config
|
||||
|
||||
ds_rules = route_config.get('domain_specific_rules', None)
|
||||
cdx_server = IndexReader(cdx_config, ds_rules)
|
||||
|
||||
wb_handler = config_utils.create_wb_handler(
|
||||
cdx_server = cdx_server,
|
||||
@ -118,7 +122,8 @@ def pywb_config(config_file = None):
|
||||
if not config_file:
|
||||
config_file = os.environ.get('PYWB_CONFIG', DEFAULT_CONFIG_FILE)
|
||||
|
||||
config = yaml.load(open(config_file))
|
||||
with open(config_file) as fh:
|
||||
config = yaml.load(fh)
|
||||
|
||||
return pywb_config_manual(config)
|
||||
|
||||
|
@ -54,8 +54,7 @@ class RewriteContent:
|
||||
# =========================================================================
|
||||
# special case -- need to ungzip the body
|
||||
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
|
||||
stream = BufferedReader(stream, 'gzip')
|
||||
|
||||
stream = BufferedReader(stream, decomp_type='gzip')
|
||||
|
||||
if rewritten_headers.charset:
|
||||
encoding = rewritten_headers.charset
|
||||
|
@ -1,14 +1,15 @@
|
||||
|
||||
|
||||
class WbException(Exception):
|
||||
pass
|
||||
|
||||
class NotFoundException(WbException):
|
||||
def status(_):
|
||||
def status(self):
|
||||
return '404 Not Found'
|
||||
|
||||
# Exceptions that effect a specific capture and result in a retry
|
||||
class CaptureException(WbException):
|
||||
def status(_):
|
||||
def status(self):
|
||||
return '500 Internal Server Error'
|
||||
|
||||
class InternalRedirect(WbException):
|
||||
|
@ -93,3 +93,6 @@ enable_cdx_api: true
|
||||
# optional reporter callback func
|
||||
# if set, called with request and cdx object
|
||||
reporter_func: pywb.run-tests.print_reporter
|
||||
|
||||
# custom rules for domain specific matching
|
||||
#domain_specific_rules: rules.yaml
|
||||
|
@ -50,6 +50,13 @@ class TestWb:
|
||||
# 1 Capture (filtered) + header
|
||||
assert len(resp.html.find_all('tr')) == 2
|
||||
|
||||
def test_calendar_query_fuzzy_match(self):
|
||||
# fuzzy match removing _= according to standard rules.yaml
|
||||
resp = self.testapp.get('/pywb/*/http://www.iana.org/_css/2013.1/screen.css?_=3141592653')
|
||||
self._assert_basic_html(resp)
|
||||
# 17 Captures + header
|
||||
assert len(resp.html.find_all('tr')) == 18
|
||||
|
||||
def test_cdx_query(self):
|
||||
resp = self.testapp.get('/pywb/cdx_/*/http://www.iana.org/')
|
||||
self._assert_basic_text(resp)
|
||||
|
Loading…
x
Reference in New Issue
Block a user