mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
cdx: add domain-specific rules at cdx layer for custom canonicalization!
and 'fuzzy' matching when not found handled via cdxdomainspecific.py BaseCDXServer contains a canonicalizer object and a fuzzy query canonicalizer abstracted to seperate class (in canonicalizer.py) clean up cdx related exceptions default rules read from cdx/rules.yaml filename configurable via 'domain_specific_rules' setting in config.yaml fix typo in pywb/rewrite
This commit is contained in:
parent
ab95524b7b
commit
a09dec4b3e
@ -92,4 +92,8 @@ static_routes:
|
|||||||
enable_http_proxy: true
|
enable_http_proxy: true
|
||||||
|
|
||||||
# enable cdx server api for querying cdx directly (experimental)
|
# enable cdx server api for querying cdx directly (experimental)
|
||||||
#enable_cdx_api: false
|
enable_cdx_api: true
|
||||||
|
|
||||||
|
# custom rules for domain specific matching
|
||||||
|
# set to false to disable
|
||||||
|
#domain_specific_rules: rules.yaml
|
||||||
|
74
pywb/cdx/canonicalize.py
Normal file
74
pywb/cdx/canonicalize.py
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
""" Standard url-canonicalzation, surt and non-surt
|
||||||
|
"""
|
||||||
|
|
||||||
|
import surt
|
||||||
|
from cdxobject import CDXException
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class UrlCanonicalizer(object):
|
||||||
|
def __init__(self, surt_ordered=True):
|
||||||
|
self.surt_ordered = surt_ordered
|
||||||
|
|
||||||
|
def __call__(self, url):
|
||||||
|
return canonicalize(url, self.surt_ordered)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def canonicalize(url, surt_ordered=True):
|
||||||
|
"""
|
||||||
|
Canonicalize url and convert to surt
|
||||||
|
If not in surt ordered mode, convert back to url form
|
||||||
|
as surt conversion is currently part of canonicalization
|
||||||
|
|
||||||
|
>>> canonicalize('http://example.com/path/file.html', surt_ordered=True)
|
||||||
|
'com,example)/path/file.html'
|
||||||
|
|
||||||
|
>>> canonicalize('http://example.com/path/file.html', surt_ordered=False)
|
||||||
|
'example.com/path/file.html'
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
key = surt.surt(url)
|
||||||
|
except Exception as e:
|
||||||
|
raise CDXException('Invalid Url: ' + url)
|
||||||
|
|
||||||
|
# if not surt, unsurt the surt to get canonicalized non-surt url
|
||||||
|
if not surt_ordered:
|
||||||
|
key = unsurt(key)
|
||||||
|
|
||||||
|
return key
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def unsurt(surt):
|
||||||
|
"""
|
||||||
|
# Simple surt
|
||||||
|
>>> unsurt('com,example)/')
|
||||||
|
'example.com/'
|
||||||
|
|
||||||
|
# Broken surt
|
||||||
|
>>> unsurt('com,example)')
|
||||||
|
'com,example)'
|
||||||
|
|
||||||
|
# Long surt
|
||||||
|
>>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\
|
||||||
|
index.html?a=b?c=)/')
|
||||||
|
'subdomain.another.subsub.sub.domain.suffix/path/file/index.html?a=b?c=)/'
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
index = surt.index(')/')
|
||||||
|
parts = surt[0:index].split(',')
|
||||||
|
parts.reverse()
|
||||||
|
host = '.'.join(parts)
|
||||||
|
host += surt[index + 1:]
|
||||||
|
return host
|
||||||
|
|
||||||
|
except ValueError:
|
||||||
|
# May not be a valid surt
|
||||||
|
return surt
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
125
pywb/cdx/cdxdomainspecific.py
Normal file
125
pywb/cdx/cdxdomainspecific.py
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
import yaml
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
import pkgutil
|
||||||
|
|
||||||
|
from canonicalize import unsurt, UrlCanonicalizer
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def load_domain_specific_cdx_rules(filename, surt_ordered):
|
||||||
|
fh = pkgutil.get_data(__package__, filename)
|
||||||
|
config = yaml.load(fh)
|
||||||
|
|
||||||
|
# Load Canonicalizer Rules
|
||||||
|
rules = StartsWithRule.load_rules(config.get('canon_rules'),
|
||||||
|
surt_ordered)
|
||||||
|
|
||||||
|
if rules:
|
||||||
|
canon = CustomUrlCanonicalizer(rules, surt_ordered)
|
||||||
|
else:
|
||||||
|
canon = None
|
||||||
|
|
||||||
|
# Load Fuzzy Lookup Rules
|
||||||
|
rules = StartsWithRule.load_rules(config.get('fuzzy_lookup_rules'),
|
||||||
|
surt_ordered)
|
||||||
|
|
||||||
|
if rules:
|
||||||
|
fuzzy = FuzzyQuery(rules)
|
||||||
|
else:
|
||||||
|
fuzzy = None
|
||||||
|
|
||||||
|
logging.debug('CANON: ' + str(canon))
|
||||||
|
logging.debug('FUZZY: ' + str(fuzzy))
|
||||||
|
return (canon, fuzzy)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class CustomUrlCanonicalizer(UrlCanonicalizer):
|
||||||
|
def __init__(self, rules, surt_ordered=True):
|
||||||
|
super(CustomUrlCanonicalizer, self).__init__(surt_ordered)
|
||||||
|
self.rules = rules
|
||||||
|
|
||||||
|
def __call__(self, url):
|
||||||
|
urlkey = super(CustomUrlCanonicalizer, self).__call__(url)
|
||||||
|
|
||||||
|
for rule in self.rules:
|
||||||
|
if not any(urlkey.startswith(x) for x in rule.starts):
|
||||||
|
continue
|
||||||
|
|
||||||
|
m = rule.regex.match(urlkey)
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if rule.replace:
|
||||||
|
return m.expand(rule.replace)
|
||||||
|
|
||||||
|
return urlkey
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class FuzzyQuery:
|
||||||
|
def __init__(self, rules):
|
||||||
|
self.rules = rules
|
||||||
|
|
||||||
|
def __call__(self, params):
|
||||||
|
matched_rule = None
|
||||||
|
|
||||||
|
urlkey = params['key']
|
||||||
|
url = params['url']
|
||||||
|
|
||||||
|
for rule in self.rules:
|
||||||
|
if not any(urlkey.startswith(x) for x in rule.starts):
|
||||||
|
continue
|
||||||
|
|
||||||
|
m = rule.regex.search(urlkey)
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
|
||||||
|
matched_rule = rule
|
||||||
|
|
||||||
|
if len(m.groups()) == 1:
|
||||||
|
params['filter'] = '=urlkey:' + m.group(1)
|
||||||
|
|
||||||
|
break
|
||||||
|
|
||||||
|
if not matched_rule:
|
||||||
|
return None
|
||||||
|
|
||||||
|
inx = url.find('?')
|
||||||
|
if inx > 0:
|
||||||
|
params['url'] = url[:inx + 1]
|
||||||
|
|
||||||
|
params['matchType'] = 'prefix'
|
||||||
|
params['key'] = None
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class StartsWithRule:
|
||||||
|
def __init__(self, config, surt_ordered=True):
|
||||||
|
self.starts = config.get('startswith')
|
||||||
|
if not isinstance(self.starts, list):
|
||||||
|
self.starts = [self.starts]
|
||||||
|
|
||||||
|
self.regex = re.compile(config.get('matches'))
|
||||||
|
self.replace = config.get('replace')
|
||||||
|
|
||||||
|
def unsurt(self):
|
||||||
|
# must convert to non-surt form
|
||||||
|
self.starts = map(unsurt, self.starts)
|
||||||
|
self.regex = unsurt(self.regex)
|
||||||
|
self.replace = unsurt(self.replace)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load_rules(rules_config, surt_ordered=True):
|
||||||
|
if not rules_config:
|
||||||
|
return []
|
||||||
|
|
||||||
|
rules = map(StartsWithRule, rules_config)
|
||||||
|
|
||||||
|
if not surt_ordered:
|
||||||
|
for rule in rules:
|
||||||
|
rule.unsurt()
|
||||||
|
|
||||||
|
return rules
|
@ -2,6 +2,24 @@ from collections import OrderedDict
|
|||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class CDXException(Exception):
|
||||||
|
def status(self):
|
||||||
|
return '400 Bad Request'
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class CaptureNotFoundException(CDXException):
|
||||||
|
def status(self):
|
||||||
|
return '404 Not Found'
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class AccessException(CDXException):
|
||||||
|
def status(self):
|
||||||
|
return '403 Access Denied'
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CDXObject(OrderedDict):
|
class CDXObject(OrderedDict):
|
||||||
CDX_FORMATS = [
|
CDX_FORMATS = [
|
||||||
|
@ -1,82 +1,103 @@
|
|||||||
import surt
|
from canonicalize import UrlCanonicalizer
|
||||||
from cdxops import cdx_load
|
|
||||||
|
|
||||||
import itertools
|
from cdxops import cdx_load
|
||||||
|
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
|
||||||
|
from cdxobject import CDXObject, CaptureNotFoundException, CDXException
|
||||||
|
from cdxdomainspecific import load_domain_specific_cdx_rules
|
||||||
|
|
||||||
|
from itertools import chain
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import urlparse
|
import urlparse
|
||||||
|
|
||||||
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
|
|
||||||
from cdxobject import CDXObject
|
#=================================================================
|
||||||
|
class BaseCDXServer(object):
|
||||||
|
def __init__(self, url_canon=None, fuzzy_query=None):
|
||||||
|
self.url_canon = url_canon if url_canon else UrlCanonicalizer()
|
||||||
|
self.fuzzy_query = fuzzy_query
|
||||||
|
|
||||||
|
def _check_cdx_iter(self, cdx_iter, params):
|
||||||
|
""" Check cdx iter semantics
|
||||||
|
If iter is empty (no matches), check if fuzzy matching
|
||||||
|
is allowed, and try it -- otherwise,
|
||||||
|
throw CaptureNotFoundException
|
||||||
|
"""
|
||||||
|
|
||||||
|
cdx_iter = self.peek_iter(cdx_iter)
|
||||||
|
|
||||||
|
if cdx_iter:
|
||||||
|
return cdx_iter
|
||||||
|
|
||||||
|
url = params['url']
|
||||||
|
|
||||||
|
if self.fuzzy_query and params.get('allow_fuzzy'):
|
||||||
|
if not 'key' in params:
|
||||||
|
params['key'] = self.url_canon(url)
|
||||||
|
|
||||||
|
params = self.fuzzy_query(params)
|
||||||
|
if params:
|
||||||
|
params['allow_fuzzy'] = False
|
||||||
|
return self.load_cdx(**params)
|
||||||
|
|
||||||
|
msg = 'No Captures found for: ' + url
|
||||||
|
raise CaptureNotFoundException(msg)
|
||||||
|
|
||||||
|
def load_cdx(self, **params):
|
||||||
|
raise NotImplementedError('Implement in subclass')
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def peek_iter(iterable):
|
||||||
|
try:
|
||||||
|
first = next(iterable)
|
||||||
|
except StopIteration:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return chain([first], iterable)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CDXException(Exception):
|
class CDXServer(BaseCDXServer):
|
||||||
def status(self):
|
|
||||||
return '400 Bad Request'
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class AccessException(CDXException):
|
|
||||||
def status(self):
|
|
||||||
return '403 Bad Request'
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class CDXServer(object):
|
|
||||||
"""
|
"""
|
||||||
Top-level cdx server object which maintains a list of cdx sources,
|
Top-level cdx server object which maintains a list of cdx sources,
|
||||||
responds to queries and dispatches to the cdx ops for processing
|
responds to queries and dispatches to the cdx ops for processing
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, paths, surt_ordered=True):
|
def __init__(self, paths, url_canon=None, fuzzy_query=None):
|
||||||
|
super(CDXServer, self).__init__(url_canon, fuzzy_query)
|
||||||
self.sources = create_cdx_sources(paths)
|
self.sources = create_cdx_sources(paths)
|
||||||
self.surt_ordered = surt_ordered
|
|
||||||
|
|
||||||
def load_cdx(self, **params):
|
def load_cdx(self, **params):
|
||||||
# if key not set, assume 'url' is set and needs canonicalization
|
# if key not set, assume 'url' is set and needs canonicalization
|
||||||
if not params.get('key'):
|
if not params.get('key'):
|
||||||
params['key'] = self._canonicalize(params)
|
try:
|
||||||
|
url = params['url']
|
||||||
|
except KeyError:
|
||||||
|
msg = 'A url= param must be specified to query the cdx server'
|
||||||
|
raise CDXException(msg)
|
||||||
|
|
||||||
|
params['key'] = self.url_canon(url)
|
||||||
|
|
||||||
convert_old_style_params(params)
|
convert_old_style_params(params)
|
||||||
|
|
||||||
return cdx_load(self.sources, params)
|
cdx_iter = cdx_load(self.sources, params)
|
||||||
|
|
||||||
def _canonicalize(self, params):
|
return self._check_cdx_iter(cdx_iter, params)
|
||||||
"""
|
|
||||||
Canonicalize url and convert to surt
|
|
||||||
If no surt-mode, convert back to url form
|
|
||||||
as surt conversion is currently part of canonicalization
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
url = params['url']
|
|
||||||
except KeyError:
|
|
||||||
msg = 'A url= param must be specified to query the cdx server'
|
|
||||||
raise CDXException(msg)
|
|
||||||
|
|
||||||
try:
|
|
||||||
key = surt.surt(url)
|
|
||||||
except Exception as e:
|
|
||||||
raise CDXException('Invalid Url: ' + url)
|
|
||||||
|
|
||||||
# if not surt, unsurt the surt to get canonicalized non-surt url
|
|
||||||
if not self.surt_ordered:
|
|
||||||
key = unsurt(key)
|
|
||||||
|
|
||||||
return key
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'CDX server serving from ' + str(self.sources)
|
return 'CDX server serving from ' + str(self.sources)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class RemoteCDXServer(object):
|
class RemoteCDXServer(BaseCDXServer):
|
||||||
"""
|
"""
|
||||||
A special cdx server that uses a single RemoteCDXSource
|
A special cdx server that uses a single RemoteCDXSource
|
||||||
It simply proxies the query params to the remote source
|
It simply proxies the query params to the remote source
|
||||||
and performs no local processing/filtering
|
and performs no local processing/filtering
|
||||||
"""
|
"""
|
||||||
def __init__(self, source):
|
def __init__(self, source, url_canon=None, fuzzy_query=None):
|
||||||
|
super(RemoteCDXServer, self).__init__(url_canon, fuzzy_query)
|
||||||
|
|
||||||
if isinstance(source, RemoteCDXSource):
|
if isinstance(source, RemoteCDXSource):
|
||||||
self.source = source
|
self.source = source
|
||||||
elif (isinstance(source, str) and
|
elif (isinstance(source, str) and
|
||||||
@ -87,18 +108,19 @@ class RemoteCDXServer(object):
|
|||||||
|
|
||||||
def load_cdx(self, **params):
|
def load_cdx(self, **params):
|
||||||
remote_iter = self.source.load_cdx(params)
|
remote_iter = self.source.load_cdx(params)
|
||||||
|
|
||||||
# if need raw, convert to raw format here
|
# if need raw, convert to raw format here
|
||||||
if params.get('output') == 'raw':
|
if params.get('output') == 'raw':
|
||||||
return (CDXObject(cdx) for cdx in remote_iter)
|
remote_iter = (CDXObject(cdx) for cdx in remote_iter)
|
||||||
else:
|
|
||||||
return remote_iter
|
return self._check_cdx_iter(remote_iter, params)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'Remote CDX server serving from ' + str(self.sources[0])
|
return 'Remote CDX server serving from ' + str(self.sources[0])
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def create_cdx_server(config):
|
def create_cdx_server(config, ds_rules_file=None):
|
||||||
if hasattr(config, 'get'):
|
if hasattr(config, 'get'):
|
||||||
paths = config.get('index_paths')
|
paths = config.get('index_paths')
|
||||||
surt_ordered = config.get('surt_ordered', True)
|
surt_ordered = config.get('surt_ordered', True)
|
||||||
@ -108,11 +130,22 @@ def create_cdx_server(config):
|
|||||||
|
|
||||||
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
||||||
|
|
||||||
|
if ds_rules_file:
|
||||||
|
canon, fuzzy = load_domain_specific_cdx_rules(ds_rules_file,
|
||||||
|
surt_ordered)
|
||||||
|
else:
|
||||||
|
canon, fuzzy = None, None
|
||||||
|
|
||||||
|
if not canon:
|
||||||
|
canon = UrlCanonicalizer(surt_ordered)
|
||||||
|
|
||||||
if (isinstance(paths, str) and
|
if (isinstance(paths, str) and
|
||||||
any(paths.startswith(x) for x in ['http://', 'https://'])):
|
any(paths.startswith(x) for x in ['http://', 'https://'])):
|
||||||
return RemoteCDXServer(paths)
|
server_cls = RemoteCDXServer
|
||||||
else:
|
else:
|
||||||
return CDXServer(paths)
|
server_cls = CDXServer
|
||||||
|
|
||||||
|
return server_cls(paths, url_canon=canon, fuzzy_query=fuzzy)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -170,13 +203,17 @@ def convert_old_style_params(params):
|
|||||||
"""
|
"""
|
||||||
Convert old-style CDX Server param semantics
|
Convert old-style CDX Server param semantics
|
||||||
"""
|
"""
|
||||||
collapse_time = params.get('collapseTime')
|
param = params.get('collapseTime')
|
||||||
if collapse_time:
|
if param:
|
||||||
params['collapse_time'] = collapse_time
|
params['collapse_time'] = param
|
||||||
|
|
||||||
resolve_revisits = params.get('resolveRevisits')
|
param = params.get('matchType')
|
||||||
if resolve_revisits:
|
if param:
|
||||||
params['resolve_revisits'] = resolve_revisits
|
params['match_type'] = param
|
||||||
|
|
||||||
|
param = params.get('resolveRevisits')
|
||||||
|
if param:
|
||||||
|
params['resolve_revisits'] = param
|
||||||
|
|
||||||
if params.get('sort') == 'reverse':
|
if params.get('sort') == 'reverse':
|
||||||
params['reverse'] = True
|
params['reverse'] = True
|
||||||
@ -204,38 +241,3 @@ def extract_params_from_wsgi_env(env):
|
|||||||
params[name] = val[0]
|
params[name] = val[0]
|
||||||
|
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def unsurt(surt):
|
|
||||||
"""
|
|
||||||
# Simple surt
|
|
||||||
>>> unsurt('com,example)/')
|
|
||||||
'example.com)/'
|
|
||||||
|
|
||||||
# Broken surt
|
|
||||||
>>> unsurt('com,example)')
|
|
||||||
'com,example)'
|
|
||||||
|
|
||||||
# Long surt
|
|
||||||
>>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\
|
|
||||||
index.html?a=b?c=)/')
|
|
||||||
'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/'
|
|
||||||
"""
|
|
||||||
|
|
||||||
try:
|
|
||||||
index = surt.index(')/')
|
|
||||||
parts = surt[0:index].split(',')
|
|
||||||
parts.reverse()
|
|
||||||
host = '.'.join(parts)
|
|
||||||
host += surt[index:]
|
|
||||||
return host
|
|
||||||
|
|
||||||
except ValueError:
|
|
||||||
# May not be a valid surt
|
|
||||||
return surt
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import doctest
|
|
||||||
doctest.testmod()
|
|
||||||
|
24
pywb/cdx/rules.yaml
Normal file
24
pywb/cdx/rules.yaml
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
|
||||||
|
fuzzy_lookup_rules:
|
||||||
|
- startswith: 'com,twitter)/i/profiles/show/'
|
||||||
|
matches: '/profiles/show/.*with_replies\?.*(max_id=[^&]+)'
|
||||||
|
|
||||||
|
- startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
|
||||||
|
matches: 'com,facebook\)/.*[?&]data=(.*(?:[&]|query_type[^,]+))'
|
||||||
|
|
||||||
|
- startswith: ['com,yimg,l)/g/combo', 'com,yahooapis,yui)/combo']
|
||||||
|
matches: '([^/]+(?:\.css|\.js))'
|
||||||
|
|
||||||
|
# matches all urls
|
||||||
|
- startswith: ''
|
||||||
|
matches: '[&?](?:_|uncache)=[\d]+[&]?'
|
||||||
|
|
||||||
|
canon_rules:
|
||||||
|
- startswith: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet'
|
||||||
|
matches: 'com,facebook\)/.*[?&]data=([^&]+).*'
|
||||||
|
replace: 'com,facebook)/ajax/pagelet/generic.php/profiletimelinesectionpagelet?data=\1'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -25,6 +25,8 @@ org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jq
|
|||||||
|
|
||||||
# No matching results
|
# No matching results
|
||||||
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2)
|
>>> cdx_ops_test('http://iana.org/dont_have_this', reverse = True, resolve_revisits = True, limit = 2)
|
||||||
|
Traceback (most recent call last):
|
||||||
|
CaptureNotFoundException: No Captures found for: http://iana.org/dont_have_this
|
||||||
|
|
||||||
|
|
||||||
# Filter cdx (default: regex)
|
# Filter cdx (default: regex)
|
||||||
|
@ -1,13 +1,7 @@
|
|||||||
import urllib
|
import urllib
|
||||||
import urllib2
|
import urllib2
|
||||||
|
|
||||||
from wbexceptions import NotFoundException
|
from pywb.cdx.cdxserver import create_cdx_server
|
||||||
|
|
||||||
from itertools import chain
|
|
||||||
from pprint import pprint
|
|
||||||
|
|
||||||
from pywb.cdx.cdxserver import create_cdx_server, CDXException
|
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class IndexReader(object):
|
class IndexReader(object):
|
||||||
@ -18,8 +12,8 @@ class IndexReader(object):
|
|||||||
Creates an appropriate query based on wbrequest type info
|
Creates an appropriate query based on wbrequest type info
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config, ds_rules_file=None):
|
||||||
self.cdx_server = create_cdx_server(config)
|
self.cdx_server = create_cdx_server(config, ds_rules_file)
|
||||||
|
|
||||||
def load_for_request(self, wbrequest):
|
def load_for_request(self, wbrequest):
|
||||||
wburl = wbrequest.wb_url
|
wburl = wbrequest.wb_url
|
||||||
@ -29,19 +23,14 @@ class IndexReader(object):
|
|||||||
|
|
||||||
# add any custom filter from the request
|
# add any custom filter from the request
|
||||||
if wbrequest.query_filter:
|
if wbrequest.query_filter:
|
||||||
params['filter'] = wbrequest.query_filter
|
params['filter'].extend(wbrequest.query_filter)
|
||||||
|
|
||||||
if wbrequest.custom_params:
|
if wbrequest.custom_params:
|
||||||
params.update(wbrequest.custom_params)
|
params.update(wbrequest.custom_params)
|
||||||
|
|
||||||
params['url'] = wburl.url
|
params['allow_fuzzy'] = True
|
||||||
|
|
||||||
cdxlines = self.load_cdx(output='raw', **params)
|
cdxlines = self.load_cdx(url=wburl.url, output='raw', **params)
|
||||||
|
|
||||||
cdxlines = self.peek_iter(cdxlines)
|
|
||||||
|
|
||||||
if cdxlines is None:
|
|
||||||
raise NotFoundException('No Captures found for: ' + wburl.url)
|
|
||||||
|
|
||||||
return cdxlines
|
return cdxlines
|
||||||
|
|
||||||
@ -54,7 +43,7 @@ class IndexReader(object):
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
wburl.QUERY:
|
wburl.QUERY:
|
||||||
{'collapseTime': collapse_time, 'filter': '!statuscode:(500|502|504)', 'limit': limit},
|
{'collapseTime': collapse_time, 'filter': ['!statuscode:(500|502|504)'], 'limit': limit},
|
||||||
|
|
||||||
wburl.URL_QUERY:
|
wburl.URL_QUERY:
|
||||||
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
|
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
|
||||||
@ -62,21 +51,12 @@ class IndexReader(object):
|
|||||||
},
|
},
|
||||||
|
|
||||||
wburl.REPLAY:
|
wburl.REPLAY:
|
||||||
{'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': replay_closest, 'closest': wburl.timestamp, 'resolveRevisits': True},
|
{'sort': 'closest', 'filter': ['!statuscode:(500|502|504)'], 'limit': replay_closest, 'closest': wburl.timestamp, 'resolveRevisits': True},
|
||||||
|
|
||||||
# BUG: resolveRevisits currently doesn't work for this type of query
|
# BUG: resolveRevisits currently doesn't work for this type of query
|
||||||
# This is not an issue in archival mode, as there is a redirect to the actual timestamp query
|
# This is not an issue in archival mode, as there is a redirect to the actual timestamp query
|
||||||
# but may be an issue in proxy mode
|
# but may be an issue in proxy mode
|
||||||
wburl.LATEST_REPLAY:
|
wburl.LATEST_REPLAY:
|
||||||
{'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True}
|
{'sort': 'reverse', 'filter': ['statuscode:[23]..'], 'limit': '1', 'resolveRevisits': True}
|
||||||
|
|
||||||
}[wburl.type]
|
}[wburl.type]
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def peek_iter(iterable):
|
|
||||||
try:
|
|
||||||
first = next(iterable)
|
|
||||||
except StopIteration:
|
|
||||||
return None
|
|
||||||
|
|
||||||
return chain([first], iterable)
|
|
||||||
|
@ -21,6 +21,8 @@ DEFAULTS = {
|
|||||||
'error_html': 'ui/error.html',
|
'error_html': 'ui/error.html',
|
||||||
|
|
||||||
'static_routes': {'static/default': 'static/'},
|
'static_routes': {'static/default': 'static/'},
|
||||||
|
|
||||||
|
'domain_specific_rules': 'rules.yaml',
|
||||||
}
|
}
|
||||||
|
|
||||||
class DictChain:
|
class DictChain:
|
||||||
@ -30,7 +32,7 @@ class DictChain:
|
|||||||
def get(self, key, default_val=None):
|
def get(self, key, default_val=None):
|
||||||
for d in self.dicts:
|
for d in self.dicts:
|
||||||
val = d.get(key)
|
val = d.get(key)
|
||||||
if val:
|
if val is not None:
|
||||||
return val
|
return val
|
||||||
return default_val
|
return default_val
|
||||||
|
|
||||||
@ -52,11 +54,13 @@ def pywb_config_manual(passed_config = {}):
|
|||||||
for name, value in collections.iteritems():
|
for name, value in collections.iteritems():
|
||||||
if isinstance(value, str):
|
if isinstance(value, str):
|
||||||
route_config = config
|
route_config = config
|
||||||
cdx_server = IndexReader(value)
|
cdx_config = value
|
||||||
else:
|
else:
|
||||||
route_config = DictChain(value, config)
|
route_config = DictChain(value, config)
|
||||||
cdx_server = IndexReader(route_config)
|
cdx_config = route_config
|
||||||
|
|
||||||
|
ds_rules = route_config.get('domain_specific_rules', None)
|
||||||
|
cdx_server = IndexReader(cdx_config, ds_rules)
|
||||||
|
|
||||||
wb_handler = config_utils.create_wb_handler(
|
wb_handler = config_utils.create_wb_handler(
|
||||||
cdx_server = cdx_server,
|
cdx_server = cdx_server,
|
||||||
@ -118,7 +122,8 @@ def pywb_config(config_file = None):
|
|||||||
if not config_file:
|
if not config_file:
|
||||||
config_file = os.environ.get('PYWB_CONFIG', DEFAULT_CONFIG_FILE)
|
config_file = os.environ.get('PYWB_CONFIG', DEFAULT_CONFIG_FILE)
|
||||||
|
|
||||||
config = yaml.load(open(config_file))
|
with open(config_file) as fh:
|
||||||
|
config = yaml.load(fh)
|
||||||
|
|
||||||
return pywb_config_manual(config)
|
return pywb_config_manual(config)
|
||||||
|
|
||||||
|
@ -54,8 +54,7 @@ class RewriteContent:
|
|||||||
# =========================================================================
|
# =========================================================================
|
||||||
# special case -- need to ungzip the body
|
# special case -- need to ungzip the body
|
||||||
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
|
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
|
||||||
stream = BufferedReader(stream, 'gzip')
|
stream = BufferedReader(stream, decomp_type='gzip')
|
||||||
|
|
||||||
|
|
||||||
if rewritten_headers.charset:
|
if rewritten_headers.charset:
|
||||||
encoding = rewritten_headers.charset
|
encoding = rewritten_headers.charset
|
||||||
|
@ -1,14 +1,15 @@
|
|||||||
|
|
||||||
|
|
||||||
class WbException(Exception):
|
class WbException(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class NotFoundException(WbException):
|
class NotFoundException(WbException):
|
||||||
def status(_):
|
def status(self):
|
||||||
return '404 Not Found'
|
return '404 Not Found'
|
||||||
|
|
||||||
# Exceptions that effect a specific capture and result in a retry
|
# Exceptions that effect a specific capture and result in a retry
|
||||||
class CaptureException(WbException):
|
class CaptureException(WbException):
|
||||||
def status(_):
|
def status(self):
|
||||||
return '500 Internal Server Error'
|
return '500 Internal Server Error'
|
||||||
|
|
||||||
class InternalRedirect(WbException):
|
class InternalRedirect(WbException):
|
||||||
|
@ -93,3 +93,6 @@ enable_cdx_api: true
|
|||||||
# optional reporter callback func
|
# optional reporter callback func
|
||||||
# if set, called with request and cdx object
|
# if set, called with request and cdx object
|
||||||
reporter_func: pywb.run-tests.print_reporter
|
reporter_func: pywb.run-tests.print_reporter
|
||||||
|
|
||||||
|
# custom rules for domain specific matching
|
||||||
|
#domain_specific_rules: rules.yaml
|
||||||
|
@ -50,6 +50,13 @@ class TestWb:
|
|||||||
# 1 Capture (filtered) + header
|
# 1 Capture (filtered) + header
|
||||||
assert len(resp.html.find_all('tr')) == 2
|
assert len(resp.html.find_all('tr')) == 2
|
||||||
|
|
||||||
|
def test_calendar_query_fuzzy_match(self):
|
||||||
|
# fuzzy match removing _= according to standard rules.yaml
|
||||||
|
resp = self.testapp.get('/pywb/*/http://www.iana.org/_css/2013.1/screen.css?_=3141592653')
|
||||||
|
self._assert_basic_html(resp)
|
||||||
|
# 17 Captures + header
|
||||||
|
assert len(resp.html.find_all('tr')) == 18
|
||||||
|
|
||||||
def test_cdx_query(self):
|
def test_cdx_query(self):
|
||||||
resp = self.testapp.get('/pywb/cdx_/*/http://www.iana.org/')
|
resp = self.testapp.get('/pywb/cdx_/*/http://www.iana.org/')
|
||||||
self._assert_basic_text(resp)
|
self._assert_basic_text(resp)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user