mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
fuzzy matching: new, clean fuzzy matcher implementation for webagg
rules: default rule: fuzzy match urls ignoring prefix match (needs more testing) tests: update tests for new broad fuzzy match rule
This commit is contained in:
parent
e0878f0f67
commit
0f0c20a03a
@ -341,7 +341,9 @@ rules:
|
|||||||
#=================================================================
|
#=================================================================
|
||||||
# Applies to all urls -- should be last
|
# Applies to all urls -- should be last
|
||||||
- url_prefix: ''
|
- url_prefix: ''
|
||||||
fuzzy_lookup:
|
fuzzy_lookup: '()'
|
||||||
match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
|
|
||||||
filter: ['=urlkey:{0}']
|
#fuzzy_lookup:
|
||||||
replace: '?'
|
# match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
|
||||||
|
# filter: ['=urlkey:{0}']
|
||||||
|
# replace: '?'
|
||||||
|
148
pywb/webagg/fuzzymatcher.py
Normal file
148
pywb/webagg/fuzzymatcher.py
Normal file
@ -0,0 +1,148 @@
|
|||||||
|
from warcio.utils import to_native_str
|
||||||
|
from pywb.utils.loaders import load_yaml_config
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from six.moves.urllib.parse import urlsplit
|
||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
FuzzyRule = namedtuple('FuzzyRule',
|
||||||
|
'url_prefix, regex, replace_after, filter_str, ' +
|
||||||
|
'match_type')
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
class FuzzyMatcher(object):
|
||||||
|
DEFAULT_FILTER = ['~urlkey:{0}']
|
||||||
|
DEFAULT_MATCH_TYPE = 'prefix'
|
||||||
|
DEFAULT_REPLACE_AFTER = '?'
|
||||||
|
|
||||||
|
REMOVE_PARAMS = ['alt_url', 'reverse', 'closest', 'end_key']
|
||||||
|
|
||||||
|
def __init__(self, filename):
|
||||||
|
config = load_yaml_config(filename)
|
||||||
|
self.rules = []
|
||||||
|
for rule in config.get('rules'):
|
||||||
|
rule = self.parse_fuzzy_rule(rule)
|
||||||
|
if rule:
|
||||||
|
self.rules.append(rule)
|
||||||
|
|
||||||
|
def parse_fuzzy_rule(self, rule):
|
||||||
|
""" Parse rules using all the different supported forms
|
||||||
|
"""
|
||||||
|
url_prefix = rule.get('url_prefix')
|
||||||
|
config = rule.get('fuzzy_lookup')
|
||||||
|
if not config:
|
||||||
|
return
|
||||||
|
|
||||||
|
if not isinstance(url_prefix, list):
|
||||||
|
url_prefix = [url_prefix]
|
||||||
|
|
||||||
|
if not isinstance(config, dict):
|
||||||
|
regex = self.make_regex(config)
|
||||||
|
replace_after = self.DEFAULT_REPLACE_AFTER
|
||||||
|
filter_str = self.DEFAULT_FILTER
|
||||||
|
match_type = self.DEFAULT_MATCH_TYPE
|
||||||
|
|
||||||
|
else:
|
||||||
|
regex = self.make_regex(config.get('match'))
|
||||||
|
replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
|
||||||
|
filter_str = config.get('filter', self.DEFAULT_FILTER)
|
||||||
|
match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
|
||||||
|
|
||||||
|
return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type)
|
||||||
|
|
||||||
|
def get_fuzzy_match(self, params):
|
||||||
|
urlkey = to_native_str(params['key'], 'utf-8')
|
||||||
|
|
||||||
|
filters = []
|
||||||
|
matched_rule = None
|
||||||
|
|
||||||
|
for rule in self.rules:
|
||||||
|
if not any((urlkey.startswith(prefix) for prefix in rule.url_prefix)):
|
||||||
|
continue
|
||||||
|
|
||||||
|
m = rule.regex.search(urlkey)
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
|
||||||
|
matched_rule = rule
|
||||||
|
groups = m.groups()
|
||||||
|
for g in groups:
|
||||||
|
for f in matched_rule.filter_str:
|
||||||
|
filters.append(f.format(g))
|
||||||
|
|
||||||
|
break
|
||||||
|
|
||||||
|
if not matched_rule:
|
||||||
|
return None
|
||||||
|
|
||||||
|
url = params['url']
|
||||||
|
|
||||||
|
inx = url.find(matched_rule.replace_after)
|
||||||
|
if inx > 0:
|
||||||
|
url = url[:inx + len(matched_rule.replace_after)]
|
||||||
|
|
||||||
|
if matched_rule.match_type == 'domain':
|
||||||
|
host = urlsplit(url).netloc
|
||||||
|
url = host.split('.', 1)[1]
|
||||||
|
|
||||||
|
params.update({'url': url,
|
||||||
|
'matchType': matched_rule.match_type,
|
||||||
|
'filter': filters})
|
||||||
|
|
||||||
|
for param in self.REMOVE_PARAMS:
|
||||||
|
params.pop(param, '')
|
||||||
|
|
||||||
|
return matched_rule
|
||||||
|
|
||||||
|
def make_regex(self, config):
|
||||||
|
if isinstance(config, list):
|
||||||
|
string = self.make_query_match_regex(config)
|
||||||
|
|
||||||
|
elif isinstance(config, dict):
|
||||||
|
string = config.get('regex', '')
|
||||||
|
string += self.make_query_match_regex(config.get('args', []))
|
||||||
|
|
||||||
|
else:
|
||||||
|
string = str(config)
|
||||||
|
|
||||||
|
return re.compile(string)
|
||||||
|
|
||||||
|
def make_query_match_regex(self, params_list):
|
||||||
|
params_list.sort()
|
||||||
|
|
||||||
|
def conv(value):
|
||||||
|
return '[?&]({0}=[^&]+)'.format(re.escape(value))
|
||||||
|
|
||||||
|
return '.*'.join([conv(param) for param in params_list])
|
||||||
|
|
||||||
|
def __call__(self, index_source, params):
|
||||||
|
cdx_iter, errs = index_source(params)
|
||||||
|
return self.get_fuzzy_iter(cdx_iter, index_source, params), errs
|
||||||
|
|
||||||
|
def get_fuzzy_iter(self, cdx_iter, index_source, params):
|
||||||
|
found = False
|
||||||
|
for cdx in cdx_iter:
|
||||||
|
found = True
|
||||||
|
yield cdx
|
||||||
|
|
||||||
|
if found:
|
||||||
|
return
|
||||||
|
|
||||||
|
rule = self.get_fuzzy_match(params)
|
||||||
|
if not rule:
|
||||||
|
return
|
||||||
|
|
||||||
|
new_iter, errs = index_source(params)
|
||||||
|
|
||||||
|
for cdx in new_iter:
|
||||||
|
if self.allow_fuzzy_result(rule, cdx):
|
||||||
|
yield cdx
|
||||||
|
|
||||||
|
def allow_fuzzy_result(self, rule, cdx):
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
@ -4,8 +4,7 @@ from pywb.utils.wbexception import BadRequestException, WbException
|
|||||||
from pywb.utils.wbexception import NotFoundException
|
from pywb.utils.wbexception import NotFoundException
|
||||||
from warcio.recordloader import ArchiveLoadFailed
|
from warcio.recordloader import ArchiveLoadFailed
|
||||||
|
|
||||||
from pywb.cdx.query import CDXQuery
|
from pywb.webagg.fuzzymatcher import FuzzyMatcher
|
||||||
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
|
|
||||||
|
|
||||||
import six
|
import six
|
||||||
|
|
||||||
@ -27,37 +26,6 @@ def to_link(cdx_iter, fields):
|
|||||||
content_type = 'application/link'
|
content_type = 'application/link'
|
||||||
return content_type, MementoUtils.make_timemap(cdx_iter)
|
return content_type, MementoUtils.make_timemap(cdx_iter)
|
||||||
|
|
||||||
#=============================================================================
|
|
||||||
class FuzzyMatcher(object):
|
|
||||||
def __init__(self):
|
|
||||||
res = load_domain_specific_cdx_rules('pywb/rules.yaml', True)
|
|
||||||
self.url_canon, self.fuzzy_query = res
|
|
||||||
|
|
||||||
def __call__(self, index_source, params):
|
|
||||||
cdx_iter, errs = index_source(params)
|
|
||||||
return self.do_fuzzy(cdx_iter, index_source, params), errs
|
|
||||||
|
|
||||||
def do_fuzzy(self, cdx_iter, index_source, params):
|
|
||||||
found = False
|
|
||||||
for cdx in cdx_iter:
|
|
||||||
found = True
|
|
||||||
yield cdx
|
|
||||||
|
|
||||||
fuzzy_query_params = None
|
|
||||||
if not found:
|
|
||||||
query = CDXQuery(params)
|
|
||||||
fuzzy_query_params = self.fuzzy_query(query)
|
|
||||||
|
|
||||||
if not fuzzy_query_params:
|
|
||||||
return
|
|
||||||
|
|
||||||
fuzzy_query_params.pop('alt_url', '')
|
|
||||||
|
|
||||||
new_iter, errs = index_source(fuzzy_query_params)
|
|
||||||
|
|
||||||
for cdx in new_iter:
|
|
||||||
yield cdx
|
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class IndexHandler(object):
|
class IndexHandler(object):
|
||||||
@ -73,7 +41,7 @@ class IndexHandler(object):
|
|||||||
def __init__(self, index_source, opts=None, *args, **kwargs):
|
def __init__(self, index_source, opts=None, *args, **kwargs):
|
||||||
self.index_source = index_source
|
self.index_source = index_source
|
||||||
self.opts = opts or {}
|
self.opts = opts or {}
|
||||||
self.fuzzy = FuzzyMatcher()
|
self.fuzzy = FuzzyMatcher('pkg://pywb/rules.yaml')
|
||||||
|
|
||||||
def get_supported_modes(self):
|
def get_supported_modes(self):
|
||||||
return dict(modes=['list_sources', 'index'])
|
return dict(modes=['list_sources', 'index'])
|
||||||
|
@ -334,13 +334,13 @@ foo=bar&test=abc"""
|
|||||||
assert 'ResErrors' not in resp.headers
|
assert 'ResErrors' not in resp.headers
|
||||||
|
|
||||||
def test_agg_seq_fallback_1(self):
|
def test_agg_seq_fallback_1(self):
|
||||||
resp = self.testapp.get('/fallback/resource?url=http://httpbin.org/')
|
resp = self.testapp.get('/fallback/resource?url=http://httpbin.org/status/200')
|
||||||
|
|
||||||
assert resp.headers['WebAgg-Source-Coll'] == 'live'
|
assert resp.headers['WebAgg-Source-Coll'] == 'live'
|
||||||
|
|
||||||
self._check_uri_date(resp, 'http://httpbin.org/', True)
|
self._check_uri_date(resp, 'http://httpbin.org/status/200', True)
|
||||||
|
|
||||||
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/', 'original')
|
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/status/200', 'original')
|
||||||
|
|
||||||
assert b'HTTP/1.1 200 OK' in resp.body
|
assert b'HTTP/1.1 200 OK' in resp.body
|
||||||
|
|
||||||
|
@ -396,10 +396,13 @@ class TestWbIntegration(BaseConfigTest):
|
|||||||
assert resp.status_int == 200
|
assert resp.status_int == 200
|
||||||
assert '"data": "^"' in resp.text
|
assert '"data": "^"' in resp.text
|
||||||
|
|
||||||
def test_post_invalid(self):
|
def test_post_fuzzy_match(self):
|
||||||
# not json
|
resp = self.testapp.post('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': 'x'})
|
||||||
resp = self.testapp.post_json('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': '^'}, status=404)
|
assert resp.status_int == 200
|
||||||
assert resp.status_int == 404
|
assert '"A": "1"' in resp.text
|
||||||
|
assert '"B": "[]"' in resp.text
|
||||||
|
assert '"C": "3"' in resp.text
|
||||||
|
|
||||||
|
|
||||||
def test_post_referer_redirect(self):
|
def test_post_referer_redirect(self):
|
||||||
# allowing 307 redirects
|
# allowing 307 redirects
|
||||||
|
Loading…
x
Reference in New Issue
Block a user