1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

fuzzy matching: new, clean fuzzy matcher implementation for webagg

rules: default rule: fuzzy match urls ignoring prefix match (needs more testing)
tests: update tests for new broad fuzzy match rule
This commit is contained in:
Ilya Kreymer 2017-03-14 11:39:36 -07:00
parent e0878f0f67
commit 0f0c20a03a
5 changed files with 166 additions and 45 deletions

View File

@ -341,7 +341,9 @@ rules:
#================================================================= #=================================================================
# Applies to all urls -- should be last # Applies to all urls -- should be last
- url_prefix: '' - url_prefix: ''
fuzzy_lookup: fuzzy_lookup: '()'
match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
filter: ['=urlkey:{0}'] #fuzzy_lookup:
replace: '?' # match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
# filter: ['=urlkey:{0}']
# replace: '?'

148
pywb/webagg/fuzzymatcher.py Normal file
View File

@ -0,0 +1,148 @@
from warcio.utils import to_native_str
from pywb.utils.loaders import load_yaml_config
import re
from six.moves.urllib.parse import urlsplit
from collections import namedtuple
# ============================================================================
FuzzyRule = namedtuple('FuzzyRule',
'url_prefix, regex, replace_after, filter_str, ' +
'match_type')
# ============================================================================
class FuzzyMatcher(object):
DEFAULT_FILTER = ['~urlkey:{0}']
DEFAULT_MATCH_TYPE = 'prefix'
DEFAULT_REPLACE_AFTER = '?'
REMOVE_PARAMS = ['alt_url', 'reverse', 'closest', 'end_key']
def __init__(self, filename):
config = load_yaml_config(filename)
self.rules = []
for rule in config.get('rules'):
rule = self.parse_fuzzy_rule(rule)
if rule:
self.rules.append(rule)
def parse_fuzzy_rule(self, rule):
""" Parse rules using all the different supported forms
"""
url_prefix = rule.get('url_prefix')
config = rule.get('fuzzy_lookup')
if not config:
return
if not isinstance(url_prefix, list):
url_prefix = [url_prefix]
if not isinstance(config, dict):
regex = self.make_regex(config)
replace_after = self.DEFAULT_REPLACE_AFTER
filter_str = self.DEFAULT_FILTER
match_type = self.DEFAULT_MATCH_TYPE
else:
regex = self.make_regex(config.get('match'))
replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
filter_str = config.get('filter', self.DEFAULT_FILTER)
match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type)
def get_fuzzy_match(self, params):
urlkey = to_native_str(params['key'], 'utf-8')
filters = []
matched_rule = None
for rule in self.rules:
if not any((urlkey.startswith(prefix) for prefix in rule.url_prefix)):
continue
m = rule.regex.search(urlkey)
if not m:
continue
matched_rule = rule
groups = m.groups()
for g in groups:
for f in matched_rule.filter_str:
filters.append(f.format(g))
break
if not matched_rule:
return None
url = params['url']
inx = url.find(matched_rule.replace_after)
if inx > 0:
url = url[:inx + len(matched_rule.replace_after)]
if matched_rule.match_type == 'domain':
host = urlsplit(url).netloc
url = host.split('.', 1)[1]
params.update({'url': url,
'matchType': matched_rule.match_type,
'filter': filters})
for param in self.REMOVE_PARAMS:
params.pop(param, '')
return matched_rule
def make_regex(self, config):
if isinstance(config, list):
string = self.make_query_match_regex(config)
elif isinstance(config, dict):
string = config.get('regex', '')
string += self.make_query_match_regex(config.get('args', []))
else:
string = str(config)
return re.compile(string)
def make_query_match_regex(self, params_list):
params_list.sort()
def conv(value):
return '[?&]({0}=[^&]+)'.format(re.escape(value))
return '.*'.join([conv(param) for param in params_list])
def __call__(self, index_source, params):
cdx_iter, errs = index_source(params)
return self.get_fuzzy_iter(cdx_iter, index_source, params), errs
def get_fuzzy_iter(self, cdx_iter, index_source, params):
found = False
for cdx in cdx_iter:
found = True
yield cdx
if found:
return
rule = self.get_fuzzy_match(params)
if not rule:
return
new_iter, errs = index_source(params)
for cdx in new_iter:
if self.allow_fuzzy_result(rule, cdx):
yield cdx
def allow_fuzzy_result(self, rule, cdx):
return True

View File

@ -4,8 +4,7 @@ from pywb.utils.wbexception import BadRequestException, WbException
from pywb.utils.wbexception import NotFoundException from pywb.utils.wbexception import NotFoundException
from warcio.recordloader import ArchiveLoadFailed from warcio.recordloader import ArchiveLoadFailed
from pywb.cdx.query import CDXQuery from pywb.webagg.fuzzymatcher import FuzzyMatcher
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
import six import six
@ -27,37 +26,6 @@ def to_link(cdx_iter, fields):
content_type = 'application/link' content_type = 'application/link'
return content_type, MementoUtils.make_timemap(cdx_iter) return content_type, MementoUtils.make_timemap(cdx_iter)
#=============================================================================
class FuzzyMatcher(object):
def __init__(self):
res = load_domain_specific_cdx_rules('pywb/rules.yaml', True)
self.url_canon, self.fuzzy_query = res
def __call__(self, index_source, params):
cdx_iter, errs = index_source(params)
return self.do_fuzzy(cdx_iter, index_source, params), errs
def do_fuzzy(self, cdx_iter, index_source, params):
found = False
for cdx in cdx_iter:
found = True
yield cdx
fuzzy_query_params = None
if not found:
query = CDXQuery(params)
fuzzy_query_params = self.fuzzy_query(query)
if not fuzzy_query_params:
return
fuzzy_query_params.pop('alt_url', '')
new_iter, errs = index_source(fuzzy_query_params)
for cdx in new_iter:
yield cdx
#============================================================================= #=============================================================================
class IndexHandler(object): class IndexHandler(object):
@ -73,7 +41,7 @@ class IndexHandler(object):
def __init__(self, index_source, opts=None, *args, **kwargs): def __init__(self, index_source, opts=None, *args, **kwargs):
self.index_source = index_source self.index_source = index_source
self.opts = opts or {} self.opts = opts or {}
self.fuzzy = FuzzyMatcher() self.fuzzy = FuzzyMatcher('pkg://pywb/rules.yaml')
def get_supported_modes(self): def get_supported_modes(self):
return dict(modes=['list_sources', 'index']) return dict(modes=['list_sources', 'index'])

View File

@ -334,13 +334,13 @@ foo=bar&test=abc"""
assert 'ResErrors' not in resp.headers assert 'ResErrors' not in resp.headers
def test_agg_seq_fallback_1(self): def test_agg_seq_fallback_1(self):
resp = self.testapp.get('/fallback/resource?url=http://httpbin.org/') resp = self.testapp.get('/fallback/resource?url=http://httpbin.org/status/200')
assert resp.headers['WebAgg-Source-Coll'] == 'live' assert resp.headers['WebAgg-Source-Coll'] == 'live'
self._check_uri_date(resp, 'http://httpbin.org/', True) self._check_uri_date(resp, 'http://httpbin.org/status/200', True)
assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/', 'original') assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/status/200', 'original')
assert b'HTTP/1.1 200 OK' in resp.body assert b'HTTP/1.1 200 OK' in resp.body

View File

@ -396,10 +396,13 @@ class TestWbIntegration(BaseConfigTest):
assert resp.status_int == 200 assert resp.status_int == 200
assert '"data": "^"' in resp.text assert '"data": "^"' in resp.text
def test_post_invalid(self): def test_post_fuzzy_match(self):
# not json resp = self.testapp.post('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': 'x'})
resp = self.testapp.post_json('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': '^'}, status=404) assert resp.status_int == 200
assert resp.status_int == 404 assert '"A": "1"' in resp.text
assert '"B": "[]"' in resp.text
assert '"C": "3"' in resp.text
def test_post_referer_redirect(self): def test_post_referer_redirect(self):
# allowing 307 redirects # allowing 307 redirects