From 0f0c20a03a4c74b6fccbf99daacc35a8c90a5659 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 14 Mar 2017 11:39:36 -0700 Subject: [PATCH] fuzzy matching: new, clean fuzzy matcher implementation for webagg rules: default rule: fuzzy match urls ignoring prefix match (needs more testing) tests: update tests for new broad fuzzy match rule --- pywb/rules.yaml | 10 +- pywb/webagg/fuzzymatcher.py | 148 ++++++++++++++++++++++++++++++ pywb/webagg/handlers.py | 36 +------- pywb/webagg/test/test_handlers.py | 6 +- tests/test_integration.py | 11 ++- 5 files changed, 166 insertions(+), 45 deletions(-) create mode 100644 pywb/webagg/fuzzymatcher.py diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 2f6fb47d..07b44112 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -341,7 +341,9 @@ rules: #================================================================= # Applies to all urls -- should be last - url_prefix: '' - fuzzy_lookup: - match: '(.*)[&?](?:_|uncache)=[\d]+[&]?' - filter: ['=urlkey:{0}'] - replace: '?' + fuzzy_lookup: '()' + + #fuzzy_lookup: + # match: '(.*)[&?](?:_|uncache)=[\d]+[&]?' + # filter: ['=urlkey:{0}'] + # replace: '?' diff --git a/pywb/webagg/fuzzymatcher.py b/pywb/webagg/fuzzymatcher.py new file mode 100644 index 00000000..9646bce0 --- /dev/null +++ b/pywb/webagg/fuzzymatcher.py @@ -0,0 +1,148 @@ +from warcio.utils import to_native_str +from pywb.utils.loaders import load_yaml_config + +import re + +from six.moves.urllib.parse import urlsplit +from collections import namedtuple + + +# ============================================================================ +FuzzyRule = namedtuple('FuzzyRule', + 'url_prefix, regex, replace_after, filter_str, ' + + 'match_type') + + +# ============================================================================ +class FuzzyMatcher(object): + DEFAULT_FILTER = ['~urlkey:{0}'] + DEFAULT_MATCH_TYPE = 'prefix' + DEFAULT_REPLACE_AFTER = '?' + + REMOVE_PARAMS = ['alt_url', 'reverse', 'closest', 'end_key'] + + def __init__(self, filename): + config = load_yaml_config(filename) + self.rules = [] + for rule in config.get('rules'): + rule = self.parse_fuzzy_rule(rule) + if rule: + self.rules.append(rule) + + def parse_fuzzy_rule(self, rule): + """ Parse rules using all the different supported forms + """ + url_prefix = rule.get('url_prefix') + config = rule.get('fuzzy_lookup') + if not config: + return + + if not isinstance(url_prefix, list): + url_prefix = [url_prefix] + + if not isinstance(config, dict): + regex = self.make_regex(config) + replace_after = self.DEFAULT_REPLACE_AFTER + filter_str = self.DEFAULT_FILTER + match_type = self.DEFAULT_MATCH_TYPE + + else: + regex = self.make_regex(config.get('match')) + replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER) + filter_str = config.get('filter', self.DEFAULT_FILTER) + match_type = config.get('type', self.DEFAULT_MATCH_TYPE) + + return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type) + + def get_fuzzy_match(self, params): + urlkey = to_native_str(params['key'], 'utf-8') + + filters = [] + matched_rule = None + + for rule in self.rules: + if not any((urlkey.startswith(prefix) for prefix in rule.url_prefix)): + continue + + m = rule.regex.search(urlkey) + if not m: + continue + + matched_rule = rule + groups = m.groups() + for g in groups: + for f in matched_rule.filter_str: + filters.append(f.format(g)) + + break + + if not matched_rule: + return None + + url = params['url'] + + inx = url.find(matched_rule.replace_after) + if inx > 0: + url = url[:inx + len(matched_rule.replace_after)] + + if matched_rule.match_type == 'domain': + host = urlsplit(url).netloc + url = host.split('.', 1)[1] + + params.update({'url': url, + 'matchType': matched_rule.match_type, + 'filter': filters}) + + for param in self.REMOVE_PARAMS: + params.pop(param, '') + + return matched_rule + + def make_regex(self, config): + if isinstance(config, list): + string = self.make_query_match_regex(config) + + elif isinstance(config, dict): + string = config.get('regex', '') + string += self.make_query_match_regex(config.get('args', [])) + + else: + string = str(config) + + return re.compile(string) + + def make_query_match_regex(self, params_list): + params_list.sort() + + def conv(value): + return '[?&]({0}=[^&]+)'.format(re.escape(value)) + + return '.*'.join([conv(param) for param in params_list]) + + def __call__(self, index_source, params): + cdx_iter, errs = index_source(params) + return self.get_fuzzy_iter(cdx_iter, index_source, params), errs + + def get_fuzzy_iter(self, cdx_iter, index_source, params): + found = False + for cdx in cdx_iter: + found = True + yield cdx + + if found: + return + + rule = self.get_fuzzy_match(params) + if not rule: + return + + new_iter, errs = index_source(params) + + for cdx in new_iter: + if self.allow_fuzzy_result(rule, cdx): + yield cdx + + def allow_fuzzy_result(self, rule, cdx): + return True + + diff --git a/pywb/webagg/handlers.py b/pywb/webagg/handlers.py index 6044edc7..c1b7df74 100644 --- a/pywb/webagg/handlers.py +++ b/pywb/webagg/handlers.py @@ -4,8 +4,7 @@ from pywb.utils.wbexception import BadRequestException, WbException from pywb.utils.wbexception import NotFoundException from warcio.recordloader import ArchiveLoadFailed -from pywb.cdx.query import CDXQuery -from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules +from pywb.webagg.fuzzymatcher import FuzzyMatcher import six @@ -27,37 +26,6 @@ def to_link(cdx_iter, fields): content_type = 'application/link' return content_type, MementoUtils.make_timemap(cdx_iter) -#============================================================================= -class FuzzyMatcher(object): - def __init__(self): - res = load_domain_specific_cdx_rules('pywb/rules.yaml', True) - self.url_canon, self.fuzzy_query = res - - def __call__(self, index_source, params): - cdx_iter, errs = index_source(params) - return self.do_fuzzy(cdx_iter, index_source, params), errs - - def do_fuzzy(self, cdx_iter, index_source, params): - found = False - for cdx in cdx_iter: - found = True - yield cdx - - fuzzy_query_params = None - if not found: - query = CDXQuery(params) - fuzzy_query_params = self.fuzzy_query(query) - - if not fuzzy_query_params: - return - - fuzzy_query_params.pop('alt_url', '') - - new_iter, errs = index_source(fuzzy_query_params) - - for cdx in new_iter: - yield cdx - #============================================================================= class IndexHandler(object): @@ -73,7 +41,7 @@ class IndexHandler(object): def __init__(self, index_source, opts=None, *args, **kwargs): self.index_source = index_source self.opts = opts or {} - self.fuzzy = FuzzyMatcher() + self.fuzzy = FuzzyMatcher('pkg://pywb/rules.yaml') def get_supported_modes(self): return dict(modes=['list_sources', 'index']) diff --git a/pywb/webagg/test/test_handlers.py b/pywb/webagg/test/test_handlers.py index 55327d5e..e95eaa13 100644 --- a/pywb/webagg/test/test_handlers.py +++ b/pywb/webagg/test/test_handlers.py @@ -334,13 +334,13 @@ foo=bar&test=abc""" assert 'ResErrors' not in resp.headers def test_agg_seq_fallback_1(self): - resp = self.testapp.get('/fallback/resource?url=http://httpbin.org/') + resp = self.testapp.get('/fallback/resource?url=http://httpbin.org/status/200') assert resp.headers['WebAgg-Source-Coll'] == 'live' - self._check_uri_date(resp, 'http://httpbin.org/', True) + self._check_uri_date(resp, 'http://httpbin.org/status/200', True) - assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/', 'original') + assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/status/200', 'original') assert b'HTTP/1.1 200 OK' in resp.body diff --git a/tests/test_integration.py b/tests/test_integration.py index 3f0c6473..2eefc60d 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -396,10 +396,13 @@ class TestWbIntegration(BaseConfigTest): assert resp.status_int == 200 assert '"data": "^"' in resp.text - def test_post_invalid(self): - # not json - resp = self.testapp.post_json('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': '^'}, status=404) - assert resp.status_int == 404 + def test_post_fuzzy_match(self): + resp = self.testapp.post('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': 'x'}) + assert resp.status_int == 200 + assert '"A": "1"' in resp.text + assert '"B": "[]"' in resp.text + assert '"C": "3"' in resp.text + def test_post_referer_redirect(self): # allowing 307 redirects