From 0f0c20a03a4c74b6fccbf99daacc35a8c90a5659 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Tue, 14 Mar 2017 11:39:36 -0700
Subject: [PATCH] fuzzy matching: new, clean fuzzy matcher implementation for
 webagg rules: default rule: fuzzy match urls ignoring prefix match (needs
 more testing) tests: update tests for new broad fuzzy match rule

---
 pywb/rules.yaml                   |  10 +-
 pywb/webagg/fuzzymatcher.py       | 148 ++++++++++++++++++++++++++++++
 pywb/webagg/handlers.py           |  36 +-------
 pywb/webagg/test/test_handlers.py |   6 +-
 tests/test_integration.py         |  11 ++-
 5 files changed, 166 insertions(+), 45 deletions(-)
 create mode 100644 pywb/webagg/fuzzymatcher.py

diff --git a/pywb/rules.yaml b/pywb/rules.yaml
index 2f6fb47d..07b44112 100644
--- a/pywb/rules.yaml
+++ b/pywb/rules.yaml
@@ -341,7 +341,9 @@ rules:
     #=================================================================
     # Applies to all urls -- should be last
     - url_prefix: ''
-      fuzzy_lookup:
-        match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
-        filter: ['=urlkey:{0}']
-        replace: '?'
+      fuzzy_lookup: '()'
+
+      #fuzzy_lookup:
+      #  match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
+      #  filter: ['=urlkey:{0}']
+      #  replace: '?'
diff --git a/pywb/webagg/fuzzymatcher.py b/pywb/webagg/fuzzymatcher.py
new file mode 100644
index 00000000..9646bce0
--- /dev/null
+++ b/pywb/webagg/fuzzymatcher.py
@@ -0,0 +1,148 @@
+from warcio.utils import to_native_str
+from pywb.utils.loaders import load_yaml_config
+
+import re
+
+from six.moves.urllib.parse import urlsplit
+from collections import namedtuple
+
+
+# ============================================================================
+FuzzyRule = namedtuple('FuzzyRule',
+                       'url_prefix, regex, replace_after, filter_str, ' +
+                       'match_type')
+
+
+# ============================================================================
+class FuzzyMatcher(object):
+    DEFAULT_FILTER = ['~urlkey:{0}']
+    DEFAULT_MATCH_TYPE = 'prefix'
+    DEFAULT_REPLACE_AFTER = '?'
+
+    REMOVE_PARAMS = ['alt_url', 'reverse', 'closest', 'end_key']
+
+    def __init__(self, filename):
+        config = load_yaml_config(filename)
+        self.rules = []
+        for rule in config.get('rules'):
+            rule = self.parse_fuzzy_rule(rule)
+            if rule:
+                self.rules.append(rule)
+
+    def parse_fuzzy_rule(self, rule):
+        """ Parse rules using all the different supported forms
+        """
+        url_prefix = rule.get('url_prefix')
+        config = rule.get('fuzzy_lookup')
+        if not config:
+            return
+
+        if not isinstance(url_prefix, list):
+            url_prefix = [url_prefix]
+
+        if not isinstance(config, dict):
+            regex = self.make_regex(config)
+            replace_after = self.DEFAULT_REPLACE_AFTER
+            filter_str = self.DEFAULT_FILTER
+            match_type = self.DEFAULT_MATCH_TYPE
+
+        else:
+            regex = self.make_regex(config.get('match'))
+            replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
+            filter_str = config.get('filter', self.DEFAULT_FILTER)
+            match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
+
+        return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type)
+
+    def get_fuzzy_match(self, params):
+        urlkey = to_native_str(params['key'], 'utf-8')
+
+        filters = []
+        matched_rule = None
+
+        for rule in self.rules:
+            if not any((urlkey.startswith(prefix) for prefix in rule.url_prefix)):
+                continue
+
+            m = rule.regex.search(urlkey)
+            if not m:
+                continue
+
+            matched_rule = rule
+            groups = m.groups()
+            for g in groups:
+                for f in matched_rule.filter_str:
+                    filters.append(f.format(g))
+
+            break
+
+        if not matched_rule:
+            return None
+
+        url = params['url']
+
+        inx = url.find(matched_rule.replace_after)
+        if inx > 0:
+            url = url[:inx + len(matched_rule.replace_after)]
+
+        if matched_rule.match_type == 'domain':
+            host = urlsplit(url).netloc
+            url = host.split('.', 1)[1]
+
+        params.update({'url': url,
+                       'matchType': matched_rule.match_type,
+                       'filter': filters})
+
+        for param in self.REMOVE_PARAMS:
+            params.pop(param, '')
+
+        return matched_rule
+
+    def make_regex(self, config):
+        if isinstance(config, list):
+            string = self.make_query_match_regex(config)
+
+        elif isinstance(config, dict):
+            string = config.get('regex', '')
+            string += self.make_query_match_regex(config.get('args', []))
+
+        else:
+            string = str(config)
+
+        return re.compile(string)
+
+    def make_query_match_regex(self, params_list):
+        params_list.sort()
+
+        def conv(value):
+            return '[?&]({0}=[^&]+)'.format(re.escape(value))
+
+        return '.*'.join([conv(param) for param in params_list])
+
+    def __call__(self, index_source, params):
+        cdx_iter, errs = index_source(params)
+        return self.get_fuzzy_iter(cdx_iter, index_source, params), errs
+
+    def get_fuzzy_iter(self, cdx_iter, index_source, params):
+        found = False
+        for cdx in cdx_iter:
+            found = True
+            yield cdx
+
+        if found:
+            return
+
+        rule = self.get_fuzzy_match(params)
+        if not rule:
+            return
+
+        new_iter, errs = index_source(params)
+
+        for cdx in new_iter:
+            if self.allow_fuzzy_result(rule, cdx):
+                yield cdx
+
+    def allow_fuzzy_result(self, rule, cdx):
+        return True
+
+
diff --git a/pywb/webagg/handlers.py b/pywb/webagg/handlers.py
index 6044edc7..c1b7df74 100644
--- a/pywb/webagg/handlers.py
+++ b/pywb/webagg/handlers.py
@@ -4,8 +4,7 @@ from pywb.utils.wbexception import BadRequestException, WbException
 from pywb.utils.wbexception import NotFoundException
 from warcio.recordloader import ArchiveLoadFailed
 
-from pywb.cdx.query import CDXQuery
-from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
+from pywb.webagg.fuzzymatcher import FuzzyMatcher
 
 import six
 
@@ -27,37 +26,6 @@ def to_link(cdx_iter, fields):
     content_type = 'application/link'
     return content_type, MementoUtils.make_timemap(cdx_iter)
 
-#=============================================================================
-class FuzzyMatcher(object):
-    def __init__(self):
-        res = load_domain_specific_cdx_rules('pywb/rules.yaml', True)
-        self.url_canon, self.fuzzy_query = res
-
-    def __call__(self, index_source, params):
-        cdx_iter, errs = index_source(params)
-        return self.do_fuzzy(cdx_iter, index_source, params), errs
-
-    def do_fuzzy(self, cdx_iter, index_source, params):
-        found = False
-        for cdx in cdx_iter:
-            found = True
-            yield cdx
-
-        fuzzy_query_params = None
-        if not found:
-            query = CDXQuery(params)
-            fuzzy_query_params = self.fuzzy_query(query)
-
-        if not fuzzy_query_params:
-            return
-
-        fuzzy_query_params.pop('alt_url', '')
-
-        new_iter, errs = index_source(fuzzy_query_params)
-
-        for cdx in new_iter:
-            yield cdx
-
 
 #=============================================================================
 class IndexHandler(object):
@@ -73,7 +41,7 @@ class IndexHandler(object):
     def __init__(self, index_source, opts=None, *args, **kwargs):
         self.index_source = index_source
         self.opts = opts or {}
-        self.fuzzy = FuzzyMatcher()
+        self.fuzzy = FuzzyMatcher('pkg://pywb/rules.yaml')
 
     def get_supported_modes(self):
         return dict(modes=['list_sources', 'index'])
diff --git a/pywb/webagg/test/test_handlers.py b/pywb/webagg/test/test_handlers.py
index 55327d5e..e95eaa13 100644
--- a/pywb/webagg/test/test_handlers.py
+++ b/pywb/webagg/test/test_handlers.py
@@ -334,13 +334,13 @@ foo=bar&test=abc"""
         assert 'ResErrors' not in resp.headers
 
     def test_agg_seq_fallback_1(self):
-        resp = self.testapp.get('/fallback/resource?url=http://httpbin.org/')
+        resp = self.testapp.get('/fallback/resource?url=http://httpbin.org/status/200')
 
         assert resp.headers['WebAgg-Source-Coll'] == 'live'
 
-        self._check_uri_date(resp, 'http://httpbin.org/', True)
+        self._check_uri_date(resp, 'http://httpbin.org/status/200', True)
 
-        assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/', 'original')
+        assert resp.headers['Link'] == MementoUtils.make_link('http://httpbin.org/status/200', 'original')
 
         assert b'HTTP/1.1 200 OK' in resp.body
 
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 3f0c6473..2eefc60d 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -396,10 +396,13 @@ class TestWbIntegration(BaseConfigTest):
         assert resp.status_int == 200
         assert '"data": "^"' in resp.text
 
-    def test_post_invalid(self):
-        # not json
-        resp = self.testapp.post_json('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': '^'}, status=404)
-        assert resp.status_int == 404
+    def test_post_fuzzy_match(self):
+        resp = self.testapp.post('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': 'x'})
+        assert resp.status_int == 200
+        assert '"A": "1"' in resp.text
+        assert '"B": "[]"' in resp.text
+        assert '"C": "3"' in resp.text
+
 
     def test_post_referer_redirect(self):
         # allowing 307 redirects