From a82cfc1ab2ac9a0e957a7d7b0ea1f801ec0b7d31 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Mon, 20 Mar 2017 14:41:12 -0700
Subject: [PATCH] rewriter: add rewrite_dash for rewriting DASH and HLS
 manifests! rewriter: refactor to use mixins to extend base rewriter (todo:
 more refactoring) fuzzy-matcher: support for additional 'match_filters' to
 filter fuzzy results via optional regexes by mime type, eg. allow more
 lenient fuzzy matching on DASH manifests than other resources (for now)
 fuzzy-matching: add WebAgg-Fuzzy-Match response header if response is fuzzy
 matched, redirect to exact match in rewriterapp

---
 pywb/rewrite/header_rewriter.py |  2 ++
 pywb/rewrite/rewrite_amf.py     |  5 ++---
 pywb/rewrite/rewrite_content.py | 16 +++++++++-----
 pywb/rules.yaml                 | 16 +++++++++-----
 pywb/static/wombat.js           |  2 +-
 pywb/urlrewrite/rewriterapp.py  | 15 +++++++++++--
 pywb/webagg/fuzzymatcher.py     | 39 ++++++++++++++++++++++++++-------
 pywb/webagg/responseloader.py   |  2 ++
 tests/test_integration.py       | 25 ++++++++++++++++-----
 9 files changed, 90 insertions(+), 32 deletions(-)

diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py
index f88f73ad..f5656170 100644
--- a/pywb/rewrite/header_rewriter.py
+++ b/pywb/rewrite/header_rewriter.py
@@ -39,6 +39,8 @@ class HeaderRewriter(object):
 
         'json': ['application/json'],
 
+        'hls': ['application/x-mpegURL'],
+
         'xml':  ['/xml', '+xml', '.xml', '.rss'],
 
         'plain': ['text/plain'],
diff --git a/pywb/rewrite/rewrite_amf.py b/pywb/rewrite/rewrite_amf.py
index 9fcae784..ddd4e078 100644
--- a/pywb/rewrite/rewrite_amf.py
+++ b/pywb/rewrite/rewrite_amf.py
@@ -1,16 +1,15 @@
 from io import BytesIO
 from six.moves import zip
-from pywb.rewrite.rewrite_content import RewriteContent
 
 
 # ============================================================================
 # Expiermental: not fully tested
-class RewriteContentAMF(RewriteContent):  #pragma: no cover
+class RewriteAMFMixin(object):  #pragma: no cover
     def handle_custom_rewrite(self, rewritten_headers, stream, urlrewriter, mod, env):
         if rewritten_headers.status_headers.get_header('Content-Type') == 'application/x-amf':
             stream = self.rewrite_amf(stream, env)
 
-        return (super(RewriteContentAMF, self).
+        return (super(RewriteAMFMixin, self).
                 handle_custom_rewrite(rewritten_headers, stream, urlrewriter, mod, env))
 
     def rewrite_amf(self, stream, env):
diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py
index 483b51fa..de239644 100644
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@@ -76,6 +76,15 @@ class RewriteContent(object):
 
         return (rewritten_headers, stream)
 
+    def _decoding_stream(self, rewritten_headers, stream):
+        for decomp_type in BufferedReader.get_supported_decompressors():
+            matched, stream = self._check_encoding(rewritten_headers,
+                                                   stream,
+                                                   decomp_type)
+            if matched:
+                break
+
+        return stream
 
     def _check_encoding(self, rewritten_headers, stream, enc):
         matched = False
@@ -142,12 +151,7 @@ class RewriteContent(object):
         encoding = None
         first_buff = b''
 
-        for decomp_type in BufferedReader.get_supported_decompressors():
-            matched, stream = self._check_encoding(rewritten_headers,
-                                                   stream,
-                                                   decomp_type)
-            if matched:
-                break
+        stream = self._decoding_stream(rewritten_headers, stream)
 
         if mod == 'js_':
             text_type, stream = self._resolve_text_type('js',
diff --git a/pywb/rules.yaml b/pywb/rules.yaml
index 07b44112..e731e10f 100644
--- a/pywb/rules.yaml
+++ b/pywb/rules.yaml
@@ -337,13 +337,17 @@ rules:
             - match: ''
               invalid_: ''
 
-      # all domain rules -- fallback to this dataset
+
+    # all domain rules -- fallback to this dataset
     #=================================================================
     # Applies to all urls -- should be last
     - url_prefix: ''
-      fuzzy_lookup: '()'
+      fuzzy_lookup:
+        match: '()'
+        match_filters:
+            - mime: 'application/dash+xml'
+              match: '()'
+
+            - mime: '*'
+              match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
 
-      #fuzzy_lookup:
-      #  match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
-      #  filter: ['=urlkey:{0}']
-      #  replace: '?'
diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js
index 8511431b..bb9e6d67 100644
--- a/pywb/static/wombat.js
+++ b/pywb/static/wombat.js
@@ -360,7 +360,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
         }
 
         if (starts_with(href, REL_PREFIX)) {
-            href = wb_info.wombat_scheme + href;
+            href = "http:" + href;
         }
 
         return href;
diff --git a/pywb/urlrewrite/rewriterapp.py b/pywb/urlrewrite/rewriterapp.py
index b609728a..f0ccb6b0 100644
--- a/pywb/urlrewrite/rewriterapp.py
+++ b/pywb/urlrewrite/rewriterapp.py
@@ -1,6 +1,9 @@
 import requests
 
-from pywb.rewrite.rewrite_amf import RewriteContentAMF
+from pywb.rewrite.rewrite_amf import RewriteAMFMixin
+from pywb.rewrite.rewrite_dash import RewriteDASHMixin
+from pywb.rewrite.rewrite_content import RewriteContent
+
 from pywb.rewrite.wburl import WbUrl
 from pywb.rewrite.url_rewriter import UrlRewriter
 
@@ -40,6 +43,11 @@ class UpstreamException(WbException):
         self.status_code = status_code
 
 
+# ============================================================================
+class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent):
+    pass
+
+
 # ============================================================================
 class RewriterApp(object):
     VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json'
@@ -56,7 +64,7 @@ class RewriterApp(object):
 
         frame_type = 'inverse' if framed_replay else False
 
-        self.content_rewriter = RewriteContentAMF(is_framed_replay=frame_type)
+        self.content_rewriter = Rewriter(is_framed_replay=frame_type)
 
         if not jinja_env:
             jinja_env = JinjaEnv(globals={'static_path': 'static/__pywb'})
@@ -198,6 +206,9 @@ class RewriterApp(object):
         cdx['timestamp'] = http_date_to_timestamp(memento_dt)
         cdx['url'] = target_uri
 
+        if target_uri != wb_url.url and r.headers.get('WebAgg-Fuzzy-Match') == '1':
+            return WbResponse.redir_response(urlrewriter.rewrite(target_uri))
+
         self._add_custom_params(cdx, r.headers, kwargs)
 
         if readd_range:
diff --git a/pywb/webagg/fuzzymatcher.py b/pywb/webagg/fuzzymatcher.py
index 9646bce0..f5926588 100644
--- a/pywb/webagg/fuzzymatcher.py
+++ b/pywb/webagg/fuzzymatcher.py
@@ -10,7 +10,7 @@ from collections import namedtuple
 # ============================================================================
 FuzzyRule = namedtuple('FuzzyRule',
                        'url_prefix, regex, replace_after, filter_str, ' +
-                       'match_type')
+                       'match_type, match_filters')
 
 
 # ============================================================================
@@ -45,14 +45,28 @@ class FuzzyMatcher(object):
             replace_after = self.DEFAULT_REPLACE_AFTER
             filter_str = self.DEFAULT_FILTER
             match_type = self.DEFAULT_MATCH_TYPE
+            match_filters = None
 
         else:
             regex = self.make_regex(config.get('match'))
             replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
             filter_str = config.get('filter', self.DEFAULT_FILTER)
             match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
+            match_filters = self._init_match_filters(config.get('match_filters'))
 
-        return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type)
+        return FuzzyRule(url_prefix, regex, replace_after, filter_str,
+                         match_type, match_filters)
+
+    def _init_match_filters(self, filter_config):
+        if not filter_config:
+            return
+
+        filters = []
+        for filter_ in filter_config:
+            filter_['match'] = re.compile(filter_['match'])
+            filters.append(filter_)
+
+        return filters
 
     def get_fuzzy_match(self, params):
         urlkey = to_native_str(params['key'], 'utf-8')
@@ -70,9 +84,8 @@ class FuzzyMatcher(object):
 
             matched_rule = rule
             groups = m.groups()
-            for g in groups:
-                for f in matched_rule.filter_str:
-                    filters.append(f.format(g))
+            for f in matched_rule.filter_str:
+                filters.append(f.format(*groups))
 
             break
 
@@ -132,6 +145,8 @@ class FuzzyMatcher(object):
         if found:
             return
 
+        url = params['url']
+
         rule = self.get_fuzzy_match(params)
         if not rule:
             return
@@ -139,10 +154,18 @@ class FuzzyMatcher(object):
         new_iter, errs = index_source(params)
 
         for cdx in new_iter:
-            if self.allow_fuzzy_result(rule, cdx):
+            if self.allow_fuzzy_result(rule, url, cdx):
+                cdx['is_fuzzy'] = True
                 yield cdx
 
-    def allow_fuzzy_result(self, rule, cdx):
-        return True
+    def allow_fuzzy_result(self, rule, url, cdx):
+        if not rule.match_filters:
+            return True
+
+        for match_filter in rule.match_filters:
+            if match_filter['mime'] in (cdx['mime'], '*'):
+                return match_filter['match'].search(url)
+
+        return False
 
 
diff --git a/pywb/webagg/responseloader.py b/pywb/webagg/responseloader.py
index 2da84652..aae4576d 100644
--- a/pywb/webagg/responseloader.py
+++ b/pywb/webagg/responseloader.py
@@ -44,6 +44,8 @@ class BaseLoader(object):
         out_headers['WebAgg-Type'] = 'warc'
         out_headers['WebAgg-Source-Coll'] = quote(cdx.get('source', ''), safe=':/')
         out_headers['Content-Type'] = 'application/warc-record'
+        if cdx.get('is_fuzzy'):
+            out_headers['WebAgg-Fuzzy-Match'] = '1'
 
         if not warc_headers:
             if other_headers:
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 2eefc60d..bb9aedc2 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -104,6 +104,15 @@ class TestWbIntegration(BaseConfigTest):
     #    assert 'wb.js' in resp.text
     #    assert '/pywb-nonframe/20140127171238/http://www.iana.org/time-zones"' in resp.text
 
+    def test_replay_fuzzy_1(self):
+        resp = self.testapp.get('/pywb/20140127171238mp_/http://www.iana.org/?_=123')
+        assert resp.status_int == 302
+        assert resp.headers['Location'].endswith('/pywb/20140127171238mp_/http://www.iana.org/')
+
+    def test_replay_no_fuzzy_match(self):
+        resp = self.testapp.get('/pywb/20140127171238mp_/http://www.iana.org/?foo=bar', status=404)
+        assert resp.status_int == 404
+
     #def test_replay_non_surt(self):
     #    resp = self.testapp.get('/pywb-nosurt/20140103030321/http://example.com?example=1')
     #    self._assert_basic_html(resp)
@@ -396,13 +405,17 @@ class TestWbIntegration(BaseConfigTest):
         assert resp.status_int == 200
         assert '"data": "^"' in resp.text
 
-    def test_post_fuzzy_match(self):
-        resp = self.testapp.post('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': 'x'})
-        assert resp.status_int == 200
-        assert '"A": "1"' in resp.text
-        assert '"B": "[]"' in resp.text
-        assert '"C": "3"' in resp.text
+    def test_post_invalid(self):
+        # not json
+        resp = self.testapp.post_json('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': '^'}, status=404)
+        assert resp.status_int == 404
 
+    #def test_post_fuzzy_match(self):
+    #    resp = self.testapp.post('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': 'x'})
+    #    assert resp.status_int == 200
+    #    assert '"A": "1"' in resp.text
+    #    assert '"B": "[]"' in resp.text
+    #    assert '"C": "3"' in resp.text
 
     def test_post_referer_redirect(self):
         # allowing 307 redirects