rewriter: add rewrite_dash for rewriting DASH and HLS manifests!

rewriter: refactor to use mixins to extend base rewriter (todo: more refactoring) fuzzy-matcher: support for additional 'match_filters' to filter fuzzy results via optional regexes by mime type, eg. allow more lenient fuzzy matching on DASH manifests than other resources (for now) fuzzy-matching: add WebAgg-Fuzzy-Match response header if response is fuzzy matched, redirect to exact match in rewriterapp
2025-03-15 00:03:28 +01:00 · 2017-03-20 14:41:12 -07:00 · 2017-03-20 14:41:12 -07:00 · a82cfc1ab2
commit a82cfc1ab2
parent 22edb2f14b
9 changed files with 90 additions and 32 deletions
--- a/pywb/rewrite/header_rewriter.py
+++ b/pywb/rewrite/header_rewriter.py
@ -39,6 +39,8 @@ class HeaderRewriter(object):

        'json': ['application/json'],

+        'hls': ['application/x-mpegURL'],
+
        'xml':  ['/xml', '+xml', '.xml', '.rss'],

        'plain': ['text/plain'],
--- a/pywb/rewrite/rewrite_amf.py
+++ b/pywb/rewrite/rewrite_amf.py
@ -1,16 +1,15 @@
 from io import BytesIO
 from six.moves import zip
-from pywb.rewrite.rewrite_content import RewriteContent


 # ============================================================================
 # Expiermental: not fully tested
-class RewriteContentAMF(RewriteContent):  #pragma: no cover
+class RewriteAMFMixin(object):  #pragma: no cover
    def handle_custom_rewrite(self, rewritten_headers, stream, urlrewriter, mod, env):
        if rewritten_headers.status_headers.get_header('Content-Type') == 'application/x-amf':
            stream = self.rewrite_amf(stream, env)

-        return (super(RewriteContentAMF, self).
+        return (super(RewriteAMFMixin, self).
                handle_custom_rewrite(rewritten_headers, stream, urlrewriter, mod, env))

    def rewrite_amf(self, stream, env):
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@ -76,6 +76,15 @@ class RewriteContent(object):

        return (rewritten_headers, stream)

+    def _decoding_stream(self, rewritten_headers, stream):
+        for decomp_type in BufferedReader.get_supported_decompressors():
+            matched, stream = self._check_encoding(rewritten_headers,
+                                                   stream,
+                                                   decomp_type)
+            if matched:
+                break
+
+        return stream

    def _check_encoding(self, rewritten_headers, stream, enc):
        matched = False
@ -142,12 +151,7 @@ class RewriteContent(object):
        encoding = None
        first_buff = b''

-        for decomp_type in BufferedReader.get_supported_decompressors():
-            matched, stream = self._check_encoding(rewritten_headers,
-                                                   stream,
-                                                   decomp_type)
-            if matched:
-                break
+        stream = self._decoding_stream(rewritten_headers, stream)

        if mod == 'js_':
            text_type, stream = self._resolve_text_type('js',
--- a/pywb/rules.yaml
+++ b/pywb/rules.yaml
@ -337,13 +337,17 @@ rules:
            - match: ''
              invalid_: ''

-      # all domain rules -- fallback to this dataset
+
+    # all domain rules -- fallback to this dataset
    #=================================================================
    # Applies to all urls -- should be last
    - url_prefix: ''
-      fuzzy_lookup: '()'
+      fuzzy_lookup:
+        match: '()'
+        match_filters:
+            - mime: 'application/dash+xml'
+              match: '()'
+
+            - mime: '*'
+              match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'

-      #fuzzy_lookup:
-      #  match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
-      #  filter: ['=urlkey:{0}']
-      #  replace: '?'
--- a/pywb/static/wombat.js
+++ b/pywb/static/wombat.js
@ -360,7 +360,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
        }

        if (starts_with(href, REL_PREFIX)) {
-            href = wb_info.wombat_scheme + href;
+            href = "http:" + href;
        }

        return href;
--- a/pywb/urlrewrite/rewriterapp.py
+++ b/pywb/urlrewrite/rewriterapp.py
@ -1,6 +1,9 @@
 import requests

-from pywb.rewrite.rewrite_amf import RewriteContentAMF
+from pywb.rewrite.rewrite_amf import RewriteAMFMixin
+from pywb.rewrite.rewrite_dash import RewriteDASHMixin
+from pywb.rewrite.rewrite_content import RewriteContent
+
 from pywb.rewrite.wburl import WbUrl
 from pywb.rewrite.url_rewriter import UrlRewriter

@ -40,6 +43,11 @@ class UpstreamException(WbException):
        self.status_code = status_code


+# ============================================================================
+class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent):
+    pass
+
+
 # ============================================================================
 class RewriterApp(object):
    VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json'
@ -56,7 +64,7 @@ class RewriterApp(object):

        frame_type = 'inverse' if framed_replay else False

-        self.content_rewriter = RewriteContentAMF(is_framed_replay=frame_type)
+        self.content_rewriter = Rewriter(is_framed_replay=frame_type)

        if not jinja_env:
            jinja_env = JinjaEnv(globals={'static_path': 'static/__pywb'})
@ -198,6 +206,9 @@ class RewriterApp(object):
        cdx['timestamp'] = http_date_to_timestamp(memento_dt)
        cdx['url'] = target_uri

+        if target_uri != wb_url.url and r.headers.get('WebAgg-Fuzzy-Match') == '1':
+            return WbResponse.redir_response(urlrewriter.rewrite(target_uri))
+
        self._add_custom_params(cdx, r.headers, kwargs)

        if readd_range:
--- a/pywb/webagg/fuzzymatcher.py
+++ b/pywb/webagg/fuzzymatcher.py
@ -10,7 +10,7 @@ from collections import namedtuple
 # ============================================================================
 FuzzyRule = namedtuple('FuzzyRule',
                       'url_prefix, regex, replace_after, filter_str, ' +
-                       'match_type')
+                       'match_type, match_filters')


 # ============================================================================
@ -45,14 +45,28 @@ class FuzzyMatcher(object):
            replace_after = self.DEFAULT_REPLACE_AFTER
            filter_str = self.DEFAULT_FILTER
            match_type = self.DEFAULT_MATCH_TYPE
+            match_filters = None

        else:
            regex = self.make_regex(config.get('match'))
            replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
            filter_str = config.get('filter', self.DEFAULT_FILTER)
            match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
+            match_filters = self._init_match_filters(config.get('match_filters'))

-        return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type)
+        return FuzzyRule(url_prefix, regex, replace_after, filter_str,
+                         match_type, match_filters)
+
+    def _init_match_filters(self, filter_config):
+        if not filter_config:
+            return
+
+        filters = []
+        for filter_ in filter_config:
+            filter_['match'] = re.compile(filter_['match'])
+            filters.append(filter_)
+
+        return filters

    def get_fuzzy_match(self, params):
        urlkey = to_native_str(params['key'], 'utf-8')
@ -70,9 +84,8 @@ class FuzzyMatcher(object):

            matched_rule = rule
            groups = m.groups()
-            for g in groups:
-                for f in matched_rule.filter_str:
-                    filters.append(f.format(g))
+            for f in matched_rule.filter_str:
+                filters.append(f.format(*groups))

            break

@ -132,6 +145,8 @@ class FuzzyMatcher(object):
        if found:
            return

+        url = params['url']
+
        rule = self.get_fuzzy_match(params)
        if not rule:
            return
@ -139,10 +154,18 @@ class FuzzyMatcher(object):
        new_iter, errs = index_source(params)

        for cdx in new_iter:
-            if self.allow_fuzzy_result(rule, cdx):
+            if self.allow_fuzzy_result(rule, url, cdx):
+                cdx['is_fuzzy'] = True
                yield cdx

-    def allow_fuzzy_result(self, rule, cdx):
-        return True
+    def allow_fuzzy_result(self, rule, url, cdx):
+        if not rule.match_filters:
+            return True
+
+        for match_filter in rule.match_filters:
+            if match_filter['mime'] in (cdx['mime'], '*'):
+                return match_filter['match'].search(url)
+
+        return False


--- a/pywb/webagg/responseloader.py
+++ b/pywb/webagg/responseloader.py
@ -44,6 +44,8 @@ class BaseLoader(object):
        out_headers['WebAgg-Type'] = 'warc'
        out_headers['WebAgg-Source-Coll'] = quote(cdx.get('source', ''), safe=':/')
        out_headers['Content-Type'] = 'application/warc-record'
+        if cdx.get('is_fuzzy'):
+            out_headers['WebAgg-Fuzzy-Match'] = '1'

        if not warc_headers:
            if other_headers:
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@ -104,6 +104,15 @@ class TestWbIntegration(BaseConfigTest):
    #    assert 'wb.js' in resp.text
    #    assert '/pywb-nonframe/20140127171238/http://www.iana.org/time-zones"' in resp.text

+    def test_replay_fuzzy_1(self):
+        resp = self.testapp.get('/pywb/20140127171238mp_/http://www.iana.org/?_=123')
+        assert resp.status_int == 302
+        assert resp.headers['Location'].endswith('/pywb/20140127171238mp_/http://www.iana.org/')
+
+    def test_replay_no_fuzzy_match(self):
+        resp = self.testapp.get('/pywb/20140127171238mp_/http://www.iana.org/?foo=bar', status=404)
+        assert resp.status_int == 404
+
    #def test_replay_non_surt(self):
    #    resp = self.testapp.get('/pywb-nosurt/20140103030321/http://example.com?example=1')
    #    self._assert_basic_html(resp)
@ -396,13 +405,17 @@ class TestWbIntegration(BaseConfigTest):
        assert resp.status_int == 200
        assert '"data": "^"' in resp.text

-    def test_post_fuzzy_match(self):
-        resp = self.testapp.post('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': 'x'})
-        assert resp.status_int == 200
-        assert '"A": "1"' in resp.text
-        assert '"B": "[]"' in resp.text
-        assert '"C": "3"' in resp.text
+    def test_post_invalid(self):
+        # not json
+        resp = self.testapp.post_json('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': '^'}, status=404)
+        assert resp.status_int == 404

+    #def test_post_fuzzy_match(self):
+    #    resp = self.testapp.post('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': 'x'})
+    #    assert resp.status_int == 200
+    #    assert '"A": "1"' in resp.text
+    #    assert '"B": "[]"' in resp.text
+    #    assert '"C": "3"' in resp.text

    def test_post_referer_redirect(self):
        # allowing 307 redirects