rewriter: add rewrite_dash for rewriting DASH and HLS manifests!

rewriter: refactor to use mixins to extend base rewriter (todo: more refactoring) fuzzy-matcher: support for additional 'match_filters' to filter fuzzy results via optional regexes by mime type, eg. allow more lenient fuzzy matching on DASH manifests than other resources (for now) fuzzy-matching: add WebAgg-Fuzzy-Match response header if response is fuzzy matched, redirect to exact match in rewriterapp
2025-03-24 06:59:52 +01:00 · 2017-03-20 14:41:12 -07:00 · 2017-03-20 14:41:12 -07:00 · a82cfc1ab2
commit a82cfc1ab2
parent 22edb2f14b
9 changed files with 90 additions and 32 deletions
--- a/pywb/rewrite/header_rewriter.py
+++ b/pywb/rewrite/header_rewriter.py
@ -39,6 +39,8 @@ class HeaderRewriter(object):
        'json': ['application/json'],
        'hls': ['application/x-mpegURL'],
        'xml':  ['/xml', '+xml', '.xml', '.rss'],
        'plain': ['text/plain'],
--- a/pywb/rewrite/rewrite_amf.py
+++ b/pywb/rewrite/rewrite_amf.py
@ -1,16 +1,15 @@
 from io import BytesIO
 from six.moves import zip
 from pywb.rewrite.rewrite_content import RewriteContent
 # ============================================================================
 # Expiermental: not fully tested
-class RewriteContentAMF(RewriteContent):  #pragma: no cover
+class RewriteAMFMixin(object):  #pragma: no cover
    def handle_custom_rewrite(self, rewritten_headers, stream, urlrewriter, mod, env):
        if rewritten_headers.status_headers.get_header('Content-Type') == 'application/x-amf':
            stream = self.rewrite_amf(stream, env)
-        return (super(RewriteContentAMF, self).
+        return (super(RewriteAMFMixin, self).
                handle_custom_rewrite(rewritten_headers, stream, urlrewriter, mod, env))
    def rewrite_amf(self, stream, env):
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@ -76,6 +76,15 @@ class RewriteContent(object):
        return (rewritten_headers, stream)
    def _decoding_stream(self, rewritten_headers, stream):
        for decomp_type in BufferedReader.get_supported_decompressors():
            matched, stream = self._check_encoding(rewritten_headers,
                                                   stream,
                                                   decomp_type)
            if matched:
                break
        return stream
    def _check_encoding(self, rewritten_headers, stream, enc):
        matched = False
@ -142,12 +151,7 @@ class RewriteContent(object):
        encoding = None
        first_buff = b''
-        for decomp_type in BufferedReader.get_supported_decompressors():
+        stream = self._decoding_stream(rewritten_headers, stream)
            matched, stream = self._check_encoding(rewritten_headers,
                                                   stream,
                                                   decomp_type)
            if matched:
                break
        if mod == 'js_':
            text_type, stream = self._resolve_text_type('js',
--- a/pywb/rules.yaml
+++ b/pywb/rules.yaml
@ -337,13 +337,17 @@ rules:
            - match: ''
              invalid_: ''
-      # all domain rules -- fallback to this dataset
+
    # all domain rules -- fallback to this dataset
    #=================================================================
    # Applies to all urls -- should be last
    - url_prefix: ''
-      fuzzy_lookup: '()'
+      fuzzy_lookup:
        match: '()'
        match_filters:
            - mime: 'application/dash+xml'
              match: '()'
            - mime: '*'
              match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
      #fuzzy_lookup:
      #  match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
      #  filter: ['=urlkey:{0}']
      #  replace: '?'
--- a/pywb/static/wombat.js
+++ b/pywb/static/wombat.js
@ -360,7 +360,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
        }
        if (starts_with(href, REL_PREFIX)) {
-            href = wb_info.wombat_scheme + href;
+            href = "http:" + href;
        }
        return href;
--- a/pywb/urlrewrite/rewriterapp.py
+++ b/pywb/urlrewrite/rewriterapp.py
@ -1,6 +1,9 @@
 import requests
-from pywb.rewrite.rewrite_amf import RewriteContentAMF
+from pywb.rewrite.rewrite_amf import RewriteAMFMixin
 from pywb.rewrite.rewrite_dash import RewriteDASHMixin
 from pywb.rewrite.rewrite_content import RewriteContent
 from pywb.rewrite.wburl import WbUrl
 from pywb.rewrite.url_rewriter import UrlRewriter
@ -40,6 +43,11 @@ class UpstreamException(WbException):
        self.status_code = status_code
 # ============================================================================
 class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent):
    pass
 # ============================================================================
 class RewriterApp(object):
    VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json'
@ -56,7 +64,7 @@ class RewriterApp(object):
        frame_type = 'inverse' if framed_replay else False
-        self.content_rewriter = RewriteContentAMF(is_framed_replay=frame_type)
+        self.content_rewriter = Rewriter(is_framed_replay=frame_type)
        if not jinja_env:
            jinja_env = JinjaEnv(globals={'static_path': 'static/__pywb'})
@ -198,6 +206,9 @@ class RewriterApp(object):
        cdx['timestamp'] = http_date_to_timestamp(memento_dt)
        cdx['url'] = target_uri
        if target_uri != wb_url.url and r.headers.get('WebAgg-Fuzzy-Match') == '1':
            return WbResponse.redir_response(urlrewriter.rewrite(target_uri))
        self._add_custom_params(cdx, r.headers, kwargs)
        if readd_range:
--- a/pywb/webagg/fuzzymatcher.py
+++ b/pywb/webagg/fuzzymatcher.py
@ -10,7 +10,7 @@ from collections import namedtuple
 # ============================================================================
 FuzzyRule = namedtuple('FuzzyRule',
                       'url_prefix, regex, replace_after, filter_str, ' +
-                       'match_type')
+                       'match_type, match_filters')
 # ============================================================================
@ -45,14 +45,28 @@ class FuzzyMatcher(object):
            replace_after = self.DEFAULT_REPLACE_AFTER
            filter_str = self.DEFAULT_FILTER
            match_type = self.DEFAULT_MATCH_TYPE
            match_filters = None
        else:
            regex = self.make_regex(config.get('match'))
            replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
            filter_str = config.get('filter', self.DEFAULT_FILTER)
            match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
            match_filters = self._init_match_filters(config.get('match_filters'))
-        return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type)
+        return FuzzyRule(url_prefix, regex, replace_after, filter_str,
                         match_type, match_filters)
    def _init_match_filters(self, filter_config):
        if not filter_config:
            return
        filters = []
        for filter_ in filter_config:
            filter_['match'] = re.compile(filter_['match'])
            filters.append(filter_)
        return filters
    def get_fuzzy_match(self, params):
        urlkey = to_native_str(params['key'], 'utf-8')
@ -70,9 +84,8 @@ class FuzzyMatcher(object):
            matched_rule = rule
            groups = m.groups()
-            for g in groups:
+            for f in matched_rule.filter_str:
-                for f in matched_rule.filter_str:
+                filters.append(f.format(*groups))
                    filters.append(f.format(g))
            break
@ -132,6 +145,8 @@ class FuzzyMatcher(object):
        if found:
            return
        url = params['url']
        rule = self.get_fuzzy_match(params)
        if not rule:
            return
@ -139,10 +154,18 @@ class FuzzyMatcher(object):
        new_iter, errs = index_source(params)
        for cdx in new_iter:
-            if self.allow_fuzzy_result(rule, cdx):
+            if self.allow_fuzzy_result(rule, url, cdx):
                cdx['is_fuzzy'] = True
                yield cdx
-    def allow_fuzzy_result(self, rule, cdx):
+    def allow_fuzzy_result(self, rule, url, cdx):
-        return True
+        if not rule.match_filters:
            return True
        for match_filter in rule.match_filters:
            if match_filter['mime'] in (cdx['mime'], '*'):
                return match_filter['match'].search(url)
        return False
--- a/pywb/webagg/responseloader.py
+++ b/pywb/webagg/responseloader.py
@ -44,6 +44,8 @@ class BaseLoader(object):
        out_headers['WebAgg-Type'] = 'warc'
        out_headers['WebAgg-Source-Coll'] = quote(cdx.get('source', ''), safe=':/')
        out_headers['Content-Type'] = 'application/warc-record'
        if cdx.get('is_fuzzy'):
            out_headers['WebAgg-Fuzzy-Match'] = '1'
        if not warc_headers:
            if other_headers:
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@ -104,6 +104,15 @@ class TestWbIntegration(BaseConfigTest):
    #    assert 'wb.js' in resp.text
    #    assert '/pywb-nonframe/20140127171238/http://www.iana.org/time-zones"' in resp.text
    def test_replay_fuzzy_1(self):
        resp = self.testapp.get('/pywb/20140127171238mp_/http://www.iana.org/?_=123')
        assert resp.status_int == 302
        assert resp.headers['Location'].endswith('/pywb/20140127171238mp_/http://www.iana.org/')
    def test_replay_no_fuzzy_match(self):
        resp = self.testapp.get('/pywb/20140127171238mp_/http://www.iana.org/?foo=bar', status=404)
        assert resp.status_int == 404
    #def test_replay_non_surt(self):
    #    resp = self.testapp.get('/pywb-nosurt/20140103030321/http://example.com?example=1')
    #    self._assert_basic_html(resp)
@ -396,13 +405,17 @@ class TestWbIntegration(BaseConfigTest):
        assert resp.status_int == 200
        assert '"data": "^"' in resp.text
-    def test_post_fuzzy_match(self):
+    def test_post_invalid(self):
-        resp = self.testapp.post('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': 'x'})
+        # not json
-        assert resp.status_int == 200
+        resp = self.testapp.post_json('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': '^'}, status=404)
-        assert '"A": "1"' in resp.text
+        assert resp.status_int == 404
        assert '"B": "[]"' in resp.text
        assert '"C": "3"' in resp.text
    #def test_post_fuzzy_match(self):
    #    resp = self.testapp.post('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': 'x'})
    #    assert resp.status_int == 200
    #    assert '"A": "1"' in resp.text
    #    assert '"B": "[]"' in resp.text
    #    assert '"C": "3"' in resp.text
    def test_post_referer_redirect(self):
        # allowing 307 redirects