From a82cfc1ab2ac9a0e957a7d7b0ea1f801ec0b7d31 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 20 Mar 2017 14:41:12 -0700 Subject: [PATCH] rewriter: add rewrite_dash for rewriting DASH and HLS manifests! rewriter: refactor to use mixins to extend base rewriter (todo: more refactoring) fuzzy-matcher: support for additional 'match_filters' to filter fuzzy results via optional regexes by mime type, eg. allow more lenient fuzzy matching on DASH manifests than other resources (for now) fuzzy-matching: add WebAgg-Fuzzy-Match response header if response is fuzzy matched, redirect to exact match in rewriterapp --- pywb/rewrite/header_rewriter.py | 2 ++ pywb/rewrite/rewrite_amf.py | 5 ++--- pywb/rewrite/rewrite_content.py | 16 +++++++++----- pywb/rules.yaml | 16 +++++++++----- pywb/static/wombat.js | 2 +- pywb/urlrewrite/rewriterapp.py | 15 +++++++++++-- pywb/webagg/fuzzymatcher.py | 39 ++++++++++++++++++++++++++------- pywb/webagg/responseloader.py | 2 ++ tests/test_integration.py | 25 ++++++++++++++++----- 9 files changed, 90 insertions(+), 32 deletions(-) diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py index f88f73ad..f5656170 100644 --- a/pywb/rewrite/header_rewriter.py +++ b/pywb/rewrite/header_rewriter.py @@ -39,6 +39,8 @@ class HeaderRewriter(object): 'json': ['application/json'], + 'hls': ['application/x-mpegURL'], + 'xml': ['/xml', '+xml', '.xml', '.rss'], 'plain': ['text/plain'], diff --git a/pywb/rewrite/rewrite_amf.py b/pywb/rewrite/rewrite_amf.py index 9fcae784..ddd4e078 100644 --- a/pywb/rewrite/rewrite_amf.py +++ b/pywb/rewrite/rewrite_amf.py @@ -1,16 +1,15 @@ from io import BytesIO from six.moves import zip -from pywb.rewrite.rewrite_content import RewriteContent # ============================================================================ # Expiermental: not fully tested -class RewriteContentAMF(RewriteContent): #pragma: no cover +class RewriteAMFMixin(object): #pragma: no cover def handle_custom_rewrite(self, rewritten_headers, stream, urlrewriter, mod, env): if rewritten_headers.status_headers.get_header('Content-Type') == 'application/x-amf': stream = self.rewrite_amf(stream, env) - return (super(RewriteContentAMF, self). + return (super(RewriteAMFMixin, self). handle_custom_rewrite(rewritten_headers, stream, urlrewriter, mod, env)) def rewrite_amf(self, stream, env): diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 483b51fa..de239644 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -76,6 +76,15 @@ class RewriteContent(object): return (rewritten_headers, stream) + def _decoding_stream(self, rewritten_headers, stream): + for decomp_type in BufferedReader.get_supported_decompressors(): + matched, stream = self._check_encoding(rewritten_headers, + stream, + decomp_type) + if matched: + break + + return stream def _check_encoding(self, rewritten_headers, stream, enc): matched = False @@ -142,12 +151,7 @@ class RewriteContent(object): encoding = None first_buff = b'' - for decomp_type in BufferedReader.get_supported_decompressors(): - matched, stream = self._check_encoding(rewritten_headers, - stream, - decomp_type) - if matched: - break + stream = self._decoding_stream(rewritten_headers, stream) if mod == 'js_': text_type, stream = self._resolve_text_type('js', diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 07b44112..e731e10f 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -337,13 +337,17 @@ rules: - match: '' invalid_: '' - # all domain rules -- fallback to this dataset + + # all domain rules -- fallback to this dataset #================================================================= # Applies to all urls -- should be last - url_prefix: '' - fuzzy_lookup: '()' + fuzzy_lookup: + match: '()' + match_filters: + - mime: 'application/dash+xml' + match: '()' + + - mime: '*' + match: '(.*)[&?](?:_|uncache)=[\d]+[&]?' - #fuzzy_lookup: - # match: '(.*)[&?](?:_|uncache)=[\d]+[&]?' - # filter: ['=urlkey:{0}'] - # replace: '?' diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 8511431b..bb9e6d67 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -360,7 +360,7 @@ var _WBWombat = function($wbwindow, wbinfo) { } if (starts_with(href, REL_PREFIX)) { - href = wb_info.wombat_scheme + href; + href = "http:" + href; } return href; diff --git a/pywb/urlrewrite/rewriterapp.py b/pywb/urlrewrite/rewriterapp.py index b609728a..f0ccb6b0 100644 --- a/pywb/urlrewrite/rewriterapp.py +++ b/pywb/urlrewrite/rewriterapp.py @@ -1,6 +1,9 @@ import requests -from pywb.rewrite.rewrite_amf import RewriteContentAMF +from pywb.rewrite.rewrite_amf import RewriteAMFMixin +from pywb.rewrite.rewrite_dash import RewriteDASHMixin +from pywb.rewrite.rewrite_content import RewriteContent + from pywb.rewrite.wburl import WbUrl from pywb.rewrite.url_rewriter import UrlRewriter @@ -40,6 +43,11 @@ class UpstreamException(WbException): self.status_code = status_code +# ============================================================================ +class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent): + pass + + # ============================================================================ class RewriterApp(object): VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json' @@ -56,7 +64,7 @@ class RewriterApp(object): frame_type = 'inverse' if framed_replay else False - self.content_rewriter = RewriteContentAMF(is_framed_replay=frame_type) + self.content_rewriter = Rewriter(is_framed_replay=frame_type) if not jinja_env: jinja_env = JinjaEnv(globals={'static_path': 'static/__pywb'}) @@ -198,6 +206,9 @@ class RewriterApp(object): cdx['timestamp'] = http_date_to_timestamp(memento_dt) cdx['url'] = target_uri + if target_uri != wb_url.url and r.headers.get('WebAgg-Fuzzy-Match') == '1': + return WbResponse.redir_response(urlrewriter.rewrite(target_uri)) + self._add_custom_params(cdx, r.headers, kwargs) if readd_range: diff --git a/pywb/webagg/fuzzymatcher.py b/pywb/webagg/fuzzymatcher.py index 9646bce0..f5926588 100644 --- a/pywb/webagg/fuzzymatcher.py +++ b/pywb/webagg/fuzzymatcher.py @@ -10,7 +10,7 @@ from collections import namedtuple # ============================================================================ FuzzyRule = namedtuple('FuzzyRule', 'url_prefix, regex, replace_after, filter_str, ' + - 'match_type') + 'match_type, match_filters') # ============================================================================ @@ -45,14 +45,28 @@ class FuzzyMatcher(object): replace_after = self.DEFAULT_REPLACE_AFTER filter_str = self.DEFAULT_FILTER match_type = self.DEFAULT_MATCH_TYPE + match_filters = None else: regex = self.make_regex(config.get('match')) replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER) filter_str = config.get('filter', self.DEFAULT_FILTER) match_type = config.get('type', self.DEFAULT_MATCH_TYPE) + match_filters = self._init_match_filters(config.get('match_filters')) - return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type) + return FuzzyRule(url_prefix, regex, replace_after, filter_str, + match_type, match_filters) + + def _init_match_filters(self, filter_config): + if not filter_config: + return + + filters = [] + for filter_ in filter_config: + filter_['match'] = re.compile(filter_['match']) + filters.append(filter_) + + return filters def get_fuzzy_match(self, params): urlkey = to_native_str(params['key'], 'utf-8') @@ -70,9 +84,8 @@ class FuzzyMatcher(object): matched_rule = rule groups = m.groups() - for g in groups: - for f in matched_rule.filter_str: - filters.append(f.format(g)) + for f in matched_rule.filter_str: + filters.append(f.format(*groups)) break @@ -132,6 +145,8 @@ class FuzzyMatcher(object): if found: return + url = params['url'] + rule = self.get_fuzzy_match(params) if not rule: return @@ -139,10 +154,18 @@ class FuzzyMatcher(object): new_iter, errs = index_source(params) for cdx in new_iter: - if self.allow_fuzzy_result(rule, cdx): + if self.allow_fuzzy_result(rule, url, cdx): + cdx['is_fuzzy'] = True yield cdx - def allow_fuzzy_result(self, rule, cdx): - return True + def allow_fuzzy_result(self, rule, url, cdx): + if not rule.match_filters: + return True + + for match_filter in rule.match_filters: + if match_filter['mime'] in (cdx['mime'], '*'): + return match_filter['match'].search(url) + + return False diff --git a/pywb/webagg/responseloader.py b/pywb/webagg/responseloader.py index 2da84652..aae4576d 100644 --- a/pywb/webagg/responseloader.py +++ b/pywb/webagg/responseloader.py @@ -44,6 +44,8 @@ class BaseLoader(object): out_headers['WebAgg-Type'] = 'warc' out_headers['WebAgg-Source-Coll'] = quote(cdx.get('source', ''), safe=':/') out_headers['Content-Type'] = 'application/warc-record' + if cdx.get('is_fuzzy'): + out_headers['WebAgg-Fuzzy-Match'] = '1' if not warc_headers: if other_headers: diff --git a/tests/test_integration.py b/tests/test_integration.py index 2eefc60d..bb9aedc2 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -104,6 +104,15 @@ class TestWbIntegration(BaseConfigTest): # assert 'wb.js' in resp.text # assert '/pywb-nonframe/20140127171238/http://www.iana.org/time-zones"' in resp.text + def test_replay_fuzzy_1(self): + resp = self.testapp.get('/pywb/20140127171238mp_/http://www.iana.org/?_=123') + assert resp.status_int == 302 + assert resp.headers['Location'].endswith('/pywb/20140127171238mp_/http://www.iana.org/') + + def test_replay_no_fuzzy_match(self): + resp = self.testapp.get('/pywb/20140127171238mp_/http://www.iana.org/?foo=bar', status=404) + assert resp.status_int == 404 + #def test_replay_non_surt(self): # resp = self.testapp.get('/pywb-nosurt/20140103030321/http://example.com?example=1') # self._assert_basic_html(resp) @@ -396,13 +405,17 @@ class TestWbIntegration(BaseConfigTest): assert resp.status_int == 200 assert '"data": "^"' in resp.text - def test_post_fuzzy_match(self): - resp = self.testapp.post('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': 'x'}) - assert resp.status_int == 200 - assert '"A": "1"' in resp.text - assert '"B": "[]"' in resp.text - assert '"C": "3"' in resp.text + def test_post_invalid(self): + # not json + resp = self.testapp.post_json('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': '^'}, status=404) + assert resp.status_int == 404 + #def test_post_fuzzy_match(self): + # resp = self.testapp.post('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': 'x'}) + # assert resp.status_int == 200 + # assert '"A": "1"' in resp.text + # assert '"B": "[]"' in resp.text + # assert '"C": "3"' in resp.text def test_post_referer_redirect(self): # allowing 307 redirects