mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
rewriter: add rewrite_dash for rewriting DASH and HLS manifests!
rewriter: refactor to use mixins to extend base rewriter (todo: more refactoring) fuzzy-matcher: support for additional 'match_filters' to filter fuzzy results via optional regexes by mime type, eg. allow more lenient fuzzy matching on DASH manifests than other resources (for now) fuzzy-matching: add WebAgg-Fuzzy-Match response header if response is fuzzy matched, redirect to exact match in rewriterapp
This commit is contained in:
parent
22edb2f14b
commit
a82cfc1ab2
@ -39,6 +39,8 @@ class HeaderRewriter(object):
|
||||
|
||||
'json': ['application/json'],
|
||||
|
||||
'hls': ['application/x-mpegURL'],
|
||||
|
||||
'xml': ['/xml', '+xml', '.xml', '.rss'],
|
||||
|
||||
'plain': ['text/plain'],
|
||||
|
@ -1,16 +1,15 @@
|
||||
from io import BytesIO
|
||||
from six.moves import zip
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Expiermental: not fully tested
|
||||
class RewriteContentAMF(RewriteContent): #pragma: no cover
|
||||
class RewriteAMFMixin(object): #pragma: no cover
|
||||
def handle_custom_rewrite(self, rewritten_headers, stream, urlrewriter, mod, env):
|
||||
if rewritten_headers.status_headers.get_header('Content-Type') == 'application/x-amf':
|
||||
stream = self.rewrite_amf(stream, env)
|
||||
|
||||
return (super(RewriteContentAMF, self).
|
||||
return (super(RewriteAMFMixin, self).
|
||||
handle_custom_rewrite(rewritten_headers, stream, urlrewriter, mod, env))
|
||||
|
||||
def rewrite_amf(self, stream, env):
|
||||
|
@ -76,6 +76,15 @@ class RewriteContent(object):
|
||||
|
||||
return (rewritten_headers, stream)
|
||||
|
||||
def _decoding_stream(self, rewritten_headers, stream):
|
||||
for decomp_type in BufferedReader.get_supported_decompressors():
|
||||
matched, stream = self._check_encoding(rewritten_headers,
|
||||
stream,
|
||||
decomp_type)
|
||||
if matched:
|
||||
break
|
||||
|
||||
return stream
|
||||
|
||||
def _check_encoding(self, rewritten_headers, stream, enc):
|
||||
matched = False
|
||||
@ -142,12 +151,7 @@ class RewriteContent(object):
|
||||
encoding = None
|
||||
first_buff = b''
|
||||
|
||||
for decomp_type in BufferedReader.get_supported_decompressors():
|
||||
matched, stream = self._check_encoding(rewritten_headers,
|
||||
stream,
|
||||
decomp_type)
|
||||
if matched:
|
||||
break
|
||||
stream = self._decoding_stream(rewritten_headers, stream)
|
||||
|
||||
if mod == 'js_':
|
||||
text_type, stream = self._resolve_text_type('js',
|
||||
|
@ -337,13 +337,17 @@ rules:
|
||||
- match: ''
|
||||
invalid_: ''
|
||||
|
||||
# all domain rules -- fallback to this dataset
|
||||
|
||||
# all domain rules -- fallback to this dataset
|
||||
#=================================================================
|
||||
# Applies to all urls -- should be last
|
||||
- url_prefix: ''
|
||||
fuzzy_lookup: '()'
|
||||
fuzzy_lookup:
|
||||
match: '()'
|
||||
match_filters:
|
||||
- mime: 'application/dash+xml'
|
||||
match: '()'
|
||||
|
||||
- mime: '*'
|
||||
match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
|
||||
|
||||
#fuzzy_lookup:
|
||||
# match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
|
||||
# filter: ['=urlkey:{0}']
|
||||
# replace: '?'
|
||||
|
@ -360,7 +360,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
||||
}
|
||||
|
||||
if (starts_with(href, REL_PREFIX)) {
|
||||
href = wb_info.wombat_scheme + href;
|
||||
href = "http:" + href;
|
||||
}
|
||||
|
||||
return href;
|
||||
|
@ -1,6 +1,9 @@
|
||||
import requests
|
||||
|
||||
from pywb.rewrite.rewrite_amf import RewriteContentAMF
|
||||
from pywb.rewrite.rewrite_amf import RewriteAMFMixin
|
||||
from pywb.rewrite.rewrite_dash import RewriteDASHMixin
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
|
||||
@ -40,6 +43,11 @@ class UpstreamException(WbException):
|
||||
self.status_code = status_code
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent):
|
||||
pass
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class RewriterApp(object):
|
||||
VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json'
|
||||
@ -56,7 +64,7 @@ class RewriterApp(object):
|
||||
|
||||
frame_type = 'inverse' if framed_replay else False
|
||||
|
||||
self.content_rewriter = RewriteContentAMF(is_framed_replay=frame_type)
|
||||
self.content_rewriter = Rewriter(is_framed_replay=frame_type)
|
||||
|
||||
if not jinja_env:
|
||||
jinja_env = JinjaEnv(globals={'static_path': 'static/__pywb'})
|
||||
@ -198,6 +206,9 @@ class RewriterApp(object):
|
||||
cdx['timestamp'] = http_date_to_timestamp(memento_dt)
|
||||
cdx['url'] = target_uri
|
||||
|
||||
if target_uri != wb_url.url and r.headers.get('WebAgg-Fuzzy-Match') == '1':
|
||||
return WbResponse.redir_response(urlrewriter.rewrite(target_uri))
|
||||
|
||||
self._add_custom_params(cdx, r.headers, kwargs)
|
||||
|
||||
if readd_range:
|
||||
|
@ -10,7 +10,7 @@ from collections import namedtuple
|
||||
# ============================================================================
|
||||
FuzzyRule = namedtuple('FuzzyRule',
|
||||
'url_prefix, regex, replace_after, filter_str, ' +
|
||||
'match_type')
|
||||
'match_type, match_filters')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -45,14 +45,28 @@ class FuzzyMatcher(object):
|
||||
replace_after = self.DEFAULT_REPLACE_AFTER
|
||||
filter_str = self.DEFAULT_FILTER
|
||||
match_type = self.DEFAULT_MATCH_TYPE
|
||||
match_filters = None
|
||||
|
||||
else:
|
||||
regex = self.make_regex(config.get('match'))
|
||||
replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
|
||||
filter_str = config.get('filter', self.DEFAULT_FILTER)
|
||||
match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
|
||||
match_filters = self._init_match_filters(config.get('match_filters'))
|
||||
|
||||
return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type)
|
||||
return FuzzyRule(url_prefix, regex, replace_after, filter_str,
|
||||
match_type, match_filters)
|
||||
|
||||
def _init_match_filters(self, filter_config):
|
||||
if not filter_config:
|
||||
return
|
||||
|
||||
filters = []
|
||||
for filter_ in filter_config:
|
||||
filter_['match'] = re.compile(filter_['match'])
|
||||
filters.append(filter_)
|
||||
|
||||
return filters
|
||||
|
||||
def get_fuzzy_match(self, params):
|
||||
urlkey = to_native_str(params['key'], 'utf-8')
|
||||
@ -70,9 +84,8 @@ class FuzzyMatcher(object):
|
||||
|
||||
matched_rule = rule
|
||||
groups = m.groups()
|
||||
for g in groups:
|
||||
for f in matched_rule.filter_str:
|
||||
filters.append(f.format(g))
|
||||
for f in matched_rule.filter_str:
|
||||
filters.append(f.format(*groups))
|
||||
|
||||
break
|
||||
|
||||
@ -132,6 +145,8 @@ class FuzzyMatcher(object):
|
||||
if found:
|
||||
return
|
||||
|
||||
url = params['url']
|
||||
|
||||
rule = self.get_fuzzy_match(params)
|
||||
if not rule:
|
||||
return
|
||||
@ -139,10 +154,18 @@ class FuzzyMatcher(object):
|
||||
new_iter, errs = index_source(params)
|
||||
|
||||
for cdx in new_iter:
|
||||
if self.allow_fuzzy_result(rule, cdx):
|
||||
if self.allow_fuzzy_result(rule, url, cdx):
|
||||
cdx['is_fuzzy'] = True
|
||||
yield cdx
|
||||
|
||||
def allow_fuzzy_result(self, rule, cdx):
|
||||
return True
|
||||
def allow_fuzzy_result(self, rule, url, cdx):
|
||||
if not rule.match_filters:
|
||||
return True
|
||||
|
||||
for match_filter in rule.match_filters:
|
||||
if match_filter['mime'] in (cdx['mime'], '*'):
|
||||
return match_filter['match'].search(url)
|
||||
|
||||
return False
|
||||
|
||||
|
||||
|
@ -44,6 +44,8 @@ class BaseLoader(object):
|
||||
out_headers['WebAgg-Type'] = 'warc'
|
||||
out_headers['WebAgg-Source-Coll'] = quote(cdx.get('source', ''), safe=':/')
|
||||
out_headers['Content-Type'] = 'application/warc-record'
|
||||
if cdx.get('is_fuzzy'):
|
||||
out_headers['WebAgg-Fuzzy-Match'] = '1'
|
||||
|
||||
if not warc_headers:
|
||||
if other_headers:
|
||||
|
@ -104,6 +104,15 @@ class TestWbIntegration(BaseConfigTest):
|
||||
# assert 'wb.js' in resp.text
|
||||
# assert '/pywb-nonframe/20140127171238/http://www.iana.org/time-zones"' in resp.text
|
||||
|
||||
def test_replay_fuzzy_1(self):
|
||||
resp = self.testapp.get('/pywb/20140127171238mp_/http://www.iana.org/?_=123')
|
||||
assert resp.status_int == 302
|
||||
assert resp.headers['Location'].endswith('/pywb/20140127171238mp_/http://www.iana.org/')
|
||||
|
||||
def test_replay_no_fuzzy_match(self):
|
||||
resp = self.testapp.get('/pywb/20140127171238mp_/http://www.iana.org/?foo=bar', status=404)
|
||||
assert resp.status_int == 404
|
||||
|
||||
#def test_replay_non_surt(self):
|
||||
# resp = self.testapp.get('/pywb-nosurt/20140103030321/http://example.com?example=1')
|
||||
# self._assert_basic_html(resp)
|
||||
@ -396,13 +405,17 @@ class TestWbIntegration(BaseConfigTest):
|
||||
assert resp.status_int == 200
|
||||
assert '"data": "^"' in resp.text
|
||||
|
||||
def test_post_fuzzy_match(self):
|
||||
resp = self.testapp.post('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': 'x'})
|
||||
assert resp.status_int == 200
|
||||
assert '"A": "1"' in resp.text
|
||||
assert '"B": "[]"' in resp.text
|
||||
assert '"C": "3"' in resp.text
|
||||
def test_post_invalid(self):
|
||||
# not json
|
||||
resp = self.testapp.post_json('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': '^'}, status=404)
|
||||
assert resp.status_int == 404
|
||||
|
||||
#def test_post_fuzzy_match(self):
|
||||
# resp = self.testapp.post('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': 'x'})
|
||||
# assert resp.status_int == 200
|
||||
# assert '"A": "1"' in resp.text
|
||||
# assert '"B": "[]"' in resp.text
|
||||
# assert '"C": "3"' in resp.text
|
||||
|
||||
def test_post_referer_redirect(self):
|
||||
# allowing 307 redirects
|
||||
|
Loading…
x
Reference in New Issue
Block a user