mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
rewriter: add rewrite_dash for rewriting DASH and HLS manifests!
rewriter: refactor to use mixins to extend base rewriter (todo: more refactoring) fuzzy-matcher: support for additional 'match_filters' to filter fuzzy results via optional regexes by mime type, eg. allow more lenient fuzzy matching on DASH manifests than other resources (for now) fuzzy-matching: add WebAgg-Fuzzy-Match response header if response is fuzzy matched, redirect to exact match in rewriterapp
This commit is contained in:
parent
22edb2f14b
commit
a82cfc1ab2
@ -39,6 +39,8 @@ class HeaderRewriter(object):
|
|||||||
|
|
||||||
'json': ['application/json'],
|
'json': ['application/json'],
|
||||||
|
|
||||||
|
'hls': ['application/x-mpegURL'],
|
||||||
|
|
||||||
'xml': ['/xml', '+xml', '.xml', '.rss'],
|
'xml': ['/xml', '+xml', '.xml', '.rss'],
|
||||||
|
|
||||||
'plain': ['text/plain'],
|
'plain': ['text/plain'],
|
||||||
|
@ -1,16 +1,15 @@
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from six.moves import zip
|
from six.moves import zip
|
||||||
from pywb.rewrite.rewrite_content import RewriteContent
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# Expiermental: not fully tested
|
# Expiermental: not fully tested
|
||||||
class RewriteContentAMF(RewriteContent): #pragma: no cover
|
class RewriteAMFMixin(object): #pragma: no cover
|
||||||
def handle_custom_rewrite(self, rewritten_headers, stream, urlrewriter, mod, env):
|
def handle_custom_rewrite(self, rewritten_headers, stream, urlrewriter, mod, env):
|
||||||
if rewritten_headers.status_headers.get_header('Content-Type') == 'application/x-amf':
|
if rewritten_headers.status_headers.get_header('Content-Type') == 'application/x-amf':
|
||||||
stream = self.rewrite_amf(stream, env)
|
stream = self.rewrite_amf(stream, env)
|
||||||
|
|
||||||
return (super(RewriteContentAMF, self).
|
return (super(RewriteAMFMixin, self).
|
||||||
handle_custom_rewrite(rewritten_headers, stream, urlrewriter, mod, env))
|
handle_custom_rewrite(rewritten_headers, stream, urlrewriter, mod, env))
|
||||||
|
|
||||||
def rewrite_amf(self, stream, env):
|
def rewrite_amf(self, stream, env):
|
||||||
|
@ -76,6 +76,15 @@ class RewriteContent(object):
|
|||||||
|
|
||||||
return (rewritten_headers, stream)
|
return (rewritten_headers, stream)
|
||||||
|
|
||||||
|
def _decoding_stream(self, rewritten_headers, stream):
|
||||||
|
for decomp_type in BufferedReader.get_supported_decompressors():
|
||||||
|
matched, stream = self._check_encoding(rewritten_headers,
|
||||||
|
stream,
|
||||||
|
decomp_type)
|
||||||
|
if matched:
|
||||||
|
break
|
||||||
|
|
||||||
|
return stream
|
||||||
|
|
||||||
def _check_encoding(self, rewritten_headers, stream, enc):
|
def _check_encoding(self, rewritten_headers, stream, enc):
|
||||||
matched = False
|
matched = False
|
||||||
@ -142,12 +151,7 @@ class RewriteContent(object):
|
|||||||
encoding = None
|
encoding = None
|
||||||
first_buff = b''
|
first_buff = b''
|
||||||
|
|
||||||
for decomp_type in BufferedReader.get_supported_decompressors():
|
stream = self._decoding_stream(rewritten_headers, stream)
|
||||||
matched, stream = self._check_encoding(rewritten_headers,
|
|
||||||
stream,
|
|
||||||
decomp_type)
|
|
||||||
if matched:
|
|
||||||
break
|
|
||||||
|
|
||||||
if mod == 'js_':
|
if mod == 'js_':
|
||||||
text_type, stream = self._resolve_text_type('js',
|
text_type, stream = self._resolve_text_type('js',
|
||||||
|
@ -337,13 +337,17 @@ rules:
|
|||||||
- match: ''
|
- match: ''
|
||||||
invalid_: ''
|
invalid_: ''
|
||||||
|
|
||||||
# all domain rules -- fallback to this dataset
|
|
||||||
|
# all domain rules -- fallback to this dataset
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# Applies to all urls -- should be last
|
# Applies to all urls -- should be last
|
||||||
- url_prefix: ''
|
- url_prefix: ''
|
||||||
fuzzy_lookup: '()'
|
fuzzy_lookup:
|
||||||
|
match: '()'
|
||||||
|
match_filters:
|
||||||
|
- mime: 'application/dash+xml'
|
||||||
|
match: '()'
|
||||||
|
|
||||||
|
- mime: '*'
|
||||||
|
match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
|
||||||
|
|
||||||
#fuzzy_lookup:
|
|
||||||
# match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
|
|
||||||
# filter: ['=urlkey:{0}']
|
|
||||||
# replace: '?'
|
|
||||||
|
@ -360,7 +360,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (starts_with(href, REL_PREFIX)) {
|
if (starts_with(href, REL_PREFIX)) {
|
||||||
href = wb_info.wombat_scheme + href;
|
href = "http:" + href;
|
||||||
}
|
}
|
||||||
|
|
||||||
return href;
|
return href;
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
import requests
|
import requests
|
||||||
|
|
||||||
from pywb.rewrite.rewrite_amf import RewriteContentAMF
|
from pywb.rewrite.rewrite_amf import RewriteAMFMixin
|
||||||
|
from pywb.rewrite.rewrite_dash import RewriteDASHMixin
|
||||||
|
from pywb.rewrite.rewrite_content import RewriteContent
|
||||||
|
|
||||||
from pywb.rewrite.wburl import WbUrl
|
from pywb.rewrite.wburl import WbUrl
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
|
|
||||||
@ -40,6 +43,11 @@ class UpstreamException(WbException):
|
|||||||
self.status_code = status_code
|
self.status_code = status_code
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class RewriterApp(object):
|
class RewriterApp(object):
|
||||||
VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json'
|
VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json'
|
||||||
@ -56,7 +64,7 @@ class RewriterApp(object):
|
|||||||
|
|
||||||
frame_type = 'inverse' if framed_replay else False
|
frame_type = 'inverse' if framed_replay else False
|
||||||
|
|
||||||
self.content_rewriter = RewriteContentAMF(is_framed_replay=frame_type)
|
self.content_rewriter = Rewriter(is_framed_replay=frame_type)
|
||||||
|
|
||||||
if not jinja_env:
|
if not jinja_env:
|
||||||
jinja_env = JinjaEnv(globals={'static_path': 'static/__pywb'})
|
jinja_env = JinjaEnv(globals={'static_path': 'static/__pywb'})
|
||||||
@ -198,6 +206,9 @@ class RewriterApp(object):
|
|||||||
cdx['timestamp'] = http_date_to_timestamp(memento_dt)
|
cdx['timestamp'] = http_date_to_timestamp(memento_dt)
|
||||||
cdx['url'] = target_uri
|
cdx['url'] = target_uri
|
||||||
|
|
||||||
|
if target_uri != wb_url.url and r.headers.get('WebAgg-Fuzzy-Match') == '1':
|
||||||
|
return WbResponse.redir_response(urlrewriter.rewrite(target_uri))
|
||||||
|
|
||||||
self._add_custom_params(cdx, r.headers, kwargs)
|
self._add_custom_params(cdx, r.headers, kwargs)
|
||||||
|
|
||||||
if readd_range:
|
if readd_range:
|
||||||
|
@ -10,7 +10,7 @@ from collections import namedtuple
|
|||||||
# ============================================================================
|
# ============================================================================
|
||||||
FuzzyRule = namedtuple('FuzzyRule',
|
FuzzyRule = namedtuple('FuzzyRule',
|
||||||
'url_prefix, regex, replace_after, filter_str, ' +
|
'url_prefix, regex, replace_after, filter_str, ' +
|
||||||
'match_type')
|
'match_type, match_filters')
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@ -45,14 +45,28 @@ class FuzzyMatcher(object):
|
|||||||
replace_after = self.DEFAULT_REPLACE_AFTER
|
replace_after = self.DEFAULT_REPLACE_AFTER
|
||||||
filter_str = self.DEFAULT_FILTER
|
filter_str = self.DEFAULT_FILTER
|
||||||
match_type = self.DEFAULT_MATCH_TYPE
|
match_type = self.DEFAULT_MATCH_TYPE
|
||||||
|
match_filters = None
|
||||||
|
|
||||||
else:
|
else:
|
||||||
regex = self.make_regex(config.get('match'))
|
regex = self.make_regex(config.get('match'))
|
||||||
replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
|
replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
|
||||||
filter_str = config.get('filter', self.DEFAULT_FILTER)
|
filter_str = config.get('filter', self.DEFAULT_FILTER)
|
||||||
match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
|
match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
|
||||||
|
match_filters = self._init_match_filters(config.get('match_filters'))
|
||||||
|
|
||||||
return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type)
|
return FuzzyRule(url_prefix, regex, replace_after, filter_str,
|
||||||
|
match_type, match_filters)
|
||||||
|
|
||||||
|
def _init_match_filters(self, filter_config):
|
||||||
|
if not filter_config:
|
||||||
|
return
|
||||||
|
|
||||||
|
filters = []
|
||||||
|
for filter_ in filter_config:
|
||||||
|
filter_['match'] = re.compile(filter_['match'])
|
||||||
|
filters.append(filter_)
|
||||||
|
|
||||||
|
return filters
|
||||||
|
|
||||||
def get_fuzzy_match(self, params):
|
def get_fuzzy_match(self, params):
|
||||||
urlkey = to_native_str(params['key'], 'utf-8')
|
urlkey = to_native_str(params['key'], 'utf-8')
|
||||||
@ -70,9 +84,8 @@ class FuzzyMatcher(object):
|
|||||||
|
|
||||||
matched_rule = rule
|
matched_rule = rule
|
||||||
groups = m.groups()
|
groups = m.groups()
|
||||||
for g in groups:
|
for f in matched_rule.filter_str:
|
||||||
for f in matched_rule.filter_str:
|
filters.append(f.format(*groups))
|
||||||
filters.append(f.format(g))
|
|
||||||
|
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -132,6 +145,8 @@ class FuzzyMatcher(object):
|
|||||||
if found:
|
if found:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
url = params['url']
|
||||||
|
|
||||||
rule = self.get_fuzzy_match(params)
|
rule = self.get_fuzzy_match(params)
|
||||||
if not rule:
|
if not rule:
|
||||||
return
|
return
|
||||||
@ -139,10 +154,18 @@ class FuzzyMatcher(object):
|
|||||||
new_iter, errs = index_source(params)
|
new_iter, errs = index_source(params)
|
||||||
|
|
||||||
for cdx in new_iter:
|
for cdx in new_iter:
|
||||||
if self.allow_fuzzy_result(rule, cdx):
|
if self.allow_fuzzy_result(rule, url, cdx):
|
||||||
|
cdx['is_fuzzy'] = True
|
||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
def allow_fuzzy_result(self, rule, cdx):
|
def allow_fuzzy_result(self, rule, url, cdx):
|
||||||
return True
|
if not rule.match_filters:
|
||||||
|
return True
|
||||||
|
|
||||||
|
for match_filter in rule.match_filters:
|
||||||
|
if match_filter['mime'] in (cdx['mime'], '*'):
|
||||||
|
return match_filter['match'].search(url)
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
@ -44,6 +44,8 @@ class BaseLoader(object):
|
|||||||
out_headers['WebAgg-Type'] = 'warc'
|
out_headers['WebAgg-Type'] = 'warc'
|
||||||
out_headers['WebAgg-Source-Coll'] = quote(cdx.get('source', ''), safe=':/')
|
out_headers['WebAgg-Source-Coll'] = quote(cdx.get('source', ''), safe=':/')
|
||||||
out_headers['Content-Type'] = 'application/warc-record'
|
out_headers['Content-Type'] = 'application/warc-record'
|
||||||
|
if cdx.get('is_fuzzy'):
|
||||||
|
out_headers['WebAgg-Fuzzy-Match'] = '1'
|
||||||
|
|
||||||
if not warc_headers:
|
if not warc_headers:
|
||||||
if other_headers:
|
if other_headers:
|
||||||
|
@ -104,6 +104,15 @@ class TestWbIntegration(BaseConfigTest):
|
|||||||
# assert 'wb.js' in resp.text
|
# assert 'wb.js' in resp.text
|
||||||
# assert '/pywb-nonframe/20140127171238/http://www.iana.org/time-zones"' in resp.text
|
# assert '/pywb-nonframe/20140127171238/http://www.iana.org/time-zones"' in resp.text
|
||||||
|
|
||||||
|
def test_replay_fuzzy_1(self):
|
||||||
|
resp = self.testapp.get('/pywb/20140127171238mp_/http://www.iana.org/?_=123')
|
||||||
|
assert resp.status_int == 302
|
||||||
|
assert resp.headers['Location'].endswith('/pywb/20140127171238mp_/http://www.iana.org/')
|
||||||
|
|
||||||
|
def test_replay_no_fuzzy_match(self):
|
||||||
|
resp = self.testapp.get('/pywb/20140127171238mp_/http://www.iana.org/?foo=bar', status=404)
|
||||||
|
assert resp.status_int == 404
|
||||||
|
|
||||||
#def test_replay_non_surt(self):
|
#def test_replay_non_surt(self):
|
||||||
# resp = self.testapp.get('/pywb-nosurt/20140103030321/http://example.com?example=1')
|
# resp = self.testapp.get('/pywb-nosurt/20140103030321/http://example.com?example=1')
|
||||||
# self._assert_basic_html(resp)
|
# self._assert_basic_html(resp)
|
||||||
@ -396,13 +405,17 @@ class TestWbIntegration(BaseConfigTest):
|
|||||||
assert resp.status_int == 200
|
assert resp.status_int == 200
|
||||||
assert '"data": "^"' in resp.text
|
assert '"data": "^"' in resp.text
|
||||||
|
|
||||||
def test_post_fuzzy_match(self):
|
def test_post_invalid(self):
|
||||||
resp = self.testapp.post('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': 'x'})
|
# not json
|
||||||
assert resp.status_int == 200
|
resp = self.testapp.post_json('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': '^'}, status=404)
|
||||||
assert '"A": "1"' in resp.text
|
assert resp.status_int == 404
|
||||||
assert '"B": "[]"' in resp.text
|
|
||||||
assert '"C": "3"' in resp.text
|
|
||||||
|
|
||||||
|
#def test_post_fuzzy_match(self):
|
||||||
|
# resp = self.testapp.post('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': 'x'})
|
||||||
|
# assert resp.status_int == 200
|
||||||
|
# assert '"A": "1"' in resp.text
|
||||||
|
# assert '"B": "[]"' in resp.text
|
||||||
|
# assert '"C": "3"' in resp.text
|
||||||
|
|
||||||
def test_post_referer_redirect(self):
|
def test_post_referer_redirect(self):
|
||||||
# allowing 307 redirects
|
# allowing 307 redirects
|
||||||
|
Loading…
x
Reference in New Issue
Block a user