1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rewriter: add rewrite_dash for rewriting DASH and HLS manifests!

rewriter: refactor to use mixins to extend base rewriter (todo: more refactoring)
fuzzy-matcher: support for additional 'match_filters' to filter fuzzy results via optional regexes by mime type,
eg. allow more lenient fuzzy matching on DASH manifests than other resources (for now)
fuzzy-matching: add WebAgg-Fuzzy-Match response header if response is fuzzy matched, redirect to exact match in rewriterapp
This commit is contained in:
Ilya Kreymer 2017-03-20 14:41:12 -07:00
parent 22edb2f14b
commit a82cfc1ab2
9 changed files with 90 additions and 32 deletions

View File

@ -39,6 +39,8 @@ class HeaderRewriter(object):
'json': ['application/json'],
'hls': ['application/x-mpegURL'],
'xml': ['/xml', '+xml', '.xml', '.rss'],
'plain': ['text/plain'],

View File

@ -1,16 +1,15 @@
from io import BytesIO
from six.moves import zip
from pywb.rewrite.rewrite_content import RewriteContent
# ============================================================================
# Expiermental: not fully tested
class RewriteContentAMF(RewriteContent): #pragma: no cover
class RewriteAMFMixin(object): #pragma: no cover
def handle_custom_rewrite(self, rewritten_headers, stream, urlrewriter, mod, env):
if rewritten_headers.status_headers.get_header('Content-Type') == 'application/x-amf':
stream = self.rewrite_amf(stream, env)
return (super(RewriteContentAMF, self).
return (super(RewriteAMFMixin, self).
handle_custom_rewrite(rewritten_headers, stream, urlrewriter, mod, env))
def rewrite_amf(self, stream, env):

View File

@ -76,6 +76,15 @@ class RewriteContent(object):
return (rewritten_headers, stream)
def _decoding_stream(self, rewritten_headers, stream):
for decomp_type in BufferedReader.get_supported_decompressors():
matched, stream = self._check_encoding(rewritten_headers,
stream,
decomp_type)
if matched:
break
return stream
def _check_encoding(self, rewritten_headers, stream, enc):
matched = False
@ -142,12 +151,7 @@ class RewriteContent(object):
encoding = None
first_buff = b''
for decomp_type in BufferedReader.get_supported_decompressors():
matched, stream = self._check_encoding(rewritten_headers,
stream,
decomp_type)
if matched:
break
stream = self._decoding_stream(rewritten_headers, stream)
if mod == 'js_':
text_type, stream = self._resolve_text_type('js',

View File

@ -337,13 +337,17 @@ rules:
- match: ''
invalid_: ''
# all domain rules -- fallback to this dataset
# all domain rules -- fallback to this dataset
#=================================================================
# Applies to all urls -- should be last
- url_prefix: ''
fuzzy_lookup: '()'
fuzzy_lookup:
match: '()'
match_filters:
- mime: 'application/dash+xml'
match: '()'
- mime: '*'
match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
#fuzzy_lookup:
# match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
# filter: ['=urlkey:{0}']
# replace: '?'

View File

@ -360,7 +360,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
if (starts_with(href, REL_PREFIX)) {
href = wb_info.wombat_scheme + href;
href = "http:" + href;
}
return href;

View File

@ -1,6 +1,9 @@
import requests
from pywb.rewrite.rewrite_amf import RewriteContentAMF
from pywb.rewrite.rewrite_amf import RewriteAMFMixin
from pywb.rewrite.rewrite_dash import RewriteDASHMixin
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter
@ -40,6 +43,11 @@ class UpstreamException(WbException):
self.status_code = status_code
# ============================================================================
class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent):
pass
# ============================================================================
class RewriterApp(object):
VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json'
@ -56,7 +64,7 @@ class RewriterApp(object):
frame_type = 'inverse' if framed_replay else False
self.content_rewriter = RewriteContentAMF(is_framed_replay=frame_type)
self.content_rewriter = Rewriter(is_framed_replay=frame_type)
if not jinja_env:
jinja_env = JinjaEnv(globals={'static_path': 'static/__pywb'})
@ -198,6 +206,9 @@ class RewriterApp(object):
cdx['timestamp'] = http_date_to_timestamp(memento_dt)
cdx['url'] = target_uri
if target_uri != wb_url.url and r.headers.get('WebAgg-Fuzzy-Match') == '1':
return WbResponse.redir_response(urlrewriter.rewrite(target_uri))
self._add_custom_params(cdx, r.headers, kwargs)
if readd_range:

View File

@ -10,7 +10,7 @@ from collections import namedtuple
# ============================================================================
FuzzyRule = namedtuple('FuzzyRule',
'url_prefix, regex, replace_after, filter_str, ' +
'match_type')
'match_type, match_filters')
# ============================================================================
@ -45,14 +45,28 @@ class FuzzyMatcher(object):
replace_after = self.DEFAULT_REPLACE_AFTER
filter_str = self.DEFAULT_FILTER
match_type = self.DEFAULT_MATCH_TYPE
match_filters = None
else:
regex = self.make_regex(config.get('match'))
replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
filter_str = config.get('filter', self.DEFAULT_FILTER)
match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
match_filters = self._init_match_filters(config.get('match_filters'))
return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type)
return FuzzyRule(url_prefix, regex, replace_after, filter_str,
match_type, match_filters)
def _init_match_filters(self, filter_config):
if not filter_config:
return
filters = []
for filter_ in filter_config:
filter_['match'] = re.compile(filter_['match'])
filters.append(filter_)
return filters
def get_fuzzy_match(self, params):
urlkey = to_native_str(params['key'], 'utf-8')
@ -70,9 +84,8 @@ class FuzzyMatcher(object):
matched_rule = rule
groups = m.groups()
for g in groups:
for f in matched_rule.filter_str:
filters.append(f.format(g))
for f in matched_rule.filter_str:
filters.append(f.format(*groups))
break
@ -132,6 +145,8 @@ class FuzzyMatcher(object):
if found:
return
url = params['url']
rule = self.get_fuzzy_match(params)
if not rule:
return
@ -139,10 +154,18 @@ class FuzzyMatcher(object):
new_iter, errs = index_source(params)
for cdx in new_iter:
if self.allow_fuzzy_result(rule, cdx):
if self.allow_fuzzy_result(rule, url, cdx):
cdx['is_fuzzy'] = True
yield cdx
def allow_fuzzy_result(self, rule, cdx):
return True
def allow_fuzzy_result(self, rule, url, cdx):
if not rule.match_filters:
return True
for match_filter in rule.match_filters:
if match_filter['mime'] in (cdx['mime'], '*'):
return match_filter['match'].search(url)
return False

View File

@ -44,6 +44,8 @@ class BaseLoader(object):
out_headers['WebAgg-Type'] = 'warc'
out_headers['WebAgg-Source-Coll'] = quote(cdx.get('source', ''), safe=':/')
out_headers['Content-Type'] = 'application/warc-record'
if cdx.get('is_fuzzy'):
out_headers['WebAgg-Fuzzy-Match'] = '1'
if not warc_headers:
if other_headers:

View File

@ -104,6 +104,15 @@ class TestWbIntegration(BaseConfigTest):
# assert 'wb.js' in resp.text
# assert '/pywb-nonframe/20140127171238/http://www.iana.org/time-zones"' in resp.text
def test_replay_fuzzy_1(self):
resp = self.testapp.get('/pywb/20140127171238mp_/http://www.iana.org/?_=123')
assert resp.status_int == 302
assert resp.headers['Location'].endswith('/pywb/20140127171238mp_/http://www.iana.org/')
def test_replay_no_fuzzy_match(self):
resp = self.testapp.get('/pywb/20140127171238mp_/http://www.iana.org/?foo=bar', status=404)
assert resp.status_int == 404
#def test_replay_non_surt(self):
# resp = self.testapp.get('/pywb-nosurt/20140103030321/http://example.com?example=1')
# self._assert_basic_html(resp)
@ -396,13 +405,17 @@ class TestWbIntegration(BaseConfigTest):
assert resp.status_int == 200
assert '"data": "^"' in resp.text
def test_post_fuzzy_match(self):
resp = self.testapp.post('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': 'x'})
assert resp.status_int == 200
assert '"A": "1"' in resp.text
assert '"B": "[]"' in resp.text
assert '"C": "3"' in resp.text
def test_post_invalid(self):
# not json
resp = self.testapp.post_json('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': '^'}, status=404)
assert resp.status_int == 404
#def test_post_fuzzy_match(self):
# resp = self.testapp.post('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': 'x'})
# assert resp.status_int == 200
# assert '"A": "1"' in resp.text
# assert '"B": "[]"' in resp.text
# assert '"C": "3"' in resp.text
def test_post_referer_redirect(self):
# allowing 307 redirects