mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
rewrite refactoring:
- rewrite headers after content to ensure content-length/content-encoding rewritten if content modified - header rewriter: remove proxyrewriter, set default rule to 'prefix' or 'keep' if url rewriting or not - set is_content_rw if record.content_stream(), assume content is modified - add BufferedRewriter as base for dash, hls, amf rewriting which processes the full stream - should_rw_content() determines if should attempt content rewriting - support banner-only insert mode: added HTMLInsertOnlyRewriter, enable if no custom JS rules - test: enable banner-only test mode
This commit is contained in:
parent
c1be7d4da5
commit
d8b67319e1
@ -7,6 +7,7 @@ from warcio.utils import to_native_str
|
||||
|
||||
import re
|
||||
import webencodings
|
||||
import tempfile
|
||||
|
||||
from pywb.webagg.utils import StreamIter, BUFF_SIZE
|
||||
from pywb.rewrite.cookie_rewriter import ExactPathCookieRewriter
|
||||
@ -78,11 +79,16 @@ class BaseContentRewriter(object):
|
||||
def create_rewriter(self, text_type, rule, rwinfo, cdx, head_insert_func=None):
|
||||
rw_type, rw_class = self.get_rw_class(rule, text_type, rwinfo)
|
||||
|
||||
if rw_type in ('js', 'js_proxy'):
|
||||
if rw_type in ('js', 'js-proxy'):
|
||||
extra_rules = []
|
||||
if 'js_regex_func' in rule:
|
||||
extra_rules = rule['js_regex_func'](rwinfo.url_rewriter)
|
||||
|
||||
# if js-proxy and no rules, default to none
|
||||
# js rewriting in proxy only if extra rules apply
|
||||
if rw_type == 'js-proxy' and not extra_rules:
|
||||
return None
|
||||
|
||||
return rw_class(rwinfo.url_rewriter, extra_rules)
|
||||
|
||||
elif rw_type != 'html':
|
||||
@ -94,6 +100,10 @@ class BaseContentRewriter(object):
|
||||
js_rewriter = self.create_rewriter('js', rule, rwinfo, cdx)
|
||||
css_rewriter = self.create_rewriter('css', rule, rwinfo, cdx)
|
||||
|
||||
# if no js rewriter, then do banner insert only
|
||||
if not js_rewriter:
|
||||
rw_class = self.all_rewriters.get('html-banner-only')
|
||||
|
||||
rw = rw_class(rwinfo.url_rewriter,
|
||||
js_rewriter=js_rewriter,
|
||||
css_rewriter=css_rewriter,
|
||||
@ -140,33 +150,28 @@ class BaseContentRewriter(object):
|
||||
return charset
|
||||
|
||||
def rewrite_headers(self, rwinfo):
|
||||
if rwinfo.is_url_rw():
|
||||
header_rw_name = 'header'
|
||||
else:
|
||||
header_rw_name = 'header-proxy'
|
||||
|
||||
header_rw_class = self.all_rewriters.get(header_rw_name)
|
||||
rwinfo.rw_http_headers = header_rw_class(rwinfo)()
|
||||
header_rw_class = self.all_rewriters.get('header')
|
||||
return header_rw_class(rwinfo)()
|
||||
|
||||
def __call__(self, record, url_rewriter, cookie_rewriter,
|
||||
head_insert_func=None,
|
||||
cdx=None):
|
||||
|
||||
rwinfo = RewriteInfo(record, self.get_rewrite_types(), url_rewriter, cookie_rewriter)
|
||||
|
||||
self.rewrite_headers(rwinfo)
|
||||
|
||||
content_rewriter = None
|
||||
if rwinfo.is_content_rw():
|
||||
|
||||
if rwinfo.should_rw_content():
|
||||
rule = self.get_rule(cdx)
|
||||
content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func)
|
||||
|
||||
if content_rewriter:
|
||||
gen = content_rewriter(rwinfo)
|
||||
else:
|
||||
gen = StreamIter(rwinfo.content_stream)
|
||||
gen = StreamIter(rwinfo.record.raw_stream)
|
||||
|
||||
return rwinfo.rw_http_headers, gen, (content_rewriter != None)
|
||||
rw_http_headers = self.rewrite_headers(rwinfo)
|
||||
|
||||
return rw_http_headers, gen, (content_rewriter != None)
|
||||
|
||||
def init_js_regexs(self, regexs):
|
||||
raise NotImplemented()
|
||||
@ -175,10 +180,34 @@ class BaseContentRewriter(object):
|
||||
raise NotImplemented()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class BufferedRewriter(object):
|
||||
def __init__(self, url_rewriter=None):
|
||||
self.url_rewriter = url_rewriter
|
||||
|
||||
def __call__(self, rwinfo):
|
||||
stream_buffer = tempfile.SpooledTemporaryFile(BUFF_SIZE * 4)
|
||||
|
||||
with closing(rwinfo.content_stream) as fh:
|
||||
while True:
|
||||
buff = fh.read()
|
||||
if not buff:
|
||||
break
|
||||
|
||||
stream_buffer.write(buff)
|
||||
|
||||
stream_buffer.seek(0)
|
||||
return StreamIter(self.rewrite_stream(stream_buffer))
|
||||
|
||||
def rewrite_stream(self, stream):
|
||||
raise NotImplemented('implement in subclass')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class StreamingRewriter(object):
|
||||
def __init__(self):
|
||||
self.align_to_line = True
|
||||
def __init__(self, url_rewriter, align_to_line=True):
|
||||
self.url_rewriter = url_rewriter
|
||||
self.align_to_line = align_to_line
|
||||
|
||||
def __call__(self, rwinfo):
|
||||
gen = self.rewrite_text_stream_to_gen(rwinfo.content_stream,
|
||||
@ -233,8 +262,8 @@ class RewriteInfo(object):
|
||||
def __init__(self, record, rewrite_types, url_rewriter, cookie_rewriter):
|
||||
self.record = record
|
||||
|
||||
self.rw_http_headers = record.http_headers
|
||||
self.content_stream = record.content_stream()
|
||||
self._content_stream = None
|
||||
self.is_content_rw = False
|
||||
|
||||
self.rewrite_types = rewrite_types
|
||||
|
||||
@ -287,15 +316,20 @@ class RewriteInfo(object):
|
||||
if self.TAG_REGEX.match(buff):
|
||||
self.text_type = 'html'
|
||||
|
||||
@property
|
||||
def content_stream(self):
|
||||
if not self._content_stream:
|
||||
self._content_stream = self.record.content_stream()
|
||||
self.is_content_rw = True
|
||||
|
||||
return self._content_stream
|
||||
|
||||
def read_and_keep(self, size):
|
||||
buff = self.content_stream.read(size)
|
||||
self.content_stream = BufferedReader(self.content_stream, starting_data=buff)
|
||||
self._content_stream = BufferedReader(self._content_stream, starting_data=buff)
|
||||
return buff
|
||||
|
||||
def is_content_rw(self):
|
||||
if not self.url_rewriter.prefix:
|
||||
return False
|
||||
|
||||
def should_rw_content(self):
|
||||
if self.url_rewriter.wburl.mod == 'id_':
|
||||
return False
|
||||
|
||||
@ -310,15 +344,15 @@ class RewriteInfo(object):
|
||||
elif not self.text_type:
|
||||
return False
|
||||
|
||||
elif self.text_type == 'css' or self.text_type == 'xml':
|
||||
if self.url_rewriter.wburl.mod == 'bn_':
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def is_url_rw(self):
|
||||
if not self.url_rewriter:
|
||||
return False
|
||||
|
||||
if self.url_rewriter.wburl.mod == 'id_':
|
||||
if self.url_rewriter.wburl.mod in ('id_', 'bn_'):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
|
34
pywb/rewrite/html_insert_rewriter.py
Normal file
34
pywb/rewrite/html_insert_rewriter.py
Normal file
@ -0,0 +1,34 @@
|
||||
import re
|
||||
from pywb.rewrite.content_rewriter import StreamingRewriter
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class HTMLInsertOnlyRewriter(StreamingRewriter):
|
||||
""" Insert custom string into HTML <head> tag
|
||||
no other rewriting performed
|
||||
"""
|
||||
HEAD_REGEX = re.compile('<\s*head\\b[^>]*[>]+', re.I)
|
||||
|
||||
def __init__(self, url_rewriter, **kwargs):
|
||||
super(HTMLInsertOnlyRewriter, self).__init__(url_rewriter, False)
|
||||
self.head_insert = kwargs['head_insert']
|
||||
|
||||
self.done = False
|
||||
|
||||
def rewrite(self, string):
|
||||
if self.done:
|
||||
return string
|
||||
|
||||
# only try to find <head> in first buffer
|
||||
self.done = True
|
||||
m = self.HEAD_REGEX.search(string)
|
||||
if m:
|
||||
inx = m.end()
|
||||
buff = string[:inx]
|
||||
buff += self.head_insert
|
||||
buff += string[inx:]
|
||||
return buff
|
||||
else:
|
||||
return string
|
||||
|
||||
|
@ -19,7 +19,7 @@ from six import text_type
|
||||
|
||||
|
||||
#=================================================================
|
||||
class HTMLRewriterMixin(object):
|
||||
class HTMLRewriterMixin(StreamingRewriter):
|
||||
"""
|
||||
HTML-Parsing Rewriter for custom rewriting, also delegates
|
||||
to rewriters for script and css
|
||||
@ -98,7 +98,7 @@ class HTMLRewriterMixin(object):
|
||||
defmod='',
|
||||
parse_comments=False):
|
||||
|
||||
self.url_rewriter = url_rewriter
|
||||
super(HTMLRewriterMixin, self).__init__(url_rewriter, False)
|
||||
self._wb_parse_context = None
|
||||
|
||||
if js_rewriter:
|
||||
@ -443,7 +443,7 @@ class HTMLRewriterMixin(object):
|
||||
|
||||
|
||||
#=================================================================
|
||||
class HTMLRewriter(HTMLRewriterMixin, StreamingRewriter, HTMLParser):
|
||||
class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
||||
PARSETAG = re.compile('[<]')
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
@ -453,8 +453,6 @@ class HTMLRewriter(HTMLRewriterMixin, StreamingRewriter, HTMLParser):
|
||||
HTMLParser.__init__(self)
|
||||
|
||||
super(HTMLRewriter, self).__init__(*args, **kwargs)
|
||||
# for StreamingRewriter
|
||||
self.align_to_line = False
|
||||
|
||||
def reset(self):
|
||||
HTMLParser.reset(self)
|
||||
|
@ -7,10 +7,6 @@ class JSONPRewriter(StreamingRewriter):
|
||||
JSONP = re.compile(r'^(\w+)\(\{')
|
||||
CALLBACK = re.compile(r'[?].*callback=([^&]+)')
|
||||
|
||||
def __init__(self, urlrewriter):
|
||||
super(JSONPRewriter, self).__init__()
|
||||
self.urlrewriter = urlrewriter
|
||||
|
||||
def rewrite(self, string):
|
||||
# see if json is jsonp, starts with callback func
|
||||
m_json = self.JSONP.search(string)
|
||||
|
@ -1,6 +1,5 @@
|
||||
import re
|
||||
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.rewrite.content_rewriter import StreamingRewriter
|
||||
|
||||
|
||||
@ -44,7 +43,7 @@ class RegexRewriter(StreamingRewriter):
|
||||
#DEFAULT_OP = add_prefix
|
||||
|
||||
def __init__(self, rewriter, rules):
|
||||
super(RegexRewriter, self).__init__()
|
||||
super(RegexRewriter, self).__init__(rewriter)
|
||||
#rules = self.create_rules(http_prefix)
|
||||
|
||||
# Build regexstr, concatenating regex list
|
||||
|
@ -1,11 +1,13 @@
|
||||
from io import BytesIO
|
||||
from six.moves import zip
|
||||
|
||||
from pywb.rewrite.content_rewriter import BufferedRewriter
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Experimental: not fully tested
|
||||
class RewriteAMF(object): #pragma: no cover
|
||||
def __call__(self, rwinfo):
|
||||
class RewriteAMF(BufferedRewriter): #pragma: no cover
|
||||
def rewrite_stream(self, stream):
|
||||
try:
|
||||
from pyamf import remoting
|
||||
|
||||
@ -20,7 +22,7 @@ class RewriteAMF(object): #pragma: no cover
|
||||
res = remoting.decode(iobuff)
|
||||
|
||||
# TODO: revisit this
|
||||
inputdata = rwinfo.url_rewriter.rewrite_opts.get('pywb.inputdata')
|
||||
inputdata = url_rewriter.rewrite_opts.get('pywb.inputdata')
|
||||
|
||||
if inputdata:
|
||||
new_list = []
|
||||
@ -42,3 +44,5 @@ class RewriteAMF(object): #pragma: no cover
|
||||
traceback.print_exc()
|
||||
print(e)
|
||||
return stream
|
||||
|
||||
|
||||
|
@ -4,24 +4,14 @@ import json
|
||||
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
from pywb.webagg.utils import StreamIter
|
||||
from pywb.rewrite.content_rewriter import BufferedRewriter
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class RewriteDASH(object):
|
||||
def __call__(self, rwinfo):
|
||||
buff_io = BytesIO()
|
||||
with closing(rwinfo.content_stream) as fh:
|
||||
while True:
|
||||
buff = fh.read()
|
||||
if not buff:
|
||||
break
|
||||
|
||||
buff_io.write(buff)
|
||||
|
||||
buff_io.seek(0)
|
||||
res_buff, best_ids = self.rewrite_dash(buff_io)
|
||||
return StreamIter(res_buff)
|
||||
class RewriteDASH(BufferedRewriter):
|
||||
def rewrite_stream(self, stream):
|
||||
res_buff, best_ids = self.rewrite_dash(stream)
|
||||
return res_buff
|
||||
|
||||
def rewrite_dash(self, stream):
|
||||
ET.register_namespace('', 'urn:mpeg:dash:schema:mpd:2011')
|
||||
@ -70,7 +60,7 @@ def rewrite_fb_dash(string):
|
||||
buff = string.encode('utf-8').decode('unicode-escape')
|
||||
buff = buff.encode('utf-8')
|
||||
io = BytesIO(buff)
|
||||
io, best_ids = RewriteDASHMixin().rewrite_dash(io)
|
||||
io, best_ids = RewriteDASH().rewrite_dash(io)
|
||||
string = json.dumps(io.read().decode('utf-8'))
|
||||
string = string[1:-1].replace('<', r'\x3C')
|
||||
|
||||
|
@ -1,16 +1,14 @@
|
||||
import re
|
||||
from io import BytesIO
|
||||
from pywb.webagg.utils import StreamIter
|
||||
|
||||
from pywb.rewrite.content_rewriter import BufferedRewriter
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class RewriteHLS(object):
|
||||
class RewriteHLS(BufferedRewriter):
|
||||
EXT_INF = re.compile('#EXT-X-STREAM-INF:(?:.*[,])?BANDWIDTH=([\d]+)')
|
||||
|
||||
def __call__(self, rwinfo):
|
||||
return StreamIter(self.rewrite_m3u8(rwinfo.content_stream))
|
||||
|
||||
def rewrite_m3u8(self, stream):
|
||||
def rewrite_stream(self, stream):
|
||||
buff = stream.read()
|
||||
|
||||
lines = buff.decode('utf-8').split('\n')
|
||||
|
@ -17,6 +17,9 @@ class PrefixHeaderRewriter(object):
|
||||
'content-location': 'url-rewrite',
|
||||
'content-base': 'url-rewrite',
|
||||
|
||||
'transfer-encoding': 'prefix',
|
||||
'connection': 'prefix',
|
||||
|
||||
'content-encoding': 'keep-if-no-content-rewrite',
|
||||
'content-length': 'content-length',
|
||||
|
||||
@ -24,13 +27,16 @@ class PrefixHeaderRewriter(object):
|
||||
'cookie': 'cookie',
|
||||
}
|
||||
|
||||
default_rule = 'prefix'
|
||||
|
||||
def __init__(self, rwinfo, header_prefix='X-Archive-Orig-'):
|
||||
self.header_prefix = header_prefix
|
||||
self.rwinfo = rwinfo
|
||||
self.http_headers = rwinfo.record.http_headers
|
||||
|
||||
if rwinfo.is_url_rw():
|
||||
self.default_rule = 'prefix'
|
||||
else:
|
||||
self.default_rule = 'keep'
|
||||
|
||||
def __call__(self):
|
||||
new_headers_list = []
|
||||
for name, value in self.http_headers.headers:
|
||||
@ -54,14 +60,14 @@ class PrefixHeaderRewriter(object):
|
||||
return (name, self.rwinfo.url_rewriter.rewrite(value))
|
||||
|
||||
elif rule == 'keep-if-no-content-rewrite':
|
||||
if not self.rwinfo.is_content_rw():
|
||||
if not self.rwinfo.is_content_rw:
|
||||
return (name, value)
|
||||
|
||||
elif rule == 'content-length':
|
||||
if value == '0':
|
||||
return (name, value)
|
||||
|
||||
if not self.rwinfo.is_content_rw():
|
||||
if not self.rwinfo.is_content_rw:
|
||||
try:
|
||||
if int(value) >= 0:
|
||||
return (name, value)
|
||||
@ -92,11 +98,3 @@ class PrefixHeaderRewriter(object):
|
||||
new_headers.append(('Expires', datetime_to_http_date(dt)))
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class ProxyHeaderRewriter(PrefixHeaderRewriter):
|
||||
header_rules = {
|
||||
'transfer-encoding': 'prefix',
|
||||
'connection': 'prefix',
|
||||
}
|
||||
|
||||
default_rule = 'keep'
|
||||
|
@ -1,12 +1,13 @@
|
||||
from pywb.rewrite.content_rewriter import BaseContentRewriter
|
||||
|
||||
from pywb.rewrite.html_rewriter import HTMLRewriter
|
||||
from pywb.rewrite.html_insert_rewriter import HTMLInsertOnlyRewriter
|
||||
|
||||
from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
|
||||
from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
|
||||
from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter
|
||||
|
||||
from pywb.urlrewrite.header_rewriter import PrefixHeaderRewriter, ProxyHeaderRewriter
|
||||
from pywb.urlrewrite.header_rewriter import PrefixHeaderRewriter
|
||||
|
||||
from pywb.rewrite.jsonp_rewriter import JSONPRewriter
|
||||
|
||||
@ -19,9 +20,9 @@ from pywb.rewrite.rewrite_amf import RewriteAMF
|
||||
class DefaultRewriter(BaseContentRewriter):
|
||||
all_rewriters = {
|
||||
'header': PrefixHeaderRewriter,
|
||||
'header-proxy': ProxyHeaderRewriter,
|
||||
|
||||
'html': HTMLRewriter,
|
||||
'html-banner-only': HTMLInsertOnlyRewriter,
|
||||
|
||||
'css': CSSRewriter,
|
||||
|
||||
|
@ -152,7 +152,7 @@ class TestWbIntegration(BaseConfigTest):
|
||||
assert len(lines) == 17
|
||||
assert lines[0].startswith('org,iana)/_css/2013.1/print.css 20140127171239')
|
||||
|
||||
def _test_replay_banner_only(self):
|
||||
def test_replay_banner_only(self):
|
||||
resp = self.testapp.get('/pywb/20140126201054bn_/http://www.iana.org/domains/reserved')
|
||||
|
||||
# wb.js header insertion
|
||||
|
Loading…
x
Reference in New Issue
Block a user