1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rewrite refactoring:

- rewrite headers after content to ensure content-length/content-encoding rewritten if content modified
- header rewriter: remove proxyrewriter, set default rule to 'prefix' or 'keep' if url rewriting or not
- set is_content_rw if record.content_stream(), assume content is modified
- add BufferedRewriter as base for dash, hls, amf rewriting which processes the full stream
- should_rw_content() determines if should attempt content rewriting
- support banner-only insert mode: added HTMLInsertOnlyRewriter, enable if no custom JS rules
- test: enable banner-only test mode
This commit is contained in:
Ilya Kreymer 2017-05-14 15:10:37 -07:00
parent c1be7d4da5
commit d8b67319e1
11 changed files with 131 additions and 79 deletions

View File

@ -7,6 +7,7 @@ from warcio.utils import to_native_str
import re
import webencodings
import tempfile
from pywb.webagg.utils import StreamIter, BUFF_SIZE
from pywb.rewrite.cookie_rewriter import ExactPathCookieRewriter
@ -78,11 +79,16 @@ class BaseContentRewriter(object):
def create_rewriter(self, text_type, rule, rwinfo, cdx, head_insert_func=None):
rw_type, rw_class = self.get_rw_class(rule, text_type, rwinfo)
if rw_type in ('js', 'js_proxy'):
if rw_type in ('js', 'js-proxy'):
extra_rules = []
if 'js_regex_func' in rule:
extra_rules = rule['js_regex_func'](rwinfo.url_rewriter)
# if js-proxy and no rules, default to none
# js rewriting in proxy only if extra rules apply
if rw_type == 'js-proxy' and not extra_rules:
return None
return rw_class(rwinfo.url_rewriter, extra_rules)
elif rw_type != 'html':
@ -94,6 +100,10 @@ class BaseContentRewriter(object):
js_rewriter = self.create_rewriter('js', rule, rwinfo, cdx)
css_rewriter = self.create_rewriter('css', rule, rwinfo, cdx)
# if no js rewriter, then do banner insert only
if not js_rewriter:
rw_class = self.all_rewriters.get('html-banner-only')
rw = rw_class(rwinfo.url_rewriter,
js_rewriter=js_rewriter,
css_rewriter=css_rewriter,
@ -140,33 +150,28 @@ class BaseContentRewriter(object):
return charset
def rewrite_headers(self, rwinfo):
if rwinfo.is_url_rw():
header_rw_name = 'header'
else:
header_rw_name = 'header-proxy'
header_rw_class = self.all_rewriters.get(header_rw_name)
rwinfo.rw_http_headers = header_rw_class(rwinfo)()
header_rw_class = self.all_rewriters.get('header')
return header_rw_class(rwinfo)()
def __call__(self, record, url_rewriter, cookie_rewriter,
head_insert_func=None,
cdx=None):
rwinfo = RewriteInfo(record, self.get_rewrite_types(), url_rewriter, cookie_rewriter)
self.rewrite_headers(rwinfo)
content_rewriter = None
if rwinfo.is_content_rw():
if rwinfo.should_rw_content():
rule = self.get_rule(cdx)
content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func)
if content_rewriter:
gen = content_rewriter(rwinfo)
else:
gen = StreamIter(rwinfo.content_stream)
gen = StreamIter(rwinfo.record.raw_stream)
return rwinfo.rw_http_headers, gen, (content_rewriter != None)
rw_http_headers = self.rewrite_headers(rwinfo)
return rw_http_headers, gen, (content_rewriter != None)
def init_js_regexs(self, regexs):
raise NotImplemented()
@ -175,10 +180,34 @@ class BaseContentRewriter(object):
raise NotImplemented()
# ============================================================================
class BufferedRewriter(object):
def __init__(self, url_rewriter=None):
self.url_rewriter = url_rewriter
def __call__(self, rwinfo):
stream_buffer = tempfile.SpooledTemporaryFile(BUFF_SIZE * 4)
with closing(rwinfo.content_stream) as fh:
while True:
buff = fh.read()
if not buff:
break
stream_buffer.write(buff)
stream_buffer.seek(0)
return StreamIter(self.rewrite_stream(stream_buffer))
def rewrite_stream(self, stream):
raise NotImplemented('implement in subclass')
# ============================================================================
class StreamingRewriter(object):
def __init__(self):
self.align_to_line = True
def __init__(self, url_rewriter, align_to_line=True):
self.url_rewriter = url_rewriter
self.align_to_line = align_to_line
def __call__(self, rwinfo):
gen = self.rewrite_text_stream_to_gen(rwinfo.content_stream,
@ -233,8 +262,8 @@ class RewriteInfo(object):
def __init__(self, record, rewrite_types, url_rewriter, cookie_rewriter):
self.record = record
self.rw_http_headers = record.http_headers
self.content_stream = record.content_stream()
self._content_stream = None
self.is_content_rw = False
self.rewrite_types = rewrite_types
@ -287,15 +316,20 @@ class RewriteInfo(object):
if self.TAG_REGEX.match(buff):
self.text_type = 'html'
@property
def content_stream(self):
if not self._content_stream:
self._content_stream = self.record.content_stream()
self.is_content_rw = True
return self._content_stream
def read_and_keep(self, size):
buff = self.content_stream.read(size)
self.content_stream = BufferedReader(self.content_stream, starting_data=buff)
self._content_stream = BufferedReader(self._content_stream, starting_data=buff)
return buff
def is_content_rw(self):
if not self.url_rewriter.prefix:
return False
def should_rw_content(self):
if self.url_rewriter.wburl.mod == 'id_':
return False
@ -310,15 +344,15 @@ class RewriteInfo(object):
elif not self.text_type:
return False
elif self.text_type == 'css' or self.text_type == 'xml':
if self.url_rewriter.wburl.mod == 'bn_':
return False
return True
def is_url_rw(self):
if not self.url_rewriter:
return False
if self.url_rewriter.wburl.mod == 'id_':
if self.url_rewriter.wburl.mod in ('id_', 'bn_'):
return False
return True

View File

@ -0,0 +1,34 @@
import re
from pywb.rewrite.content_rewriter import StreamingRewriter
# ============================================================================
class HTMLInsertOnlyRewriter(StreamingRewriter):
""" Insert custom string into HTML <head> tag
no other rewriting performed
"""
HEAD_REGEX = re.compile('<\s*head\\b[^>]*[>]+', re.I)
def __init__(self, url_rewriter, **kwargs):
super(HTMLInsertOnlyRewriter, self).__init__(url_rewriter, False)
self.head_insert = kwargs['head_insert']
self.done = False
def rewrite(self, string):
if self.done:
return string
# only try to find <head> in first buffer
self.done = True
m = self.HEAD_REGEX.search(string)
if m:
inx = m.end()
buff = string[:inx]
buff += self.head_insert
buff += string[inx:]
return buff
else:
return string

View File

@ -19,7 +19,7 @@ from six import text_type
#=================================================================
class HTMLRewriterMixin(object):
class HTMLRewriterMixin(StreamingRewriter):
"""
HTML-Parsing Rewriter for custom rewriting, also delegates
to rewriters for script and css
@ -98,7 +98,7 @@ class HTMLRewriterMixin(object):
defmod='',
parse_comments=False):
self.url_rewriter = url_rewriter
super(HTMLRewriterMixin, self).__init__(url_rewriter, False)
self._wb_parse_context = None
if js_rewriter:
@ -443,7 +443,7 @@ class HTMLRewriterMixin(object):
#=================================================================
class HTMLRewriter(HTMLRewriterMixin, StreamingRewriter, HTMLParser):
class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
PARSETAG = re.compile('[<]')
def __init__(self, *args, **kwargs):
@ -453,8 +453,6 @@ class HTMLRewriter(HTMLRewriterMixin, StreamingRewriter, HTMLParser):
HTMLParser.__init__(self)
super(HTMLRewriter, self).__init__(*args, **kwargs)
# for StreamingRewriter
self.align_to_line = False
def reset(self):
HTMLParser.reset(self)

View File

@ -7,10 +7,6 @@ class JSONPRewriter(StreamingRewriter):
JSONP = re.compile(r'^(\w+)\(\{')
CALLBACK = re.compile(r'[?].*callback=([^&]+)')
def __init__(self, urlrewriter):
super(JSONPRewriter, self).__init__()
self.urlrewriter = urlrewriter
def rewrite(self, string):
# see if json is jsonp, starts with callback func
m_json = self.JSONP.search(string)

View File

@ -1,6 +1,5 @@
import re
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.content_rewriter import StreamingRewriter
@ -44,7 +43,7 @@ class RegexRewriter(StreamingRewriter):
#DEFAULT_OP = add_prefix
def __init__(self, rewriter, rules):
super(RegexRewriter, self).__init__()
super(RegexRewriter, self).__init__(rewriter)
#rules = self.create_rules(http_prefix)
# Build regexstr, concatenating regex list

View File

@ -1,11 +1,13 @@
from io import BytesIO
from six.moves import zip
from pywb.rewrite.content_rewriter import BufferedRewriter
# ============================================================================
# Experimental: not fully tested
class RewriteAMF(object): #pragma: no cover
def __call__(self, rwinfo):
class RewriteAMF(BufferedRewriter): #pragma: no cover
def rewrite_stream(self, stream):
try:
from pyamf import remoting
@ -20,7 +22,7 @@ class RewriteAMF(object): #pragma: no cover
res = remoting.decode(iobuff)
# TODO: revisit this
inputdata = rwinfo.url_rewriter.rewrite_opts.get('pywb.inputdata')
inputdata = url_rewriter.rewrite_opts.get('pywb.inputdata')
if inputdata:
new_list = []
@ -42,3 +44,5 @@ class RewriteAMF(object): #pragma: no cover
traceback.print_exc()
print(e)
return stream

View File

@ -4,24 +4,14 @@ import json
import xml.etree.ElementTree as ET
from pywb.webagg.utils import StreamIter
from pywb.rewrite.content_rewriter import BufferedRewriter
# ============================================================================
class RewriteDASH(object):
def __call__(self, rwinfo):
buff_io = BytesIO()
with closing(rwinfo.content_stream) as fh:
while True:
buff = fh.read()
if not buff:
break
buff_io.write(buff)
buff_io.seek(0)
res_buff, best_ids = self.rewrite_dash(buff_io)
return StreamIter(res_buff)
class RewriteDASH(BufferedRewriter):
def rewrite_stream(self, stream):
res_buff, best_ids = self.rewrite_dash(stream)
return res_buff
def rewrite_dash(self, stream):
ET.register_namespace('', 'urn:mpeg:dash:schema:mpd:2011')
@ -70,7 +60,7 @@ def rewrite_fb_dash(string):
buff = string.encode('utf-8').decode('unicode-escape')
buff = buff.encode('utf-8')
io = BytesIO(buff)
io, best_ids = RewriteDASHMixin().rewrite_dash(io)
io, best_ids = RewriteDASH().rewrite_dash(io)
string = json.dumps(io.read().decode('utf-8'))
string = string[1:-1].replace('<', r'\x3C')

View File

@ -1,16 +1,14 @@
import re
from io import BytesIO
from pywb.webagg.utils import StreamIter
from pywb.rewrite.content_rewriter import BufferedRewriter
# ============================================================================
class RewriteHLS(object):
class RewriteHLS(BufferedRewriter):
EXT_INF = re.compile('#EXT-X-STREAM-INF:(?:.*[,])?BANDWIDTH=([\d]+)')
def __call__(self, rwinfo):
return StreamIter(self.rewrite_m3u8(rwinfo.content_stream))
def rewrite_m3u8(self, stream):
def rewrite_stream(self, stream):
buff = stream.read()
lines = buff.decode('utf-8').split('\n')

View File

@ -17,6 +17,9 @@ class PrefixHeaderRewriter(object):
'content-location': 'url-rewrite',
'content-base': 'url-rewrite',
'transfer-encoding': 'prefix',
'connection': 'prefix',
'content-encoding': 'keep-if-no-content-rewrite',
'content-length': 'content-length',
@ -24,13 +27,16 @@ class PrefixHeaderRewriter(object):
'cookie': 'cookie',
}
default_rule = 'prefix'
def __init__(self, rwinfo, header_prefix='X-Archive-Orig-'):
self.header_prefix = header_prefix
self.rwinfo = rwinfo
self.http_headers = rwinfo.record.http_headers
if rwinfo.is_url_rw():
self.default_rule = 'prefix'
else:
self.default_rule = 'keep'
def __call__(self):
new_headers_list = []
for name, value in self.http_headers.headers:
@ -54,14 +60,14 @@ class PrefixHeaderRewriter(object):
return (name, self.rwinfo.url_rewriter.rewrite(value))
elif rule == 'keep-if-no-content-rewrite':
if not self.rwinfo.is_content_rw():
if not self.rwinfo.is_content_rw:
return (name, value)
elif rule == 'content-length':
if value == '0':
return (name, value)
if not self.rwinfo.is_content_rw():
if not self.rwinfo.is_content_rw:
try:
if int(value) >= 0:
return (name, value)
@ -92,11 +98,3 @@ class PrefixHeaderRewriter(object):
new_headers.append(('Expires', datetime_to_http_date(dt)))
#=============================================================================
class ProxyHeaderRewriter(PrefixHeaderRewriter):
header_rules = {
'transfer-encoding': 'prefix',
'connection': 'prefix',
}
default_rule = 'keep'

View File

@ -1,12 +1,13 @@
from pywb.rewrite.content_rewriter import BaseContentRewriter
from pywb.rewrite.html_rewriter import HTMLRewriter
from pywb.rewrite.html_insert_rewriter import HTMLInsertOnlyRewriter
from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter
from pywb.urlrewrite.header_rewriter import PrefixHeaderRewriter, ProxyHeaderRewriter
from pywb.urlrewrite.header_rewriter import PrefixHeaderRewriter
from pywb.rewrite.jsonp_rewriter import JSONPRewriter
@ -19,9 +20,9 @@ from pywb.rewrite.rewrite_amf import RewriteAMF
class DefaultRewriter(BaseContentRewriter):
all_rewriters = {
'header': PrefixHeaderRewriter,
'header-proxy': ProxyHeaderRewriter,
'html': HTMLRewriter,
'html-banner-only': HTMLInsertOnlyRewriter,
'css': CSSRewriter,

View File

@ -152,7 +152,7 @@ class TestWbIntegration(BaseConfigTest):
assert len(lines) == 17
assert lines[0].startswith('org,iana)/_css/2013.1/print.css 20140127171239')
def _test_replay_banner_only(self):
def test_replay_banner_only(self):
resp = self.testapp.get('/pywb/20140126201054bn_/http://www.iana.org/domains/reserved')
# wb.js header insertion