diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index e94b6f21..8e61ea60 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -7,6 +7,7 @@ from warcio.utils import to_native_str import re import webencodings +import tempfile from pywb.webagg.utils import StreamIter, BUFF_SIZE from pywb.rewrite.cookie_rewriter import ExactPathCookieRewriter @@ -78,11 +79,16 @@ class BaseContentRewriter(object): def create_rewriter(self, text_type, rule, rwinfo, cdx, head_insert_func=None): rw_type, rw_class = self.get_rw_class(rule, text_type, rwinfo) - if rw_type in ('js', 'js_proxy'): + if rw_type in ('js', 'js-proxy'): extra_rules = [] if 'js_regex_func' in rule: extra_rules = rule['js_regex_func'](rwinfo.url_rewriter) + # if js-proxy and no rules, default to none + # js rewriting in proxy only if extra rules apply + if rw_type == 'js-proxy' and not extra_rules: + return None + return rw_class(rwinfo.url_rewriter, extra_rules) elif rw_type != 'html': @@ -94,6 +100,10 @@ class BaseContentRewriter(object): js_rewriter = self.create_rewriter('js', rule, rwinfo, cdx) css_rewriter = self.create_rewriter('css', rule, rwinfo, cdx) + # if no js rewriter, then do banner insert only + if not js_rewriter: + rw_class = self.all_rewriters.get('html-banner-only') + rw = rw_class(rwinfo.url_rewriter, js_rewriter=js_rewriter, css_rewriter=css_rewriter, @@ -140,33 +150,28 @@ class BaseContentRewriter(object): return charset def rewrite_headers(self, rwinfo): - if rwinfo.is_url_rw(): - header_rw_name = 'header' - else: - header_rw_name = 'header-proxy' - - header_rw_class = self.all_rewriters.get(header_rw_name) - rwinfo.rw_http_headers = header_rw_class(rwinfo)() + header_rw_class = self.all_rewriters.get('header') + return header_rw_class(rwinfo)() def __call__(self, record, url_rewriter, cookie_rewriter, head_insert_func=None, cdx=None): rwinfo = RewriteInfo(record, self.get_rewrite_types(), url_rewriter, cookie_rewriter) - - self.rewrite_headers(rwinfo) - content_rewriter = None - if rwinfo.is_content_rw(): + + if rwinfo.should_rw_content(): rule = self.get_rule(cdx) content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func) if content_rewriter: gen = content_rewriter(rwinfo) else: - gen = StreamIter(rwinfo.content_stream) + gen = StreamIter(rwinfo.record.raw_stream) - return rwinfo.rw_http_headers, gen, (content_rewriter != None) + rw_http_headers = self.rewrite_headers(rwinfo) + + return rw_http_headers, gen, (content_rewriter != None) def init_js_regexs(self, regexs): raise NotImplemented() @@ -175,10 +180,34 @@ class BaseContentRewriter(object): raise NotImplemented() +# ============================================================================ +class BufferedRewriter(object): + def __init__(self, url_rewriter=None): + self.url_rewriter = url_rewriter + + def __call__(self, rwinfo): + stream_buffer = tempfile.SpooledTemporaryFile(BUFF_SIZE * 4) + + with closing(rwinfo.content_stream) as fh: + while True: + buff = fh.read() + if not buff: + break + + stream_buffer.write(buff) + + stream_buffer.seek(0) + return StreamIter(self.rewrite_stream(stream_buffer)) + + def rewrite_stream(self, stream): + raise NotImplemented('implement in subclass') + + # ============================================================================ class StreamingRewriter(object): - def __init__(self): - self.align_to_line = True + def __init__(self, url_rewriter, align_to_line=True): + self.url_rewriter = url_rewriter + self.align_to_line = align_to_line def __call__(self, rwinfo): gen = self.rewrite_text_stream_to_gen(rwinfo.content_stream, @@ -233,8 +262,8 @@ class RewriteInfo(object): def __init__(self, record, rewrite_types, url_rewriter, cookie_rewriter): self.record = record - self.rw_http_headers = record.http_headers - self.content_stream = record.content_stream() + self._content_stream = None + self.is_content_rw = False self.rewrite_types = rewrite_types @@ -287,15 +316,20 @@ class RewriteInfo(object): if self.TAG_REGEX.match(buff): self.text_type = 'html' + @property + def content_stream(self): + if not self._content_stream: + self._content_stream = self.record.content_stream() + self.is_content_rw = True + + return self._content_stream + def read_and_keep(self, size): buff = self.content_stream.read(size) - self.content_stream = BufferedReader(self.content_stream, starting_data=buff) + self._content_stream = BufferedReader(self._content_stream, starting_data=buff) return buff - def is_content_rw(self): - if not self.url_rewriter.prefix: - return False - + def should_rw_content(self): if self.url_rewriter.wburl.mod == 'id_': return False @@ -310,15 +344,15 @@ class RewriteInfo(object): elif not self.text_type: return False + elif self.text_type == 'css' or self.text_type == 'xml': + if self.url_rewriter.wburl.mod == 'bn_': + return False + return True def is_url_rw(self): - if not self.url_rewriter: - return False - - if self.url_rewriter.wburl.mod == 'id_': + if self.url_rewriter.wburl.mod in ('id_', 'bn_'): return False return True - diff --git a/pywb/rewrite/html_insert_rewriter.py b/pywb/rewrite/html_insert_rewriter.py new file mode 100644 index 00000000..053c1231 --- /dev/null +++ b/pywb/rewrite/html_insert_rewriter.py @@ -0,0 +1,34 @@ +import re +from pywb.rewrite.content_rewriter import StreamingRewriter + + +# ============================================================================ +class HTMLInsertOnlyRewriter(StreamingRewriter): + """ Insert custom string into HTML tag + no other rewriting performed + """ + HEAD_REGEX = re.compile('<\s*head\\b[^>]*[>]+', re.I) + + def __init__(self, url_rewriter, **kwargs): + super(HTMLInsertOnlyRewriter, self).__init__(url_rewriter, False) + self.head_insert = kwargs['head_insert'] + + self.done = False + + def rewrite(self, string): + if self.done: + return string + + # only try to find in first buffer + self.done = True + m = self.HEAD_REGEX.search(string) + if m: + inx = m.end() + buff = string[:inx] + buff += self.head_insert + buff += string[inx:] + return buff + else: + return string + + diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 54b0802b..cb98a16b 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -19,7 +19,7 @@ from six import text_type #================================================================= -class HTMLRewriterMixin(object): +class HTMLRewriterMixin(StreamingRewriter): """ HTML-Parsing Rewriter for custom rewriting, also delegates to rewriters for script and css @@ -98,7 +98,7 @@ class HTMLRewriterMixin(object): defmod='', parse_comments=False): - self.url_rewriter = url_rewriter + super(HTMLRewriterMixin, self).__init__(url_rewriter, False) self._wb_parse_context = None if js_rewriter: @@ -443,7 +443,7 @@ class HTMLRewriterMixin(object): #================================================================= -class HTMLRewriter(HTMLRewriterMixin, StreamingRewriter, HTMLParser): +class HTMLRewriter(HTMLRewriterMixin, HTMLParser): PARSETAG = re.compile('[<]') def __init__(self, *args, **kwargs): @@ -453,8 +453,6 @@ class HTMLRewriter(HTMLRewriterMixin, StreamingRewriter, HTMLParser): HTMLParser.__init__(self) super(HTMLRewriter, self).__init__(*args, **kwargs) - # for StreamingRewriter - self.align_to_line = False def reset(self): HTMLParser.reset(self) diff --git a/pywb/rewrite/jsonp_rewriter.py b/pywb/rewrite/jsonp_rewriter.py index 6f603566..b4d3fac5 100644 --- a/pywb/rewrite/jsonp_rewriter.py +++ b/pywb/rewrite/jsonp_rewriter.py @@ -7,10 +7,6 @@ class JSONPRewriter(StreamingRewriter): JSONP = re.compile(r'^(\w+)\(\{') CALLBACK = re.compile(r'[?].*callback=([^&]+)') - def __init__(self, urlrewriter): - super(JSONPRewriter, self).__init__() - self.urlrewriter = urlrewriter - def rewrite(self, string): # see if json is jsonp, starts with callback func m_json = self.JSONP.search(string) diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index 9e4f0958..ad807e7d 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -1,6 +1,5 @@ import re -from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.content_rewriter import StreamingRewriter @@ -44,7 +43,7 @@ class RegexRewriter(StreamingRewriter): #DEFAULT_OP = add_prefix def __init__(self, rewriter, rules): - super(RegexRewriter, self).__init__() + super(RegexRewriter, self).__init__(rewriter) #rules = self.create_rules(http_prefix) # Build regexstr, concatenating regex list diff --git a/pywb/rewrite/rewrite_amf.py b/pywb/rewrite/rewrite_amf.py index 075af986..aba45edf 100644 --- a/pywb/rewrite/rewrite_amf.py +++ b/pywb/rewrite/rewrite_amf.py @@ -1,11 +1,13 @@ from io import BytesIO from six.moves import zip +from pywb.rewrite.content_rewriter import BufferedRewriter + # ============================================================================ # Experimental: not fully tested -class RewriteAMF(object): #pragma: no cover - def __call__(self, rwinfo): +class RewriteAMF(BufferedRewriter): #pragma: no cover + def rewrite_stream(self, stream): try: from pyamf import remoting @@ -20,7 +22,7 @@ class RewriteAMF(object): #pragma: no cover res = remoting.decode(iobuff) # TODO: revisit this - inputdata = rwinfo.url_rewriter.rewrite_opts.get('pywb.inputdata') + inputdata = url_rewriter.rewrite_opts.get('pywb.inputdata') if inputdata: new_list = [] @@ -42,3 +44,5 @@ class RewriteAMF(object): #pragma: no cover traceback.print_exc() print(e) return stream + + diff --git a/pywb/rewrite/rewrite_dash.py b/pywb/rewrite/rewrite_dash.py index 497a381b..ab3dd5ea 100644 --- a/pywb/rewrite/rewrite_dash.py +++ b/pywb/rewrite/rewrite_dash.py @@ -4,24 +4,14 @@ import json import xml.etree.ElementTree as ET -from pywb.webagg.utils import StreamIter +from pywb.rewrite.content_rewriter import BufferedRewriter # ============================================================================ -class RewriteDASH(object): - def __call__(self, rwinfo): - buff_io = BytesIO() - with closing(rwinfo.content_stream) as fh: - while True: - buff = fh.read() - if not buff: - break - - buff_io.write(buff) - - buff_io.seek(0) - res_buff, best_ids = self.rewrite_dash(buff_io) - return StreamIter(res_buff) +class RewriteDASH(BufferedRewriter): + def rewrite_stream(self, stream): + res_buff, best_ids = self.rewrite_dash(stream) + return res_buff def rewrite_dash(self, stream): ET.register_namespace('', 'urn:mpeg:dash:schema:mpd:2011') @@ -70,7 +60,7 @@ def rewrite_fb_dash(string): buff = string.encode('utf-8').decode('unicode-escape') buff = buff.encode('utf-8') io = BytesIO(buff) - io, best_ids = RewriteDASHMixin().rewrite_dash(io) + io, best_ids = RewriteDASH().rewrite_dash(io) string = json.dumps(io.read().decode('utf-8')) string = string[1:-1].replace('<', r'\x3C') diff --git a/pywb/rewrite/rewrite_hls.py b/pywb/rewrite/rewrite_hls.py index ae56129a..06be6750 100644 --- a/pywb/rewrite/rewrite_hls.py +++ b/pywb/rewrite/rewrite_hls.py @@ -1,16 +1,14 @@ import re from io import BytesIO -from pywb.webagg.utils import StreamIter + +from pywb.rewrite.content_rewriter import BufferedRewriter # ============================================================================ -class RewriteHLS(object): +class RewriteHLS(BufferedRewriter): EXT_INF = re.compile('#EXT-X-STREAM-INF:(?:.*[,])?BANDWIDTH=([\d]+)') - def __call__(self, rwinfo): - return StreamIter(self.rewrite_m3u8(rwinfo.content_stream)) - - def rewrite_m3u8(self, stream): + def rewrite_stream(self, stream): buff = stream.read() lines = buff.decode('utf-8').split('\n') diff --git a/pywb/urlrewrite/header_rewriter.py b/pywb/urlrewrite/header_rewriter.py index 8788c535..394ea0bc 100644 --- a/pywb/urlrewrite/header_rewriter.py +++ b/pywb/urlrewrite/header_rewriter.py @@ -17,6 +17,9 @@ class PrefixHeaderRewriter(object): 'content-location': 'url-rewrite', 'content-base': 'url-rewrite', + 'transfer-encoding': 'prefix', + 'connection': 'prefix', + 'content-encoding': 'keep-if-no-content-rewrite', 'content-length': 'content-length', @@ -24,13 +27,16 @@ class PrefixHeaderRewriter(object): 'cookie': 'cookie', } - default_rule = 'prefix' - def __init__(self, rwinfo, header_prefix='X-Archive-Orig-'): self.header_prefix = header_prefix self.rwinfo = rwinfo self.http_headers = rwinfo.record.http_headers + if rwinfo.is_url_rw(): + self.default_rule = 'prefix' + else: + self.default_rule = 'keep' + def __call__(self): new_headers_list = [] for name, value in self.http_headers.headers: @@ -54,14 +60,14 @@ class PrefixHeaderRewriter(object): return (name, self.rwinfo.url_rewriter.rewrite(value)) elif rule == 'keep-if-no-content-rewrite': - if not self.rwinfo.is_content_rw(): + if not self.rwinfo.is_content_rw: return (name, value) elif rule == 'content-length': if value == '0': return (name, value) - if not self.rwinfo.is_content_rw(): + if not self.rwinfo.is_content_rw: try: if int(value) >= 0: return (name, value) @@ -92,11 +98,3 @@ class PrefixHeaderRewriter(object): new_headers.append(('Expires', datetime_to_http_date(dt))) -#============================================================================= -class ProxyHeaderRewriter(PrefixHeaderRewriter): - header_rules = { - 'transfer-encoding': 'prefix', - 'connection': 'prefix', - } - - default_rule = 'keep' diff --git a/pywb/urlrewrite/rewriter.py b/pywb/urlrewrite/rewriter.py index 9121a9e6..c8de6fc7 100644 --- a/pywb/urlrewrite/rewriter.py +++ b/pywb/urlrewrite/rewriter.py @@ -1,12 +1,13 @@ from pywb.rewrite.content_rewriter import BaseContentRewriter from pywb.rewrite.html_rewriter import HTMLRewriter +from pywb.rewrite.html_insert_rewriter import HTMLInsertOnlyRewriter from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter -from pywb.urlrewrite.header_rewriter import PrefixHeaderRewriter, ProxyHeaderRewriter +from pywb.urlrewrite.header_rewriter import PrefixHeaderRewriter from pywb.rewrite.jsonp_rewriter import JSONPRewriter @@ -19,9 +20,9 @@ from pywb.rewrite.rewrite_amf import RewriteAMF class DefaultRewriter(BaseContentRewriter): all_rewriters = { 'header': PrefixHeaderRewriter, - 'header-proxy': ProxyHeaderRewriter, 'html': HTMLRewriter, + 'html-banner-only': HTMLInsertOnlyRewriter, 'css': CSSRewriter, diff --git a/tests/test_integration.py b/tests/test_integration.py index 67ec287f..571165eb 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -152,7 +152,7 @@ class TestWbIntegration(BaseConfigTest): assert len(lines) == 17 assert lines[0].startswith('org,iana)/_css/2013.1/print.css 20140127171239') - def _test_replay_banner_only(self): + def test_replay_banner_only(self): resp = self.testapp.get('/pywb/20140126201054bn_/http://www.iana.org/domains/reserved') # wb.js header insertion