diff --git a/pywb/rewrite/lxml_html_rewriter.py b/pywb/rewrite/lxml_html_rewriter.py index 8aac2f54..abf28fc4 100644 --- a/pywb/rewrite/lxml_html_rewriter.py +++ b/pywb/rewrite/lxml_html_rewriter.py @@ -45,6 +45,18 @@ class LXMLHTMLRewriter(HTMLRewriterMixin): #string = string.replace(u'', u'') self.parser.feed(string) + def parse(self, stream): + self.out = self.AccumBuff() + + lxml.etree.parse(stream, self.parser) + + result = self.out.getvalue() + + # Clear buffer to create new one for next rewrite() + self.out = None + + return result + def _internal_close(self): if self.started: self.parser.close() diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 720bf9f1..c2d17047 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -123,12 +123,20 @@ class RewriteContent: return (status_headers, gen, True) + def _parse_full_gen(self, rewriter, encoding, stream): + buff = rewriter.parse(stream) + buff = buff.encode(encoding) + yield buff + # Create rewrite stream, may even be chunked by front-end def _rewriting_stream_gen(self, rewriter, encoding, stream_raw, stream, first_buff=None): + + if stream_raw: + return self._parse_full_gen(rewriter, encoding, stream) + def do_rewrite(buff): - if not stream_raw: - buff = self._decode_buff(buff, stream, encoding) + buff = self._decode_buff(buff, stream, encoding) buff = rewriter.rewrite(buff) diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py index fac38789..17bf0a75 100644 --- a/pywb/rewrite/test/test_regex_rewriters.py +++ b/pywb/rewrite/test/test_regex_rewriters.py @@ -51,7 +51,7 @@ r""" # scheme-agnostic >>> _test_js('cool_Location = "//example.com/abc.html" //comment') -'cool_Location = "/web/20131010em_///example.com/abc.html" //comment' +'cool_Location = "/web/20131010em_/http://example.com/abc.html" //comment' #================================================================= diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 9545a040..cb35607f 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -14,7 +14,7 @@ class UrlRewriter(object): NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', 'mailto:', 'about:'] - PROTOCOLS = ['http:', 'https:', '//', 'ftp:', 'mms:', 'rtsp:', 'wais:'] + PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:'] def __init__(self, wburl, prefix): self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) @@ -32,6 +32,10 @@ class UrlRewriter(object): isAbs = any(url.startswith(x) for x in self.PROTOCOLS) + if url.startswith('//'): + isAbs = True + url = 'http:' + url + # Optimized rewriter for # -rel urls that don't start with / and # do not contain ../ and no special mod