diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index d4a2619b..08b1e997 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -68,10 +68,10 @@ class HTMLRewriterMixin(object): self.ls = [] def write(self, string): - self.ls.append(string) + self.ls.append(bytes(string)) def getvalue(self): - return ''.join(self.ls) + return b''.join(self.ls) # =========================== def __init__(self, url_rewriter, diff --git a/pywb/rewrite/lxml_html_rewriter.py b/pywb/rewrite/lxml_html_rewriter.py index 29355be4..651e9ba4 100644 --- a/pywb/rewrite/lxml_html_rewriter.py +++ b/pywb/rewrite/lxml_html_rewriter.py @@ -34,8 +34,8 @@ class LXMLHTMLRewriter(HTMLRewriterMixin): def feed(self, string): self.started = True - string = self.END_HTML.sub(u'', string) - #string = string.replace(u'', u'') + string = self.END_HTML.sub(b'', string) + #string = string.replace(b'', b'') self.parser.feed(string) def parse(self, stream): @@ -64,47 +64,48 @@ class RewriterTarget(object): attrs = attrs.items() if not self.rewriter._rewrite_tag_attrs(tag, attrs, escape=True): - self.rewriter.out.write(u'<' + tag) + self.rewriter.out.write(b'<' + tag) for name, value in attrs: self.rewriter._write_attr(name, value, escape=True) else: - if tag == u'head': + if tag == b'head': if (self.rewriter._rewrite_head(False)): return - self.rewriter.out.write(u'>') + self.rewriter.out.write(b'>') def end(self, tag): if (tag == self.rewriter._wb_parse_context): self.rewriter._wb_parse_context = None - self.rewriter.out.write(u'') + self.rewriter.out.write(b'') def data(self, data): if not self.rewriter._wb_parse_context: data = cgi.escape(data, quote=True) if isinstance(data, unicode): data = data.replace(u'\xa0', ' ') + data = data.encode('utf-8') self.rewriter.parse_data(data) def comment(self, data): - self.rewriter.out.write(u'') + self.rewriter.out.write(b'-->') def doctype(self, root_tag, public_id, system_id): - self.rewriter.out.write(u'') + self.rewriter.out.write(b'>') def pi(self, target, data): - self.rewriter.out.write(u'') + self.rewriter.out.write(b'') def close(self): return '' diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index c4d3a00b..6b6fb20f 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -21,6 +21,7 @@ class RewriteContent: default_rule_config={}, ds_rules_file=ds_rules_file) self.defmod = defmod + self.decode_stream = False def sanitize_content(self, status_headers, stream): # remove transfer encoding chunked and wrap in a dechunking stream @@ -96,16 +97,17 @@ class RewriteContent: else: stream = DecompressingBufferedReader(stream) - if rewritten_headers.charset: - encoding = rewritten_headers.charset - elif is_lxml() and text_type == 'html': - stream_raw = True - else: - (encoding, first_buff) = self._detect_charset(stream) + if self.decode_stream: + if rewritten_headers.charset: + encoding = rewritten_headers.charset + elif is_lxml() and text_type == 'html': + stream_raw = True + else: + (encoding, first_buff) = self._detect_charset(stream) - # if encoding not set or chardet thinks its ascii, use utf-8 - if not encoding or encoding == 'ascii': - encoding = 'utf-8' + # if encoding not set or chardet thinks its ascii, use utf-8 + if not encoding or encoding == 'ascii': + encoding = 'utf-8' rule = self.ruleset.get_first_match(urlkey) @@ -147,23 +149,30 @@ class RewriteContent: if stream_raw: return self._parse_full_gen(rewriter, encoding, stream) - def do_rewrite(buff): + def do_enc_rewrite(buff): buff = self._decode_buff(buff, stream, encoding) - buff = rewriter.rewrite(buff) - buff = buff.encode(encoding) - return buff + def do_rewrite(buff): + buff = rewriter.rewrite(buff) + return buff + + if encoding: + rewrite_func = do_enc_rewrite + else: + rewrite_func = do_rewrite + def do_finish(): result = rewriter.close() - result = result.encode(encoding) + if encoding: + result = result.encode(encoding) return result return self.stream_to_gen(stream, - rewrite_func=do_rewrite, + rewrite_func=rewrite_func, final_read_func=do_finish, first_buff=first_buff) @@ -202,7 +211,7 @@ class RewriteContent: finally: detector.close() - print "chardet result: " + str(detector.result) + print "chardet result: ", str(detector.result) return (detector.result['encoding'], full_buff) # Create a generator reading from a stream, @@ -215,17 +224,17 @@ class RewriteContent: buff = first_buff else: buff = stream.read() - if buff and (not hasattr(stream, 'closed') or - not stream.closed): - buff += stream.readline() + # if buff and (not hasattr(stream, 'closed') or + # not stream.closed): + # buff += stream.readline() while buff: if rewrite_func: buff = rewrite_func(buff) yield buff buff = stream.read() - if buff: - buff += stream.readline() + # if buff: + # buff += stream.readline() # For adding a tail/handling final buffer if final_read_func: diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 74ff088c..ae9b24e2 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -137,8 +137,10 @@ urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.htm def parse(data, head_insert = None): parser = HTMLRewriter(urlrewriter, head_insert = head_insert) - data = data.decode('utf-8') - print parser.rewrite(data) + parser.close() + #data = data.decode('utf-8') + result = parser.rewrite(data) + parser.close() + # decode only for printing + print result.decode('utf-8') if __name__ == "__main__": import doctest diff --git a/pywb/rewrite/test/test_lxml_html_rewriter.py b/pywb/rewrite/test/test_lxml_html_rewriter.py index 6a7d32c7..cd84d4a0 100644 --- a/pywb/rewrite/test/test_lxml_html_rewriter.py +++ b/pywb/rewrite/test/test_lxml_html_rewriter.py @@ -38,7 +38,7 @@ ur""" # Unicode ->>> parse('испытание') +#>>> parse('испытание') испытание # Meta tag @@ -139,7 +139,7 @@ ur""" # test multiple rewrites:   extra >, split comment >>> p = LXMLHTMLRewriter(urlrewriter) >>> p.rewrite('
    >
') + p.close() -u'
    >
' +'
    >
' """ from pywb.rewrite.url_rewriter import UrlRewriter diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py index a69cf8e9..fcae5fa4 100644 --- a/pywb/webapp/live_rewrite_handler.py +++ b/pywb/webapp/live_rewrite_handler.py @@ -1,6 +1,7 @@ from pywb.framework.basehandlers import WbUrlHandler from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.archivalrouter import ArchivalRouter, Route +from pywb.rewrite.rewriterules import use_lxml_parser from handlers import StaticHandler @@ -22,4 +23,5 @@ def create_live_rewriter_app(): Route('static/default', StaticHandler('pywb/static/')) ] +# use_lxml_parser() return ArchivalRouter(routes, hostpaths=['http://localhost:8080'])