From d6006acdc3dfab128ff9798a00c1348ab7e883c1 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 6 Apr 2014 09:47:34 -0700 Subject: [PATCH] rewrite: when using lxml parser, just pass raw stream to lxml without decoding. lxml parser expects to have raw bytes and will determine encoding on its own. then serve back as utf-8 if no encoding specified. should address #36 --- pywb/rewrite/lxml_html_rewriter.py | 3 +- pywb/rewrite/rewrite_content.py | 39 ++++++++++++-------- pywb/rewrite/rewriterules.py | 11 +++++- pywb/rewrite/test/test_lxml_html_rewriter.py | 9 +++++ 4 files changed, 43 insertions(+), 19 deletions(-) diff --git a/pywb/rewrite/lxml_html_rewriter.py b/pywb/rewrite/lxml_html_rewriter.py index 2c8a8b8a..8aac2f54 100644 --- a/pywb/rewrite/lxml_html_rewriter.py +++ b/pywb/rewrite/lxml_html_rewriter.py @@ -79,7 +79,8 @@ class RewriterTarget(object): def data(self, data): if not self.rewriter._wb_parse_context: data = cgi.escape(data, quote=True) - + if isinstance(data, unicode): + data = data.replace(u'\xa0', ' ') self.rewriter.parse_data(data) def comment(self, data): diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 0acdf5a5..720bf9f1 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -6,7 +6,7 @@ from io import BytesIO from header_rewriter import RewrittenStatusAndHeaders -from rewriterules import RewriteRules +from rewriterules import RewriteRules, is_lxml from pywb.utils.dsrules import RuleSet from pywb.utils.statusandheaders import StatusAndHeaders @@ -73,21 +73,25 @@ class RewriteContent: # ==================================================================== # special case -- need to ungzip the body + text_type = rewritten_headers.text_type + stream_raw = False + encoding = None + first_buff = None + if (rewritten_headers. contains_removed_header('content-encoding', 'gzip')): stream = DecompressingBufferedReader(stream, decomp_type='gzip') if rewritten_headers.charset: encoding = rewritten_headers.charset - first_buff = None + elif is_lxml() and text_type == 'html': + stream_raw = True else: (encoding, first_buff) = self._detect_charset(stream) - # if chardet thinks its ascii, use utf-8 - if encoding == 'ascii': - encoding = 'utf-8' - - text_type = rewritten_headers.text_type + # if encoding not set or chardet thinks its ascii, use utf-8 + if not encoding or encoding == 'ascii': + encoding = 'utf-8' rule = self.ruleset.get_first_match(urlkey) @@ -108,34 +112,33 @@ class RewriteContent: js_rewriter_class=rule.rewriters['js'], css_rewriter_class=rule.rewriters['css'], head_insert=head_insert_str) + else: # apply one of (js, css, xml) rewriters rewriter = rewriter_class(urlrewriter) # Create rewriting generator - gen = self._rewriting_stream_gen(rewriter, encoding, + gen = self._rewriting_stream_gen(rewriter, encoding, stream_raw, stream, first_buff) return (status_headers, gen, True) # Create rewrite stream, may even be chunked by front-end - def _rewriting_stream_gen(self, rewriter, encoding, + def _rewriting_stream_gen(self, rewriter, encoding, stream_raw, stream, first_buff=None): def do_rewrite(buff): - if encoding: + if not stream_raw: buff = self._decode_buff(buff, stream, encoding) buff = rewriter.rewrite(buff) - if encoding: - buff = buff.encode(encoding) + buff = buff.encode(encoding) return buff def do_finish(): result = rewriter.close() - if encoding: - result = result.encode(encoding) + result = result.encode(encoding) return result @@ -188,12 +191,16 @@ class RewriteContent: def stream_to_gen(stream, rewrite_func=None, final_read_func=None, first_buff=None): try: - buff = first_buff if first_buff else stream.read() + if first_buff: + buff = first_buff + else: + buff = stream.read() + stream.readline() + while buff: if rewrite_func: buff = rewrite_func(buff) yield buff - buff = stream.read() + buff = stream.read() + stream.readline() # For adding a tail/handling final buffer if final_read_func: diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py index 03a23653..a7737248 100644 --- a/pywb/rewrite/rewriterules.py +++ b/pywb/rewrite/rewriterules.py @@ -9,6 +9,7 @@ from html_rewriter import HTMLRewriter import itertools HTML = HTMLRewriter +_is_lxml = False #================================================================= @@ -18,13 +19,19 @@ def use_lxml_parser(): if LXML_SUPPORTED: global HTML + global _is_lxml HTML = LXMLHTMLRewriter logging.debug('Using LXML Parser') - return True + _is_lxml = True else: # pragma: no cover logging.debug('LXML Parser not available') - return False + _is_lxml = False + return _is_lxml + + +def is_lxml(): + return _is_lxml #================================================================= class RewriteRules(BaseRule): diff --git a/pywb/rewrite/test/test_lxml_html_rewriter.py b/pywb/rewrite/test/test_lxml_html_rewriter.py index 125977e7..038de4a8 100644 --- a/pywb/rewrite/test/test_lxml_html_rewriter.py +++ b/pywb/rewrite/test/test_lxml_html_rewriter.py @@ -119,6 +119,15 @@ ur""" >>> p = LXMLHTMLRewriter(urlrewriter) >>> p.close() '' + +# test   +>>> parse(' ') +

 

+ +# test multiple rewrites:   extra >, split comment +>>> p = LXMLHTMLRewriter(urlrewriter) +>>> p.rewrite('
    >
') + p.close() +u'
    >
' """ from pywb.rewrite.url_rewriter import UrlRewriter