diff --git a/pywb/rewrite/lxml_html_rewriter.py b/pywb/rewrite/lxml_html_rewriter.py index 2c8a8b8a..8aac2f54 100644 --- a/pywb/rewrite/lxml_html_rewriter.py +++ b/pywb/rewrite/lxml_html_rewriter.py @@ -79,7 +79,8 @@ class RewriterTarget(object): def data(self, data): if not self.rewriter._wb_parse_context: data = cgi.escape(data, quote=True) - + if isinstance(data, unicode): + data = data.replace(u'\xa0', ' ') self.rewriter.parse_data(data) def comment(self, data): diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 0acdf5a5..720bf9f1 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -6,7 +6,7 @@ from io import BytesIO from header_rewriter import RewrittenStatusAndHeaders -from rewriterules import RewriteRules +from rewriterules import RewriteRules, is_lxml from pywb.utils.dsrules import RuleSet from pywb.utils.statusandheaders import StatusAndHeaders @@ -73,21 +73,25 @@ class RewriteContent: # ==================================================================== # special case -- need to ungzip the body + text_type = rewritten_headers.text_type + stream_raw = False + encoding = None + first_buff = None + if (rewritten_headers. contains_removed_header('content-encoding', 'gzip')): stream = DecompressingBufferedReader(stream, decomp_type='gzip') if rewritten_headers.charset: encoding = rewritten_headers.charset - first_buff = None + elif is_lxml() and text_type == 'html': + stream_raw = True else: (encoding, first_buff) = self._detect_charset(stream) - # if chardet thinks its ascii, use utf-8 - if encoding == 'ascii': - encoding = 'utf-8' - - text_type = rewritten_headers.text_type + # if encoding not set or chardet thinks its ascii, use utf-8 + if not encoding or encoding == 'ascii': + encoding = 'utf-8' rule = self.ruleset.get_first_match(urlkey) @@ -108,34 +112,33 @@ class RewriteContent: js_rewriter_class=rule.rewriters['js'], css_rewriter_class=rule.rewriters['css'], head_insert=head_insert_str) + else: # apply one of (js, css, xml) rewriters rewriter = rewriter_class(urlrewriter) # Create rewriting generator - gen = self._rewriting_stream_gen(rewriter, encoding, + gen = self._rewriting_stream_gen(rewriter, encoding, stream_raw, stream, first_buff) return (status_headers, gen, True) # Create rewrite stream, may even be chunked by front-end - def _rewriting_stream_gen(self, rewriter, encoding, + def _rewriting_stream_gen(self, rewriter, encoding, stream_raw, stream, first_buff=None): def do_rewrite(buff): - if encoding: + if not stream_raw: buff = self._decode_buff(buff, stream, encoding) buff = rewriter.rewrite(buff) - if encoding: - buff = buff.encode(encoding) + buff = buff.encode(encoding) return buff def do_finish(): result = rewriter.close() - if encoding: - result = result.encode(encoding) + result = result.encode(encoding) return result @@ -188,12 +191,16 @@ class RewriteContent: def stream_to_gen(stream, rewrite_func=None, final_read_func=None, first_buff=None): try: - buff = first_buff if first_buff else stream.read() + if first_buff: + buff = first_buff + else: + buff = stream.read() + stream.readline() + while buff: if rewrite_func: buff = rewrite_func(buff) yield buff - buff = stream.read() + buff = stream.read() + stream.readline() # For adding a tail/handling final buffer if final_read_func: diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py index 03a23653..a7737248 100644 --- a/pywb/rewrite/rewriterules.py +++ b/pywb/rewrite/rewriterules.py @@ -9,6 +9,7 @@ from html_rewriter import HTMLRewriter import itertools HTML = HTMLRewriter +_is_lxml = False #================================================================= @@ -18,13 +19,19 @@ def use_lxml_parser(): if LXML_SUPPORTED: global HTML + global _is_lxml HTML = LXMLHTMLRewriter logging.debug('Using LXML Parser') - return True + _is_lxml = True else: # pragma: no cover logging.debug('LXML Parser not available') - return False + _is_lxml = False + return _is_lxml + + +def is_lxml(): + return _is_lxml #================================================================= class RewriteRules(BaseRule): diff --git a/pywb/rewrite/test/test_lxml_html_rewriter.py b/pywb/rewrite/test/test_lxml_html_rewriter.py index 125977e7..038de4a8 100644 --- a/pywb/rewrite/test/test_lxml_html_rewriter.py +++ b/pywb/rewrite/test/test_lxml_html_rewriter.py @@ -119,6 +119,15 @@ ur""" >>> p = LXMLHTMLRewriter(urlrewriter) >>> p.close() '' + +# test   +>>> parse(' ') +

 

+ +# test multiple rewrites:   extra >, split comment +>>> p = LXMLHTMLRewriter(urlrewriter) +>>> p.rewrite('
    >
') + p.close() +u'
    >
' """ from pywb.rewrite.url_rewriter import UrlRewriter