From f35e82a4d5f14ecac17fa6320f50713029896af2 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 17 Mar 2014 13:17:02 -0700 Subject: [PATCH] ensure final output from close() is encoded! add config option to 'use_lxml_parser' if available, if not, will default to regular parser testing on travis with lxml (not adding to dep yet) --- .travis.yml | 1 + config.yaml | 4 ++++ pywb/core/pywb_init.py | 5 +++++ pywb/rewrite/html_rewriter.py | 3 +-- pywb/rewrite/lxml_parser.py | 28 ++++------------------------ pywb/rewrite/rewrite_content.py | 6 +++++- pywb/rewrite/rewrite_live.py | 4 ++-- pywb/rewrite/rewriterules.py | 26 ++++++++++++++------------ 8 files changed, 36 insertions(+), 41 deletions(-) diff --git a/.travis.yml b/.travis.yml index 941b0aed..daa35e75 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,7 @@ python: # command to install dependencies install: - "pip install 'argparse>=1.2.1' --allow-all-external" + - pip install lxml - python setup.py -q install - pip install coverage pytest-cov coveralls --use-mirrors # command to run tests diff --git a/config.yaml b/config.yaml index b81e555e..3a535454 100644 --- a/config.yaml +++ b/config.yaml @@ -103,3 +103,7 @@ enable_cdx_api: true # Memento support, enable enable_memento: true + +# Use lxml parser, if available +use_lxml_parser: true + diff --git a/pywb/core/pywb_init.py b/pywb/core/pywb_init.py index f365afef..2485be00 100644 --- a/pywb/core/pywb_init.py +++ b/pywb/core/pywb_init.py @@ -7,6 +7,7 @@ from pywb.warc.recordloader import ArcWarcRecordLoader from pywb.warc.resolvingloader import ResolvingLoader from pywb.rewrite.rewrite_content import RewriteContent +from pywb.rewrite.rewriterules import use_lxml_parser from pywb.cdx.cdxserver import create_cdx_server @@ -41,6 +42,8 @@ DEFAULTS = { 'domain_specific_rules': 'pywb/rules.yaml', 'enable_memento': True, + + 'use_lxml_parser': True, } #================================================================= @@ -137,6 +140,8 @@ def create_wb_router(passed_config = {}): else: request_class = WbRequest + if config.get('use_lxml_parser', False): + use_lxml_parser() for name, value in collections.iteritems(): if isinstance(value, str): diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 2c5a18b1..9895ce2e 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -11,6 +11,7 @@ from regex_rewriters import JSRewriter, CSSRewriter import cgi + #================================================================= class HTMLRewriterMixin(object): """ @@ -198,7 +199,6 @@ class HTMLRewriterMixin(object): return True - def _write_attr(self, name, value, escape=False): # parser doesn't differentiate between 'attr=""' and just 'attr' # 'attr=""' is more common, so use that form @@ -283,7 +283,6 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser): elif tag != 'head' or not self._rewrite_head(True): self.out.write('/>') - def handle_endtag(self, tag): if (tag == self._wb_parse_context): self._wb_parse_context = None diff --git a/pywb/rewrite/lxml_parser.py b/pywb/rewrite/lxml_parser.py index 4956c872..5137d74e 100644 --- a/pywb/rewrite/lxml_parser.py +++ b/pywb/rewrite/lxml_parser.py @@ -1,7 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -#import lxml.html import lxml.etree import cgi import re @@ -11,6 +10,7 @@ from url_rewriter import UrlRewriter from html_rewriter import HTMLRewriterMixin +#================================================================= class LXMLHTMLRewriter(HTMLRewriterMixin): ur""" >>> parse('Text') @@ -128,9 +128,6 @@ class LXMLHTMLRewriter(HTMLRewriterMixin): recover=True, ) - self.is_closing = False - - def feed(self, string): string = self.END_HTML.sub(u'', string) #string = string.replace(u'', u'') @@ -150,6 +147,7 @@ class LXMLHTMLRewriter(HTMLRewriterMixin): return result +#================================================================= class RewriterTarget(object): def __init__(self, rewriter): self.rewriter = rewriter @@ -169,11 +167,7 @@ class RewriterTarget(object): self.rewriter.out.write(u'>') - def end(self, tag): - #if tag == 'html' and not self.rewriter.is_closing: - # raise lxml.etree.LxmlError('test') - if (tag == self.rewriter._wb_parse_context): self.rewriter._wb_parse_context = None @@ -202,21 +196,7 @@ def parse(data, head_insert=None): parser = LXMLHTMLRewriter(urlrewriter, head_insert=head_insert) data = data.decode('utf-8') print parser.rewrite(data) + parser.close() - #return parser.rewrite(data) + parser.close() - if __name__ == "__main__": - - import sys - if len(sys.argv) == 1: - import doctest - doctest.testmod() - else: - parser = LXMLHTMLRewriter(urlrewriter) - x = open(sys.argv[1]) - b = x.read(81920) - while b: - result = parser.rewrite(b.decode('utf-8')) - print result.encode('utf-8') - b = x.read(81920) - print parser.close() + import doctest + doctest.testmod() diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index f6ee9a57..1d4fc9f2 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -123,7 +123,11 @@ class RewriteContent: return buff def do_finish(): - return rewriter.close() + result = rewriter.close() + if encoding: + result = result.encode(encoding) + + return result return self.stream_to_gen(stream, rewrite_func=do_rewrite, diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index 0d5245e0..2c6633cd 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -66,9 +66,9 @@ def get_rewritten(url, urlrewriter, urlkey=None, head_insert_func=None): status_headers, gen = result - #buff = u''.join(gen) + buff = ''.join(gen) - return (status_headers, gen) + return (status_headers, buff) #================================================================= diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py index 8069b99f..8d98313b 100644 --- a/pywb/rewrite/rewriterules.py +++ b/pywb/rewrite/rewriterules.py @@ -3,22 +3,24 @@ from pywb.utils.dsrules import BaseRule from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter -HTML = None -try: - from lxml_parser import LXMLHTMLRewriter - HTML = LXMLHTMLRewriter - pass -except ImportError: - pass - -if not HTML: - from html_rewriter import HTMLRewriter - HTML = HTMLRewriter - from header_rewriter import HeaderRewriter +from html_rewriter import HTMLRewriter import itertools +HTML = HTMLRewriter + + +#================================================================= +def use_lxml_parser(): + try: + import logging + from lxml_parser import LXMLHTMLRewriter + HTML = LXMLHTMLRewriter + logging.debug('Using LXML Parser') + except ImportError: + logging.debug('Error Loading LXML Parser') + #================================================================= class RewriteRules(BaseRule):