1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

ensure final output from close() is encoded!

add config option to 'use_lxml_parser' if available, if not,
will default to regular parser
testing on travis with lxml (not adding to dep yet)
This commit is contained in:
Ilya Kreymer 2014-03-17 13:17:02 -07:00
parent 1404177c6f
commit f35e82a4d5
8 changed files with 36 additions and 41 deletions

View File

@ -6,6 +6,7 @@ python:
# command to install dependencies
install:
- "pip install 'argparse>=1.2.1' --allow-all-external"
- pip install lxml
- python setup.py -q install
- pip install coverage pytest-cov coveralls --use-mirrors
# command to run tests

View File

@ -103,3 +103,7 @@ enable_cdx_api: true
# Memento support, enable
enable_memento: true
# Use lxml parser, if available
use_lxml_parser: true

View File

@ -7,6 +7,7 @@ from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.rewrite.rewriterules import use_lxml_parser
from pywb.cdx.cdxserver import create_cdx_server
@ -41,6 +42,8 @@ DEFAULTS = {
'domain_specific_rules': 'pywb/rules.yaml',
'enable_memento': True,
'use_lxml_parser': True,
}
#=================================================================
@ -137,6 +140,8 @@ def create_wb_router(passed_config = {}):
else:
request_class = WbRequest
if config.get('use_lxml_parser', False):
use_lxml_parser()
for name, value in collections.iteritems():
if isinstance(value, str):

View File

@ -11,6 +11,7 @@ from regex_rewriters import JSRewriter, CSSRewriter
import cgi
#=================================================================
class HTMLRewriterMixin(object):
"""
@ -198,7 +199,6 @@ class HTMLRewriterMixin(object):
return True
def _write_attr(self, name, value, escape=False):
# parser doesn't differentiate between 'attr=""' and just 'attr'
# 'attr=""' is more common, so use that form
@ -283,7 +283,6 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
elif tag != 'head' or not self._rewrite_head(True):
self.out.write('/>')
def handle_endtag(self, tag):
if (tag == self._wb_parse_context):
self._wb_parse_context = None

View File

@ -1,7 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#import lxml.html
import lxml.etree
import cgi
import re
@ -11,6 +10,7 @@ from url_rewriter import UrlRewriter
from html_rewriter import HTMLRewriterMixin
#=================================================================
class LXMLHTMLRewriter(HTMLRewriterMixin):
ur"""
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
@ -128,9 +128,6 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
recover=True,
)
self.is_closing = False
def feed(self, string):
string = self.END_HTML.sub(u'', string)
#string = string.replace(u'</html>', u'')
@ -150,6 +147,7 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
return result
#=================================================================
class RewriterTarget(object):
def __init__(self, rewriter):
self.rewriter = rewriter
@ -169,11 +167,7 @@ class RewriterTarget(object):
self.rewriter.out.write(u'>')
def end(self, tag):
#if tag == 'html' and not self.rewriter.is_closing:
# raise lxml.etree.LxmlError('test')
if (tag == self.rewriter._wb_parse_context):
self.rewriter._wb_parse_context = None
@ -202,21 +196,7 @@ def parse(data, head_insert=None):
parser = LXMLHTMLRewriter(urlrewriter, head_insert=head_insert)
data = data.decode('utf-8')
print parser.rewrite(data) + parser.close()
#return parser.rewrite(data) + parser.close()
if __name__ == "__main__":
import sys
if len(sys.argv) == 1:
import doctest
doctest.testmod()
else:
parser = LXMLHTMLRewriter(urlrewriter)
x = open(sys.argv[1])
b = x.read(81920)
while b:
result = parser.rewrite(b.decode('utf-8'))
print result.encode('utf-8')
b = x.read(81920)
print parser.close()
import doctest
doctest.testmod()

View File

@ -123,7 +123,11 @@ class RewriteContent:
return buff
def do_finish():
return rewriter.close()
result = rewriter.close()
if encoding:
result = result.encode(encoding)
return result
return self.stream_to_gen(stream,
rewrite_func=do_rewrite,

View File

@ -66,9 +66,9 @@ def get_rewritten(url, urlrewriter, urlkey=None, head_insert_func=None):
status_headers, gen = result
#buff = u''.join(gen)
buff = ''.join(gen)
return (status_headers, gen)
return (status_headers, buff)
#=================================================================

View File

@ -3,22 +3,24 @@ from pywb.utils.dsrules import BaseRule
from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
HTML = None
try:
from lxml_parser import LXMLHTMLRewriter
HTML = LXMLHTMLRewriter
pass
except ImportError:
pass
if not HTML:
from html_rewriter import HTMLRewriter
HTML = HTMLRewriter
from header_rewriter import HeaderRewriter
from html_rewriter import HTMLRewriter
import itertools
HTML = HTMLRewriter
#=================================================================
def use_lxml_parser():
try:
import logging
from lxml_parser import LXMLHTMLRewriter
HTML = LXMLHTMLRewriter
logging.debug('Using LXML Parser')
except ImportError:
logging.debug('Error Loading LXML Parser')
#=================================================================
class RewriteRules(BaseRule):