mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
ensure final output from close() is encoded!
add config option to 'use_lxml_parser' if available, if not, will default to regular parser testing on travis with lxml (not adding to dep yet)
This commit is contained in:
parent
1404177c6f
commit
f35e82a4d5
@ -6,6 +6,7 @@ python:
|
||||
# command to install dependencies
|
||||
install:
|
||||
- "pip install 'argparse>=1.2.1' --allow-all-external"
|
||||
- pip install lxml
|
||||
- python setup.py -q install
|
||||
- pip install coverage pytest-cov coveralls --use-mirrors
|
||||
# command to run tests
|
||||
|
@ -103,3 +103,7 @@ enable_cdx_api: true
|
||||
|
||||
# Memento support, enable
|
||||
enable_memento: true
|
||||
|
||||
# Use lxml parser, if available
|
||||
use_lxml_parser: true
|
||||
|
||||
|
@ -7,6 +7,7 @@ from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
from pywb.warc.resolvingloader import ResolvingLoader
|
||||
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
from pywb.rewrite.rewriterules import use_lxml_parser
|
||||
|
||||
from pywb.cdx.cdxserver import create_cdx_server
|
||||
|
||||
@ -41,6 +42,8 @@ DEFAULTS = {
|
||||
'domain_specific_rules': 'pywb/rules.yaml',
|
||||
|
||||
'enable_memento': True,
|
||||
|
||||
'use_lxml_parser': True,
|
||||
}
|
||||
|
||||
#=================================================================
|
||||
@ -137,6 +140,8 @@ def create_wb_router(passed_config = {}):
|
||||
else:
|
||||
request_class = WbRequest
|
||||
|
||||
if config.get('use_lxml_parser', False):
|
||||
use_lxml_parser()
|
||||
|
||||
for name, value in collections.iteritems():
|
||||
if isinstance(value, str):
|
||||
|
@ -11,6 +11,7 @@ from regex_rewriters import JSRewriter, CSSRewriter
|
||||
|
||||
import cgi
|
||||
|
||||
|
||||
#=================================================================
|
||||
class HTMLRewriterMixin(object):
|
||||
"""
|
||||
@ -198,7 +199,6 @@ class HTMLRewriterMixin(object):
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _write_attr(self, name, value, escape=False):
|
||||
# parser doesn't differentiate between 'attr=""' and just 'attr'
|
||||
# 'attr=""' is more common, so use that form
|
||||
@ -283,7 +283,6 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
||||
elif tag != 'head' or not self._rewrite_head(True):
|
||||
self.out.write('/>')
|
||||
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if (tag == self._wb_parse_context):
|
||||
self._wb_parse_context = None
|
||||
|
@ -1,7 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
#import lxml.html
|
||||
import lxml.etree
|
||||
import cgi
|
||||
import re
|
||||
@ -11,6 +10,7 @@ from url_rewriter import UrlRewriter
|
||||
from html_rewriter import HTMLRewriterMixin
|
||||
|
||||
|
||||
#=================================================================
|
||||
class LXMLHTMLRewriter(HTMLRewriterMixin):
|
||||
ur"""
|
||||
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||
@ -128,9 +128,6 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
|
||||
recover=True,
|
||||
)
|
||||
|
||||
self.is_closing = False
|
||||
|
||||
|
||||
def feed(self, string):
|
||||
string = self.END_HTML.sub(u'', string)
|
||||
#string = string.replace(u'</html>', u'')
|
||||
@ -150,6 +147,7 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
|
||||
return result
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RewriterTarget(object):
|
||||
def __init__(self, rewriter):
|
||||
self.rewriter = rewriter
|
||||
@ -169,11 +167,7 @@ class RewriterTarget(object):
|
||||
|
||||
self.rewriter.out.write(u'>')
|
||||
|
||||
|
||||
def end(self, tag):
|
||||
#if tag == 'html' and not self.rewriter.is_closing:
|
||||
# raise lxml.etree.LxmlError('test')
|
||||
|
||||
if (tag == self.rewriter._wb_parse_context):
|
||||
self.rewriter._wb_parse_context = None
|
||||
|
||||
@ -202,21 +196,7 @@ def parse(data, head_insert=None):
|
||||
parser = LXMLHTMLRewriter(urlrewriter, head_insert=head_insert)
|
||||
data = data.decode('utf-8')
|
||||
print parser.rewrite(data) + parser.close()
|
||||
#return parser.rewrite(data) + parser.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
import sys
|
||||
if len(sys.argv) == 1:
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
else:
|
||||
parser = LXMLHTMLRewriter(urlrewriter)
|
||||
x = open(sys.argv[1])
|
||||
b = x.read(81920)
|
||||
while b:
|
||||
result = parser.rewrite(b.decode('utf-8'))
|
||||
print result.encode('utf-8')
|
||||
b = x.read(81920)
|
||||
print parser.close()
|
||||
|
@ -123,7 +123,11 @@ class RewriteContent:
|
||||
return buff
|
||||
|
||||
def do_finish():
|
||||
return rewriter.close()
|
||||
result = rewriter.close()
|
||||
if encoding:
|
||||
result = result.encode(encoding)
|
||||
|
||||
return result
|
||||
|
||||
return self.stream_to_gen(stream,
|
||||
rewrite_func=do_rewrite,
|
||||
|
@ -66,9 +66,9 @@ def get_rewritten(url, urlrewriter, urlkey=None, head_insert_func=None):
|
||||
|
||||
status_headers, gen = result
|
||||
|
||||
#buff = u''.join(gen)
|
||||
buff = ''.join(gen)
|
||||
|
||||
return (status_headers, gen)
|
||||
return (status_headers, buff)
|
||||
|
||||
|
||||
#=================================================================
|
||||
|
@ -3,22 +3,24 @@ from pywb.utils.dsrules import BaseRule
|
||||
from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
|
||||
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
|
||||
|
||||
HTML = None
|
||||
try:
|
||||
from lxml_parser import LXMLHTMLRewriter
|
||||
HTML = LXMLHTMLRewriter
|
||||
pass
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
if not HTML:
|
||||
from html_rewriter import HTMLRewriter
|
||||
HTML = HTMLRewriter
|
||||
|
||||
from header_rewriter import HeaderRewriter
|
||||
from html_rewriter import HTMLRewriter
|
||||
|
||||
import itertools
|
||||
|
||||
HTML = HTMLRewriter
|
||||
|
||||
|
||||
#=================================================================
|
||||
def use_lxml_parser():
|
||||
try:
|
||||
import logging
|
||||
from lxml_parser import LXMLHTMLRewriter
|
||||
HTML = LXMLHTMLRewriter
|
||||
logging.debug('Using LXML Parser')
|
||||
except ImportError:
|
||||
logging.debug('Error Loading LXML Parser')
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RewriteRules(BaseRule):
|
||||
|
Loading…
x
Reference in New Issue
Block a user