mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
ensure final output from close() is encoded!
add config option to 'use_lxml_parser' if available, if not, will default to regular parser testing on travis with lxml (not adding to dep yet)
This commit is contained in:
parent
1404177c6f
commit
f35e82a4d5
@ -6,6 +6,7 @@ python:
|
|||||||
# command to install dependencies
|
# command to install dependencies
|
||||||
install:
|
install:
|
||||||
- "pip install 'argparse>=1.2.1' --allow-all-external"
|
- "pip install 'argparse>=1.2.1' --allow-all-external"
|
||||||
|
- pip install lxml
|
||||||
- python setup.py -q install
|
- python setup.py -q install
|
||||||
- pip install coverage pytest-cov coveralls --use-mirrors
|
- pip install coverage pytest-cov coveralls --use-mirrors
|
||||||
# command to run tests
|
# command to run tests
|
||||||
|
@ -103,3 +103,7 @@ enable_cdx_api: true
|
|||||||
|
|
||||||
# Memento support, enable
|
# Memento support, enable
|
||||||
enable_memento: true
|
enable_memento: true
|
||||||
|
|
||||||
|
# Use lxml parser, if available
|
||||||
|
use_lxml_parser: true
|
||||||
|
|
||||||
|
@ -7,6 +7,7 @@ from pywb.warc.recordloader import ArcWarcRecordLoader
|
|||||||
from pywb.warc.resolvingloader import ResolvingLoader
|
from pywb.warc.resolvingloader import ResolvingLoader
|
||||||
|
|
||||||
from pywb.rewrite.rewrite_content import RewriteContent
|
from pywb.rewrite.rewrite_content import RewriteContent
|
||||||
|
from pywb.rewrite.rewriterules import use_lxml_parser
|
||||||
|
|
||||||
from pywb.cdx.cdxserver import create_cdx_server
|
from pywb.cdx.cdxserver import create_cdx_server
|
||||||
|
|
||||||
@ -41,6 +42,8 @@ DEFAULTS = {
|
|||||||
'domain_specific_rules': 'pywb/rules.yaml',
|
'domain_specific_rules': 'pywb/rules.yaml',
|
||||||
|
|
||||||
'enable_memento': True,
|
'enable_memento': True,
|
||||||
|
|
||||||
|
'use_lxml_parser': True,
|
||||||
}
|
}
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -137,6 +140,8 @@ def create_wb_router(passed_config = {}):
|
|||||||
else:
|
else:
|
||||||
request_class = WbRequest
|
request_class = WbRequest
|
||||||
|
|
||||||
|
if config.get('use_lxml_parser', False):
|
||||||
|
use_lxml_parser()
|
||||||
|
|
||||||
for name, value in collections.iteritems():
|
for name, value in collections.iteritems():
|
||||||
if isinstance(value, str):
|
if isinstance(value, str):
|
||||||
|
@ -11,6 +11,7 @@ from regex_rewriters import JSRewriter, CSSRewriter
|
|||||||
|
|
||||||
import cgi
|
import cgi
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class HTMLRewriterMixin(object):
|
class HTMLRewriterMixin(object):
|
||||||
"""
|
"""
|
||||||
@ -198,7 +199,6 @@ class HTMLRewriterMixin(object):
|
|||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def _write_attr(self, name, value, escape=False):
|
def _write_attr(self, name, value, escape=False):
|
||||||
# parser doesn't differentiate between 'attr=""' and just 'attr'
|
# parser doesn't differentiate between 'attr=""' and just 'attr'
|
||||||
# 'attr=""' is more common, so use that form
|
# 'attr=""' is more common, so use that form
|
||||||
@ -283,7 +283,6 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
|||||||
elif tag != 'head' or not self._rewrite_head(True):
|
elif tag != 'head' or not self._rewrite_head(True):
|
||||||
self.out.write('/>')
|
self.out.write('/>')
|
||||||
|
|
||||||
|
|
||||||
def handle_endtag(self, tag):
|
def handle_endtag(self, tag):
|
||||||
if (tag == self._wb_parse_context):
|
if (tag == self._wb_parse_context):
|
||||||
self._wb_parse_context = None
|
self._wb_parse_context = None
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
#import lxml.html
|
|
||||||
import lxml.etree
|
import lxml.etree
|
||||||
import cgi
|
import cgi
|
||||||
import re
|
import re
|
||||||
@ -11,6 +10,7 @@ from url_rewriter import UrlRewriter
|
|||||||
from html_rewriter import HTMLRewriterMixin
|
from html_rewriter import HTMLRewriterMixin
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
class LXMLHTMLRewriter(HTMLRewriterMixin):
|
class LXMLHTMLRewriter(HTMLRewriterMixin):
|
||||||
ur"""
|
ur"""
|
||||||
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
|
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||||
@ -128,9 +128,6 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
|
|||||||
recover=True,
|
recover=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.is_closing = False
|
|
||||||
|
|
||||||
|
|
||||||
def feed(self, string):
|
def feed(self, string):
|
||||||
string = self.END_HTML.sub(u'', string)
|
string = self.END_HTML.sub(u'', string)
|
||||||
#string = string.replace(u'</html>', u'')
|
#string = string.replace(u'</html>', u'')
|
||||||
@ -150,6 +147,7 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
class RewriterTarget(object):
|
class RewriterTarget(object):
|
||||||
def __init__(self, rewriter):
|
def __init__(self, rewriter):
|
||||||
self.rewriter = rewriter
|
self.rewriter = rewriter
|
||||||
@ -169,11 +167,7 @@ class RewriterTarget(object):
|
|||||||
|
|
||||||
self.rewriter.out.write(u'>')
|
self.rewriter.out.write(u'>')
|
||||||
|
|
||||||
|
|
||||||
def end(self, tag):
|
def end(self, tag):
|
||||||
#if tag == 'html' and not self.rewriter.is_closing:
|
|
||||||
# raise lxml.etree.LxmlError('test')
|
|
||||||
|
|
||||||
if (tag == self.rewriter._wb_parse_context):
|
if (tag == self.rewriter._wb_parse_context):
|
||||||
self.rewriter._wb_parse_context = None
|
self.rewriter._wb_parse_context = None
|
||||||
|
|
||||||
@ -202,21 +196,7 @@ def parse(data, head_insert=None):
|
|||||||
parser = LXMLHTMLRewriter(urlrewriter, head_insert=head_insert)
|
parser = LXMLHTMLRewriter(urlrewriter, head_insert=head_insert)
|
||||||
data = data.decode('utf-8')
|
data = data.decode('utf-8')
|
||||||
print parser.rewrite(data) + parser.close()
|
print parser.rewrite(data) + parser.close()
|
||||||
#return parser.rewrite(data) + parser.close()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
import sys
|
doctest.testmod()
|
||||||
if len(sys.argv) == 1:
|
|
||||||
import doctest
|
|
||||||
doctest.testmod()
|
|
||||||
else:
|
|
||||||
parser = LXMLHTMLRewriter(urlrewriter)
|
|
||||||
x = open(sys.argv[1])
|
|
||||||
b = x.read(81920)
|
|
||||||
while b:
|
|
||||||
result = parser.rewrite(b.decode('utf-8'))
|
|
||||||
print result.encode('utf-8')
|
|
||||||
b = x.read(81920)
|
|
||||||
print parser.close()
|
|
||||||
|
@ -123,7 +123,11 @@ class RewriteContent:
|
|||||||
return buff
|
return buff
|
||||||
|
|
||||||
def do_finish():
|
def do_finish():
|
||||||
return rewriter.close()
|
result = rewriter.close()
|
||||||
|
if encoding:
|
||||||
|
result = result.encode(encoding)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
return self.stream_to_gen(stream,
|
return self.stream_to_gen(stream,
|
||||||
rewrite_func=do_rewrite,
|
rewrite_func=do_rewrite,
|
||||||
|
@ -66,9 +66,9 @@ def get_rewritten(url, urlrewriter, urlkey=None, head_insert_func=None):
|
|||||||
|
|
||||||
status_headers, gen = result
|
status_headers, gen = result
|
||||||
|
|
||||||
#buff = u''.join(gen)
|
buff = ''.join(gen)
|
||||||
|
|
||||||
return (status_headers, gen)
|
return (status_headers, buff)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -3,22 +3,24 @@ from pywb.utils.dsrules import BaseRule
|
|||||||
from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
|
from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
|
||||||
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
|
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
|
||||||
|
|
||||||
HTML = None
|
|
||||||
try:
|
|
||||||
from lxml_parser import LXMLHTMLRewriter
|
|
||||||
HTML = LXMLHTMLRewriter
|
|
||||||
pass
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if not HTML:
|
|
||||||
from html_rewriter import HTMLRewriter
|
|
||||||
HTML = HTMLRewriter
|
|
||||||
|
|
||||||
from header_rewriter import HeaderRewriter
|
from header_rewriter import HeaderRewriter
|
||||||
|
from html_rewriter import HTMLRewriter
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
|
HTML = HTMLRewriter
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def use_lxml_parser():
|
||||||
|
try:
|
||||||
|
import logging
|
||||||
|
from lxml_parser import LXMLHTMLRewriter
|
||||||
|
HTML = LXMLHTMLRewriter
|
||||||
|
logging.debug('Using LXML Parser')
|
||||||
|
except ImportError:
|
||||||
|
logging.debug('Error Loading LXML Parser')
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class RewriteRules(BaseRule):
|
class RewriteRules(BaseRule):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user