mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
rewrite: when using lxml parser, just pass raw stream to lxml
without decoding. lxml parser expects to have raw bytes and will determine encoding on its own. then serve back as utf-8 if no encoding specified. should address #36
This commit is contained in:
parent
e077c23de7
commit
d6006acdc3
@ -79,7 +79,8 @@ class RewriterTarget(object):
|
|||||||
def data(self, data):
|
def data(self, data):
|
||||||
if not self.rewriter._wb_parse_context:
|
if not self.rewriter._wb_parse_context:
|
||||||
data = cgi.escape(data, quote=True)
|
data = cgi.escape(data, quote=True)
|
||||||
|
if isinstance(data, unicode):
|
||||||
|
data = data.replace(u'\xa0', ' ')
|
||||||
self.rewriter.parse_data(data)
|
self.rewriter.parse_data(data)
|
||||||
|
|
||||||
def comment(self, data):
|
def comment(self, data):
|
||||||
|
@ -6,7 +6,7 @@ from io import BytesIO
|
|||||||
|
|
||||||
from header_rewriter import RewrittenStatusAndHeaders
|
from header_rewriter import RewrittenStatusAndHeaders
|
||||||
|
|
||||||
from rewriterules import RewriteRules
|
from rewriterules import RewriteRules, is_lxml
|
||||||
|
|
||||||
from pywb.utils.dsrules import RuleSet
|
from pywb.utils.dsrules import RuleSet
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
@ -73,21 +73,25 @@ class RewriteContent:
|
|||||||
# ====================================================================
|
# ====================================================================
|
||||||
# special case -- need to ungzip the body
|
# special case -- need to ungzip the body
|
||||||
|
|
||||||
|
text_type = rewritten_headers.text_type
|
||||||
|
stream_raw = False
|
||||||
|
encoding = None
|
||||||
|
first_buff = None
|
||||||
|
|
||||||
if (rewritten_headers.
|
if (rewritten_headers.
|
||||||
contains_removed_header('content-encoding', 'gzip')):
|
contains_removed_header('content-encoding', 'gzip')):
|
||||||
stream = DecompressingBufferedReader(stream, decomp_type='gzip')
|
stream = DecompressingBufferedReader(stream, decomp_type='gzip')
|
||||||
|
|
||||||
if rewritten_headers.charset:
|
if rewritten_headers.charset:
|
||||||
encoding = rewritten_headers.charset
|
encoding = rewritten_headers.charset
|
||||||
first_buff = None
|
elif is_lxml() and text_type == 'html':
|
||||||
|
stream_raw = True
|
||||||
else:
|
else:
|
||||||
(encoding, first_buff) = self._detect_charset(stream)
|
(encoding, first_buff) = self._detect_charset(stream)
|
||||||
|
|
||||||
# if chardet thinks its ascii, use utf-8
|
# if encoding not set or chardet thinks its ascii, use utf-8
|
||||||
if encoding == 'ascii':
|
if not encoding or encoding == 'ascii':
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
|
|
||||||
text_type = rewritten_headers.text_type
|
|
||||||
|
|
||||||
rule = self.ruleset.get_first_match(urlkey)
|
rule = self.ruleset.get_first_match(urlkey)
|
||||||
|
|
||||||
@ -108,34 +112,33 @@ class RewriteContent:
|
|||||||
js_rewriter_class=rule.rewriters['js'],
|
js_rewriter_class=rule.rewriters['js'],
|
||||||
css_rewriter_class=rule.rewriters['css'],
|
css_rewriter_class=rule.rewriters['css'],
|
||||||
head_insert=head_insert_str)
|
head_insert=head_insert_str)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# apply one of (js, css, xml) rewriters
|
# apply one of (js, css, xml) rewriters
|
||||||
rewriter = rewriter_class(urlrewriter)
|
rewriter = rewriter_class(urlrewriter)
|
||||||
|
|
||||||
# Create rewriting generator
|
# Create rewriting generator
|
||||||
gen = self._rewriting_stream_gen(rewriter, encoding,
|
gen = self._rewriting_stream_gen(rewriter, encoding, stream_raw,
|
||||||
stream, first_buff)
|
stream, first_buff)
|
||||||
|
|
||||||
return (status_headers, gen, True)
|
return (status_headers, gen, True)
|
||||||
|
|
||||||
# Create rewrite stream, may even be chunked by front-end
|
# Create rewrite stream, may even be chunked by front-end
|
||||||
def _rewriting_stream_gen(self, rewriter, encoding,
|
def _rewriting_stream_gen(self, rewriter, encoding, stream_raw,
|
||||||
stream, first_buff=None):
|
stream, first_buff=None):
|
||||||
def do_rewrite(buff):
|
def do_rewrite(buff):
|
||||||
if encoding:
|
if not stream_raw:
|
||||||
buff = self._decode_buff(buff, stream, encoding)
|
buff = self._decode_buff(buff, stream, encoding)
|
||||||
|
|
||||||
buff = rewriter.rewrite(buff)
|
buff = rewriter.rewrite(buff)
|
||||||
|
|
||||||
if encoding:
|
buff = buff.encode(encoding)
|
||||||
buff = buff.encode(encoding)
|
|
||||||
|
|
||||||
return buff
|
return buff
|
||||||
|
|
||||||
def do_finish():
|
def do_finish():
|
||||||
result = rewriter.close()
|
result = rewriter.close()
|
||||||
if encoding:
|
result = result.encode(encoding)
|
||||||
result = result.encode(encoding)
|
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@ -188,12 +191,16 @@ class RewriteContent:
|
|||||||
def stream_to_gen(stream, rewrite_func=None,
|
def stream_to_gen(stream, rewrite_func=None,
|
||||||
final_read_func=None, first_buff=None):
|
final_read_func=None, first_buff=None):
|
||||||
try:
|
try:
|
||||||
buff = first_buff if first_buff else stream.read()
|
if first_buff:
|
||||||
|
buff = first_buff
|
||||||
|
else:
|
||||||
|
buff = stream.read() + stream.readline()
|
||||||
|
|
||||||
while buff:
|
while buff:
|
||||||
if rewrite_func:
|
if rewrite_func:
|
||||||
buff = rewrite_func(buff)
|
buff = rewrite_func(buff)
|
||||||
yield buff
|
yield buff
|
||||||
buff = stream.read()
|
buff = stream.read() + stream.readline()
|
||||||
|
|
||||||
# For adding a tail/handling final buffer
|
# For adding a tail/handling final buffer
|
||||||
if final_read_func:
|
if final_read_func:
|
||||||
|
@ -9,6 +9,7 @@ from html_rewriter import HTMLRewriter
|
|||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
HTML = HTMLRewriter
|
HTML = HTMLRewriter
|
||||||
|
_is_lxml = False
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -18,13 +19,19 @@ def use_lxml_parser():
|
|||||||
|
|
||||||
if LXML_SUPPORTED:
|
if LXML_SUPPORTED:
|
||||||
global HTML
|
global HTML
|
||||||
|
global _is_lxml
|
||||||
HTML = LXMLHTMLRewriter
|
HTML = LXMLHTMLRewriter
|
||||||
logging.debug('Using LXML Parser')
|
logging.debug('Using LXML Parser')
|
||||||
return True
|
_is_lxml = True
|
||||||
else: # pragma: no cover
|
else: # pragma: no cover
|
||||||
logging.debug('LXML Parser not available')
|
logging.debug('LXML Parser not available')
|
||||||
return False
|
_is_lxml = False
|
||||||
|
|
||||||
|
return _is_lxml
|
||||||
|
|
||||||
|
|
||||||
|
def is_lxml():
|
||||||
|
return _is_lxml
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class RewriteRules(BaseRule):
|
class RewriteRules(BaseRule):
|
||||||
|
@ -119,6 +119,15 @@ ur"""
|
|||||||
>>> p = LXMLHTMLRewriter(urlrewriter)
|
>>> p = LXMLHTMLRewriter(urlrewriter)
|
||||||
>>> p.close()
|
>>> p.close()
|
||||||
''
|
''
|
||||||
|
|
||||||
|
# test
|
||||||
|
>>> parse(' ')
|
||||||
|
<html><body><p> </p></body></html>
|
||||||
|
|
||||||
|
# test multiple rewrites: extra >, split comment
|
||||||
|
>>> p = LXMLHTMLRewriter(urlrewriter)
|
||||||
|
>>> p.rewrite('<div> > <!-- a') + p.rewrite('b --></div>') + p.close()
|
||||||
|
u'<html><body><div> > <!-- ab --></div></body></html>'
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
|
Loading…
x
Reference in New Issue
Block a user