1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

rewrite: when using lxml parser, just pass raw stream to lxml

without decoding. lxml parser expects to have raw bytes and will determine
encoding on its own. then serve back as utf-8 if no encoding specified.
should address #36
This commit is contained in:
Ilya Kreymer 2014-04-06 09:47:34 -07:00
parent e077c23de7
commit d6006acdc3
4 changed files with 43 additions and 19 deletions

View File

@ -79,7 +79,8 @@ class RewriterTarget(object):
def data(self, data): def data(self, data):
if not self.rewriter._wb_parse_context: if not self.rewriter._wb_parse_context:
data = cgi.escape(data, quote=True) data = cgi.escape(data, quote=True)
if isinstance(data, unicode):
data = data.replace(u'\xa0', ' ')
self.rewriter.parse_data(data) self.rewriter.parse_data(data)
def comment(self, data): def comment(self, data):

View File

@ -6,7 +6,7 @@ from io import BytesIO
from header_rewriter import RewrittenStatusAndHeaders from header_rewriter import RewrittenStatusAndHeaders
from rewriterules import RewriteRules from rewriterules import RewriteRules, is_lxml
from pywb.utils.dsrules import RuleSet from pywb.utils.dsrules import RuleSet
from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeaders
@ -73,21 +73,25 @@ class RewriteContent:
# ==================================================================== # ====================================================================
# special case -- need to ungzip the body # special case -- need to ungzip the body
text_type = rewritten_headers.text_type
stream_raw = False
encoding = None
first_buff = None
if (rewritten_headers. if (rewritten_headers.
contains_removed_header('content-encoding', 'gzip')): contains_removed_header('content-encoding', 'gzip')):
stream = DecompressingBufferedReader(stream, decomp_type='gzip') stream = DecompressingBufferedReader(stream, decomp_type='gzip')
if rewritten_headers.charset: if rewritten_headers.charset:
encoding = rewritten_headers.charset encoding = rewritten_headers.charset
first_buff = None elif is_lxml() and text_type == 'html':
stream_raw = True
else: else:
(encoding, first_buff) = self._detect_charset(stream) (encoding, first_buff) = self._detect_charset(stream)
# if chardet thinks its ascii, use utf-8 # if encoding not set or chardet thinks its ascii, use utf-8
if encoding == 'ascii': if not encoding or encoding == 'ascii':
encoding = 'utf-8' encoding = 'utf-8'
text_type = rewritten_headers.text_type
rule = self.ruleset.get_first_match(urlkey) rule = self.ruleset.get_first_match(urlkey)
@ -108,34 +112,33 @@ class RewriteContent:
js_rewriter_class=rule.rewriters['js'], js_rewriter_class=rule.rewriters['js'],
css_rewriter_class=rule.rewriters['css'], css_rewriter_class=rule.rewriters['css'],
head_insert=head_insert_str) head_insert=head_insert_str)
else: else:
# apply one of (js, css, xml) rewriters # apply one of (js, css, xml) rewriters
rewriter = rewriter_class(urlrewriter) rewriter = rewriter_class(urlrewriter)
# Create rewriting generator # Create rewriting generator
gen = self._rewriting_stream_gen(rewriter, encoding, gen = self._rewriting_stream_gen(rewriter, encoding, stream_raw,
stream, first_buff) stream, first_buff)
return (status_headers, gen, True) return (status_headers, gen, True)
# Create rewrite stream, may even be chunked by front-end # Create rewrite stream, may even be chunked by front-end
def _rewriting_stream_gen(self, rewriter, encoding, def _rewriting_stream_gen(self, rewriter, encoding, stream_raw,
stream, first_buff=None): stream, first_buff=None):
def do_rewrite(buff): def do_rewrite(buff):
if encoding: if not stream_raw:
buff = self._decode_buff(buff, stream, encoding) buff = self._decode_buff(buff, stream, encoding)
buff = rewriter.rewrite(buff) buff = rewriter.rewrite(buff)
if encoding: buff = buff.encode(encoding)
buff = buff.encode(encoding)
return buff return buff
def do_finish(): def do_finish():
result = rewriter.close() result = rewriter.close()
if encoding: result = result.encode(encoding)
result = result.encode(encoding)
return result return result
@ -188,12 +191,16 @@ class RewriteContent:
def stream_to_gen(stream, rewrite_func=None, def stream_to_gen(stream, rewrite_func=None,
final_read_func=None, first_buff=None): final_read_func=None, first_buff=None):
try: try:
buff = first_buff if first_buff else stream.read() if first_buff:
buff = first_buff
else:
buff = stream.read() + stream.readline()
while buff: while buff:
if rewrite_func: if rewrite_func:
buff = rewrite_func(buff) buff = rewrite_func(buff)
yield buff yield buff
buff = stream.read() buff = stream.read() + stream.readline()
# For adding a tail/handling final buffer # For adding a tail/handling final buffer
if final_read_func: if final_read_func:

View File

@ -9,6 +9,7 @@ from html_rewriter import HTMLRewriter
import itertools import itertools
HTML = HTMLRewriter HTML = HTMLRewriter
_is_lxml = False
#================================================================= #=================================================================
@ -18,13 +19,19 @@ def use_lxml_parser():
if LXML_SUPPORTED: if LXML_SUPPORTED:
global HTML global HTML
global _is_lxml
HTML = LXMLHTMLRewriter HTML = LXMLHTMLRewriter
logging.debug('Using LXML Parser') logging.debug('Using LXML Parser')
return True _is_lxml = True
else: # pragma: no cover else: # pragma: no cover
logging.debug('LXML Parser not available') logging.debug('LXML Parser not available')
return False _is_lxml = False
return _is_lxml
def is_lxml():
return _is_lxml
#================================================================= #=================================================================
class RewriteRules(BaseRule): class RewriteRules(BaseRule):

View File

@ -119,6 +119,15 @@ ur"""
>>> p = LXMLHTMLRewriter(urlrewriter) >>> p = LXMLHTMLRewriter(urlrewriter)
>>> p.close() >>> p.close()
'' ''
# test  
>>> parse(' ')
<html><body><p>&nbsp;</p></body></html>
# test multiple rewrites: &nbsp; extra >, split comment
>>> p = LXMLHTMLRewriter(urlrewriter)
>>> p.rewrite('<div>&nbsp; &nbsp; > <!-- a') + p.rewrite('b --></div>') + p.close()
u'<html><body><div>&nbsp; &nbsp; &gt; <!-- ab --></div></body></html>'
""" """
from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.url_rewriter import UrlRewriter