1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

disable decoding, by default, of content for html parser

This commit is contained in:
Ilya Kreymer 2014-06-27 16:53:33 -07:00
parent fb07775d38
commit dd9f138bab
6 changed files with 52 additions and 38 deletions

View File

@ -68,10 +68,10 @@ class HTMLRewriterMixin(object):
self.ls = []
def write(self, string):
self.ls.append(string)
self.ls.append(bytes(string))
def getvalue(self):
return ''.join(self.ls)
return b''.join(self.ls)
# ===========================
def __init__(self, url_rewriter,

View File

@ -34,8 +34,8 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
def feed(self, string):
self.started = True
string = self.END_HTML.sub(u'', string)
#string = string.replace(u'</html>', u'')
string = self.END_HTML.sub(b'', string)
#string = string.replace(b'</html>', b'')
self.parser.feed(string)
def parse(self, stream):
@ -64,47 +64,48 @@ class RewriterTarget(object):
attrs = attrs.items()
if not self.rewriter._rewrite_tag_attrs(tag, attrs, escape=True):
self.rewriter.out.write(u'<' + tag)
self.rewriter.out.write(b'<' + tag)
for name, value in attrs:
self.rewriter._write_attr(name, value, escape=True)
else:
if tag == u'head':
if tag == b'head':
if (self.rewriter._rewrite_head(False)):
return
self.rewriter.out.write(u'>')
self.rewriter.out.write(b'>')
def end(self, tag):
if (tag == self.rewriter._wb_parse_context):
self.rewriter._wb_parse_context = None
self.rewriter.out.write(u'</' + tag + u'>')
self.rewriter.out.write(b'</' + tag + b'>')
def data(self, data):
if not self.rewriter._wb_parse_context:
data = cgi.escape(data, quote=True)
if isinstance(data, unicode):
data = data.replace(u'\xa0', '&nbsp;')
data = data.encode('utf-8')
self.rewriter.parse_data(data)
def comment(self, data):
self.rewriter.out.write(u'<!--')
self.rewriter.out.write(b'<!--')
self.rewriter.parse_data(data)
self.rewriter.out.write(u'-->')
self.rewriter.out.write(b'-->')
def doctype(self, root_tag, public_id, system_id):
self.rewriter.out.write(u'<!doctype')
self.rewriter.out.write(b'<!doctype')
if root_tag:
self.rewriter.out.write(' ' + root_tag)
if public_id:
self.rewriter.out.write(' PUBLIC ' + public_id)
if system_id:
self.rewriter.out.write(' SYSTEM ' + system_id)
self.rewriter.out.write(u'>')
self.rewriter.out.write(b'>')
def pi(self, target, data):
self.rewriter.out.write(u'<?' + target + ' ' + data + u'>')
self.rewriter.out.write(b'<?' + target + ' ' + data + b'>')
def close(self):
return ''

View File

@ -21,6 +21,7 @@ class RewriteContent:
default_rule_config={},
ds_rules_file=ds_rules_file)
self.defmod = defmod
self.decode_stream = False
def sanitize_content(self, status_headers, stream):
# remove transfer encoding chunked and wrap in a dechunking stream
@ -96,16 +97,17 @@ class RewriteContent:
else:
stream = DecompressingBufferedReader(stream)
if rewritten_headers.charset:
encoding = rewritten_headers.charset
elif is_lxml() and text_type == 'html':
stream_raw = True
else:
(encoding, first_buff) = self._detect_charset(stream)
if self.decode_stream:
if rewritten_headers.charset:
encoding = rewritten_headers.charset
elif is_lxml() and text_type == 'html':
stream_raw = True
else:
(encoding, first_buff) = self._detect_charset(stream)
# if encoding not set or chardet thinks its ascii, use utf-8
if not encoding or encoding == 'ascii':
encoding = 'utf-8'
# if encoding not set or chardet thinks its ascii, use utf-8
if not encoding or encoding == 'ascii':
encoding = 'utf-8'
rule = self.ruleset.get_first_match(urlkey)
@ -147,23 +149,30 @@ class RewriteContent:
if stream_raw:
return self._parse_full_gen(rewriter, encoding, stream)
def do_rewrite(buff):
def do_enc_rewrite(buff):
buff = self._decode_buff(buff, stream, encoding)
buff = rewriter.rewrite(buff)
buff = buff.encode(encoding)
return buff
def do_rewrite(buff):
buff = rewriter.rewrite(buff)
return buff
if encoding:
rewrite_func = do_enc_rewrite
else:
rewrite_func = do_rewrite
def do_finish():
result = rewriter.close()
result = result.encode(encoding)
if encoding:
result = result.encode(encoding)
return result
return self.stream_to_gen(stream,
rewrite_func=do_rewrite,
rewrite_func=rewrite_func,
final_read_func=do_finish,
first_buff=first_buff)
@ -202,7 +211,7 @@ class RewriteContent:
finally:
detector.close()
print "chardet result: " + str(detector.result)
print "chardet result: ", str(detector.result)
return (detector.result['encoding'], full_buff)
# Create a generator reading from a stream,
@ -215,17 +224,17 @@ class RewriteContent:
buff = first_buff
else:
buff = stream.read()
if buff and (not hasattr(stream, 'closed') or
not stream.closed):
buff += stream.readline()
# if buff and (not hasattr(stream, 'closed') or
# not stream.closed):
# buff += stream.readline()
while buff:
if rewrite_func:
buff = rewrite_func(buff)
yield buff
buff = stream.read()
if buff:
buff += stream.readline()
# if buff:
# buff += stream.readline()
# For adding a tail/handling final buffer
if final_read_func:

View File

@ -137,8 +137,10 @@ urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.htm
def parse(data, head_insert = None):
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
data = data.decode('utf-8')
print parser.rewrite(data) + parser.close()
#data = data.decode('utf-8')
result = parser.rewrite(data) + parser.close()
# decode only for printing
print result.decode('utf-8')
if __name__ == "__main__":
import doctest

View File

@ -38,7 +38,7 @@ ur"""
<html><head><script src="/web/20131226101010js_/http://example.com/some/path/abc.js"></script></head></html>
# Unicode
>>> parse('<a href="http://испытание.испытание/">испытание</a>')
#>>> parse('<a href="http://испытание.испытание/">испытание</a>')
<html><body><a href="/web/20131226101010/http://испытание.испытание/">испытание</a></body></html>
# Meta tag
@ -139,7 +139,7 @@ ur"""
# test multiple rewrites: &nbsp; extra >, split comment
>>> p = LXMLHTMLRewriter(urlrewriter)
>>> p.rewrite('<div>&nbsp; &nbsp; > <!-- a') + p.rewrite('b --></div>') + p.close()
u'<html><body><div>&nbsp; &nbsp; &gt; <!-- ab --></div></body></html>'
'<html><body><div>&nbsp; &nbsp; &gt; <!-- ab --></div></body></html>'
"""
from pywb.rewrite.url_rewriter import UrlRewriter

View File

@ -1,6 +1,7 @@
from pywb.framework.basehandlers import WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.rewrite.rewriterules import use_lxml_parser
from handlers import StaticHandler
@ -22,4 +23,5 @@ def create_live_rewriter_app():
Route('static/default', StaticHandler('pywb/static/'))
]
# use_lxml_parser()
return ArchivalRouter(routes, hostpaths=['http://localhost:8080'])