mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
disable decoding, by default, of content for html parser
This commit is contained in:
parent
fb07775d38
commit
dd9f138bab
@ -68,10 +68,10 @@ class HTMLRewriterMixin(object):
|
||||
self.ls = []
|
||||
|
||||
def write(self, string):
|
||||
self.ls.append(string)
|
||||
self.ls.append(bytes(string))
|
||||
|
||||
def getvalue(self):
|
||||
return ''.join(self.ls)
|
||||
return b''.join(self.ls)
|
||||
|
||||
# ===========================
|
||||
def __init__(self, url_rewriter,
|
||||
|
@ -34,8 +34,8 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
|
||||
|
||||
def feed(self, string):
|
||||
self.started = True
|
||||
string = self.END_HTML.sub(u'', string)
|
||||
#string = string.replace(u'</html>', u'')
|
||||
string = self.END_HTML.sub(b'', string)
|
||||
#string = string.replace(b'</html>', b'')
|
||||
self.parser.feed(string)
|
||||
|
||||
def parse(self, stream):
|
||||
@ -64,47 +64,48 @@ class RewriterTarget(object):
|
||||
attrs = attrs.items()
|
||||
|
||||
if not self.rewriter._rewrite_tag_attrs(tag, attrs, escape=True):
|
||||
self.rewriter.out.write(u'<' + tag)
|
||||
self.rewriter.out.write(b'<' + tag)
|
||||
|
||||
for name, value in attrs:
|
||||
self.rewriter._write_attr(name, value, escape=True)
|
||||
else:
|
||||
if tag == u'head':
|
||||
if tag == b'head':
|
||||
if (self.rewriter._rewrite_head(False)):
|
||||
return
|
||||
|
||||
self.rewriter.out.write(u'>')
|
||||
self.rewriter.out.write(b'>')
|
||||
|
||||
def end(self, tag):
|
||||
if (tag == self.rewriter._wb_parse_context):
|
||||
self.rewriter._wb_parse_context = None
|
||||
|
||||
self.rewriter.out.write(u'</' + tag + u'>')
|
||||
self.rewriter.out.write(b'</' + tag + b'>')
|
||||
|
||||
def data(self, data):
|
||||
if not self.rewriter._wb_parse_context:
|
||||
data = cgi.escape(data, quote=True)
|
||||
if isinstance(data, unicode):
|
||||
data = data.replace(u'\xa0', ' ')
|
||||
data = data.encode('utf-8')
|
||||
self.rewriter.parse_data(data)
|
||||
|
||||
def comment(self, data):
|
||||
self.rewriter.out.write(u'<!--')
|
||||
self.rewriter.out.write(b'<!--')
|
||||
self.rewriter.parse_data(data)
|
||||
self.rewriter.out.write(u'-->')
|
||||
self.rewriter.out.write(b'-->')
|
||||
|
||||
def doctype(self, root_tag, public_id, system_id):
|
||||
self.rewriter.out.write(u'<!doctype')
|
||||
self.rewriter.out.write(b'<!doctype')
|
||||
if root_tag:
|
||||
self.rewriter.out.write(' ' + root_tag)
|
||||
if public_id:
|
||||
self.rewriter.out.write(' PUBLIC ' + public_id)
|
||||
if system_id:
|
||||
self.rewriter.out.write(' SYSTEM ' + system_id)
|
||||
self.rewriter.out.write(u'>')
|
||||
self.rewriter.out.write(b'>')
|
||||
|
||||
def pi(self, target, data):
|
||||
self.rewriter.out.write(u'<?' + target + ' ' + data + u'>')
|
||||
self.rewriter.out.write(b'<?' + target + ' ' + data + b'>')
|
||||
|
||||
def close(self):
|
||||
return ''
|
||||
|
@ -21,6 +21,7 @@ class RewriteContent:
|
||||
default_rule_config={},
|
||||
ds_rules_file=ds_rules_file)
|
||||
self.defmod = defmod
|
||||
self.decode_stream = False
|
||||
|
||||
def sanitize_content(self, status_headers, stream):
|
||||
# remove transfer encoding chunked and wrap in a dechunking stream
|
||||
@ -96,16 +97,17 @@ class RewriteContent:
|
||||
else:
|
||||
stream = DecompressingBufferedReader(stream)
|
||||
|
||||
if rewritten_headers.charset:
|
||||
encoding = rewritten_headers.charset
|
||||
elif is_lxml() and text_type == 'html':
|
||||
stream_raw = True
|
||||
else:
|
||||
(encoding, first_buff) = self._detect_charset(stream)
|
||||
if self.decode_stream:
|
||||
if rewritten_headers.charset:
|
||||
encoding = rewritten_headers.charset
|
||||
elif is_lxml() and text_type == 'html':
|
||||
stream_raw = True
|
||||
else:
|
||||
(encoding, first_buff) = self._detect_charset(stream)
|
||||
|
||||
# if encoding not set or chardet thinks its ascii, use utf-8
|
||||
if not encoding or encoding == 'ascii':
|
||||
encoding = 'utf-8'
|
||||
# if encoding not set or chardet thinks its ascii, use utf-8
|
||||
if not encoding or encoding == 'ascii':
|
||||
encoding = 'utf-8'
|
||||
|
||||
rule = self.ruleset.get_first_match(urlkey)
|
||||
|
||||
@ -147,23 +149,30 @@ class RewriteContent:
|
||||
if stream_raw:
|
||||
return self._parse_full_gen(rewriter, encoding, stream)
|
||||
|
||||
def do_rewrite(buff):
|
||||
def do_enc_rewrite(buff):
|
||||
buff = self._decode_buff(buff, stream, encoding)
|
||||
|
||||
buff = rewriter.rewrite(buff)
|
||||
|
||||
buff = buff.encode(encoding)
|
||||
|
||||
return buff
|
||||
|
||||
def do_rewrite(buff):
|
||||
buff = rewriter.rewrite(buff)
|
||||
return buff
|
||||
|
||||
if encoding:
|
||||
rewrite_func = do_enc_rewrite
|
||||
else:
|
||||
rewrite_func = do_rewrite
|
||||
|
||||
def do_finish():
|
||||
result = rewriter.close()
|
||||
result = result.encode(encoding)
|
||||
if encoding:
|
||||
result = result.encode(encoding)
|
||||
|
||||
return result
|
||||
|
||||
return self.stream_to_gen(stream,
|
||||
rewrite_func=do_rewrite,
|
||||
rewrite_func=rewrite_func,
|
||||
final_read_func=do_finish,
|
||||
first_buff=first_buff)
|
||||
|
||||
@ -202,7 +211,7 @@ class RewriteContent:
|
||||
finally:
|
||||
detector.close()
|
||||
|
||||
print "chardet result: " + str(detector.result)
|
||||
print "chardet result: ", str(detector.result)
|
||||
return (detector.result['encoding'], full_buff)
|
||||
|
||||
# Create a generator reading from a stream,
|
||||
@ -215,17 +224,17 @@ class RewriteContent:
|
||||
buff = first_buff
|
||||
else:
|
||||
buff = stream.read()
|
||||
if buff and (not hasattr(stream, 'closed') or
|
||||
not stream.closed):
|
||||
buff += stream.readline()
|
||||
# if buff and (not hasattr(stream, 'closed') or
|
||||
# not stream.closed):
|
||||
# buff += stream.readline()
|
||||
|
||||
while buff:
|
||||
if rewrite_func:
|
||||
buff = rewrite_func(buff)
|
||||
yield buff
|
||||
buff = stream.read()
|
||||
if buff:
|
||||
buff += stream.readline()
|
||||
# if buff:
|
||||
# buff += stream.readline()
|
||||
|
||||
# For adding a tail/handling final buffer
|
||||
if final_read_func:
|
||||
|
@ -137,8 +137,10 @@ urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.htm
|
||||
|
||||
def parse(data, head_insert = None):
|
||||
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
|
||||
data = data.decode('utf-8')
|
||||
print parser.rewrite(data) + parser.close()
|
||||
#data = data.decode('utf-8')
|
||||
result = parser.rewrite(data) + parser.close()
|
||||
# decode only for printing
|
||||
print result.decode('utf-8')
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
@ -38,7 +38,7 @@ ur"""
|
||||
<html><head><script src="/web/20131226101010js_/http://example.com/some/path/abc.js"></script></head></html>
|
||||
|
||||
# Unicode
|
||||
>>> parse('<a href="http://испытание.испытание/">испытание</a>')
|
||||
#>>> parse('<a href="http://испытание.испытание/">испытание</a>')
|
||||
<html><body><a href="/web/20131226101010/http://испытание.испытание/">испытание</a></body></html>
|
||||
|
||||
# Meta tag
|
||||
@ -139,7 +139,7 @@ ur"""
|
||||
# test multiple rewrites: extra >, split comment
|
||||
>>> p = LXMLHTMLRewriter(urlrewriter)
|
||||
>>> p.rewrite('<div> > <!-- a') + p.rewrite('b --></div>') + p.close()
|
||||
u'<html><body><div> > <!-- ab --></div></body></html>'
|
||||
'<html><body><div> > <!-- ab --></div></body></html>'
|
||||
"""
|
||||
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
|
@ -1,6 +1,7 @@
|
||||
from pywb.framework.basehandlers import WbUrlHandler
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.framework.archivalrouter import ArchivalRouter, Route
|
||||
from pywb.rewrite.rewriterules import use_lxml_parser
|
||||
|
||||
from handlers import StaticHandler
|
||||
|
||||
@ -22,4 +23,5 @@ def create_live_rewriter_app():
|
||||
Route('static/default', StaticHandler('pywb/static/'))
|
||||
]
|
||||
|
||||
# use_lxml_parser()
|
||||
return ArchivalRouter(routes, hostpaths=['http://localhost:8080'])
|
||||
|
Loading…
x
Reference in New Issue
Block a user