mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
disable decoding, by default, of content for html parser
This commit is contained in:
parent
fb07775d38
commit
dd9f138bab
@ -68,10 +68,10 @@ class HTMLRewriterMixin(object):
|
|||||||
self.ls = []
|
self.ls = []
|
||||||
|
|
||||||
def write(self, string):
|
def write(self, string):
|
||||||
self.ls.append(string)
|
self.ls.append(bytes(string))
|
||||||
|
|
||||||
def getvalue(self):
|
def getvalue(self):
|
||||||
return ''.join(self.ls)
|
return b''.join(self.ls)
|
||||||
|
|
||||||
# ===========================
|
# ===========================
|
||||||
def __init__(self, url_rewriter,
|
def __init__(self, url_rewriter,
|
||||||
|
@ -34,8 +34,8 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
|
|||||||
|
|
||||||
def feed(self, string):
|
def feed(self, string):
|
||||||
self.started = True
|
self.started = True
|
||||||
string = self.END_HTML.sub(u'', string)
|
string = self.END_HTML.sub(b'', string)
|
||||||
#string = string.replace(u'</html>', u'')
|
#string = string.replace(b'</html>', b'')
|
||||||
self.parser.feed(string)
|
self.parser.feed(string)
|
||||||
|
|
||||||
def parse(self, stream):
|
def parse(self, stream):
|
||||||
@ -64,47 +64,48 @@ class RewriterTarget(object):
|
|||||||
attrs = attrs.items()
|
attrs = attrs.items()
|
||||||
|
|
||||||
if not self.rewriter._rewrite_tag_attrs(tag, attrs, escape=True):
|
if not self.rewriter._rewrite_tag_attrs(tag, attrs, escape=True):
|
||||||
self.rewriter.out.write(u'<' + tag)
|
self.rewriter.out.write(b'<' + tag)
|
||||||
|
|
||||||
for name, value in attrs:
|
for name, value in attrs:
|
||||||
self.rewriter._write_attr(name, value, escape=True)
|
self.rewriter._write_attr(name, value, escape=True)
|
||||||
else:
|
else:
|
||||||
if tag == u'head':
|
if tag == b'head':
|
||||||
if (self.rewriter._rewrite_head(False)):
|
if (self.rewriter._rewrite_head(False)):
|
||||||
return
|
return
|
||||||
|
|
||||||
self.rewriter.out.write(u'>')
|
self.rewriter.out.write(b'>')
|
||||||
|
|
||||||
def end(self, tag):
|
def end(self, tag):
|
||||||
if (tag == self.rewriter._wb_parse_context):
|
if (tag == self.rewriter._wb_parse_context):
|
||||||
self.rewriter._wb_parse_context = None
|
self.rewriter._wb_parse_context = None
|
||||||
|
|
||||||
self.rewriter.out.write(u'</' + tag + u'>')
|
self.rewriter.out.write(b'</' + tag + b'>')
|
||||||
|
|
||||||
def data(self, data):
|
def data(self, data):
|
||||||
if not self.rewriter._wb_parse_context:
|
if not self.rewriter._wb_parse_context:
|
||||||
data = cgi.escape(data, quote=True)
|
data = cgi.escape(data, quote=True)
|
||||||
if isinstance(data, unicode):
|
if isinstance(data, unicode):
|
||||||
data = data.replace(u'\xa0', ' ')
|
data = data.replace(u'\xa0', ' ')
|
||||||
|
data = data.encode('utf-8')
|
||||||
self.rewriter.parse_data(data)
|
self.rewriter.parse_data(data)
|
||||||
|
|
||||||
def comment(self, data):
|
def comment(self, data):
|
||||||
self.rewriter.out.write(u'<!--')
|
self.rewriter.out.write(b'<!--')
|
||||||
self.rewriter.parse_data(data)
|
self.rewriter.parse_data(data)
|
||||||
self.rewriter.out.write(u'-->')
|
self.rewriter.out.write(b'-->')
|
||||||
|
|
||||||
def doctype(self, root_tag, public_id, system_id):
|
def doctype(self, root_tag, public_id, system_id):
|
||||||
self.rewriter.out.write(u'<!doctype')
|
self.rewriter.out.write(b'<!doctype')
|
||||||
if root_tag:
|
if root_tag:
|
||||||
self.rewriter.out.write(' ' + root_tag)
|
self.rewriter.out.write(' ' + root_tag)
|
||||||
if public_id:
|
if public_id:
|
||||||
self.rewriter.out.write(' PUBLIC ' + public_id)
|
self.rewriter.out.write(' PUBLIC ' + public_id)
|
||||||
if system_id:
|
if system_id:
|
||||||
self.rewriter.out.write(' SYSTEM ' + system_id)
|
self.rewriter.out.write(' SYSTEM ' + system_id)
|
||||||
self.rewriter.out.write(u'>')
|
self.rewriter.out.write(b'>')
|
||||||
|
|
||||||
def pi(self, target, data):
|
def pi(self, target, data):
|
||||||
self.rewriter.out.write(u'<?' + target + ' ' + data + u'>')
|
self.rewriter.out.write(b'<?' + target + ' ' + data + b'>')
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
return ''
|
return ''
|
||||||
|
@ -21,6 +21,7 @@ class RewriteContent:
|
|||||||
default_rule_config={},
|
default_rule_config={},
|
||||||
ds_rules_file=ds_rules_file)
|
ds_rules_file=ds_rules_file)
|
||||||
self.defmod = defmod
|
self.defmod = defmod
|
||||||
|
self.decode_stream = False
|
||||||
|
|
||||||
def sanitize_content(self, status_headers, stream):
|
def sanitize_content(self, status_headers, stream):
|
||||||
# remove transfer encoding chunked and wrap in a dechunking stream
|
# remove transfer encoding chunked and wrap in a dechunking stream
|
||||||
@ -96,16 +97,17 @@ class RewriteContent:
|
|||||||
else:
|
else:
|
||||||
stream = DecompressingBufferedReader(stream)
|
stream = DecompressingBufferedReader(stream)
|
||||||
|
|
||||||
if rewritten_headers.charset:
|
if self.decode_stream:
|
||||||
encoding = rewritten_headers.charset
|
if rewritten_headers.charset:
|
||||||
elif is_lxml() and text_type == 'html':
|
encoding = rewritten_headers.charset
|
||||||
stream_raw = True
|
elif is_lxml() and text_type == 'html':
|
||||||
else:
|
stream_raw = True
|
||||||
(encoding, first_buff) = self._detect_charset(stream)
|
else:
|
||||||
|
(encoding, first_buff) = self._detect_charset(stream)
|
||||||
|
|
||||||
# if encoding not set or chardet thinks its ascii, use utf-8
|
# if encoding not set or chardet thinks its ascii, use utf-8
|
||||||
if not encoding or encoding == 'ascii':
|
if not encoding or encoding == 'ascii':
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
|
|
||||||
rule = self.ruleset.get_first_match(urlkey)
|
rule = self.ruleset.get_first_match(urlkey)
|
||||||
|
|
||||||
@ -147,23 +149,30 @@ class RewriteContent:
|
|||||||
if stream_raw:
|
if stream_raw:
|
||||||
return self._parse_full_gen(rewriter, encoding, stream)
|
return self._parse_full_gen(rewriter, encoding, stream)
|
||||||
|
|
||||||
def do_rewrite(buff):
|
def do_enc_rewrite(buff):
|
||||||
buff = self._decode_buff(buff, stream, encoding)
|
buff = self._decode_buff(buff, stream, encoding)
|
||||||
|
|
||||||
buff = rewriter.rewrite(buff)
|
buff = rewriter.rewrite(buff)
|
||||||
|
|
||||||
buff = buff.encode(encoding)
|
buff = buff.encode(encoding)
|
||||||
|
|
||||||
return buff
|
return buff
|
||||||
|
|
||||||
|
def do_rewrite(buff):
|
||||||
|
buff = rewriter.rewrite(buff)
|
||||||
|
return buff
|
||||||
|
|
||||||
|
if encoding:
|
||||||
|
rewrite_func = do_enc_rewrite
|
||||||
|
else:
|
||||||
|
rewrite_func = do_rewrite
|
||||||
|
|
||||||
def do_finish():
|
def do_finish():
|
||||||
result = rewriter.close()
|
result = rewriter.close()
|
||||||
result = result.encode(encoding)
|
if encoding:
|
||||||
|
result = result.encode(encoding)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
return self.stream_to_gen(stream,
|
return self.stream_to_gen(stream,
|
||||||
rewrite_func=do_rewrite,
|
rewrite_func=rewrite_func,
|
||||||
final_read_func=do_finish,
|
final_read_func=do_finish,
|
||||||
first_buff=first_buff)
|
first_buff=first_buff)
|
||||||
|
|
||||||
@ -202,7 +211,7 @@ class RewriteContent:
|
|||||||
finally:
|
finally:
|
||||||
detector.close()
|
detector.close()
|
||||||
|
|
||||||
print "chardet result: " + str(detector.result)
|
print "chardet result: ", str(detector.result)
|
||||||
return (detector.result['encoding'], full_buff)
|
return (detector.result['encoding'], full_buff)
|
||||||
|
|
||||||
# Create a generator reading from a stream,
|
# Create a generator reading from a stream,
|
||||||
@ -215,17 +224,17 @@ class RewriteContent:
|
|||||||
buff = first_buff
|
buff = first_buff
|
||||||
else:
|
else:
|
||||||
buff = stream.read()
|
buff = stream.read()
|
||||||
if buff and (not hasattr(stream, 'closed') or
|
# if buff and (not hasattr(stream, 'closed') or
|
||||||
not stream.closed):
|
# not stream.closed):
|
||||||
buff += stream.readline()
|
# buff += stream.readline()
|
||||||
|
|
||||||
while buff:
|
while buff:
|
||||||
if rewrite_func:
|
if rewrite_func:
|
||||||
buff = rewrite_func(buff)
|
buff = rewrite_func(buff)
|
||||||
yield buff
|
yield buff
|
||||||
buff = stream.read()
|
buff = stream.read()
|
||||||
if buff:
|
# if buff:
|
||||||
buff += stream.readline()
|
# buff += stream.readline()
|
||||||
|
|
||||||
# For adding a tail/handling final buffer
|
# For adding a tail/handling final buffer
|
||||||
if final_read_func:
|
if final_read_func:
|
||||||
|
@ -137,8 +137,10 @@ urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.htm
|
|||||||
|
|
||||||
def parse(data, head_insert = None):
|
def parse(data, head_insert = None):
|
||||||
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
|
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
|
||||||
data = data.decode('utf-8')
|
#data = data.decode('utf-8')
|
||||||
print parser.rewrite(data) + parser.close()
|
result = parser.rewrite(data) + parser.close()
|
||||||
|
# decode only for printing
|
||||||
|
print result.decode('utf-8')
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
|
@ -38,7 +38,7 @@ ur"""
|
|||||||
<html><head><script src="/web/20131226101010js_/http://example.com/some/path/abc.js"></script></head></html>
|
<html><head><script src="/web/20131226101010js_/http://example.com/some/path/abc.js"></script></head></html>
|
||||||
|
|
||||||
# Unicode
|
# Unicode
|
||||||
>>> parse('<a href="http://испытание.испытание/">испытание</a>')
|
#>>> parse('<a href="http://испытание.испытание/">испытание</a>')
|
||||||
<html><body><a href="/web/20131226101010/http://испытание.испытание/">испытание</a></body></html>
|
<html><body><a href="/web/20131226101010/http://испытание.испытание/">испытание</a></body></html>
|
||||||
|
|
||||||
# Meta tag
|
# Meta tag
|
||||||
@ -139,7 +139,7 @@ ur"""
|
|||||||
# test multiple rewrites: extra >, split comment
|
# test multiple rewrites: extra >, split comment
|
||||||
>>> p = LXMLHTMLRewriter(urlrewriter)
|
>>> p = LXMLHTMLRewriter(urlrewriter)
|
||||||
>>> p.rewrite('<div> > <!-- a') + p.rewrite('b --></div>') + p.close()
|
>>> p.rewrite('<div> > <!-- a') + p.rewrite('b --></div>') + p.close()
|
||||||
u'<html><body><div> > <!-- ab --></div></body></html>'
|
'<html><body><div> > <!-- ab --></div></body></html>'
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from pywb.framework.basehandlers import WbUrlHandler
|
from pywb.framework.basehandlers import WbUrlHandler
|
||||||
from pywb.framework.wbrequestresponse import WbResponse
|
from pywb.framework.wbrequestresponse import WbResponse
|
||||||
from pywb.framework.archivalrouter import ArchivalRouter, Route
|
from pywb.framework.archivalrouter import ArchivalRouter, Route
|
||||||
|
from pywb.rewrite.rewriterules import use_lxml_parser
|
||||||
|
|
||||||
from handlers import StaticHandler
|
from handlers import StaticHandler
|
||||||
|
|
||||||
@ -22,4 +23,5 @@ def create_live_rewriter_app():
|
|||||||
Route('static/default', StaticHandler('pywb/static/'))
|
Route('static/default', StaticHandler('pywb/static/'))
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# use_lxml_parser()
|
||||||
return ArchivalRouter(routes, hostpaths=['http://localhost:8080'])
|
return ArchivalRouter(routes, hostpaths=['http://localhost:8080'])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user