1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

lxml: use lxml's parse interface instead of feed interface to allow

xml to handle decoding unicode data, better address #36
This commit is contained in:
Ilya Kreymer 2014-04-07 17:13:43 -07:00
parent 890c323617
commit 2a318527df
4 changed files with 28 additions and 4 deletions

View File

@ -45,6 +45,18 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
#string = string.replace(u'</html>', u'')
self.parser.feed(string)
def parse(self, stream):
self.out = self.AccumBuff()
lxml.etree.parse(stream, self.parser)
result = self.out.getvalue()
# Clear buffer to create new one for next rewrite()
self.out = None
return result
def _internal_close(self):
if self.started:
self.parser.close()

View File

@ -123,12 +123,20 @@ class RewriteContent:
return (status_headers, gen, True)
def _parse_full_gen(self, rewriter, encoding, stream):
buff = rewriter.parse(stream)
buff = buff.encode(encoding)
yield buff
# Create rewrite stream, may even be chunked by front-end
def _rewriting_stream_gen(self, rewriter, encoding, stream_raw,
stream, first_buff=None):
if stream_raw:
return self._parse_full_gen(rewriter, encoding, stream)
def do_rewrite(buff):
if not stream_raw:
buff = self._decode_buff(buff, stream, encoding)
buff = self._decode_buff(buff, stream, encoding)
buff = rewriter.rewrite(buff)

View File

@ -51,7 +51,7 @@ r"""
# scheme-agnostic
>>> _test_js('cool_Location = "//example.com/abc.html" //comment')
'cool_Location = "/web/20131010em_///example.com/abc.html" //comment'
'cool_Location = "/web/20131010em_/http://example.com/abc.html" //comment'
#=================================================================

View File

@ -14,7 +14,7 @@ class UrlRewriter(object):
NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', 'mailto:', 'about:']
PROTOCOLS = ['http:', 'https:', '//', 'ftp:', 'mms:', 'rtsp:', 'wais:']
PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']
def __init__(self, wburl, prefix):
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
@ -32,6 +32,10 @@ class UrlRewriter(object):
isAbs = any(url.startswith(x) for x in self.PROTOCOLS)
if url.startswith('//'):
isAbs = True
url = 'http:' + url
# Optimized rewriter for
# -rel urls that don't start with / and
# do not contain ../ and no special mod