mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
lxml: use lxml's parse interface instead of feed interface to allow
xml to handle decoding unicode data, better address #36
This commit is contained in:
parent
890c323617
commit
2a318527df
@ -45,6 +45,18 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
|
||||
#string = string.replace(u'</html>', u'')
|
||||
self.parser.feed(string)
|
||||
|
||||
def parse(self, stream):
|
||||
self.out = self.AccumBuff()
|
||||
|
||||
lxml.etree.parse(stream, self.parser)
|
||||
|
||||
result = self.out.getvalue()
|
||||
|
||||
# Clear buffer to create new one for next rewrite()
|
||||
self.out = None
|
||||
|
||||
return result
|
||||
|
||||
def _internal_close(self):
|
||||
if self.started:
|
||||
self.parser.close()
|
||||
|
@ -123,12 +123,20 @@ class RewriteContent:
|
||||
|
||||
return (status_headers, gen, True)
|
||||
|
||||
def _parse_full_gen(self, rewriter, encoding, stream):
|
||||
buff = rewriter.parse(stream)
|
||||
buff = buff.encode(encoding)
|
||||
yield buff
|
||||
|
||||
# Create rewrite stream, may even be chunked by front-end
|
||||
def _rewriting_stream_gen(self, rewriter, encoding, stream_raw,
|
||||
stream, first_buff=None):
|
||||
|
||||
if stream_raw:
|
||||
return self._parse_full_gen(rewriter, encoding, stream)
|
||||
|
||||
def do_rewrite(buff):
|
||||
if not stream_raw:
|
||||
buff = self._decode_buff(buff, stream, encoding)
|
||||
buff = self._decode_buff(buff, stream, encoding)
|
||||
|
||||
buff = rewriter.rewrite(buff)
|
||||
|
||||
|
@ -51,7 +51,7 @@ r"""
|
||||
|
||||
# scheme-agnostic
|
||||
>>> _test_js('cool_Location = "//example.com/abc.html" //comment')
|
||||
'cool_Location = "/web/20131010em_///example.com/abc.html" //comment'
|
||||
'cool_Location = "/web/20131010em_/http://example.com/abc.html" //comment'
|
||||
|
||||
|
||||
#=================================================================
|
||||
|
@ -14,7 +14,7 @@ class UrlRewriter(object):
|
||||
|
||||
NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', 'mailto:', 'about:']
|
||||
|
||||
PROTOCOLS = ['http:', 'https:', '//', 'ftp:', 'mms:', 'rtsp:', 'wais:']
|
||||
PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']
|
||||
|
||||
def __init__(self, wburl, prefix):
|
||||
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
||||
@ -32,6 +32,10 @@ class UrlRewriter(object):
|
||||
|
||||
isAbs = any(url.startswith(x) for x in self.PROTOCOLS)
|
||||
|
||||
if url.startswith('//'):
|
||||
isAbs = True
|
||||
url = 'http:' + url
|
||||
|
||||
# Optimized rewriter for
|
||||
# -rel urls that don't start with / and
|
||||
# do not contain ../ and no special mod
|
||||
|
Loading…
x
Reference in New Issue
Block a user