mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
lxml: use lxml's parse interface instead of feed interface to allow
xml to handle decoding unicode data, better address #36
This commit is contained in:
parent
890c323617
commit
2a318527df
@ -45,6 +45,18 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
|
|||||||
#string = string.replace(u'</html>', u'')
|
#string = string.replace(u'</html>', u'')
|
||||||
self.parser.feed(string)
|
self.parser.feed(string)
|
||||||
|
|
||||||
|
def parse(self, stream):
|
||||||
|
self.out = self.AccumBuff()
|
||||||
|
|
||||||
|
lxml.etree.parse(stream, self.parser)
|
||||||
|
|
||||||
|
result = self.out.getvalue()
|
||||||
|
|
||||||
|
# Clear buffer to create new one for next rewrite()
|
||||||
|
self.out = None
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
def _internal_close(self):
|
def _internal_close(self):
|
||||||
if self.started:
|
if self.started:
|
||||||
self.parser.close()
|
self.parser.close()
|
||||||
|
@ -123,12 +123,20 @@ class RewriteContent:
|
|||||||
|
|
||||||
return (status_headers, gen, True)
|
return (status_headers, gen, True)
|
||||||
|
|
||||||
|
def _parse_full_gen(self, rewriter, encoding, stream):
|
||||||
|
buff = rewriter.parse(stream)
|
||||||
|
buff = buff.encode(encoding)
|
||||||
|
yield buff
|
||||||
|
|
||||||
# Create rewrite stream, may even be chunked by front-end
|
# Create rewrite stream, may even be chunked by front-end
|
||||||
def _rewriting_stream_gen(self, rewriter, encoding, stream_raw,
|
def _rewriting_stream_gen(self, rewriter, encoding, stream_raw,
|
||||||
stream, first_buff=None):
|
stream, first_buff=None):
|
||||||
|
|
||||||
|
if stream_raw:
|
||||||
|
return self._parse_full_gen(rewriter, encoding, stream)
|
||||||
|
|
||||||
def do_rewrite(buff):
|
def do_rewrite(buff):
|
||||||
if not stream_raw:
|
buff = self._decode_buff(buff, stream, encoding)
|
||||||
buff = self._decode_buff(buff, stream, encoding)
|
|
||||||
|
|
||||||
buff = rewriter.rewrite(buff)
|
buff = rewriter.rewrite(buff)
|
||||||
|
|
||||||
|
@ -51,7 +51,7 @@ r"""
|
|||||||
|
|
||||||
# scheme-agnostic
|
# scheme-agnostic
|
||||||
>>> _test_js('cool_Location = "//example.com/abc.html" //comment')
|
>>> _test_js('cool_Location = "//example.com/abc.html" //comment')
|
||||||
'cool_Location = "/web/20131010em_///example.com/abc.html" //comment'
|
'cool_Location = "/web/20131010em_/http://example.com/abc.html" //comment'
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -14,7 +14,7 @@ class UrlRewriter(object):
|
|||||||
|
|
||||||
NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', 'mailto:', 'about:']
|
NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', 'mailto:', 'about:']
|
||||||
|
|
||||||
PROTOCOLS = ['http:', 'https:', '//', 'ftp:', 'mms:', 'rtsp:', 'wais:']
|
PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']
|
||||||
|
|
||||||
def __init__(self, wburl, prefix):
|
def __init__(self, wburl, prefix):
|
||||||
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
||||||
@ -32,6 +32,10 @@ class UrlRewriter(object):
|
|||||||
|
|
||||||
isAbs = any(url.startswith(x) for x in self.PROTOCOLS)
|
isAbs = any(url.startswith(x) for x in self.PROTOCOLS)
|
||||||
|
|
||||||
|
if url.startswith('//'):
|
||||||
|
isAbs = True
|
||||||
|
url = 'http:' + url
|
||||||
|
|
||||||
# Optimized rewriter for
|
# Optimized rewriter for
|
||||||
# -rel urls that don't start with / and
|
# -rel urls that don't start with / and
|
||||||
# do not contain ../ and no special mod
|
# do not contain ../ and no special mod
|
||||||
|
Loading…
x
Reference in New Issue
Block a user