mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Cleanup rewrite interfaces to address #13
All rewriters can support either buffered or streaming mode. In buffered mode, the full text content is written into a buffer and served with a Content-Length in streaming mode, text is streamed as it is rewritten and no Content-Length is written Default is to stream the response
This commit is contained in:
parent
33c135b337
commit
7722014a96
@ -113,16 +113,23 @@ class WBHtml(HTMLParser):
|
||||
|
||||
HEAD_TAGS = ['html', 'head', 'base', 'link', 'meta', 'title', 'style', 'script', 'object', 'bgsound']
|
||||
|
||||
class AccumBuff:
|
||||
def __init__(self):
|
||||
self.buff = ''
|
||||
|
||||
def __init__(self, rewriter, outstream = None, headInsert = None, jsRewriterClass = JSRewriter, cssRewriterClass = CSSRewriter):
|
||||
def write(self, string):
|
||||
self.buff += string
|
||||
|
||||
|
||||
def __init__(self, url_rewriter, outstream = None, headInsert = None, jsRewriterClass = JSRewriter, cssRewriterClass = CSSRewriter):
|
||||
HTMLParser.__init__(self)
|
||||
|
||||
self.rewriter = rewriter
|
||||
self.url_rewriter = url_rewriter
|
||||
self._wbParseContext = None
|
||||
self.out = outstream if outstream else sys.stdout
|
||||
self.out = outstream if outstream else WBHtml.AccumBuff()
|
||||
|
||||
self.jsRewriter = jsRewriterClass(rewriter)
|
||||
self.cssRewriter = cssRewriterClass(rewriter)
|
||||
self.jsRewriter = jsRewriterClass(url_rewriter)
|
||||
self.cssRewriter = cssRewriterClass(url_rewriter)
|
||||
|
||||
self.headInsert = headInsert
|
||||
|
||||
@ -147,14 +154,14 @@ class WBHtml(HTMLParser):
|
||||
# ===========================
|
||||
|
||||
def _rewriteURL(self, value, mod = None):
|
||||
return self.rewriter.rewrite(value, mod) if value else None
|
||||
return self.url_rewriter.rewrite(value, mod) if value else None
|
||||
|
||||
|
||||
def _rewriteCSS(self, cssContent):
|
||||
return self.cssRewriter.replaceAll(cssContent) if cssContent else None
|
||||
return self.cssRewriter.rewrite(cssContent) if cssContent else None
|
||||
|
||||
def _rewriteScript(self, scriptContent):
|
||||
return self.jsRewriter.replaceAll(scriptContent) if scriptContent else None
|
||||
return self.jsRewriter.rewrite(scriptContent) if scriptContent else None
|
||||
|
||||
def hasAttr(self, tagAttrs, attr):
|
||||
name, value = attr
|
||||
@ -202,7 +209,7 @@ class WBHtml(HTMLParser):
|
||||
else:
|
||||
# special case: base tag
|
||||
if (tag == 'base') and (attrName == 'href') and attrValue:
|
||||
self.rewriter.setBaseUrl(attrValue)
|
||||
self.url_rewriter.setBaseUrl(attrValue)
|
||||
|
||||
rwMod = handler.get(attrName)
|
||||
if rwMod is not None:
|
||||
@ -232,14 +239,28 @@ class WBHtml(HTMLParser):
|
||||
|
||||
self.out.write(data)
|
||||
|
||||
def rewrite(self, string):
|
||||
if not self.out:
|
||||
self.out = WBHtml.AccumBuff()
|
||||
|
||||
self.feed(string)
|
||||
|
||||
result = self.out.buff
|
||||
# Clear buffer to create new one for next rewrite()
|
||||
self.out = None
|
||||
|
||||
return result
|
||||
|
||||
# HTMLParser overrides below
|
||||
def close(self):
|
||||
if (self._wbParseContext):
|
||||
self.feed('</' + self._wbParseContext + '>')
|
||||
result = self.rewrite('</' + self._wbParseContext + '>')
|
||||
self._wbParseContext = None
|
||||
else:
|
||||
result = ''
|
||||
|
||||
HTMLParser.close(self)
|
||||
return result
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if not self.rewriteTagAttrs(tag, attrs, False):
|
||||
@ -291,11 +312,12 @@ class WBHtml(HTMLParser):
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
||||
rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
|
||||
url_rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
|
||||
|
||||
def parse(data, headInsert = None):
|
||||
parser = WBHtml(rewriter, headInsert = headInsert)
|
||||
parser.feed(data)
|
||||
parser.close()
|
||||
parser = WBHtml(url_rewriter, headInsert = headInsert)
|
||||
print parser.rewrite(data) + parser.close()
|
||||
|
||||
doctest.testmod()
|
||||
|
||||
|
||||
|
@ -8,7 +8,7 @@ from url_rewriter import ArchivalUrlRewriter
|
||||
class RegexRewriter:
|
||||
"""
|
||||
# Test https->http converter (other tests below in subclasses)
|
||||
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.removeHttps, 0)]).replaceAll('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
|
||||
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.removeHttps, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
|
||||
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
|
||||
"""
|
||||
|
||||
@ -48,9 +48,12 @@ class RegexRewriter:
|
||||
def filter(self, m):
|
||||
return True
|
||||
|
||||
def replaceAll(self, string):
|
||||
def rewrite(self, string):
|
||||
return self.regex.sub(lambda x: self.replace(x), string)
|
||||
|
||||
def close(self):
|
||||
return ''
|
||||
|
||||
def replace(self, m):
|
||||
i = 0
|
||||
for _, op, count in self.rules:
|
||||
@ -218,13 +221,13 @@ if __name__ == "__main__":
|
||||
arcrw = ArchivalUrlRewriter('/20131010im_/http://example.com/', '/web/')
|
||||
|
||||
def test_js(string, extra = []):
|
||||
return JSRewriter(arcrw, extra).replaceAll(string)
|
||||
return JSRewriter(arcrw, extra).rewrite(string)
|
||||
|
||||
def test_xml(string):
|
||||
return XMLRewriter(arcrw).replaceAll(string)
|
||||
return XMLRewriter(arcrw).rewrite(string)
|
||||
|
||||
def test_css(string):
|
||||
return CSSRewriter(arcrw).replaceAll(string)
|
||||
return CSSRewriter(arcrw).rewrite(string)
|
||||
|
||||
|
||||
|
||||
|
128
pywb/replay.py
128
pywb/replay.py
@ -204,6 +204,9 @@ class RewritingReplayHandler(ReplayHandler):
|
||||
self.headerRewriter = headerRewriter
|
||||
self.redir_to_exact = redir_to_exact
|
||||
|
||||
# buffer or stream rewritten response
|
||||
self.buffer_response = False
|
||||
|
||||
|
||||
def _textContentType(self, contentType):
|
||||
for ctype, mimelist in self.REWRITE_TYPES.iteritems():
|
||||
@ -259,71 +262,97 @@ class RewritingReplayHandler(ReplayHandler):
|
||||
encoding = 'utf-8'
|
||||
|
||||
# Buffering response for html, streaming for others?
|
||||
if rewrittenHeaders.textType == 'html':
|
||||
return self._rewriteHtml(encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
|
||||
else:
|
||||
return self._rewriteOther(rewrittenHeaders.textType, encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
|
||||
#if rewrittenHeaders.textType == 'html':
|
||||
# return self._rewriteHtml(encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
|
||||
#else:
|
||||
# return self._rewriteOther(rewrittenHeaders.textType, encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
|
||||
|
||||
textType = rewrittenHeaders.textType
|
||||
status_headers = rewrittenHeaders.status_headers
|
||||
|
||||
def _rewriteHtml(self, encoding, urlrewriter, stream, status_headers, firstBuff = None):
|
||||
out = StringIO.StringIO()
|
||||
htmlrewriter = html_rewriter.WBHtml(urlrewriter, out, self.headInsert)
|
||||
|
||||
try:
|
||||
buff = firstBuff if firstBuff else stream.read()
|
||||
while buff:
|
||||
if encoding:
|
||||
try:
|
||||
buff = buff.decode(encoding)
|
||||
except UnicodeDecodeError, e:
|
||||
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
|
||||
for i in range(3):
|
||||
buff += stream.read(1)
|
||||
try:
|
||||
buff = buff.decode(encoding)
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
else:
|
||||
raise
|
||||
htmlrewriter.feed(buff)
|
||||
buff = stream.read()
|
||||
|
||||
# Close rewriter if gracefully made it to end
|
||||
htmlrewriter.close()
|
||||
|
||||
finally:
|
||||
content = out.getvalue()
|
||||
if encoding:
|
||||
content = content.encode(encoding)
|
||||
|
||||
value = [content]
|
||||
contentLengthStr = str(len(content))
|
||||
status_headers.headers.append(('Content-Length', contentLengthStr))
|
||||
out.close()
|
||||
|
||||
return WbResponse(status_headers, value = value)
|
||||
|
||||
|
||||
def _rewriteOther(self, textType, encoding, urlrewriter, stream, status_headers, firstBuff = None):
|
||||
if textType == 'css':
|
||||
if textType == 'html':
|
||||
rewriter = html_rewriter.WBHtml(urlrewriter, outstream = None, headInsert = self.headInsert)
|
||||
elif textType == 'css':
|
||||
rewriter = regex_rewriters.CSSRewriter(urlrewriter)
|
||||
elif textType == 'js':
|
||||
rewriter = regex_rewriters.JSRewriter(urlrewriter)
|
||||
elif textType == 'xml':
|
||||
rewriter = regex_rewriters.XMLRewriter(urlrewriter)
|
||||
else:
|
||||
raise Exception('Unknown Text Type for Rewrite: ' + textType)
|
||||
|
||||
|
||||
if self.buffer_response:
|
||||
return self._buffer_rewrite_response(rewriter, encoding, stream, status_headers, firstBuff)
|
||||
else:
|
||||
return self._stream_rewrite_response(rewriter, encoding, stream, status_headers, firstBuff)
|
||||
|
||||
|
||||
# Buffer rewrite response, and serve with full Content-Length
|
||||
def _buffer_rewrite_response(self, rewriter, encoding, stream, status_headers, firstBuff = None):
|
||||
out = StringIO.StringIO()
|
||||
|
||||
try:
|
||||
buff = firstBuff if firstBuff else stream.read()
|
||||
while buff:
|
||||
if encoding:
|
||||
buff = self._decodeBuff(buff, stream, encoding)
|
||||
|
||||
out.write(rewriter.rewrite(buff))
|
||||
buff = stream.read()
|
||||
|
||||
# Close rewriter if gracefully made it to end
|
||||
rewriter.close()
|
||||
|
||||
finally:
|
||||
content = out.getvalue()
|
||||
|
||||
if encoding:
|
||||
content = content.encode(encoding)
|
||||
|
||||
value = [content]
|
||||
contentLengthStr = str(len(content))
|
||||
status_headers.headers.append(('Content-Length', contentLengthStr))
|
||||
out.close()
|
||||
|
||||
return WbResponse(status_headers, value = value)
|
||||
|
||||
# Stream rewrite response from record (no Content-Length), may even be chunked by front-end
|
||||
def _stream_rewrite_response(self, rewriter, encoding, stream, status_headers, firstBuff = None):
|
||||
def doRewrite(buff):
|
||||
if encoding:
|
||||
buff = buff.decode(encoding)
|
||||
buff = rewriter.replaceAll(buff)
|
||||
buff = self._decodeBuff(buff, stream, encoding)
|
||||
|
||||
buff = rewriter.rewrite(buff)
|
||||
|
||||
if encoding:
|
||||
buff = buff.encode(encoding)
|
||||
|
||||
return buff
|
||||
|
||||
return WbResponse.stream_response(status_headers, stream, doRewrite, firstBuff)
|
||||
def doFinish():
|
||||
return rewriter.close()
|
||||
|
||||
return WbResponse.stream_response(status_headers, stream, rewrite_func = doRewrite, final_read_func = doFinish, first_buff = firstBuff)
|
||||
|
||||
|
||||
def _decodeBuff(self, buff, stream, encoding):
|
||||
try:
|
||||
buff = buff.decode(encoding)
|
||||
except UnicodeDecodeError, e:
|
||||
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
|
||||
for i in range(3):
|
||||
buff += stream.read(1)
|
||||
try:
|
||||
buff = buff.decode(encoding)
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
else:
|
||||
raise
|
||||
|
||||
return buff
|
||||
|
||||
|
||||
def _detectCharset(self, stream):
|
||||
buff = stream.read(8192)
|
||||
@ -331,6 +360,7 @@ class RewritingReplayHandler(ReplayHandler):
|
||||
print "chardet result: " + str(result)
|
||||
return (result['encoding'], buff)
|
||||
|
||||
|
||||
def _checkRedir(self, wbrequest, cdx):
|
||||
if self.redir_to_exact and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
|
||||
newUrl = wbrequest.urlrewriter.getTimestampUrl(cdx['timestamp'], cdx['original'])
|
||||
|
@ -133,15 +133,22 @@ class WbResponse:
|
||||
return WbResponse(StatusAndHeaders(status, [('Location', location)]))
|
||||
|
||||
@staticmethod
|
||||
def stream_response(status_headers, stream, proc = None, firstBuff = None):
|
||||
def stream_response(status_headers, stream, rewrite_func = None, final_read_func = None, first_buff = None):
|
||||
def streamGen():
|
||||
try:
|
||||
buff = firstBuff if firstBuff else stream.read()
|
||||
buff = first_buff if first_buff else stream.read()
|
||||
while buff:
|
||||
if proc:
|
||||
buff = proc(buff)
|
||||
if rewrite_func:
|
||||
buff = rewrite_func(buff)
|
||||
yield buff
|
||||
buff = stream.read()
|
||||
|
||||
# For adding a tail/handling final buffer
|
||||
if final_read_func:
|
||||
buff = final_read_func()
|
||||
if buff:
|
||||
yield buff
|
||||
|
||||
finally:
|
||||
stream.close()
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user