1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Cleanup rewrite interfaces to address #13

All rewriters can support either buffered or streaming mode.
In buffered mode, the full text content is written into a buffer
and served with a Content-Length
in streaming mode, text is streamed as it is rewritten and
no Content-Length is written
Default is to stream the response
This commit is contained in:
Ilya Kreymer 2014-01-22 14:03:41 -08:00
parent 33c135b337
commit 7722014a96
4 changed files with 134 additions and 72 deletions

View File

@ -113,16 +113,23 @@ class WBHtml(HTMLParser):
HEAD_TAGS = ['html', 'head', 'base', 'link', 'meta', 'title', 'style', 'script', 'object', 'bgsound']
class AccumBuff:
def __init__(self):
self.buff = ''
def __init__(self, rewriter, outstream = None, headInsert = None, jsRewriterClass = JSRewriter, cssRewriterClass = CSSRewriter):
def write(self, string):
self.buff += string
def __init__(self, url_rewriter, outstream = None, headInsert = None, jsRewriterClass = JSRewriter, cssRewriterClass = CSSRewriter):
HTMLParser.__init__(self)
self.rewriter = rewriter
self.url_rewriter = url_rewriter
self._wbParseContext = None
self.out = outstream if outstream else sys.stdout
self.out = outstream if outstream else WBHtml.AccumBuff()
self.jsRewriter = jsRewriterClass(rewriter)
self.cssRewriter = cssRewriterClass(rewriter)
self.jsRewriter = jsRewriterClass(url_rewriter)
self.cssRewriter = cssRewriterClass(url_rewriter)
self.headInsert = headInsert
@ -147,14 +154,14 @@ class WBHtml(HTMLParser):
# ===========================
def _rewriteURL(self, value, mod = None):
return self.rewriter.rewrite(value, mod) if value else None
return self.url_rewriter.rewrite(value, mod) if value else None
def _rewriteCSS(self, cssContent):
return self.cssRewriter.replaceAll(cssContent) if cssContent else None
return self.cssRewriter.rewrite(cssContent) if cssContent else None
def _rewriteScript(self, scriptContent):
return self.jsRewriter.replaceAll(scriptContent) if scriptContent else None
return self.jsRewriter.rewrite(scriptContent) if scriptContent else None
def hasAttr(self, tagAttrs, attr):
name, value = attr
@ -202,7 +209,7 @@ class WBHtml(HTMLParser):
else:
# special case: base tag
if (tag == 'base') and (attrName == 'href') and attrValue:
self.rewriter.setBaseUrl(attrValue)
self.url_rewriter.setBaseUrl(attrValue)
rwMod = handler.get(attrName)
if rwMod is not None:
@ -232,14 +239,28 @@ class WBHtml(HTMLParser):
self.out.write(data)
def rewrite(self, string):
if not self.out:
self.out = WBHtml.AccumBuff()
self.feed(string)
result = self.out.buff
# Clear buffer to create new one for next rewrite()
self.out = None
return result
# HTMLParser overrides below
def close(self):
if (self._wbParseContext):
self.feed('</' + self._wbParseContext + '>')
result = self.rewrite('</' + self._wbParseContext + '>')
self._wbParseContext = None
else:
result = ''
HTMLParser.close(self)
return result
def handle_starttag(self, tag, attrs):
if not self.rewriteTagAttrs(tag, attrs, False):
@ -291,11 +312,12 @@ class WBHtml(HTMLParser):
if __name__ == "__main__":
import doctest
rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
url_rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
def parse(data, headInsert = None):
parser = WBHtml(rewriter, headInsert = headInsert)
parser.feed(data)
parser.close()
parser = WBHtml(url_rewriter, headInsert = headInsert)
print parser.rewrite(data) + parser.close()
doctest.testmod()

View File

@ -8,7 +8,7 @@ from url_rewriter import ArchivalUrlRewriter
class RegexRewriter:
"""
# Test https->http converter (other tests below in subclasses)
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.removeHttps, 0)]).replaceAll('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.removeHttps, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
"""
@ -48,9 +48,12 @@ class RegexRewriter:
def filter(self, m):
return True
def replaceAll(self, string):
def rewrite(self, string):
return self.regex.sub(lambda x: self.replace(x), string)
def close(self):
return ''
def replace(self, m):
i = 0
for _, op, count in self.rules:
@ -218,13 +221,13 @@ if __name__ == "__main__":
arcrw = ArchivalUrlRewriter('/20131010im_/http://example.com/', '/web/')
def test_js(string, extra = []):
return JSRewriter(arcrw, extra).replaceAll(string)
return JSRewriter(arcrw, extra).rewrite(string)
def test_xml(string):
return XMLRewriter(arcrw).replaceAll(string)
return XMLRewriter(arcrw).rewrite(string)
def test_css(string):
return CSSRewriter(arcrw).replaceAll(string)
return CSSRewriter(arcrw).rewrite(string)

View File

@ -204,6 +204,9 @@ class RewritingReplayHandler(ReplayHandler):
self.headerRewriter = headerRewriter
self.redir_to_exact = redir_to_exact
# buffer or stream rewritten response
self.buffer_response = False
def _textContentType(self, contentType):
for ctype, mimelist in self.REWRITE_TYPES.iteritems():
@ -259,71 +262,97 @@ class RewritingReplayHandler(ReplayHandler):
encoding = 'utf-8'
# Buffering response for html, streaming for others?
if rewrittenHeaders.textType == 'html':
return self._rewriteHtml(encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
else:
return self._rewriteOther(rewrittenHeaders.textType, encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
#if rewrittenHeaders.textType == 'html':
# return self._rewriteHtml(encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
#else:
# return self._rewriteOther(rewrittenHeaders.textType, encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
textType = rewrittenHeaders.textType
status_headers = rewrittenHeaders.status_headers
def _rewriteHtml(self, encoding, urlrewriter, stream, status_headers, firstBuff = None):
out = StringIO.StringIO()
htmlrewriter = html_rewriter.WBHtml(urlrewriter, out, self.headInsert)
try:
buff = firstBuff if firstBuff else stream.read()
while buff:
if encoding:
try:
buff = buff.decode(encoding)
except UnicodeDecodeError, e:
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
for i in range(3):
buff += stream.read(1)
try:
buff = buff.decode(encoding)
break
except UnicodeDecodeError:
pass
else:
raise
htmlrewriter.feed(buff)
buff = stream.read()
# Close rewriter if gracefully made it to end
htmlrewriter.close()
finally:
content = out.getvalue()
if encoding:
content = content.encode(encoding)
value = [content]
contentLengthStr = str(len(content))
status_headers.headers.append(('Content-Length', contentLengthStr))
out.close()
return WbResponse(status_headers, value = value)
def _rewriteOther(self, textType, encoding, urlrewriter, stream, status_headers, firstBuff = None):
if textType == 'css':
if textType == 'html':
rewriter = html_rewriter.WBHtml(urlrewriter, outstream = None, headInsert = self.headInsert)
elif textType == 'css':
rewriter = regex_rewriters.CSSRewriter(urlrewriter)
elif textType == 'js':
rewriter = regex_rewriters.JSRewriter(urlrewriter)
elif textType == 'xml':
rewriter = regex_rewriters.XMLRewriter(urlrewriter)
else:
raise Exception('Unknown Text Type for Rewrite: ' + textType)
if self.buffer_response:
return self._buffer_rewrite_response(rewriter, encoding, stream, status_headers, firstBuff)
else:
return self._stream_rewrite_response(rewriter, encoding, stream, status_headers, firstBuff)
# Buffer rewrite response, and serve with full Content-Length
def _buffer_rewrite_response(self, rewriter, encoding, stream, status_headers, firstBuff = None):
out = StringIO.StringIO()
try:
buff = firstBuff if firstBuff else stream.read()
while buff:
if encoding:
buff = self._decodeBuff(buff, stream, encoding)
out.write(rewriter.rewrite(buff))
buff = stream.read()
# Close rewriter if gracefully made it to end
rewriter.close()
finally:
content = out.getvalue()
if encoding:
content = content.encode(encoding)
value = [content]
contentLengthStr = str(len(content))
status_headers.headers.append(('Content-Length', contentLengthStr))
out.close()
return WbResponse(status_headers, value = value)
# Stream rewrite response from record (no Content-Length), may even be chunked by front-end
def _stream_rewrite_response(self, rewriter, encoding, stream, status_headers, firstBuff = None):
def doRewrite(buff):
if encoding:
buff = buff.decode(encoding)
buff = rewriter.replaceAll(buff)
buff = self._decodeBuff(buff, stream, encoding)
buff = rewriter.rewrite(buff)
if encoding:
buff = buff.encode(encoding)
return buff
return WbResponse.stream_response(status_headers, stream, doRewrite, firstBuff)
def doFinish():
return rewriter.close()
return WbResponse.stream_response(status_headers, stream, rewrite_func = doRewrite, final_read_func = doFinish, first_buff = firstBuff)
def _decodeBuff(self, buff, stream, encoding):
try:
buff = buff.decode(encoding)
except UnicodeDecodeError, e:
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
for i in range(3):
buff += stream.read(1)
try:
buff = buff.decode(encoding)
break
except UnicodeDecodeError:
pass
else:
raise
return buff
def _detectCharset(self, stream):
buff = stream.read(8192)
@ -331,6 +360,7 @@ class RewritingReplayHandler(ReplayHandler):
print "chardet result: " + str(result)
return (result['encoding'], buff)
def _checkRedir(self, wbrequest, cdx):
if self.redir_to_exact and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
newUrl = wbrequest.urlrewriter.getTimestampUrl(cdx['timestamp'], cdx['original'])

View File

@ -133,15 +133,22 @@ class WbResponse:
return WbResponse(StatusAndHeaders(status, [('Location', location)]))
@staticmethod
def stream_response(status_headers, stream, proc = None, firstBuff = None):
def stream_response(status_headers, stream, rewrite_func = None, final_read_func = None, first_buff = None):
def streamGen():
try:
buff = firstBuff if firstBuff else stream.read()
buff = first_buff if first_buff else stream.read()
while buff:
if proc:
buff = proc(buff)
if rewrite_func:
buff = rewrite_func(buff)
yield buff
buff = stream.read()
# For adding a tail/handling final buffer
if final_read_func:
buff = final_read_func()
if buff:
yield buff
finally:
stream.close()