diff --git a/pywb/html_rewriter.py b/pywb/html_rewriter.py
index bbf30294..5b32e2e2 100644
--- a/pywb/html_rewriter.py
+++ b/pywb/html_rewriter.py
@@ -113,16 +113,23 @@ class WBHtml(HTMLParser):
HEAD_TAGS = ['html', 'head', 'base', 'link', 'meta', 'title', 'style', 'script', 'object', 'bgsound']
+ class AccumBuff:
+ def __init__(self):
+ self.buff = ''
- def __init__(self, rewriter, outstream = None, headInsert = None, jsRewriterClass = JSRewriter, cssRewriterClass = CSSRewriter):
+ def write(self, string):
+ self.buff += string
+
+
+ def __init__(self, url_rewriter, outstream = None, headInsert = None, jsRewriterClass = JSRewriter, cssRewriterClass = CSSRewriter):
HTMLParser.__init__(self)
- self.rewriter = rewriter
+ self.url_rewriter = url_rewriter
self._wbParseContext = None
- self.out = outstream if outstream else sys.stdout
+ self.out = outstream if outstream else WBHtml.AccumBuff()
- self.jsRewriter = jsRewriterClass(rewriter)
- self.cssRewriter = cssRewriterClass(rewriter)
+ self.jsRewriter = jsRewriterClass(url_rewriter)
+ self.cssRewriter = cssRewriterClass(url_rewriter)
self.headInsert = headInsert
@@ -147,14 +154,14 @@ class WBHtml(HTMLParser):
# ===========================
def _rewriteURL(self, value, mod = None):
- return self.rewriter.rewrite(value, mod) if value else None
+ return self.url_rewriter.rewrite(value, mod) if value else None
def _rewriteCSS(self, cssContent):
- return self.cssRewriter.replaceAll(cssContent) if cssContent else None
+ return self.cssRewriter.rewrite(cssContent) if cssContent else None
def _rewriteScript(self, scriptContent):
- return self.jsRewriter.replaceAll(scriptContent) if scriptContent else None
+ return self.jsRewriter.rewrite(scriptContent) if scriptContent else None
def hasAttr(self, tagAttrs, attr):
name, value = attr
@@ -202,7 +209,7 @@ class WBHtml(HTMLParser):
else:
# special case: base tag
if (tag == 'base') and (attrName == 'href') and attrValue:
- self.rewriter.setBaseUrl(attrValue)
+ self.url_rewriter.setBaseUrl(attrValue)
rwMod = handler.get(attrName)
if rwMod is not None:
@@ -232,14 +239,28 @@ class WBHtml(HTMLParser):
self.out.write(data)
+ def rewrite(self, string):
+ if not self.out:
+ self.out = WBHtml.AccumBuff()
+
+ self.feed(string)
+
+ result = self.out.buff
+ # Clear buffer to create new one for next rewrite()
+ self.out = None
+
+ return result
# HTMLParser overrides below
def close(self):
if (self._wbParseContext):
- self.feed('' + self._wbParseContext + '>')
+ result = self.rewrite('' + self._wbParseContext + '>')
self._wbParseContext = None
+ else:
+ result = ''
HTMLParser.close(self)
+ return result
def handle_starttag(self, tag, attrs):
if not self.rewriteTagAttrs(tag, attrs, False):
@@ -291,11 +312,12 @@ class WBHtml(HTMLParser):
if __name__ == "__main__":
import doctest
- rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
+ url_rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
def parse(data, headInsert = None):
- parser = WBHtml(rewriter, headInsert = headInsert)
- parser.feed(data)
- parser.close()
+ parser = WBHtml(url_rewriter, headInsert = headInsert)
+ print parser.rewrite(data) + parser.close()
doctest.testmod()
+
+
diff --git a/pywb/regex_rewriters.py b/pywb/regex_rewriters.py
index e8a505e6..7ae5acd1 100644
--- a/pywb/regex_rewriters.py
+++ b/pywb/regex_rewriters.py
@@ -8,7 +8,7 @@ from url_rewriter import ArchivalUrlRewriter
class RegexRewriter:
"""
# Test https->http converter (other tests below in subclasses)
- >>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.removeHttps, 0)]).replaceAll('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
+ >>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.removeHttps, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
"""
@@ -48,9 +48,12 @@ class RegexRewriter:
def filter(self, m):
return True
- def replaceAll(self, string):
+ def rewrite(self, string):
return self.regex.sub(lambda x: self.replace(x), string)
+ def close(self):
+ return ''
+
def replace(self, m):
i = 0
for _, op, count in self.rules:
@@ -218,13 +221,13 @@ if __name__ == "__main__":
arcrw = ArchivalUrlRewriter('/20131010im_/http://example.com/', '/web/')
def test_js(string, extra = []):
- return JSRewriter(arcrw, extra).replaceAll(string)
+ return JSRewriter(arcrw, extra).rewrite(string)
def test_xml(string):
- return XMLRewriter(arcrw).replaceAll(string)
+ return XMLRewriter(arcrw).rewrite(string)
def test_css(string):
- return CSSRewriter(arcrw).replaceAll(string)
+ return CSSRewriter(arcrw).rewrite(string)
diff --git a/pywb/replay.py b/pywb/replay.py
index ad059238..326a8933 100644
--- a/pywb/replay.py
+++ b/pywb/replay.py
@@ -204,6 +204,9 @@ class RewritingReplayHandler(ReplayHandler):
self.headerRewriter = headerRewriter
self.redir_to_exact = redir_to_exact
+ # buffer or stream rewritten response
+ self.buffer_response = False
+
def _textContentType(self, contentType):
for ctype, mimelist in self.REWRITE_TYPES.iteritems():
@@ -259,71 +262,97 @@ class RewritingReplayHandler(ReplayHandler):
encoding = 'utf-8'
# Buffering response for html, streaming for others?
- if rewrittenHeaders.textType == 'html':
- return self._rewriteHtml(encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
- else:
- return self._rewriteOther(rewrittenHeaders.textType, encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
+ #if rewrittenHeaders.textType == 'html':
+ # return self._rewriteHtml(encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
+ #else:
+ # return self._rewriteOther(rewrittenHeaders.textType, encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff)
+ textType = rewrittenHeaders.textType
+ status_headers = rewrittenHeaders.status_headers
- def _rewriteHtml(self, encoding, urlrewriter, stream, status_headers, firstBuff = None):
- out = StringIO.StringIO()
- htmlrewriter = html_rewriter.WBHtml(urlrewriter, out, self.headInsert)
-
- try:
- buff = firstBuff if firstBuff else stream.read()
- while buff:
- if encoding:
- try:
- buff = buff.decode(encoding)
- except UnicodeDecodeError, e:
- # chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
- for i in range(3):
- buff += stream.read(1)
- try:
- buff = buff.decode(encoding)
- break
- except UnicodeDecodeError:
- pass
- else:
- raise
- htmlrewriter.feed(buff)
- buff = stream.read()
-
- # Close rewriter if gracefully made it to end
- htmlrewriter.close()
-
- finally:
- content = out.getvalue()
- if encoding:
- content = content.encode(encoding)
-
- value = [content]
- contentLengthStr = str(len(content))
- status_headers.headers.append(('Content-Length', contentLengthStr))
- out.close()
-
- return WbResponse(status_headers, value = value)
-
-
- def _rewriteOther(self, textType, encoding, urlrewriter, stream, status_headers, firstBuff = None):
- if textType == 'css':
+ if textType == 'html':
+ rewriter = html_rewriter.WBHtml(urlrewriter, outstream = None, headInsert = self.headInsert)
+ elif textType == 'css':
rewriter = regex_rewriters.CSSRewriter(urlrewriter)
elif textType == 'js':
rewriter = regex_rewriters.JSRewriter(urlrewriter)
elif textType == 'xml':
rewriter = regex_rewriters.XMLRewriter(urlrewriter)
+ else:
+ raise Exception('Unknown Text Type for Rewrite: ' + textType)
+ if self.buffer_response:
+ return self._buffer_rewrite_response(rewriter, encoding, stream, status_headers, firstBuff)
+ else:
+ return self._stream_rewrite_response(rewriter, encoding, stream, status_headers, firstBuff)
+
+
+ # Buffer rewrite response, and serve with full Content-Length
+ def _buffer_rewrite_response(self, rewriter, encoding, stream, status_headers, firstBuff = None):
+ out = StringIO.StringIO()
+
+ try:
+ buff = firstBuff if firstBuff else stream.read()
+ while buff:
+ if encoding:
+ buff = self._decodeBuff(buff, stream, encoding)
+
+ out.write(rewriter.rewrite(buff))
+ buff = stream.read()
+
+ # Close rewriter if gracefully made it to end
+ rewriter.close()
+
+ finally:
+ content = out.getvalue()
+
+ if encoding:
+ content = content.encode(encoding)
+
+ value = [content]
+ contentLengthStr = str(len(content))
+ status_headers.headers.append(('Content-Length', contentLengthStr))
+ out.close()
+
+ return WbResponse(status_headers, value = value)
+
+ # Stream rewrite response from record (no Content-Length), may even be chunked by front-end
+ def _stream_rewrite_response(self, rewriter, encoding, stream, status_headers, firstBuff = None):
def doRewrite(buff):
if encoding:
- buff = buff.decode(encoding)
- buff = rewriter.replaceAll(buff)
+ buff = self._decodeBuff(buff, stream, encoding)
+
+ buff = rewriter.rewrite(buff)
+
if encoding:
buff = buff.encode(encoding)
return buff
- return WbResponse.stream_response(status_headers, stream, doRewrite, firstBuff)
+ def doFinish():
+ return rewriter.close()
+
+ return WbResponse.stream_response(status_headers, stream, rewrite_func = doRewrite, final_read_func = doFinish, first_buff = firstBuff)
+
+
+ def _decodeBuff(self, buff, stream, encoding):
+ try:
+ buff = buff.decode(encoding)
+ except UnicodeDecodeError, e:
+ # chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
+ for i in range(3):
+ buff += stream.read(1)
+ try:
+ buff = buff.decode(encoding)
+ break
+ except UnicodeDecodeError:
+ pass
+ else:
+ raise
+
+ return buff
+
def _detectCharset(self, stream):
buff = stream.read(8192)
@@ -331,6 +360,7 @@ class RewritingReplayHandler(ReplayHandler):
print "chardet result: " + str(result)
return (result['encoding'], buff)
+
def _checkRedir(self, wbrequest, cdx):
if self.redir_to_exact and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp):
newUrl = wbrequest.urlrewriter.getTimestampUrl(cdx['timestamp'], cdx['original'])
diff --git a/pywb/wbrequestresponse.py b/pywb/wbrequestresponse.py
index b12c6c20..2de05458 100644
--- a/pywb/wbrequestresponse.py
+++ b/pywb/wbrequestresponse.py
@@ -133,15 +133,22 @@ class WbResponse:
return WbResponse(StatusAndHeaders(status, [('Location', location)]))
@staticmethod
- def stream_response(status_headers, stream, proc = None, firstBuff = None):
+ def stream_response(status_headers, stream, rewrite_func = None, final_read_func = None, first_buff = None):
def streamGen():
try:
- buff = firstBuff if firstBuff else stream.read()
+ buff = first_buff if first_buff else stream.read()
while buff:
- if proc:
- buff = proc(buff)
+ if rewrite_func:
+ buff = rewrite_func(buff)
yield buff
buff = stream.read()
+
+ # For adding a tail/handling final buffer
+ if final_read_func:
+ buff = final_read_func()
+ if buff:
+ yield buff
+
finally:
stream.close()