diff --git a/pywb/html_rewriter.py b/pywb/html_rewriter.py index bbf30294..5b32e2e2 100644 --- a/pywb/html_rewriter.py +++ b/pywb/html_rewriter.py @@ -113,16 +113,23 @@ class WBHtml(HTMLParser): HEAD_TAGS = ['html', 'head', 'base', 'link', 'meta', 'title', 'style', 'script', 'object', 'bgsound'] + class AccumBuff: + def __init__(self): + self.buff = '' - def __init__(self, rewriter, outstream = None, headInsert = None, jsRewriterClass = JSRewriter, cssRewriterClass = CSSRewriter): + def write(self, string): + self.buff += string + + + def __init__(self, url_rewriter, outstream = None, headInsert = None, jsRewriterClass = JSRewriter, cssRewriterClass = CSSRewriter): HTMLParser.__init__(self) - self.rewriter = rewriter + self.url_rewriter = url_rewriter self._wbParseContext = None - self.out = outstream if outstream else sys.stdout + self.out = outstream if outstream else WBHtml.AccumBuff() - self.jsRewriter = jsRewriterClass(rewriter) - self.cssRewriter = cssRewriterClass(rewriter) + self.jsRewriter = jsRewriterClass(url_rewriter) + self.cssRewriter = cssRewriterClass(url_rewriter) self.headInsert = headInsert @@ -147,14 +154,14 @@ class WBHtml(HTMLParser): # =========================== def _rewriteURL(self, value, mod = None): - return self.rewriter.rewrite(value, mod) if value else None + return self.url_rewriter.rewrite(value, mod) if value else None def _rewriteCSS(self, cssContent): - return self.cssRewriter.replaceAll(cssContent) if cssContent else None + return self.cssRewriter.rewrite(cssContent) if cssContent else None def _rewriteScript(self, scriptContent): - return self.jsRewriter.replaceAll(scriptContent) if scriptContent else None + return self.jsRewriter.rewrite(scriptContent) if scriptContent else None def hasAttr(self, tagAttrs, attr): name, value = attr @@ -202,7 +209,7 @@ class WBHtml(HTMLParser): else: # special case: base tag if (tag == 'base') and (attrName == 'href') and attrValue: - self.rewriter.setBaseUrl(attrValue) + self.url_rewriter.setBaseUrl(attrValue) rwMod = handler.get(attrName) if rwMod is not None: @@ -232,14 +239,28 @@ class WBHtml(HTMLParser): self.out.write(data) + def rewrite(self, string): + if not self.out: + self.out = WBHtml.AccumBuff() + + self.feed(string) + + result = self.out.buff + # Clear buffer to create new one for next rewrite() + self.out = None + + return result # HTMLParser overrides below def close(self): if (self._wbParseContext): - self.feed('') + result = self.rewrite('') self._wbParseContext = None + else: + result = '' HTMLParser.close(self) + return result def handle_starttag(self, tag, attrs): if not self.rewriteTagAttrs(tag, attrs, False): @@ -291,11 +312,12 @@ class WBHtml(HTMLParser): if __name__ == "__main__": import doctest - rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/') + url_rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/') def parse(data, headInsert = None): - parser = WBHtml(rewriter, headInsert = headInsert) - parser.feed(data) - parser.close() + parser = WBHtml(url_rewriter, headInsert = headInsert) + print parser.rewrite(data) + parser.close() doctest.testmod() + + diff --git a/pywb/regex_rewriters.py b/pywb/regex_rewriters.py index e8a505e6..7ae5acd1 100644 --- a/pywb/regex_rewriters.py +++ b/pywb/regex_rewriters.py @@ -8,7 +8,7 @@ from url_rewriter import ArchivalUrlRewriter class RegexRewriter: """ # Test https->http converter (other tests below in subclasses) - >>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.removeHttps, 0)]).replaceAll('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com') + >>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.removeHttps, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com') 'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com' """ @@ -48,9 +48,12 @@ class RegexRewriter: def filter(self, m): return True - def replaceAll(self, string): + def rewrite(self, string): return self.regex.sub(lambda x: self.replace(x), string) + def close(self): + return '' + def replace(self, m): i = 0 for _, op, count in self.rules: @@ -218,13 +221,13 @@ if __name__ == "__main__": arcrw = ArchivalUrlRewriter('/20131010im_/http://example.com/', '/web/') def test_js(string, extra = []): - return JSRewriter(arcrw, extra).replaceAll(string) + return JSRewriter(arcrw, extra).rewrite(string) def test_xml(string): - return XMLRewriter(arcrw).replaceAll(string) + return XMLRewriter(arcrw).rewrite(string) def test_css(string): - return CSSRewriter(arcrw).replaceAll(string) + return CSSRewriter(arcrw).rewrite(string) diff --git a/pywb/replay.py b/pywb/replay.py index ad059238..326a8933 100644 --- a/pywb/replay.py +++ b/pywb/replay.py @@ -204,6 +204,9 @@ class RewritingReplayHandler(ReplayHandler): self.headerRewriter = headerRewriter self.redir_to_exact = redir_to_exact + # buffer or stream rewritten response + self.buffer_response = False + def _textContentType(self, contentType): for ctype, mimelist in self.REWRITE_TYPES.iteritems(): @@ -259,71 +262,97 @@ class RewritingReplayHandler(ReplayHandler): encoding = 'utf-8' # Buffering response for html, streaming for others? - if rewrittenHeaders.textType == 'html': - return self._rewriteHtml(encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff) - else: - return self._rewriteOther(rewrittenHeaders.textType, encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff) + #if rewrittenHeaders.textType == 'html': + # return self._rewriteHtml(encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff) + #else: + # return self._rewriteOther(rewrittenHeaders.textType, encoding, urlrewriter, stream, rewrittenHeaders.status_headers, firstBuff) + textType = rewrittenHeaders.textType + status_headers = rewrittenHeaders.status_headers - def _rewriteHtml(self, encoding, urlrewriter, stream, status_headers, firstBuff = None): - out = StringIO.StringIO() - htmlrewriter = html_rewriter.WBHtml(urlrewriter, out, self.headInsert) - - try: - buff = firstBuff if firstBuff else stream.read() - while buff: - if encoding: - try: - buff = buff.decode(encoding) - except UnicodeDecodeError, e: - # chunk may have cut apart unicode bytes -- add 1-3 bytes and retry - for i in range(3): - buff += stream.read(1) - try: - buff = buff.decode(encoding) - break - except UnicodeDecodeError: - pass - else: - raise - htmlrewriter.feed(buff) - buff = stream.read() - - # Close rewriter if gracefully made it to end - htmlrewriter.close() - - finally: - content = out.getvalue() - if encoding: - content = content.encode(encoding) - - value = [content] - contentLengthStr = str(len(content)) - status_headers.headers.append(('Content-Length', contentLengthStr)) - out.close() - - return WbResponse(status_headers, value = value) - - - def _rewriteOther(self, textType, encoding, urlrewriter, stream, status_headers, firstBuff = None): - if textType == 'css': + if textType == 'html': + rewriter = html_rewriter.WBHtml(urlrewriter, outstream = None, headInsert = self.headInsert) + elif textType == 'css': rewriter = regex_rewriters.CSSRewriter(urlrewriter) elif textType == 'js': rewriter = regex_rewriters.JSRewriter(urlrewriter) elif textType == 'xml': rewriter = regex_rewriters.XMLRewriter(urlrewriter) + else: + raise Exception('Unknown Text Type for Rewrite: ' + textType) + if self.buffer_response: + return self._buffer_rewrite_response(rewriter, encoding, stream, status_headers, firstBuff) + else: + return self._stream_rewrite_response(rewriter, encoding, stream, status_headers, firstBuff) + + + # Buffer rewrite response, and serve with full Content-Length + def _buffer_rewrite_response(self, rewriter, encoding, stream, status_headers, firstBuff = None): + out = StringIO.StringIO() + + try: + buff = firstBuff if firstBuff else stream.read() + while buff: + if encoding: + buff = self._decodeBuff(buff, stream, encoding) + + out.write(rewriter.rewrite(buff)) + buff = stream.read() + + # Close rewriter if gracefully made it to end + rewriter.close() + + finally: + content = out.getvalue() + + if encoding: + content = content.encode(encoding) + + value = [content] + contentLengthStr = str(len(content)) + status_headers.headers.append(('Content-Length', contentLengthStr)) + out.close() + + return WbResponse(status_headers, value = value) + + # Stream rewrite response from record (no Content-Length), may even be chunked by front-end + def _stream_rewrite_response(self, rewriter, encoding, stream, status_headers, firstBuff = None): def doRewrite(buff): if encoding: - buff = buff.decode(encoding) - buff = rewriter.replaceAll(buff) + buff = self._decodeBuff(buff, stream, encoding) + + buff = rewriter.rewrite(buff) + if encoding: buff = buff.encode(encoding) return buff - return WbResponse.stream_response(status_headers, stream, doRewrite, firstBuff) + def doFinish(): + return rewriter.close() + + return WbResponse.stream_response(status_headers, stream, rewrite_func = doRewrite, final_read_func = doFinish, first_buff = firstBuff) + + + def _decodeBuff(self, buff, stream, encoding): + try: + buff = buff.decode(encoding) + except UnicodeDecodeError, e: + # chunk may have cut apart unicode bytes -- add 1-3 bytes and retry + for i in range(3): + buff += stream.read(1) + try: + buff = buff.decode(encoding) + break + except UnicodeDecodeError: + pass + else: + raise + + return buff + def _detectCharset(self, stream): buff = stream.read(8192) @@ -331,6 +360,7 @@ class RewritingReplayHandler(ReplayHandler): print "chardet result: " + str(result) return (result['encoding'], buff) + def _checkRedir(self, wbrequest, cdx): if self.redir_to_exact and cdx and (cdx['timestamp'] != wbrequest.wb_url.timestamp): newUrl = wbrequest.urlrewriter.getTimestampUrl(cdx['timestamp'], cdx['original']) diff --git a/pywb/wbrequestresponse.py b/pywb/wbrequestresponse.py index b12c6c20..2de05458 100644 --- a/pywb/wbrequestresponse.py +++ b/pywb/wbrequestresponse.py @@ -133,15 +133,22 @@ class WbResponse: return WbResponse(StatusAndHeaders(status, [('Location', location)])) @staticmethod - def stream_response(status_headers, stream, proc = None, firstBuff = None): + def stream_response(status_headers, stream, rewrite_func = None, final_read_func = None, first_buff = None): def streamGen(): try: - buff = firstBuff if firstBuff else stream.read() + buff = first_buff if first_buff else stream.read() while buff: - if proc: - buff = proc(buff) + if rewrite_func: + buff = rewrite_func(buff) yield buff buff = stream.read() + + # For adding a tail/handling final buffer + if final_read_func: + buff = final_read_func() + if buff: + yield buff + finally: stream.close()