diff --git a/pywb/archiveloader.py b/pywb/archiveloader.py index cb7456fa..d79cbcc2 100644 --- a/pywb/archiveloader.py +++ b/pywb/archiveloader.py @@ -282,6 +282,33 @@ class LineReader: self.stream.close() self.stream = None +class ChunkedLineReader(LineReader): + allChunksRead = False + + def _fillbuff(self, chunkSize = None): + if self.allChunksRead: + return + + if not self.buff or self.buff.pos >= self.buff.len: + lengthHeader = self.stream.readline() + chunkSize = int(lengthHeader.strip().split(';')[0], 16) + data = '' + if chunkSize: + while len(data) < chunkSize: + newData = self.stream.read(chunkSize - len(data)) + if not newData: + raise Exception("Error reading chunked data: ran out of data before end of chunk.") + data += newData + clrf = self.stream.read(2) + if clrf != '\r\n': + raise Exception("Error reading chunked data: end of chunk not found where expected.") + if self.decomp: + data = self.decomp.decompress(data) + else: + self.allChunksRead = True + data = '' + + self.buff = StringIO.StringIO(data) #================================================================= if __name__ == "__main__": diff --git a/pywb/replay.py b/pywb/replay.py index ca993535..d5eb5c44 100644 --- a/pywb/replay.py +++ b/pywb/replay.py @@ -3,7 +3,7 @@ from urllib2 import URLError import chardet import redis -import indexreader +import indexreader, archiveloader from wbrequestresponse import WbResponse, StatusAndHeaders from wbarchivalurl import ArchivalUrl import utils @@ -100,7 +100,7 @@ class ReplayHandler(object): def doReplay(self, cdx, wbrequest, failedFiles): hasCurr = (cdx['filename'] != '-') - hasOrig = (cdx['orig.filename'] != '-') + hasOrig = (cdx.get('orig.filename','-') != '-') # Case 1: non-revisit if (hasCurr and not hasOrig): @@ -190,9 +190,13 @@ class RewritingReplayHandler(ReplayHandler): # TODO: better way to pass this? stream = response._stream + # handle transfer-encoding: chunked + if (rewrittenHeaders.containsRemovedHeader('transfer-encoding', 'chunked')): + stream = archiveloader.ChunkedLineReader(stream) + # special case -- need to ungzip the body if (rewrittenHeaders.containsRemovedHeader('content-encoding', 'gzip')): - stream = archiveloader.LineStream(stream, decomp = utils.create_decompressor()) + stream = archiveloader.LineReader(stream, decomp = utils.create_decompressor()) # TODO: is this right? if rewrittenHeaders.charset: @@ -221,7 +225,19 @@ class RewritingReplayHandler(ReplayHandler): buff = firstBuff if firstBuff else stream.read() while buff: if encoding: - buff = buff.decode(encoding) + try: + buff = buff.decode(encoding) + except UnicodeDecodeError, e: + # chunk may have cut apart unicode bytes -- add 1-3 bytes and retry + for i in range(3): + buff += stream.read(1) + try: + buff = buff.decode(encoding) + break + except UnicodeDecodeError: + pass + else: + raise htmlrewriter.feed(buff) buff = stream.read()