From a1cd40fba1cc95c2584d83ae502d36d04a5ee872 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 20 Jan 2014 23:06:45 -0800 Subject: [PATCH] support replay of records that have Transfer-Encoding: chunked, but were not actually rewritten to the warc as chunked. Attempt to parse chunk length, and if failed, fallback to treating record as not chunked --- pywb/archiveloader.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/pywb/archiveloader.py b/pywb/archiveloader.py index d79cbcc2..e7d325e1 100644 --- a/pywb/archiveloader.py +++ b/pywb/archiveloader.py @@ -262,12 +262,16 @@ class LineReader: if not self.buff or self.buff.pos >= self.buff.len: toRead = min(self.maxLen - self.numRead, self.chunkSize) if (self.maxLen > 0) else self.chunkSize data = self.stream.read(toRead) - self.numRead += len(data) + self._process_read(data) - if self.decomp: - data = self.decomp.decompress(data) + def _process_read(self, data): + self.numRead += len(data) + + if self.decomp: + data = self.decomp.decompress(data) + + self.buff = StringIO.StringIO(data) - self.buff = StringIO.StringIO(data) def read(self, length = None): self._fillbuff() @@ -282,16 +286,31 @@ class LineReader: self.stream.close() self.stream = None + class ChunkedLineReader(LineReader): allChunksRead = False + notChunked = False def _fillbuff(self, chunkSize = None): + if self.notChunked: + LineReader._fillbuff(self, chunkSize) + if self.allChunksRead: return if not self.buff or self.buff.pos >= self.buff.len: - lengthHeader = self.stream.readline() - chunkSize = int(lengthHeader.strip().split(';')[0], 16) + lengthHeader = self.stream.readline(64) + + # It's possible that non-chunked data is set with a Transfer-Encoding: chunked + # to handle this, if its not possible to decode it the chunk, then treat this as a regular LineReader + try: + chunkSize = int(lengthHeader.strip().split(';')[0], 16) + except Exception: + # can't parse the lengthHeader, treat this as non-chunk encoded from here on + self._process_read(lengthHeader) + self.notChunked = True + return + data = '' if chunkSize: while len(data) < chunkSize: @@ -310,6 +329,7 @@ class ChunkedLineReader(LineReader): self.buff = StringIO.StringIO(data) + #================================================================= if __name__ == "__main__": import doctest