support replay of records that have Transfer-Encoding: chunked, but

were not actually rewritten to the warc as chunked. Attempt to parse chunk length, and if failed, fallback to treating record as not chunked
2025-03-24 06:59:52 +01:00 · 2014-01-20 23:06:45 -08:00 · 2014-01-20 23:06:45 -08:00 · a1cd40fba1
commit a1cd40fba1
parent 8fd10673e8
1 changed files with 26 additions and 6 deletions
--- a/pywb/archiveloader.py
+++ b/pywb/archiveloader.py
@ -262,12 +262,16 @@ class LineReader:
        if not self.buff or self.buff.pos >= self.buff.len:
            toRead =  min(self.maxLen - self.numRead, self.chunkSize) if (self.maxLen > 0) else self.chunkSize
            data = self.stream.read(toRead)
-            self.numRead += len(data)
+            self._process_read(data)
-            if self.decomp:
+    def _process_read(self, data):
-                data = self.decomp.decompress(data)
+       self.numRead += len(data)
       if self.decomp:
           data = self.decomp.decompress(data)
       self.buff = StringIO.StringIO(data)
            self.buff = StringIO.StringIO(data)
    def read(self, length = None):
        self._fillbuff()
@ -282,16 +286,31 @@ class LineReader:
            self.stream.close()
            self.stream = None
 class ChunkedLineReader(LineReader):
    allChunksRead = False
    notChunked = False
    def _fillbuff(self, chunkSize = None):
        if self.notChunked:
            LineReader._fillbuff(self, chunkSize)
        if self.allChunksRead:
            return
        if not self.buff or self.buff.pos >= self.buff.len:
-            lengthHeader = self.stream.readline()
+            lengthHeader = self.stream.readline(64)
-            chunkSize = int(lengthHeader.strip().split(';')[0], 16)
+
            # It's possible that non-chunked data is set with a Transfer-Encoding: chunked
            # to handle this, if its not possible to decode it the chunk, then treat this as a regular LineReader
            try:
                chunkSize = int(lengthHeader.strip().split(';')[0], 16)
            except Exception:
                # can't parse the lengthHeader, treat this as non-chunk encoded from here on
                self._process_read(lengthHeader)
                self.notChunked = True
                return
            data = ''
            if chunkSize:
                while len(data) < chunkSize:
@ -310,6 +329,7 @@ class ChunkedLineReader(LineReader):
            self.buff = StringIO.StringIO(data)
 #=================================================================
 if __name__ == "__main__":
    import doctest