fix ChunkedDataReader chunked + gzip decomp: if reading one chunk yields no data

(due to more data being needed for gzip decomp), keep reading more blocks until there is data or last block is reached (or error). Ensure a single read() call will return some data if there is any
2025-03-15 00:03:28 +01:00 · 2014-04-25 10:30:22 -07:00 · 2014-04-25 10:30:22 -07:00 · e4262502b0
commit e4262502b0
parent 53f0cb540f
2 changed files with 31 additions and 16 deletions
--- a/pywb/utils/bufferedreaders.py
+++ b/pywb/utils/bufferedreaders.py
@ -103,7 +103,8 @@ class DecompressingBufferedReader(object):
            return ''

        self._fillbuff()
-        return self.buff.read(length)
+        buff = self.buff.read(length)
+        return buff

    def readline(self, length=None):
        """
@ -162,7 +163,9 @@ class DecompressingBufferedReader(object):

 #=================================================================
 class ChunkedDataException(Exception):
-    pass
+    def __init__(self, msg, data=''):
+        Exception.__init__(self, msg)
+        self.data = data


 #=================================================================
@ -187,16 +190,17 @@ class ChunkedDataReader(DecompressingBufferedReader):
        if self.not_chunked:
            return super(ChunkedDataReader, self)._fillbuff(block_size)

-        if self.all_chunks_read:
-            return
-
-        if self.empty():
-            length_header = self.stream.readline(64)
-            self._data = ''
+        # Loop over chunks until there is some data (not empty())
+        # In particular, gzipped data may require multiple chunks to
+        # return any decompressed result
+        while (self.empty() and
+               not self.all_chunks_read and
+               not self.not_chunked):

            try:
+                length_header = self.stream.readline(64)
                self._try_decode(length_header)
-            except ChunkedDataException:
+            except ChunkedDataException as e:
                if self.raise_chunked_data_exceptions:
                    raise

@ -204,7 +208,7 @@ class ChunkedDataReader(DecompressingBufferedReader):
                # It's possible that non-chunked data is served
                # with a Transfer-Encoding: chunked.
                # Treat this as non-chunk encoded from here on.
-                self._process_read(length_header + self._data)
+                self._process_read(length_header + e.data)
                self.not_chunked = True

                # parse as block as non-chunked
@ -224,7 +228,8 @@ class ChunkedDataReader(DecompressingBufferedReader):
            self._process_read('')
            return

-        data_len = len(self._data)
+        data_len = 0
+        data = ''

        # read chunk
        while data_len < chunk_size:
@ -236,20 +241,21 @@ class ChunkedDataReader(DecompressingBufferedReader):
            if not new_data:
                if self.raise_chunked_data_exceptions:
                    msg = 'Ran out of data before end of chunk'
-                    raise ChunkedDataException(msg)
+                    raise ChunkedDataException(msg, data)
                else:
                    chunk_size = data_len
                    self.all_chunks_read = True

-            self._data += new_data
-            data_len = len(self._data)
+            data += new_data
+            data_len = len(data)

        # if we successfully read a block without running out,
        # it should end in \r\n
        if not self.all_chunks_read:
            clrf = self.stream.read(2)
            if clrf != '\r\n':
-                raise ChunkedDataException("Chunk terminator not found.")
+                raise ChunkedDataException("Chunk terminator not found.",
+                                           data)

        # hand to base class for further processing
-        self._process_read(self._data)
+        self._process_read(data)
--- a/pywb/utils/test/test_bufferedreaders.py
+++ b/pywb/utils/test/test_bufferedreaders.py
@ -73,6 +73,15 @@ Non-chunked, compressed data
 >>> DecompressingBufferedReader(ChunkedDataReader(BytesIO(compress('\nABCDEF\nGHIJ')))).read()
 '\nABCDEF\nGHIJ'

+Chunked compressed data
+Split compressed stream into 10-byte chunk and a remainder chunk
+>>> b = compress('ABCDEFGHIJKLMNOP')
+>>> l = len(b)
+>>> in_ = format(10, 'x') + "\r\n" + b[:10] + "\r\n" + format(l - 10, 'x') + "\r\n" + b[10:] + "\r\n0\r\n\r\n"
+>>> c = ChunkedDataReader(BytesIO(in_), decomp_type='gzip')
+>>> c.read()
+'ABCDEFGHIJKLMNOP'
+
 Starts like chunked data, but isn't:
 >>> c = ChunkedDataReader(BytesIO("1\r\nxyz123!@#"));
 >>> c.read() + c.read()