1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

fix ChunkedDataReader chunked + gzip decomp: if reading one chunk yields no data

(due to more data being needed for gzip decomp), keep reading more blocks until there is data
or last block is reached (or error). Ensure a single read() call will return some data if there is any
This commit is contained in:
Ilya Kreymer 2014-04-25 10:30:22 -07:00
parent 53f0cb540f
commit e4262502b0
2 changed files with 31 additions and 16 deletions

View File

@ -103,7 +103,8 @@ class DecompressingBufferedReader(object):
return '' return ''
self._fillbuff() self._fillbuff()
return self.buff.read(length) buff = self.buff.read(length)
return buff
def readline(self, length=None): def readline(self, length=None):
""" """
@ -162,7 +163,9 @@ class DecompressingBufferedReader(object):
#================================================================= #=================================================================
class ChunkedDataException(Exception): class ChunkedDataException(Exception):
pass def __init__(self, msg, data=''):
Exception.__init__(self, msg)
self.data = data
#================================================================= #=================================================================
@ -187,16 +190,17 @@ class ChunkedDataReader(DecompressingBufferedReader):
if self.not_chunked: if self.not_chunked:
return super(ChunkedDataReader, self)._fillbuff(block_size) return super(ChunkedDataReader, self)._fillbuff(block_size)
if self.all_chunks_read: # Loop over chunks until there is some data (not empty())
return # In particular, gzipped data may require multiple chunks to
# return any decompressed result
if self.empty(): while (self.empty() and
length_header = self.stream.readline(64) not self.all_chunks_read and
self._data = '' not self.not_chunked):
try: try:
length_header = self.stream.readline(64)
self._try_decode(length_header) self._try_decode(length_header)
except ChunkedDataException: except ChunkedDataException as e:
if self.raise_chunked_data_exceptions: if self.raise_chunked_data_exceptions:
raise raise
@ -204,7 +208,7 @@ class ChunkedDataReader(DecompressingBufferedReader):
# It's possible that non-chunked data is served # It's possible that non-chunked data is served
# with a Transfer-Encoding: chunked. # with a Transfer-Encoding: chunked.
# Treat this as non-chunk encoded from here on. # Treat this as non-chunk encoded from here on.
self._process_read(length_header + self._data) self._process_read(length_header + e.data)
self.not_chunked = True self.not_chunked = True
# parse as block as non-chunked # parse as block as non-chunked
@ -224,7 +228,8 @@ class ChunkedDataReader(DecompressingBufferedReader):
self._process_read('') self._process_read('')
return return
data_len = len(self._data) data_len = 0
data = ''
# read chunk # read chunk
while data_len < chunk_size: while data_len < chunk_size:
@ -236,20 +241,21 @@ class ChunkedDataReader(DecompressingBufferedReader):
if not new_data: if not new_data:
if self.raise_chunked_data_exceptions: if self.raise_chunked_data_exceptions:
msg = 'Ran out of data before end of chunk' msg = 'Ran out of data before end of chunk'
raise ChunkedDataException(msg) raise ChunkedDataException(msg, data)
else: else:
chunk_size = data_len chunk_size = data_len
self.all_chunks_read = True self.all_chunks_read = True
self._data += new_data data += new_data
data_len = len(self._data) data_len = len(data)
# if we successfully read a block without running out, # if we successfully read a block without running out,
# it should end in \r\n # it should end in \r\n
if not self.all_chunks_read: if not self.all_chunks_read:
clrf = self.stream.read(2) clrf = self.stream.read(2)
if clrf != '\r\n': if clrf != '\r\n':
raise ChunkedDataException("Chunk terminator not found.") raise ChunkedDataException("Chunk terminator not found.",
data)
# hand to base class for further processing # hand to base class for further processing
self._process_read(self._data) self._process_read(data)

View File

@ -73,6 +73,15 @@ Non-chunked, compressed data
>>> DecompressingBufferedReader(ChunkedDataReader(BytesIO(compress('\nABCDEF\nGHIJ')))).read() >>> DecompressingBufferedReader(ChunkedDataReader(BytesIO(compress('\nABCDEF\nGHIJ')))).read()
'\nABCDEF\nGHIJ' '\nABCDEF\nGHIJ'
Chunked compressed data
Split compressed stream into 10-byte chunk and a remainder chunk
>>> b = compress('ABCDEFGHIJKLMNOP')
>>> l = len(b)
>>> in_ = format(10, 'x') + "\r\n" + b[:10] + "\r\n" + format(l - 10, 'x') + "\r\n" + b[10:] + "\r\n0\r\n\r\n"
>>> c = ChunkedDataReader(BytesIO(in_), decomp_type='gzip')
>>> c.read()
'ABCDEFGHIJKLMNOP'
Starts like chunked data, but isn't: Starts like chunked data, but isn't:
>>> c = ChunkedDataReader(BytesIO("1\r\nxyz123!@#")); >>> c = ChunkedDataReader(BytesIO("1\r\nxyz123!@#"));
>>> c.read() + c.read() >>> c.read() + c.read()