From 6581f54fad24a91e572f29a4d6e20616de865fb1 Mon Sep 17 00:00:00 2001 From: Jack Cushman Date: Tue, 21 Jan 2014 20:00:52 -0500 Subject: [PATCH] Robust chunked data exception handling. --- pywb/archiveloader.py | 103 +++++++++++++++++++++++++++++------------- 1 file changed, 72 insertions(+), 31 deletions(-) diff --git a/pywb/archiveloader.py b/pywb/archiveloader.py index e7d325e1..a3b77e2f 100644 --- a/pywb/archiveloader.py +++ b/pywb/archiveloader.py @@ -265,12 +265,12 @@ class LineReader: self._process_read(data) def _process_read(self, data): - self.numRead += len(data) + self.numRead += len(data) - if self.decomp: - data = self.decomp.decompress(data) + if self.decomp and data: + data = self.decomp.decompress(data) - self.buff = StringIO.StringIO(data) + self.buff = StringIO.StringIO(data) def read(self, length = None): @@ -287,47 +287,88 @@ class LineReader: self.stream = None +class ChunkedDataException(Exception): + pass + + class ChunkedLineReader(LineReader): + r""" + Properly formatted chunked data: + >>> c=ChunkedLineReader(StringIO.StringIO("4\r\n1234\r\n0\r\n\r\n")); c.read()+c.read() + '1234' + + Non-chunked data: + >>> ChunkedLineReader(StringIO.StringIO("xyz123!@#")).read() + 'xyz123!@#' + + Starts like chunked data, but isn't: + >>> c=ChunkedLineReader(StringIO.StringIO("1\r\nxyz123!@#")); c.read()+c.read() + '1\r\nx123!@#' + + Chunked data cut off part way through: + >>> c=ChunkedLineReader(StringIO.StringIO("4\r\n1234\r\n4\r\n12"));c.read()+c.read() + '123412' + """ + allChunksRead = False notChunked = False + raiseChunkedDataExceptions = False # if False, we'll use best-guess fallback for parse errors def _fillbuff(self, chunkSize = None): if self.notChunked: - LineReader._fillbuff(self, chunkSize) + return LineReader._fillbuff(self, chunkSize) if self.allChunksRead: return if not self.buff or self.buff.pos >= self.buff.len: lengthHeader = self.stream.readline(64) - - # It's possible that non-chunked data is set with a Transfer-Encoding: chunked - # to handle this, if its not possible to decode it the chunk, then treat this as a regular LineReader - try: - chunkSize = int(lengthHeader.strip().split(';')[0], 16) - except Exception: - # can't parse the lengthHeader, treat this as non-chunk encoded from here on - self._process_read(lengthHeader) - self.notChunked = True - return - data = '' - if chunkSize: - while len(data) < chunkSize: - newData = self.stream.read(chunkSize - len(data)) - if not newData: - raise Exception("Error reading chunked data: ran out of data before end of chunk.") - data += newData - clrf = self.stream.read(2) - if clrf != '\r\n': - raise Exception("Error reading chunked data: end of chunk not found where expected.") - if self.decomp: - data = self.decomp.decompress(data) - else: - self.allChunksRead = True - data = '' - self.buff = StringIO.StringIO(data) + try: + # decode length header + try: + chunkSize = int(lengthHeader.strip().split(';')[0], 16) + except ValueError: + raise ChunkedDataException("Couldn't decode length header '%s'" % lengthHeader) + + if chunkSize: + # read chunk + while len(data) < chunkSize: + newData = self.stream.read(chunkSize - len(data)) + + # if we unexpectedly run out of data, either raise an exception or just stop reading, assuming file was cut off + if not newData: + if self.raiseChunkedDataExceptions: + raise ChunkedDataException("Ran out of data before end of chunk") + else: + chunkSize = len(data) + self.allChunksRead = True + + data += newData + + # if we successfully read a block without running out, it should end in \r\n + if not self.allChunksRead: + clrf = self.stream.read(2) + if clrf != '\r\n': + raise ChunkedDataException("Chunk terminator not found.") + + if self.decomp: + data = self.decomp.decompress(data) + else: + # chunkSize 0 indicates end of file + self.allChunksRead = True + data = '' + + self._process_read(data) + except ChunkedDataException: + if self.raiseChunkedDataExceptions: + raise + # Can't parse the data as chunked. + # It's possible that non-chunked data is set with a Transfer-Encoding: chunked + # Treat this as non-chunk encoded from here on + self._process_read(lengthHeader+data) + self.notChunked = True #=================================================================