1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

Handle transfer-encoding:chunked; misc. replay bugs.

- Add a ChunkedLineReader to deal with replays with the
transfer-encoding: chunked header.
- Catch UnicodeDecodeErrors caused by multibyte characters getting
split during buffering.
- A couple of tiny bugs in replay.py
This commit is contained in:
Jack Cushman 2014-01-18 21:32:49 -05:00
parent 7ce6d0d22b
commit c9d0b0ba7b
2 changed files with 47 additions and 4 deletions

View File

@ -282,6 +282,33 @@ class LineReader:
self.stream.close() self.stream.close()
self.stream = None self.stream = None
class ChunkedLineReader(LineReader):
allChunksRead = False
def _fillbuff(self, chunkSize = None):
if self.allChunksRead:
return
if not self.buff or self.buff.pos >= self.buff.len:
lengthHeader = self.stream.readline()
chunkSize = int(lengthHeader.strip().split(';')[0], 16)
data = ''
if chunkSize:
while len(data) < chunkSize:
newData = self.stream.read(chunkSize - len(data))
if not newData:
raise Exception("Error reading chunked data: ran out of data before end of chunk.")
data += newData
clrf = self.stream.read(2)
if clrf != '\r\n':
raise Exception("Error reading chunked data: end of chunk not found where expected.")
if self.decomp:
data = self.decomp.decompress(data)
else:
self.allChunksRead = True
data = ''
self.buff = StringIO.StringIO(data)
#================================================================= #=================================================================
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -3,7 +3,7 @@ from urllib2 import URLError
import chardet import chardet
import redis import redis
import indexreader import indexreader, archiveloader
from wbrequestresponse import WbResponse, StatusAndHeaders from wbrequestresponse import WbResponse, StatusAndHeaders
from wbarchivalurl import ArchivalUrl from wbarchivalurl import ArchivalUrl
import utils import utils
@ -100,7 +100,7 @@ class ReplayHandler(object):
def doReplay(self, cdx, wbrequest, failedFiles): def doReplay(self, cdx, wbrequest, failedFiles):
hasCurr = (cdx['filename'] != '-') hasCurr = (cdx['filename'] != '-')
hasOrig = (cdx['orig.filename'] != '-') hasOrig = (cdx.get('orig.filename','-') != '-')
# Case 1: non-revisit # Case 1: non-revisit
if (hasCurr and not hasOrig): if (hasCurr and not hasOrig):
@ -190,9 +190,13 @@ class RewritingReplayHandler(ReplayHandler):
# TODO: better way to pass this? # TODO: better way to pass this?
stream = response._stream stream = response._stream
# handle transfer-encoding: chunked
if (rewrittenHeaders.containsRemovedHeader('transfer-encoding', 'chunked')):
stream = archiveloader.ChunkedLineReader(stream)
# special case -- need to ungzip the body # special case -- need to ungzip the body
if (rewrittenHeaders.containsRemovedHeader('content-encoding', 'gzip')): if (rewrittenHeaders.containsRemovedHeader('content-encoding', 'gzip')):
stream = archiveloader.LineStream(stream, decomp = utils.create_decompressor()) stream = archiveloader.LineReader(stream, decomp = utils.create_decompressor())
# TODO: is this right? # TODO: is this right?
if rewrittenHeaders.charset: if rewrittenHeaders.charset:
@ -221,7 +225,19 @@ class RewritingReplayHandler(ReplayHandler):
buff = firstBuff if firstBuff else stream.read() buff = firstBuff if firstBuff else stream.read()
while buff: while buff:
if encoding: if encoding:
buff = buff.decode(encoding) try:
buff = buff.decode(encoding)
except UnicodeDecodeError, e:
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
for i in range(3):
buff += stream.read(1)
try:
buff = buff.decode(encoding)
break
except UnicodeDecodeError:
pass
else:
raise
htmlrewriter.feed(buff) htmlrewriter.feed(buff)
buff = stream.read() buff = stream.read()