1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Handle transfer-encoding:chunked; misc. replay bugs.

- Add a ChunkedLineReader to deal with replays with the
transfer-encoding: chunked header.
- Catch UnicodeDecodeErrors caused by multibyte characters getting
split during buffering.
- A couple of tiny bugs in replay.py
This commit is contained in:
Jack Cushman 2014-01-18 21:32:49 -05:00
parent 7ce6d0d22b
commit c9d0b0ba7b
2 changed files with 47 additions and 4 deletions

View File

@ -282,6 +282,33 @@ class LineReader:
self.stream.close()
self.stream = None
class ChunkedLineReader(LineReader):
allChunksRead = False
def _fillbuff(self, chunkSize = None):
if self.allChunksRead:
return
if not self.buff or self.buff.pos >= self.buff.len:
lengthHeader = self.stream.readline()
chunkSize = int(lengthHeader.strip().split(';')[0], 16)
data = ''
if chunkSize:
while len(data) < chunkSize:
newData = self.stream.read(chunkSize - len(data))
if not newData:
raise Exception("Error reading chunked data: ran out of data before end of chunk.")
data += newData
clrf = self.stream.read(2)
if clrf != '\r\n':
raise Exception("Error reading chunked data: end of chunk not found where expected.")
if self.decomp:
data = self.decomp.decompress(data)
else:
self.allChunksRead = True
data = ''
self.buff = StringIO.StringIO(data)
#=================================================================
if __name__ == "__main__":

View File

@ -3,7 +3,7 @@ from urllib2 import URLError
import chardet
import redis
import indexreader
import indexreader, archiveloader
from wbrequestresponse import WbResponse, StatusAndHeaders
from wbarchivalurl import ArchivalUrl
import utils
@ -100,7 +100,7 @@ class ReplayHandler(object):
def doReplay(self, cdx, wbrequest, failedFiles):
hasCurr = (cdx['filename'] != '-')
hasOrig = (cdx['orig.filename'] != '-')
hasOrig = (cdx.get('orig.filename','-') != '-')
# Case 1: non-revisit
if (hasCurr and not hasOrig):
@ -190,9 +190,13 @@ class RewritingReplayHandler(ReplayHandler):
# TODO: better way to pass this?
stream = response._stream
# handle transfer-encoding: chunked
if (rewrittenHeaders.containsRemovedHeader('transfer-encoding', 'chunked')):
stream = archiveloader.ChunkedLineReader(stream)
# special case -- need to ungzip the body
if (rewrittenHeaders.containsRemovedHeader('content-encoding', 'gzip')):
stream = archiveloader.LineStream(stream, decomp = utils.create_decompressor())
stream = archiveloader.LineReader(stream, decomp = utils.create_decompressor())
# TODO: is this right?
if rewrittenHeaders.charset:
@ -221,7 +225,19 @@ class RewritingReplayHandler(ReplayHandler):
buff = firstBuff if firstBuff else stream.read()
while buff:
if encoding:
buff = buff.decode(encoding)
try:
buff = buff.decode(encoding)
except UnicodeDecodeError, e:
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
for i in range(3):
buff += stream.read(1)
try:
buff = buff.decode(encoding)
break
except UnicodeDecodeError:
pass
else:
raise
htmlrewriter.feed(buff)
buff = stream.read()