1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

Merge pull request #2 from jcushman/master

Handle transfer-encoding:chunked; misc. replay bugs.
This commit is contained in:
ikreymer 2014-01-19 12:05:57 -08:00
commit ab955c411b
2 changed files with 47 additions and 4 deletions

View File

@ -282,6 +282,33 @@ class LineReader:
self.stream.close() self.stream.close()
self.stream = None self.stream = None
class ChunkedLineReader(LineReader):
allChunksRead = False
def _fillbuff(self, chunkSize = None):
if self.allChunksRead:
return
if not self.buff or self.buff.pos >= self.buff.len:
lengthHeader = self.stream.readline()
chunkSize = int(lengthHeader.strip().split(';')[0], 16)
data = ''
if chunkSize:
while len(data) < chunkSize:
newData = self.stream.read(chunkSize - len(data))
if not newData:
raise Exception("Error reading chunked data: ran out of data before end of chunk.")
data += newData
clrf = self.stream.read(2)
if clrf != '\r\n':
raise Exception("Error reading chunked data: end of chunk not found where expected.")
if self.decomp:
data = self.decomp.decompress(data)
else:
self.allChunksRead = True
data = ''
self.buff = StringIO.StringIO(data)
#================================================================= #=================================================================
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -3,7 +3,7 @@ from urllib2 import URLError
import chardet import chardet
import redis import redis
import indexreader import indexreader, archiveloader
from wbrequestresponse import WbResponse, StatusAndHeaders from wbrequestresponse import WbResponse, StatusAndHeaders
from wbarchivalurl import ArchivalUrl from wbarchivalurl import ArchivalUrl
import utils import utils
@ -100,7 +100,7 @@ class ReplayHandler(object):
def doReplay(self, cdx, wbrequest, failedFiles): def doReplay(self, cdx, wbrequest, failedFiles):
hasCurr = (cdx['filename'] != '-') hasCurr = (cdx['filename'] != '-')
hasOrig = (cdx['orig.filename'] != '-') hasOrig = (cdx.get('orig.filename','-') != '-')
# Case 1: non-revisit # Case 1: non-revisit
if (hasCurr and not hasOrig): if (hasCurr and not hasOrig):
@ -190,9 +190,13 @@ class RewritingReplayHandler(ReplayHandler):
# TODO: better way to pass this? # TODO: better way to pass this?
stream = response._stream stream = response._stream
# handle transfer-encoding: chunked
if (rewrittenHeaders.containsRemovedHeader('transfer-encoding', 'chunked')):
stream = archiveloader.ChunkedLineReader(stream)
# special case -- need to ungzip the body # special case -- need to ungzip the body
if (rewrittenHeaders.containsRemovedHeader('content-encoding', 'gzip')): if (rewrittenHeaders.containsRemovedHeader('content-encoding', 'gzip')):
stream = archiveloader.LineStream(stream, decomp = utils.create_decompressor()) stream = archiveloader.LineReader(stream, decomp = utils.create_decompressor())
# TODO: is this right? # TODO: is this right?
if rewrittenHeaders.charset: if rewrittenHeaders.charset:
@ -221,7 +225,19 @@ class RewritingReplayHandler(ReplayHandler):
buff = firstBuff if firstBuff else stream.read() buff = firstBuff if firstBuff else stream.read()
while buff: while buff:
if encoding: if encoding:
buff = buff.decode(encoding) try:
buff = buff.decode(encoding)
except UnicodeDecodeError, e:
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
for i in range(3):
buff += stream.read(1)
try:
buff = buff.decode(encoding)
break
except UnicodeDecodeError:
pass
else:
raise
htmlrewriter.feed(buff) htmlrewriter.feed(buff)
buff = stream.read() buff = stream.read()