mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Merge pull request #2 from jcushman/master
Handle transfer-encoding:chunked; misc. replay bugs.
This commit is contained in:
commit
ab955c411b
@ -282,6 +282,33 @@ class LineReader:
|
||||
self.stream.close()
|
||||
self.stream = None
|
||||
|
||||
class ChunkedLineReader(LineReader):
|
||||
allChunksRead = False
|
||||
|
||||
def _fillbuff(self, chunkSize = None):
|
||||
if self.allChunksRead:
|
||||
return
|
||||
|
||||
if not self.buff or self.buff.pos >= self.buff.len:
|
||||
lengthHeader = self.stream.readline()
|
||||
chunkSize = int(lengthHeader.strip().split(';')[0], 16)
|
||||
data = ''
|
||||
if chunkSize:
|
||||
while len(data) < chunkSize:
|
||||
newData = self.stream.read(chunkSize - len(data))
|
||||
if not newData:
|
||||
raise Exception("Error reading chunked data: ran out of data before end of chunk.")
|
||||
data += newData
|
||||
clrf = self.stream.read(2)
|
||||
if clrf != '\r\n':
|
||||
raise Exception("Error reading chunked data: end of chunk not found where expected.")
|
||||
if self.decomp:
|
||||
data = self.decomp.decompress(data)
|
||||
else:
|
||||
self.allChunksRead = True
|
||||
data = ''
|
||||
|
||||
self.buff = StringIO.StringIO(data)
|
||||
|
||||
#=================================================================
|
||||
if __name__ == "__main__":
|
||||
|
@ -3,7 +3,7 @@ from urllib2 import URLError
|
||||
import chardet
|
||||
import redis
|
||||
|
||||
import indexreader
|
||||
import indexreader, archiveloader
|
||||
from wbrequestresponse import WbResponse, StatusAndHeaders
|
||||
from wbarchivalurl import ArchivalUrl
|
||||
import utils
|
||||
@ -100,7 +100,7 @@ class ReplayHandler(object):
|
||||
|
||||
def doReplay(self, cdx, wbrequest, failedFiles):
|
||||
hasCurr = (cdx['filename'] != '-')
|
||||
hasOrig = (cdx['orig.filename'] != '-')
|
||||
hasOrig = (cdx.get('orig.filename','-') != '-')
|
||||
|
||||
# Case 1: non-revisit
|
||||
if (hasCurr and not hasOrig):
|
||||
@ -190,9 +190,13 @@ class RewritingReplayHandler(ReplayHandler):
|
||||
# TODO: better way to pass this?
|
||||
stream = response._stream
|
||||
|
||||
# handle transfer-encoding: chunked
|
||||
if (rewrittenHeaders.containsRemovedHeader('transfer-encoding', 'chunked')):
|
||||
stream = archiveloader.ChunkedLineReader(stream)
|
||||
|
||||
# special case -- need to ungzip the body
|
||||
if (rewrittenHeaders.containsRemovedHeader('content-encoding', 'gzip')):
|
||||
stream = archiveloader.LineStream(stream, decomp = utils.create_decompressor())
|
||||
stream = archiveloader.LineReader(stream, decomp = utils.create_decompressor())
|
||||
|
||||
# TODO: is this right?
|
||||
if rewrittenHeaders.charset:
|
||||
@ -221,7 +225,19 @@ class RewritingReplayHandler(ReplayHandler):
|
||||
buff = firstBuff if firstBuff else stream.read()
|
||||
while buff:
|
||||
if encoding:
|
||||
buff = buff.decode(encoding)
|
||||
try:
|
||||
buff = buff.decode(encoding)
|
||||
except UnicodeDecodeError, e:
|
||||
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
|
||||
for i in range(3):
|
||||
buff += stream.read(1)
|
||||
try:
|
||||
buff = buff.decode(encoding)
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
else:
|
||||
raise
|
||||
htmlrewriter.feed(buff)
|
||||
buff = stream.read()
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user