mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Merge pull request #2 from jcushman/master
Handle transfer-encoding:chunked; misc. replay bugs.
This commit is contained in:
commit
ab955c411b
@ -282,6 +282,33 @@ class LineReader:
|
|||||||
self.stream.close()
|
self.stream.close()
|
||||||
self.stream = None
|
self.stream = None
|
||||||
|
|
||||||
|
class ChunkedLineReader(LineReader):
|
||||||
|
allChunksRead = False
|
||||||
|
|
||||||
|
def _fillbuff(self, chunkSize = None):
|
||||||
|
if self.allChunksRead:
|
||||||
|
return
|
||||||
|
|
||||||
|
if not self.buff or self.buff.pos >= self.buff.len:
|
||||||
|
lengthHeader = self.stream.readline()
|
||||||
|
chunkSize = int(lengthHeader.strip().split(';')[0], 16)
|
||||||
|
data = ''
|
||||||
|
if chunkSize:
|
||||||
|
while len(data) < chunkSize:
|
||||||
|
newData = self.stream.read(chunkSize - len(data))
|
||||||
|
if not newData:
|
||||||
|
raise Exception("Error reading chunked data: ran out of data before end of chunk.")
|
||||||
|
data += newData
|
||||||
|
clrf = self.stream.read(2)
|
||||||
|
if clrf != '\r\n':
|
||||||
|
raise Exception("Error reading chunked data: end of chunk not found where expected.")
|
||||||
|
if self.decomp:
|
||||||
|
data = self.decomp.decompress(data)
|
||||||
|
else:
|
||||||
|
self.allChunksRead = True
|
||||||
|
data = ''
|
||||||
|
|
||||||
|
self.buff = StringIO.StringIO(data)
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -3,7 +3,7 @@ from urllib2 import URLError
|
|||||||
import chardet
|
import chardet
|
||||||
import redis
|
import redis
|
||||||
|
|
||||||
import indexreader
|
import indexreader, archiveloader
|
||||||
from wbrequestresponse import WbResponse, StatusAndHeaders
|
from wbrequestresponse import WbResponse, StatusAndHeaders
|
||||||
from wbarchivalurl import ArchivalUrl
|
from wbarchivalurl import ArchivalUrl
|
||||||
import utils
|
import utils
|
||||||
@ -100,7 +100,7 @@ class ReplayHandler(object):
|
|||||||
|
|
||||||
def doReplay(self, cdx, wbrequest, failedFiles):
|
def doReplay(self, cdx, wbrequest, failedFiles):
|
||||||
hasCurr = (cdx['filename'] != '-')
|
hasCurr = (cdx['filename'] != '-')
|
||||||
hasOrig = (cdx['orig.filename'] != '-')
|
hasOrig = (cdx.get('orig.filename','-') != '-')
|
||||||
|
|
||||||
# Case 1: non-revisit
|
# Case 1: non-revisit
|
||||||
if (hasCurr and not hasOrig):
|
if (hasCurr and not hasOrig):
|
||||||
@ -190,9 +190,13 @@ class RewritingReplayHandler(ReplayHandler):
|
|||||||
# TODO: better way to pass this?
|
# TODO: better way to pass this?
|
||||||
stream = response._stream
|
stream = response._stream
|
||||||
|
|
||||||
|
# handle transfer-encoding: chunked
|
||||||
|
if (rewrittenHeaders.containsRemovedHeader('transfer-encoding', 'chunked')):
|
||||||
|
stream = archiveloader.ChunkedLineReader(stream)
|
||||||
|
|
||||||
# special case -- need to ungzip the body
|
# special case -- need to ungzip the body
|
||||||
if (rewrittenHeaders.containsRemovedHeader('content-encoding', 'gzip')):
|
if (rewrittenHeaders.containsRemovedHeader('content-encoding', 'gzip')):
|
||||||
stream = archiveloader.LineStream(stream, decomp = utils.create_decompressor())
|
stream = archiveloader.LineReader(stream, decomp = utils.create_decompressor())
|
||||||
|
|
||||||
# TODO: is this right?
|
# TODO: is this right?
|
||||||
if rewrittenHeaders.charset:
|
if rewrittenHeaders.charset:
|
||||||
@ -221,7 +225,19 @@ class RewritingReplayHandler(ReplayHandler):
|
|||||||
buff = firstBuff if firstBuff else stream.read()
|
buff = firstBuff if firstBuff else stream.read()
|
||||||
while buff:
|
while buff:
|
||||||
if encoding:
|
if encoding:
|
||||||
buff = buff.decode(encoding)
|
try:
|
||||||
|
buff = buff.decode(encoding)
|
||||||
|
except UnicodeDecodeError, e:
|
||||||
|
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
|
||||||
|
for i in range(3):
|
||||||
|
buff += stream.read(1)
|
||||||
|
try:
|
||||||
|
buff = buff.decode(encoding)
|
||||||
|
break
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
raise
|
||||||
htmlrewriter.feed(buff)
|
htmlrewriter.feed(buff)
|
||||||
buff = stream.read()
|
buff = stream.read()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user