1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Merge branch 'master' of github.com:ikreymer/pywb into work

This commit is contained in:
Ilya Kreymer 2014-01-19 12:31:53 -08:00
commit 6cb1743163
2 changed files with 47 additions and 4 deletions

View File

@ -282,6 +282,33 @@ class LineReader:
self.stream.close()
self.stream = None
class ChunkedLineReader(LineReader):
allChunksRead = False
def _fillbuff(self, chunkSize = None):
if self.allChunksRead:
return
if not self.buff or self.buff.pos >= self.buff.len:
lengthHeader = self.stream.readline()
chunkSize = int(lengthHeader.strip().split(';')[0], 16)
data = ''
if chunkSize:
while len(data) < chunkSize:
newData = self.stream.read(chunkSize - len(data))
if not newData:
raise Exception("Error reading chunked data: ran out of data before end of chunk.")
data += newData
clrf = self.stream.read(2)
if clrf != '\r\n':
raise Exception("Error reading chunked data: end of chunk not found where expected.")
if self.decomp:
data = self.decomp.decompress(data)
else:
self.allChunksRead = True
data = ''
self.buff = StringIO.StringIO(data)
#=================================================================
if __name__ == "__main__":

View File

@ -4,7 +4,7 @@ import chardet
import redis
import copy
import indexreader
import indexreader, archiveloader
from wbrequestresponse import WbResponse, StatusAndHeaders
from wbarchivalurl import ArchivalUrl
import utils
@ -101,7 +101,7 @@ class ReplayHandler(object):
def doReplay(self, cdx, wbrequest, query, failedFiles):
hasCurr = (cdx['filename'] != '-')
hasOrig = (cdx['orig.filename'] != '-')
hasOrig = (cdx.get('orig.filename','-') != '-')
# Case 1: non-revisit
if (hasCurr and not hasOrig):
@ -239,9 +239,13 @@ class RewritingReplayHandler(ReplayHandler):
# TODO: better way to pass this?
stream = response._stream
# handle transfer-encoding: chunked
if (rewrittenHeaders.containsRemovedHeader('transfer-encoding', 'chunked')):
stream = archiveloader.ChunkedLineReader(stream)
# special case -- need to ungzip the body
if (rewrittenHeaders.containsRemovedHeader('content-encoding', 'gzip')):
stream = archiveloader.LineStream(stream, decomp = utils.create_decompressor())
stream = archiveloader.LineReader(stream, decomp = utils.create_decompressor())
# TODO: is this right?
if rewrittenHeaders.charset:
@ -270,7 +274,19 @@ class RewritingReplayHandler(ReplayHandler):
buff = firstBuff if firstBuff else stream.read()
while buff:
if encoding:
buff = buff.decode(encoding)
try:
buff = buff.decode(encoding)
except UnicodeDecodeError, e:
# chunk may have cut apart unicode bytes -- add 1-3 bytes and retry
for i in range(3):
buff += stream.read(1)
try:
buff = buff.decode(encoding)
break
except UnicodeDecodeError:
pass
else:
raise
htmlrewriter.feed(buff)
buff = stream.read()