1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rename BufferedReader -> DecompressingBufferedReader

remove max_len from DecompressingBufferedReader as it applied to
the compressed size, not original size.
Add integration test for verifying content length of larger file
This commit is contained in:
Ilya Kreymer 2014-02-20 11:53:08 -08:00
parent 433b150542
commit 922917a631
5 changed files with 26 additions and 24 deletions

View File

@ -6,7 +6,7 @@ from regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.bufferedreaders import BufferedReader, ChunkedDataReader
from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader
class RewriteContent:
@ -54,7 +54,7 @@ class RewriteContent:
# =========================================================================
# special case -- need to ungzip the body
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
stream = BufferedReader(stream, decomp_type='gzip')
stream = DecompressingBufferedReader(stream, decomp_type='gzip')
if rewritten_headers.charset:
encoding = rewritten_headers.charset

View File

@ -11,7 +11,7 @@ def gzip_decompressor():
#=================================================================
class BufferedReader(object):
class DecompressingBufferedReader(object):
"""
A wrapping line reader which wraps an existing reader.
Read operations operate on underlying buffer, which is filled to
@ -29,7 +29,7 @@ class BufferedReader(object):
DECOMPRESSORS = {'gzip': gzip_decompressor}
def __init__(self, stream, max_len=0, block_size=1024, decomp_type=None):
def __init__(self, stream, block_size=1024, decomp_type=None):
self.stream = stream
self.block_size = block_size
@ -44,19 +44,13 @@ class BufferedReader(object):
self.buff = None
self.num_read = 0
self.max_len = max_len
def _fillbuff(self, block_size=None):
if not block_size:
block_size = self.block_size
if not self.buff or self.buff.pos >= self.buff.len:
if self.max_len > 0:
to_read = min(self.max_len - self.num_read, self.block_size)
else:
to_read = self.block_size
data = self.stream.read(to_read)
data = self.stream.read(self.block_size)
self._process_read(data)
def _process_read(self, data):
@ -97,7 +91,7 @@ class ChunkedDataException(Exception):
#=================================================================
class ChunkedDataReader(BufferedReader):
class ChunkedDataReader(DecompressingBufferedReader):
r"""
A ChunkedDataReader is a BufferedReader which also supports de-chunking
of the data if it happens to be http 'chunk-encoded'.
@ -133,7 +127,7 @@ class ChunkedDataReader(BufferedReader):
def _fillbuff(self, block_size=None):
if self.not_chunked:
return BufferedReader._fillbuff(self, block_size)
return super(ChunkedDataReader, self)._fillbuff(block_size)
if self.all_chunks_read:
return

View File

@ -23,12 +23,12 @@
>>> seek_read_full(sr, 100)
'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n'
#BufferedReader readline()
>>> BufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
#DecompressingBufferedReader readline()
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
' CDX N b a m s k r M S V g\\n'
#BufferedReader readline() with decompression
>>> BufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
#DecompressingBufferedReader readline() with decompression
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
' CDX N b a m s k r M S V g\\n'
>>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
@ -41,7 +41,7 @@ import os
import StringIO
from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker
from pywb.utils.loaders import LimitReader, SeekableTextFileReader
from pywb.utils.bufferedreaders import BufferedReader
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb import get_test_dir
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'

View File

@ -7,7 +7,7 @@ from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.statusandheaders import StatusAndHeadersParserException
from pywb.utils.loaders import FileLoader, HttpLoader
from pywb.utils.bufferedreaders import BufferedReader
from pywb.utils.bufferedreaders import DecompressingBufferedReader
#=================================================================
ArcWarcRecord = collections.namedtuple('ArchiveRecord',
@ -43,13 +43,13 @@ class ArcWarcRecordLoader:
'': file
}
def __init__(self, loaders={}, cookie_maker=None, chunk_size=8192):
def __init__(self, loaders={}, cookie_maker=None, block_size=8192):
self.loaders = loaders
if not self.loaders:
self.loaders = self.create_default_loaders(cookie_maker)
self.chunk_size = chunk_size
self.block_size = block_size
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
@ -73,9 +73,12 @@ class ArcWarcRecordLoader:
decomp_type = 'gzip'
stream = BufferedReader(raw, length, self.chunk_size, decomp_type)
# Create decompressing stream
stream = DecompressingBufferedReader(stream = raw,
decomp_type = decomp_type,
block_size = self.block_size)
(the_format, rec_headers) = self._load_headers(stream)
(the_format, rec_headers) = self._detect_type_load_headers(stream)
if the_format == 'arc':
rec_type = 'response'
@ -111,7 +114,7 @@ class ArcWarcRecordLoader:
return ArcWarcRecord((the_format, rec_type),
rec_headers, stream, status_headers)
def _load_headers(self, stream):
def _detect_type_load_headers(self, stream):
"""
Try parsing record as WARC, then try parsing as ARC.
if neither one succeeds, we're out of luck.

View File

@ -75,6 +75,11 @@ class TestWb:
assert 'wb.js' in resp.body
assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body
def test_replay_content_length_1(self):
# test larger file, rewritten file (svg!)
resp = self.testapp.get('/pywb/20140126200654/http://www.iana.org/_img/2013.1/rir-map.svg')
assert resp.headers['Content-Length'] == str(len(resp.body))
def test_redirect_1(self):
resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/')