diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 9f904764..81cd23c9 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -6,7 +6,7 @@ from regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeaders -from pywb.utils.bufferedreaders import BufferedReader, ChunkedDataReader +from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader class RewriteContent: @@ -54,7 +54,7 @@ class RewriteContent: # ========================================================================= # special case -- need to ungzip the body if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')): - stream = BufferedReader(stream, decomp_type='gzip') + stream = DecompressingBufferedReader(stream, decomp_type='gzip') if rewritten_headers.charset: encoding = rewritten_headers.charset diff --git a/pywb/utils/bufferedreaders.py b/pywb/utils/bufferedreaders.py index 27a3ed33..845e97a7 100644 --- a/pywb/utils/bufferedreaders.py +++ b/pywb/utils/bufferedreaders.py @@ -11,7 +11,7 @@ def gzip_decompressor(): #================================================================= -class BufferedReader(object): +class DecompressingBufferedReader(object): """ A wrapping line reader which wraps an existing reader. Read operations operate on underlying buffer, which is filled to @@ -29,7 +29,7 @@ class BufferedReader(object): DECOMPRESSORS = {'gzip': gzip_decompressor} - def __init__(self, stream, max_len=0, block_size=1024, decomp_type=None): + def __init__(self, stream, block_size=1024, decomp_type=None): self.stream = stream self.block_size = block_size @@ -44,19 +44,13 @@ class BufferedReader(object): self.buff = None self.num_read = 0 - self.max_len = max_len def _fillbuff(self, block_size=None): if not block_size: block_size = self.block_size if not self.buff or self.buff.pos >= self.buff.len: - if self.max_len > 0: - to_read = min(self.max_len - self.num_read, self.block_size) - else: - to_read = self.block_size - - data = self.stream.read(to_read) + data = self.stream.read(self.block_size) self._process_read(data) def _process_read(self, data): @@ -97,7 +91,7 @@ class ChunkedDataException(Exception): #================================================================= -class ChunkedDataReader(BufferedReader): +class ChunkedDataReader(DecompressingBufferedReader): r""" A ChunkedDataReader is a BufferedReader which also supports de-chunking of the data if it happens to be http 'chunk-encoded'. @@ -133,7 +127,7 @@ class ChunkedDataReader(BufferedReader): def _fillbuff(self, block_size=None): if self.not_chunked: - return BufferedReader._fillbuff(self, block_size) + return super(ChunkedDataReader, self)._fillbuff(block_size) if self.all_chunks_read: return diff --git a/pywb/utils/test/loaders_test.py b/pywb/utils/test/loaders_test.py index 73d4b3dd..a18499b8 100644 --- a/pywb/utils/test/loaders_test.py +++ b/pywb/utils/test/loaders_test.py @@ -23,12 +23,12 @@ >>> seek_read_full(sr, 100) 'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n' -#BufferedReader readline() ->>> BufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline() +#DecompressingBufferedReader readline() +>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline() ' CDX N b a m s k r M S V g\\n' -#BufferedReader readline() with decompression ->>> BufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline() +#DecompressingBufferedReader readline() with decompression +>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline() ' CDX N b a m s k r M S V g\\n' >>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read() @@ -41,7 +41,7 @@ import os import StringIO from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker from pywb.utils.loaders import LimitReader, SeekableTextFileReader -from pywb.utils.bufferedreaders import BufferedReader +from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb import get_test_dir #test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/' diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 05973f6b..9f595301 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -7,7 +7,7 @@ from pywb.utils.statusandheaders import StatusAndHeadersParser from pywb.utils.statusandheaders import StatusAndHeadersParserException from pywb.utils.loaders import FileLoader, HttpLoader -from pywb.utils.bufferedreaders import BufferedReader +from pywb.utils.bufferedreaders import DecompressingBufferedReader #================================================================= ArcWarcRecord = collections.namedtuple('ArchiveRecord', @@ -43,13 +43,13 @@ class ArcWarcRecordLoader: '': file } - def __init__(self, loaders={}, cookie_maker=None, chunk_size=8192): + def __init__(self, loaders={}, cookie_maker=None, block_size=8192): self.loaders = loaders if not self.loaders: self.loaders = self.create_default_loaders(cookie_maker) - self.chunk_size = chunk_size + self.block_size = block_size self.arc_parser = ARCHeadersParser(self.ARC_HEADERS) @@ -73,9 +73,12 @@ class ArcWarcRecordLoader: decomp_type = 'gzip' - stream = BufferedReader(raw, length, self.chunk_size, decomp_type) + # Create decompressing stream + stream = DecompressingBufferedReader(stream = raw, + decomp_type = decomp_type, + block_size = self.block_size) - (the_format, rec_headers) = self._load_headers(stream) + (the_format, rec_headers) = self._detect_type_load_headers(stream) if the_format == 'arc': rec_type = 'response' @@ -111,7 +114,7 @@ class ArcWarcRecordLoader: return ArcWarcRecord((the_format, rec_type), rec_headers, stream, status_headers) - def _load_headers(self, stream): + def _detect_type_load_headers(self, stream): """ Try parsing record as WARC, then try parsing as ARC. if neither one succeeds, we're out of luck. diff --git a/tests/test_integration.py b/tests/test_integration.py index ec7fd6bd..f8e614cc 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -75,6 +75,11 @@ class TestWb: assert 'wb.js' in resp.body assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body + def test_replay_content_length_1(self): + # test larger file, rewritten file (svg!) + resp = self.testapp.get('/pywb/20140126200654/http://www.iana.org/_img/2013.1/rir-map.svg') + assert resp.headers['Content-Length'] == str(len(resp.body)) + def test_redirect_1(self): resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/')