1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

rename BufferedReader -> DecompressingBufferedReader

remove max_len from DecompressingBufferedReader as it applied to
the compressed size, not original size.
Add integration test for verifying content length of larger file
This commit is contained in:
Ilya Kreymer 2014-02-20 11:53:08 -08:00
parent 433b150542
commit 922917a631
5 changed files with 26 additions and 24 deletions

View File

@ -6,7 +6,7 @@ from regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders
from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.bufferedreaders import BufferedReader, ChunkedDataReader from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader
class RewriteContent: class RewriteContent:
@ -54,7 +54,7 @@ class RewriteContent:
# ========================================================================= # =========================================================================
# special case -- need to ungzip the body # special case -- need to ungzip the body
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')): if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
stream = BufferedReader(stream, decomp_type='gzip') stream = DecompressingBufferedReader(stream, decomp_type='gzip')
if rewritten_headers.charset: if rewritten_headers.charset:
encoding = rewritten_headers.charset encoding = rewritten_headers.charset

View File

@ -11,7 +11,7 @@ def gzip_decompressor():
#================================================================= #=================================================================
class BufferedReader(object): class DecompressingBufferedReader(object):
""" """
A wrapping line reader which wraps an existing reader. A wrapping line reader which wraps an existing reader.
Read operations operate on underlying buffer, which is filled to Read operations operate on underlying buffer, which is filled to
@ -29,7 +29,7 @@ class BufferedReader(object):
DECOMPRESSORS = {'gzip': gzip_decompressor} DECOMPRESSORS = {'gzip': gzip_decompressor}
def __init__(self, stream, max_len=0, block_size=1024, decomp_type=None): def __init__(self, stream, block_size=1024, decomp_type=None):
self.stream = stream self.stream = stream
self.block_size = block_size self.block_size = block_size
@ -44,19 +44,13 @@ class BufferedReader(object):
self.buff = None self.buff = None
self.num_read = 0 self.num_read = 0
self.max_len = max_len
def _fillbuff(self, block_size=None): def _fillbuff(self, block_size=None):
if not block_size: if not block_size:
block_size = self.block_size block_size = self.block_size
if not self.buff or self.buff.pos >= self.buff.len: if not self.buff or self.buff.pos >= self.buff.len:
if self.max_len > 0: data = self.stream.read(self.block_size)
to_read = min(self.max_len - self.num_read, self.block_size)
else:
to_read = self.block_size
data = self.stream.read(to_read)
self._process_read(data) self._process_read(data)
def _process_read(self, data): def _process_read(self, data):
@ -97,7 +91,7 @@ class ChunkedDataException(Exception):
#================================================================= #=================================================================
class ChunkedDataReader(BufferedReader): class ChunkedDataReader(DecompressingBufferedReader):
r""" r"""
A ChunkedDataReader is a BufferedReader which also supports de-chunking A ChunkedDataReader is a BufferedReader which also supports de-chunking
of the data if it happens to be http 'chunk-encoded'. of the data if it happens to be http 'chunk-encoded'.
@ -133,7 +127,7 @@ class ChunkedDataReader(BufferedReader):
def _fillbuff(self, block_size=None): def _fillbuff(self, block_size=None):
if self.not_chunked: if self.not_chunked:
return BufferedReader._fillbuff(self, block_size) return super(ChunkedDataReader, self)._fillbuff(block_size)
if self.all_chunks_read: if self.all_chunks_read:
return return

View File

@ -23,12 +23,12 @@
>>> seek_read_full(sr, 100) >>> seek_read_full(sr, 100)
'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n' 'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n'
#BufferedReader readline() #DecompressingBufferedReader readline()
>>> BufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline() >>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
' CDX N b a m s k r M S V g\\n' ' CDX N b a m s k r M S V g\\n'
#BufferedReader readline() with decompression #DecompressingBufferedReader readline() with decompression
>>> BufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline() >>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
' CDX N b a m s k r M S V g\\n' ' CDX N b a m s k r M S V g\\n'
>>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read() >>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
@ -41,7 +41,7 @@ import os
import StringIO import StringIO
from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker
from pywb.utils.loaders import LimitReader, SeekableTextFileReader from pywb.utils.loaders import LimitReader, SeekableTextFileReader
from pywb.utils.bufferedreaders import BufferedReader from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb import get_test_dir from pywb import get_test_dir
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/' #test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'

View File

@ -7,7 +7,7 @@ from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.statusandheaders import StatusAndHeadersParserException from pywb.utils.statusandheaders import StatusAndHeadersParserException
from pywb.utils.loaders import FileLoader, HttpLoader from pywb.utils.loaders import FileLoader, HttpLoader
from pywb.utils.bufferedreaders import BufferedReader from pywb.utils.bufferedreaders import DecompressingBufferedReader
#================================================================= #=================================================================
ArcWarcRecord = collections.namedtuple('ArchiveRecord', ArcWarcRecord = collections.namedtuple('ArchiveRecord',
@ -43,13 +43,13 @@ class ArcWarcRecordLoader:
'': file '': file
} }
def __init__(self, loaders={}, cookie_maker=None, chunk_size=8192): def __init__(self, loaders={}, cookie_maker=None, block_size=8192):
self.loaders = loaders self.loaders = loaders
if not self.loaders: if not self.loaders:
self.loaders = self.create_default_loaders(cookie_maker) self.loaders = self.create_default_loaders(cookie_maker)
self.chunk_size = chunk_size self.block_size = block_size
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS) self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
@ -73,9 +73,12 @@ class ArcWarcRecordLoader:
decomp_type = 'gzip' decomp_type = 'gzip'
stream = BufferedReader(raw, length, self.chunk_size, decomp_type) # Create decompressing stream
stream = DecompressingBufferedReader(stream = raw,
decomp_type = decomp_type,
block_size = self.block_size)
(the_format, rec_headers) = self._load_headers(stream) (the_format, rec_headers) = self._detect_type_load_headers(stream)
if the_format == 'arc': if the_format == 'arc':
rec_type = 'response' rec_type = 'response'
@ -111,7 +114,7 @@ class ArcWarcRecordLoader:
return ArcWarcRecord((the_format, rec_type), return ArcWarcRecord((the_format, rec_type),
rec_headers, stream, status_headers) rec_headers, stream, status_headers)
def _load_headers(self, stream): def _detect_type_load_headers(self, stream):
""" """
Try parsing record as WARC, then try parsing as ARC. Try parsing record as WARC, then try parsing as ARC.
if neither one succeeds, we're out of luck. if neither one succeeds, we're out of luck.

View File

@ -75,6 +75,11 @@ class TestWb:
assert 'wb.js' in resp.body assert 'wb.js' in resp.body
assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body
def test_replay_content_length_1(self):
# test larger file, rewritten file (svg!)
resp = self.testapp.get('/pywb/20140126200654/http://www.iana.org/_img/2013.1/rir-map.svg')
assert resp.headers['Content-Length'] == str(len(resp.body))
def test_redirect_1(self): def test_redirect_1(self):
resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/') resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/')