mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
rename BufferedReader -> DecompressingBufferedReader
remove max_len from DecompressingBufferedReader as it applied to the compressed size, not original size. Add integration test for verifying content length of larger file
This commit is contained in:
parent
433b150542
commit
922917a631
@ -6,7 +6,7 @@ from regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
|
||||
from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders
|
||||
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.bufferedreaders import BufferedReader, ChunkedDataReader
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader
|
||||
|
||||
class RewriteContent:
|
||||
|
||||
@ -54,7 +54,7 @@ class RewriteContent:
|
||||
# =========================================================================
|
||||
# special case -- need to ungzip the body
|
||||
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
|
||||
stream = BufferedReader(stream, decomp_type='gzip')
|
||||
stream = DecompressingBufferedReader(stream, decomp_type='gzip')
|
||||
|
||||
if rewritten_headers.charset:
|
||||
encoding = rewritten_headers.charset
|
||||
|
@ -11,7 +11,7 @@ def gzip_decompressor():
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BufferedReader(object):
|
||||
class DecompressingBufferedReader(object):
|
||||
"""
|
||||
A wrapping line reader which wraps an existing reader.
|
||||
Read operations operate on underlying buffer, which is filled to
|
||||
@ -29,7 +29,7 @@ class BufferedReader(object):
|
||||
|
||||
DECOMPRESSORS = {'gzip': gzip_decompressor}
|
||||
|
||||
def __init__(self, stream, max_len=0, block_size=1024, decomp_type=None):
|
||||
def __init__(self, stream, block_size=1024, decomp_type=None):
|
||||
self.stream = stream
|
||||
self.block_size = block_size
|
||||
|
||||
@ -44,19 +44,13 @@ class BufferedReader(object):
|
||||
|
||||
self.buff = None
|
||||
self.num_read = 0
|
||||
self.max_len = max_len
|
||||
|
||||
def _fillbuff(self, block_size=None):
|
||||
if not block_size:
|
||||
block_size = self.block_size
|
||||
|
||||
if not self.buff or self.buff.pos >= self.buff.len:
|
||||
if self.max_len > 0:
|
||||
to_read = min(self.max_len - self.num_read, self.block_size)
|
||||
else:
|
||||
to_read = self.block_size
|
||||
|
||||
data = self.stream.read(to_read)
|
||||
data = self.stream.read(self.block_size)
|
||||
self._process_read(data)
|
||||
|
||||
def _process_read(self, data):
|
||||
@ -97,7 +91,7 @@ class ChunkedDataException(Exception):
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ChunkedDataReader(BufferedReader):
|
||||
class ChunkedDataReader(DecompressingBufferedReader):
|
||||
r"""
|
||||
A ChunkedDataReader is a BufferedReader which also supports de-chunking
|
||||
of the data if it happens to be http 'chunk-encoded'.
|
||||
@ -133,7 +127,7 @@ class ChunkedDataReader(BufferedReader):
|
||||
|
||||
def _fillbuff(self, block_size=None):
|
||||
if self.not_chunked:
|
||||
return BufferedReader._fillbuff(self, block_size)
|
||||
return super(ChunkedDataReader, self)._fillbuff(block_size)
|
||||
|
||||
if self.all_chunks_read:
|
||||
return
|
||||
|
@ -23,12 +23,12 @@
|
||||
>>> seek_read_full(sr, 100)
|
||||
'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n'
|
||||
|
||||
#BufferedReader readline()
|
||||
>>> BufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
|
||||
#DecompressingBufferedReader readline()
|
||||
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
|
||||
' CDX N b a m s k r M S V g\\n'
|
||||
|
||||
#BufferedReader readline() with decompression
|
||||
>>> BufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
|
||||
#DecompressingBufferedReader readline() with decompression
|
||||
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
|
||||
' CDX N b a m s k r M S V g\\n'
|
||||
|
||||
>>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
|
||||
@ -41,7 +41,7 @@ import os
|
||||
import StringIO
|
||||
from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker
|
||||
from pywb.utils.loaders import LimitReader, SeekableTextFileReader
|
||||
from pywb.utils.bufferedreaders import BufferedReader
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
|
||||
from pywb import get_test_dir
|
||||
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'
|
||||
|
@ -7,7 +7,7 @@ from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParserException
|
||||
|
||||
from pywb.utils.loaders import FileLoader, HttpLoader
|
||||
from pywb.utils.bufferedreaders import BufferedReader
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
|
||||
#=================================================================
|
||||
ArcWarcRecord = collections.namedtuple('ArchiveRecord',
|
||||
@ -43,13 +43,13 @@ class ArcWarcRecordLoader:
|
||||
'': file
|
||||
}
|
||||
|
||||
def __init__(self, loaders={}, cookie_maker=None, chunk_size=8192):
|
||||
def __init__(self, loaders={}, cookie_maker=None, block_size=8192):
|
||||
self.loaders = loaders
|
||||
|
||||
if not self.loaders:
|
||||
self.loaders = self.create_default_loaders(cookie_maker)
|
||||
|
||||
self.chunk_size = chunk_size
|
||||
self.block_size = block_size
|
||||
|
||||
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
|
||||
|
||||
@ -73,9 +73,12 @@ class ArcWarcRecordLoader:
|
||||
|
||||
decomp_type = 'gzip'
|
||||
|
||||
stream = BufferedReader(raw, length, self.chunk_size, decomp_type)
|
||||
# Create decompressing stream
|
||||
stream = DecompressingBufferedReader(stream = raw,
|
||||
decomp_type = decomp_type,
|
||||
block_size = self.block_size)
|
||||
|
||||
(the_format, rec_headers) = self._load_headers(stream)
|
||||
(the_format, rec_headers) = self._detect_type_load_headers(stream)
|
||||
|
||||
if the_format == 'arc':
|
||||
rec_type = 'response'
|
||||
@ -111,7 +114,7 @@ class ArcWarcRecordLoader:
|
||||
return ArcWarcRecord((the_format, rec_type),
|
||||
rec_headers, stream, status_headers)
|
||||
|
||||
def _load_headers(self, stream):
|
||||
def _detect_type_load_headers(self, stream):
|
||||
"""
|
||||
Try parsing record as WARC, then try parsing as ARC.
|
||||
if neither one succeeds, we're out of luck.
|
||||
|
@ -75,6 +75,11 @@ class TestWb:
|
||||
assert 'wb.js' in resp.body
|
||||
assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body
|
||||
|
||||
def test_replay_content_length_1(self):
|
||||
# test larger file, rewritten file (svg!)
|
||||
resp = self.testapp.get('/pywb/20140126200654/http://www.iana.org/_img/2013.1/rir-map.svg')
|
||||
assert resp.headers['Content-Length'] == str(len(resp.body))
|
||||
|
||||
|
||||
def test_redirect_1(self):
|
||||
resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/')
|
||||
|
Loading…
x
Reference in New Issue
Block a user