mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
rename BufferedReader -> DecompressingBufferedReader
remove max_len from DecompressingBufferedReader as it applied to the compressed size, not original size. Add integration test for verifying content length of larger file
This commit is contained in:
parent
433b150542
commit
922917a631
@ -6,7 +6,7 @@ from regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
|
|||||||
from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders
|
from header_rewriter import HeaderRewriter, RewrittenStatusAndHeaders
|
||||||
|
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
from pywb.utils.bufferedreaders import BufferedReader, ChunkedDataReader
|
from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader
|
||||||
|
|
||||||
class RewriteContent:
|
class RewriteContent:
|
||||||
|
|
||||||
@ -54,7 +54,7 @@ class RewriteContent:
|
|||||||
# =========================================================================
|
# =========================================================================
|
||||||
# special case -- need to ungzip the body
|
# special case -- need to ungzip the body
|
||||||
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
|
if (rewritten_headers.contains_removed_header('content-encoding', 'gzip')):
|
||||||
stream = BufferedReader(stream, decomp_type='gzip')
|
stream = DecompressingBufferedReader(stream, decomp_type='gzip')
|
||||||
|
|
||||||
if rewritten_headers.charset:
|
if rewritten_headers.charset:
|
||||||
encoding = rewritten_headers.charset
|
encoding = rewritten_headers.charset
|
||||||
|
@ -11,7 +11,7 @@ def gzip_decompressor():
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class BufferedReader(object):
|
class DecompressingBufferedReader(object):
|
||||||
"""
|
"""
|
||||||
A wrapping line reader which wraps an existing reader.
|
A wrapping line reader which wraps an existing reader.
|
||||||
Read operations operate on underlying buffer, which is filled to
|
Read operations operate on underlying buffer, which is filled to
|
||||||
@ -29,7 +29,7 @@ class BufferedReader(object):
|
|||||||
|
|
||||||
DECOMPRESSORS = {'gzip': gzip_decompressor}
|
DECOMPRESSORS = {'gzip': gzip_decompressor}
|
||||||
|
|
||||||
def __init__(self, stream, max_len=0, block_size=1024, decomp_type=None):
|
def __init__(self, stream, block_size=1024, decomp_type=None):
|
||||||
self.stream = stream
|
self.stream = stream
|
||||||
self.block_size = block_size
|
self.block_size = block_size
|
||||||
|
|
||||||
@ -44,19 +44,13 @@ class BufferedReader(object):
|
|||||||
|
|
||||||
self.buff = None
|
self.buff = None
|
||||||
self.num_read = 0
|
self.num_read = 0
|
||||||
self.max_len = max_len
|
|
||||||
|
|
||||||
def _fillbuff(self, block_size=None):
|
def _fillbuff(self, block_size=None):
|
||||||
if not block_size:
|
if not block_size:
|
||||||
block_size = self.block_size
|
block_size = self.block_size
|
||||||
|
|
||||||
if not self.buff or self.buff.pos >= self.buff.len:
|
if not self.buff or self.buff.pos >= self.buff.len:
|
||||||
if self.max_len > 0:
|
data = self.stream.read(self.block_size)
|
||||||
to_read = min(self.max_len - self.num_read, self.block_size)
|
|
||||||
else:
|
|
||||||
to_read = self.block_size
|
|
||||||
|
|
||||||
data = self.stream.read(to_read)
|
|
||||||
self._process_read(data)
|
self._process_read(data)
|
||||||
|
|
||||||
def _process_read(self, data):
|
def _process_read(self, data):
|
||||||
@ -97,7 +91,7 @@ class ChunkedDataException(Exception):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ChunkedDataReader(BufferedReader):
|
class ChunkedDataReader(DecompressingBufferedReader):
|
||||||
r"""
|
r"""
|
||||||
A ChunkedDataReader is a BufferedReader which also supports de-chunking
|
A ChunkedDataReader is a BufferedReader which also supports de-chunking
|
||||||
of the data if it happens to be http 'chunk-encoded'.
|
of the data if it happens to be http 'chunk-encoded'.
|
||||||
@ -133,7 +127,7 @@ class ChunkedDataReader(BufferedReader):
|
|||||||
|
|
||||||
def _fillbuff(self, block_size=None):
|
def _fillbuff(self, block_size=None):
|
||||||
if self.not_chunked:
|
if self.not_chunked:
|
||||||
return BufferedReader._fillbuff(self, block_size)
|
return super(ChunkedDataReader, self)._fillbuff(block_size)
|
||||||
|
|
||||||
if self.all_chunks_read:
|
if self.all_chunks_read:
|
||||||
return
|
return
|
||||||
|
@ -23,12 +23,12 @@
|
|||||||
>>> seek_read_full(sr, 100)
|
>>> seek_read_full(sr, 100)
|
||||||
'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n'
|
'org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf application/octet-stream 200 LNMEDYOENSOEI5VPADCKL3CB6N3GWXPR - - 34054 620049 iana.warc.gz\\n'
|
||||||
|
|
||||||
#BufferedReader readline()
|
#DecompressingBufferedReader readline()
|
||||||
>>> BufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
|
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
|
||||||
' CDX N b a m s k r M S V g\\n'
|
' CDX N b a m s k r M S V g\\n'
|
||||||
|
|
||||||
#BufferedReader readline() with decompression
|
#DecompressingBufferedReader readline() with decompression
|
||||||
>>> BufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
|
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
|
||||||
' CDX N b a m s k r M S V g\\n'
|
' CDX N b a m s k r M S V g\\n'
|
||||||
|
|
||||||
>>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
|
>>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
|
||||||
@ -41,7 +41,7 @@ import os
|
|||||||
import StringIO
|
import StringIO
|
||||||
from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker
|
from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker
|
||||||
from pywb.utils.loaders import LimitReader, SeekableTextFileReader
|
from pywb.utils.loaders import LimitReader, SeekableTextFileReader
|
||||||
from pywb.utils.bufferedreaders import BufferedReader
|
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'
|
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'
|
||||||
|
@ -7,7 +7,7 @@ from pywb.utils.statusandheaders import StatusAndHeadersParser
|
|||||||
from pywb.utils.statusandheaders import StatusAndHeadersParserException
|
from pywb.utils.statusandheaders import StatusAndHeadersParserException
|
||||||
|
|
||||||
from pywb.utils.loaders import FileLoader, HttpLoader
|
from pywb.utils.loaders import FileLoader, HttpLoader
|
||||||
from pywb.utils.bufferedreaders import BufferedReader
|
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
ArcWarcRecord = collections.namedtuple('ArchiveRecord',
|
ArcWarcRecord = collections.namedtuple('ArchiveRecord',
|
||||||
@ -43,13 +43,13 @@ class ArcWarcRecordLoader:
|
|||||||
'': file
|
'': file
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, loaders={}, cookie_maker=None, chunk_size=8192):
|
def __init__(self, loaders={}, cookie_maker=None, block_size=8192):
|
||||||
self.loaders = loaders
|
self.loaders = loaders
|
||||||
|
|
||||||
if not self.loaders:
|
if not self.loaders:
|
||||||
self.loaders = self.create_default_loaders(cookie_maker)
|
self.loaders = self.create_default_loaders(cookie_maker)
|
||||||
|
|
||||||
self.chunk_size = chunk_size
|
self.block_size = block_size
|
||||||
|
|
||||||
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
|
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
|
||||||
|
|
||||||
@ -73,9 +73,12 @@ class ArcWarcRecordLoader:
|
|||||||
|
|
||||||
decomp_type = 'gzip'
|
decomp_type = 'gzip'
|
||||||
|
|
||||||
stream = BufferedReader(raw, length, self.chunk_size, decomp_type)
|
# Create decompressing stream
|
||||||
|
stream = DecompressingBufferedReader(stream = raw,
|
||||||
|
decomp_type = decomp_type,
|
||||||
|
block_size = self.block_size)
|
||||||
|
|
||||||
(the_format, rec_headers) = self._load_headers(stream)
|
(the_format, rec_headers) = self._detect_type_load_headers(stream)
|
||||||
|
|
||||||
if the_format == 'arc':
|
if the_format == 'arc':
|
||||||
rec_type = 'response'
|
rec_type = 'response'
|
||||||
@ -111,7 +114,7 @@ class ArcWarcRecordLoader:
|
|||||||
return ArcWarcRecord((the_format, rec_type),
|
return ArcWarcRecord((the_format, rec_type),
|
||||||
rec_headers, stream, status_headers)
|
rec_headers, stream, status_headers)
|
||||||
|
|
||||||
def _load_headers(self, stream):
|
def _detect_type_load_headers(self, stream):
|
||||||
"""
|
"""
|
||||||
Try parsing record as WARC, then try parsing as ARC.
|
Try parsing record as WARC, then try parsing as ARC.
|
||||||
if neither one succeeds, we're out of luck.
|
if neither one succeeds, we're out of luck.
|
||||||
|
@ -75,6 +75,11 @@ class TestWb:
|
|||||||
assert 'wb.js' in resp.body
|
assert 'wb.js' in resp.body
|
||||||
assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body
|
assert '/pywb/20140127171238/http://www.iana.org/time-zones' in resp.body
|
||||||
|
|
||||||
|
def test_replay_content_length_1(self):
|
||||||
|
# test larger file, rewritten file (svg!)
|
||||||
|
resp = self.testapp.get('/pywb/20140126200654/http://www.iana.org/_img/2013.1/rir-map.svg')
|
||||||
|
assert resp.headers['Content-Length'] == str(len(resp.body))
|
||||||
|
|
||||||
|
|
||||||
def test_redirect_1(self):
|
def test_redirect_1(self):
|
||||||
resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/')
|
resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user