diff --git a/pywb/utils/bufferedreaders.py b/pywb/utils/bufferedreaders.py index f434e492..aece175f 100644 --- a/pywb/utils/bufferedreaders.py +++ b/pywb/utils/bufferedreaders.py @@ -30,7 +30,7 @@ class DecompressingBufferedReader(object): DECOMPRESSORS = {'gzip': gzip_decompressor} def __init__(self, stream, block_size=1024, - decomp_type=None, + decomp_type='gzip', starting_data=None): self.stream = stream self.block_size = block_size diff --git a/pywb/utils/test/test_bufferedreaders.py b/pywb/utils/test/test_bufferedreaders.py index 558f8782..d061218c 100644 --- a/pywb/utils/test/test_bufferedreaders.py +++ b/pywb/utils/test/test_bufferedreaders.py @@ -10,8 +10,8 @@ r""" >>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb'), decomp_type = 'gzip').readline() ' CDX N b a m s k r M S V g\n' -# decompress with on the fly compression ->>> DecompressingBufferedReader(BytesIO(compress('ABC\n1234\n')), decomp_type = 'gzip').read() +# decompress with on the fly compression, default gzip compression +>>> DecompressingBufferedReader(BytesIO(compress('ABC\n1234\n'))).read() 'ABC\n1234\n' # error: invalid compress type @@ -27,6 +27,11 @@ Exception: Decompression type not supported: bzip2 Traceback (most recent call last): error: Error -3 while decompressing: incorrect header check +# invalid output when reading compressed data as not compressed +>>> DecompressingBufferedReader(BytesIO(compress('ABC')), decomp_type = None).read() != 'ABC' +True + + # DecompressingBufferedReader readline() with decompression (zipnum file, no header) >>> DecompressingBufferedReader(open(test_zip_dir + 'zipnum-sample.cdx.gz', 'rb'), decomp_type = 'gzip').readline() 'com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz\n' @@ -60,6 +65,14 @@ Non-chunked data: >>> ChunkedDataReader(BytesIO("xyz123!@#")).read() 'xyz123!@#' +Non-chunked, compressed data +>>> ChunkedDataReader(BytesIO(compress('ABCDEF'))).read() +'ABCDEF' + +Non-chunked, compressed data +>>> DecompressingBufferedReader(ChunkedDataReader(BytesIO(compress('\nABCDEF\nGHIJ')))).read() +'\nABCDEF\nGHIJ' + Starts like chunked data, but isn't: >>> c = ChunkedDataReader(BytesIO("1\r\nxyz123!@#")); >>> c.read() + c.read()