decompressingbufferedreader: default to 'gzip' decompression instead of

none. ChunkedDataReader also automatically attempts decompression, by default Add tests to verify
2025-03-15 00:03:28 +01:00 · 2014-04-08 21:49:04 -07:00 · 2014-04-08 21:49:04 -07:00 · 8897a0a7c9
commit 8897a0a7c9
parent 02fe78cb0b
2 changed files with 16 additions and 3 deletions
--- a/pywb/utils/bufferedreaders.py
+++ b/pywb/utils/bufferedreaders.py
@ -30,7 +30,7 @@ class DecompressingBufferedReader(object):
    DECOMPRESSORS = {'gzip': gzip_decompressor}

    def __init__(self, stream, block_size=1024,
-                 decomp_type=None,
+                 decomp_type='gzip',
                 starting_data=None):
        self.stream = stream
        self.block_size = block_size
--- a/pywb/utils/test/test_bufferedreaders.py
+++ b/pywb/utils/test/test_bufferedreaders.py
@ -10,8 +10,8 @@ r"""
 >>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb'), decomp_type = 'gzip').readline()
 ' CDX N b a m s k r M S V g\n'

-# decompress with on the fly compression
->>> DecompressingBufferedReader(BytesIO(compress('ABC\n1234\n')), decomp_type = 'gzip').read()
+# decompress with on the fly compression, default gzip compression
+>>> DecompressingBufferedReader(BytesIO(compress('ABC\n1234\n'))).read()
 'ABC\n1234\n'

 # error: invalid compress type
@ -27,6 +27,11 @@ Exception: Decompression type not supported: bzip2
 Traceback (most recent call last):
 error: Error -3 while decompressing: incorrect header check

+# invalid output when reading compressed data as not compressed
+>>> DecompressingBufferedReader(BytesIO(compress('ABC')), decomp_type = None).read() != 'ABC'
+True
+
+
 # DecompressingBufferedReader readline() with decompression (zipnum file, no header)
 >>> DecompressingBufferedReader(open(test_zip_dir + 'zipnum-sample.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
 'com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz\n'
@ -60,6 +65,14 @@ Non-chunked data:
 >>> ChunkedDataReader(BytesIO("xyz123!@#")).read()
 'xyz123!@#'

+Non-chunked, compressed data
+>>> ChunkedDataReader(BytesIO(compress('ABCDEF'))).read()
+'ABCDEF'
+
+Non-chunked, compressed data
+>>> DecompressingBufferedReader(ChunkedDataReader(BytesIO(compress('\nABCDEF\nGHIJ')))).read()
+'\nABCDEF\nGHIJ'
+
 Starts like chunked data, but isn't:
 >>> c = ChunkedDataReader(BytesIO("1\r\nxyz123!@#"));
 >>> c.read() + c.read()