diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index c62f39d9..6d4ced33 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -81,7 +81,12 @@ class RewriteContent: if (rewritten_headers. contains_removed_header('content-encoding', 'gzip')): - stream = DecompressingBufferedReader(stream, decomp_type='gzip') + + #optimize: if already a ChunkedDataReader, add gzip + if isinstance(stream, ChunkedDataReader): + stream.set_decomp('gzip') + else: + stream = DecompressingBufferedReader(stream, decomp_type='gzip') if rewritten_headers.charset: encoding = rewritten_headers.charset diff --git a/pywb/utils/bufferedreaders.py b/pywb/utils/bufferedreaders.py index ba0ffc0e..7e461dee 100644 --- a/pywb/utils/bufferedreaders.py +++ b/pywb/utils/bufferedreaders.py @@ -11,7 +11,7 @@ def gzip_decompressor(): #================================================================= -class DecompressingBufferedReader(object): +class BufferedReader(object): """ A wrapping line reader which wraps an existing reader. Read operations operate on underlying buffer, which is filled to @@ -20,9 +20,12 @@ class DecompressingBufferedReader(object): If an optional decompress type is specified, data is fed through the decompressor when read from the buffer. Currently supported decompression: gzip + If unspecified, default decompression is None - If decompression fails on first try, data is assumed to be decompressed - and no exception is thrown. If a failure occurs after data has been + If decompression is specified, and decompress fails on first try, + data is assumed to not be compressed and no exception is thrown. + + If a failure occurs after data has been partially decompressed, the exception is propagated. """ @@ -30,7 +33,7 @@ class DecompressingBufferedReader(object): DECOMPRESSORS = {'gzip': gzip_decompressor} def __init__(self, stream, block_size=1024, - decomp_type='gzip', + decomp_type=None, starting_data=None): self.stream = stream self.block_size = block_size @@ -42,6 +45,12 @@ class DecompressingBufferedReader(object): self.num_read = 0 self.buff_size = 0 + def set_decomp(self, decomp_type): + if self.num_read > 0: + raise Exception('Attempting to change decompression mid-stream') + + self._init_decomp(decomp_type) + def _init_decomp(self, decomp_type): if decomp_type: try: @@ -161,6 +170,18 @@ class DecompressingBufferedReader(object): self.stream = None +#================================================================= +class DecompressingBufferedReader(BufferedReader): + """ + A BufferedReader which defaults to gzip decompression, + (unless different type specified) + """ + def __init__(self, *args, **kwargs): + if 'decomp_type' not in kwargs: + kwargs['decomp_type'] = 'gzip' + super(DecompressingBufferedReader, self).__init__(*args, **kwargs) + + #================================================================= class ChunkedDataException(Exception): def __init__(self, msg, data=''): @@ -169,7 +190,7 @@ class ChunkedDataException(Exception): #================================================================= -class ChunkedDataReader(DecompressingBufferedReader): +class ChunkedDataReader(BufferedReader): r""" A ChunkedDataReader is a DecompressingBufferedReader which also supports de-chunking of the data if it happens diff --git a/pywb/utils/test/test_bufferedreaders.py b/pywb/utils/test/test_bufferedreaders.py index c764e09d..cd5f3787 100644 --- a/pywb/utils/test/test_bufferedreaders.py +++ b/pywb/utils/test/test_bufferedreaders.py @@ -65,11 +65,15 @@ Non-chunked data: >>> ChunkedDataReader(BytesIO("xyz123!@#")).read() 'xyz123!@#' -Non-chunked, compressed data ->>> ChunkedDataReader(BytesIO(compress('ABCDEF'))).read() +Non-chunked, compressed data, specify decomp_type +>>> ChunkedDataReader(BytesIO(compress('ABCDEF')), decomp_type='gzip').read() 'ABCDEF' -Non-chunked, compressed data +Non-chunked, compressed data, specifiy compression seperately +>>> c = ChunkedDataReader(BytesIO(compress('ABCDEF'))); c.set_decomp('gzip'); c.read() +'ABCDEF' + +Non-chunked, compressed data, wrap in DecompressingBufferedReader >>> DecompressingBufferedReader(ChunkedDataReader(BytesIO(compress('\nABCDEF\nGHIJ')))).read() '\nABCDEF\nGHIJ'