diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index 680db354..7e03b5a8 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -2,7 +2,7 @@ from io import BytesIO from contextlib import closing -from warcio.bufferedreaders import BufferedReader +from warcio.bufferedreaders import BufferedReader, ChunkedDataReader from warcio.utils import to_native_str import re @@ -163,15 +163,26 @@ class BaseContentRewriter(object): rule = self.get_rule(cdx) content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func) + gen = None + if content_rewriter: gen = content_rewriter(rwinfo) elif rwinfo.is_content_rw: gen = StreamIter(rwinfo.content_stream) - else: - gen = StreamIter(rwinfo.record.raw_stream) rw_http_headers = self.rewrite_headers(rwinfo) + if not gen: + # if not rewriting content, still need to dechunk + # to conform to WSGI spec + if rwinfo.is_chunked: + stream = ChunkedDataReader(rwinfo.record.raw_stream, + decomp_type=None) + else: + stream = rwinfo.record.raw_stream + + gen = StreamIter(stream) + return rw_http_headers, gen, (content_rewriter != None) def init_js_regexs(self, regexs): @@ -280,6 +291,7 @@ class RewriteInfo(object): self._content_stream = None self.is_content_rw = False + self.is_chunked = False self.rewrite_types = content_rewriter.get_rewrite_types() diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py index e851c442..410c1659 100644 --- a/pywb/rewrite/header_rewriter.py +++ b/pywb/rewrite/header_rewriter.py @@ -60,7 +60,7 @@ class DefaultHeaderRewriter(object): 'strict-transport-security': 'prefix', 'trailer': 'prefix', - 'transfer-encoding': 'prefix', + 'transfer-encoding': 'transfer-encoding', 'tk': 'prefix', 'upgrade': 'prefix', @@ -133,6 +133,10 @@ class DefaultHeaderRewriter(object): return (self.header_prefix + name, value) + elif rule == 'transfer-encoding': + self.rwinfo.is_chunked = True + return (self.header_prefix + name, value) + elif rule == 'cookie': if self.rwinfo.cookie_rewriter: return self.rwinfo.cookie_rewriter.rewrite(value) diff --git a/pywb/rewrite/test/test_content_rewriter.py b/pywb/rewrite/test/test_content_rewriter.py index 981a0018..56f9ccc5 100644 --- a/pywb/rewrite/test/test_content_rewriter.py +++ b/pywb/rewrite/test/test_content_rewriter.py @@ -6,6 +6,8 @@ from io import BytesIO from pywb.warcserver.index.cdxobject import CDXObject from pywb.utils.canonicalize import canonicalize +from pywb.utils.io import chunk_encode_iter + from pywb.rewrite.wburl import WbUrl from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.default_rewriter import DefaultRewriter @@ -153,6 +155,34 @@ class TestContentRewriter(object): assert is_rw == False + def test_binary_dechunk(self): + headers = {'Content-Type': 'application/octet-stream', + 'Transfer-Encoding': 'chunked'} + + content = b''.join(chunk_encode_iter([b'ABCD'] * 10)).decode('utf-8') + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_') + + exp = ''.join(['ABCD'] * 10) + assert b''.join(gen).decode('utf-8') == exp + + assert is_rw == False + + assert ('Transfer-Encoding', 'chunked') not in headers.headers + + def test_binary_dechunk_not_actually_chunked(self): + headers = {'Content-Type': 'application/octet-stream', + 'Transfer-Encoding': 'chunked'} + + content = ''.join(['ABCD'] * 10) + headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_') + + exp = ''.join(['ABCD'] * 10) + assert b''.join(gen).decode('utf-8') == exp + + assert is_rw == False + + assert ('Transfer-Encoding', 'chunked') not in headers.headers + def test_rewrite_json(self): headers = {'Content-Type': 'application/json'} content = '/**/ jQuery_ABC({"foo": "bar"});' diff --git a/requirements.txt b/requirements.txt index 246dc957..1c5ee468 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ six -warcio>=1.4.0 +warcio>=1.5.0 chardet requests redis