1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

content-rewriter: if not rewriting content, still need to dechunk any chunk-encoded responses to conform to WSGI

header_rewriter: check if 'transfer-encoded' header is set to mark for dechunking
update dependency to warcio>=1.5.0 for better detection of chunked data by ChunkedDataReader
tests: add tests to ensure dechunk of chunk encoded response, proper handling of 'transfer-encoded' header present but not chunked case
This commit is contained in:
Ilya Kreymer 2017-10-26 20:37:17 -07:00
parent af0f9c22cb
commit 77a2e5370f
4 changed files with 51 additions and 5 deletions

View File

@ -2,7 +2,7 @@ from io import BytesIO
from contextlib import closing
from warcio.bufferedreaders import BufferedReader
from warcio.bufferedreaders import BufferedReader, ChunkedDataReader
from warcio.utils import to_native_str
import re
@ -163,15 +163,26 @@ class BaseContentRewriter(object):
rule = self.get_rule(cdx)
content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func)
gen = None
if content_rewriter:
gen = content_rewriter(rwinfo)
elif rwinfo.is_content_rw:
gen = StreamIter(rwinfo.content_stream)
else:
gen = StreamIter(rwinfo.record.raw_stream)
rw_http_headers = self.rewrite_headers(rwinfo)
if not gen:
# if not rewriting content, still need to dechunk
# to conform to WSGI spec
if rwinfo.is_chunked:
stream = ChunkedDataReader(rwinfo.record.raw_stream,
decomp_type=None)
else:
stream = rwinfo.record.raw_stream
gen = StreamIter(stream)
return rw_http_headers, gen, (content_rewriter != None)
def init_js_regexs(self, regexs):
@ -280,6 +291,7 @@ class RewriteInfo(object):
self._content_stream = None
self.is_content_rw = False
self.is_chunked = False
self.rewrite_types = content_rewriter.get_rewrite_types()

View File

@ -60,7 +60,7 @@ class DefaultHeaderRewriter(object):
'strict-transport-security': 'prefix',
'trailer': 'prefix',
'transfer-encoding': 'prefix',
'transfer-encoding': 'transfer-encoding',
'tk': 'prefix',
'upgrade': 'prefix',
@ -133,6 +133,10 @@ class DefaultHeaderRewriter(object):
return (self.header_prefix + name, value)
elif rule == 'transfer-encoding':
self.rwinfo.is_chunked = True
return (self.header_prefix + name, value)
elif rule == 'cookie':
if self.rwinfo.cookie_rewriter:
return self.rwinfo.cookie_rewriter.rewrite(value)

View File

@ -6,6 +6,8 @@ from io import BytesIO
from pywb.warcserver.index.cdxobject import CDXObject
from pywb.utils.canonicalize import canonicalize
from pywb.utils.io import chunk_encode_iter
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.default_rewriter import DefaultRewriter
@ -153,6 +155,34 @@ class TestContentRewriter(object):
assert is_rw == False
def test_binary_dechunk(self):
headers = {'Content-Type': 'application/octet-stream',
'Transfer-Encoding': 'chunked'}
content = b''.join(chunk_encode_iter([b'ABCD'] * 10)).decode('utf-8')
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
exp = ''.join(['ABCD'] * 10)
assert b''.join(gen).decode('utf-8') == exp
assert is_rw == False
assert ('Transfer-Encoding', 'chunked') not in headers.headers
def test_binary_dechunk_not_actually_chunked(self):
headers = {'Content-Type': 'application/octet-stream',
'Transfer-Encoding': 'chunked'}
content = ''.join(['ABCD'] * 10)
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
exp = ''.join(['ABCD'] * 10)
assert b''.join(gen).decode('utf-8') == exp
assert is_rw == False
assert ('Transfer-Encoding', 'chunked') not in headers.headers
def test_rewrite_json(self):
headers = {'Content-Type': 'application/json'}
content = '/**/ jQuery_ABC({"foo": "bar"});'

View File

@ -1,5 +1,5 @@
six
warcio>=1.4.0
warcio>=1.5.0
chardet
requests
redis