mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
content-rewriter: if not rewriting content, still need to dechunk any chunk-encoded responses to conform to WSGI
header_rewriter: check if 'transfer-encoded' header is set to mark for dechunking update dependency to warcio>=1.5.0 for better detection of chunked data by ChunkedDataReader tests: add tests to ensure dechunk of chunk encoded response, proper handling of 'transfer-encoded' header present but not chunked case
This commit is contained in:
parent
af0f9c22cb
commit
77a2e5370f
@ -2,7 +2,7 @@ from io import BytesIO
|
||||
|
||||
from contextlib import closing
|
||||
|
||||
from warcio.bufferedreaders import BufferedReader
|
||||
from warcio.bufferedreaders import BufferedReader, ChunkedDataReader
|
||||
from warcio.utils import to_native_str
|
||||
|
||||
import re
|
||||
@ -163,15 +163,26 @@ class BaseContentRewriter(object):
|
||||
rule = self.get_rule(cdx)
|
||||
content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func)
|
||||
|
||||
gen = None
|
||||
|
||||
if content_rewriter:
|
||||
gen = content_rewriter(rwinfo)
|
||||
elif rwinfo.is_content_rw:
|
||||
gen = StreamIter(rwinfo.content_stream)
|
||||
else:
|
||||
gen = StreamIter(rwinfo.record.raw_stream)
|
||||
|
||||
rw_http_headers = self.rewrite_headers(rwinfo)
|
||||
|
||||
if not gen:
|
||||
# if not rewriting content, still need to dechunk
|
||||
# to conform to WSGI spec
|
||||
if rwinfo.is_chunked:
|
||||
stream = ChunkedDataReader(rwinfo.record.raw_stream,
|
||||
decomp_type=None)
|
||||
else:
|
||||
stream = rwinfo.record.raw_stream
|
||||
|
||||
gen = StreamIter(stream)
|
||||
|
||||
return rw_http_headers, gen, (content_rewriter != None)
|
||||
|
||||
def init_js_regexs(self, regexs):
|
||||
@ -280,6 +291,7 @@ class RewriteInfo(object):
|
||||
|
||||
self._content_stream = None
|
||||
self.is_content_rw = False
|
||||
self.is_chunked = False
|
||||
|
||||
self.rewrite_types = content_rewriter.get_rewrite_types()
|
||||
|
||||
|
@ -60,7 +60,7 @@ class DefaultHeaderRewriter(object):
|
||||
'strict-transport-security': 'prefix',
|
||||
|
||||
'trailer': 'prefix',
|
||||
'transfer-encoding': 'prefix',
|
||||
'transfer-encoding': 'transfer-encoding',
|
||||
'tk': 'prefix',
|
||||
|
||||
'upgrade': 'prefix',
|
||||
@ -133,6 +133,10 @@ class DefaultHeaderRewriter(object):
|
||||
|
||||
return (self.header_prefix + name, value)
|
||||
|
||||
elif rule == 'transfer-encoding':
|
||||
self.rwinfo.is_chunked = True
|
||||
return (self.header_prefix + name, value)
|
||||
|
||||
elif rule == 'cookie':
|
||||
if self.rwinfo.cookie_rewriter:
|
||||
return self.rwinfo.cookie_rewriter.rewrite(value)
|
||||
|
@ -6,6 +6,8 @@ from io import BytesIO
|
||||
from pywb.warcserver.index.cdxobject import CDXObject
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
|
||||
from pywb.utils.io import chunk_encode_iter
|
||||
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.rewrite.default_rewriter import DefaultRewriter
|
||||
@ -153,6 +155,34 @@ class TestContentRewriter(object):
|
||||
|
||||
assert is_rw == False
|
||||
|
||||
def test_binary_dechunk(self):
|
||||
headers = {'Content-Type': 'application/octet-stream',
|
||||
'Transfer-Encoding': 'chunked'}
|
||||
|
||||
content = b''.join(chunk_encode_iter([b'ABCD'] * 10)).decode('utf-8')
|
||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
||||
|
||||
exp = ''.join(['ABCD'] * 10)
|
||||
assert b''.join(gen).decode('utf-8') == exp
|
||||
|
||||
assert is_rw == False
|
||||
|
||||
assert ('Transfer-Encoding', 'chunked') not in headers.headers
|
||||
|
||||
def test_binary_dechunk_not_actually_chunked(self):
|
||||
headers = {'Content-Type': 'application/octet-stream',
|
||||
'Transfer-Encoding': 'chunked'}
|
||||
|
||||
content = ''.join(['ABCD'] * 10)
|
||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
||||
|
||||
exp = ''.join(['ABCD'] * 10)
|
||||
assert b''.join(gen).decode('utf-8') == exp
|
||||
|
||||
assert is_rw == False
|
||||
|
||||
assert ('Transfer-Encoding', 'chunked') not in headers.headers
|
||||
|
||||
def test_rewrite_json(self):
|
||||
headers = {'Content-Type': 'application/json'}
|
||||
content = '/**/ jQuery_ABC({"foo": "bar"});'
|
||||
|
@ -1,5 +1,5 @@
|
||||
six
|
||||
warcio>=1.4.0
|
||||
warcio>=1.5.0
|
||||
chardet
|
||||
requests
|
||||
redis
|
||||
|
Loading…
x
Reference in New Issue
Block a user