mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
content-rewriter: if not rewriting content, still need to dechunk any chunk-encoded responses to conform to WSGI
header_rewriter: check if 'transfer-encoded' header is set to mark for dechunking update dependency to warcio>=1.5.0 for better detection of chunked data by ChunkedDataReader tests: add tests to ensure dechunk of chunk encoded response, proper handling of 'transfer-encoded' header present but not chunked case
This commit is contained in:
parent
af0f9c22cb
commit
77a2e5370f
@ -2,7 +2,7 @@ from io import BytesIO
|
|||||||
|
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
|
|
||||||
from warcio.bufferedreaders import BufferedReader
|
from warcio.bufferedreaders import BufferedReader, ChunkedDataReader
|
||||||
from warcio.utils import to_native_str
|
from warcio.utils import to_native_str
|
||||||
|
|
||||||
import re
|
import re
|
||||||
@ -163,15 +163,26 @@ class BaseContentRewriter(object):
|
|||||||
rule = self.get_rule(cdx)
|
rule = self.get_rule(cdx)
|
||||||
content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func)
|
content_rewriter = self.create_rewriter(rwinfo.text_type, rule, rwinfo, cdx, head_insert_func)
|
||||||
|
|
||||||
|
gen = None
|
||||||
|
|
||||||
if content_rewriter:
|
if content_rewriter:
|
||||||
gen = content_rewriter(rwinfo)
|
gen = content_rewriter(rwinfo)
|
||||||
elif rwinfo.is_content_rw:
|
elif rwinfo.is_content_rw:
|
||||||
gen = StreamIter(rwinfo.content_stream)
|
gen = StreamIter(rwinfo.content_stream)
|
||||||
else:
|
|
||||||
gen = StreamIter(rwinfo.record.raw_stream)
|
|
||||||
|
|
||||||
rw_http_headers = self.rewrite_headers(rwinfo)
|
rw_http_headers = self.rewrite_headers(rwinfo)
|
||||||
|
|
||||||
|
if not gen:
|
||||||
|
# if not rewriting content, still need to dechunk
|
||||||
|
# to conform to WSGI spec
|
||||||
|
if rwinfo.is_chunked:
|
||||||
|
stream = ChunkedDataReader(rwinfo.record.raw_stream,
|
||||||
|
decomp_type=None)
|
||||||
|
else:
|
||||||
|
stream = rwinfo.record.raw_stream
|
||||||
|
|
||||||
|
gen = StreamIter(stream)
|
||||||
|
|
||||||
return rw_http_headers, gen, (content_rewriter != None)
|
return rw_http_headers, gen, (content_rewriter != None)
|
||||||
|
|
||||||
def init_js_regexs(self, regexs):
|
def init_js_regexs(self, regexs):
|
||||||
@ -280,6 +291,7 @@ class RewriteInfo(object):
|
|||||||
|
|
||||||
self._content_stream = None
|
self._content_stream = None
|
||||||
self.is_content_rw = False
|
self.is_content_rw = False
|
||||||
|
self.is_chunked = False
|
||||||
|
|
||||||
self.rewrite_types = content_rewriter.get_rewrite_types()
|
self.rewrite_types = content_rewriter.get_rewrite_types()
|
||||||
|
|
||||||
|
@ -60,7 +60,7 @@ class DefaultHeaderRewriter(object):
|
|||||||
'strict-transport-security': 'prefix',
|
'strict-transport-security': 'prefix',
|
||||||
|
|
||||||
'trailer': 'prefix',
|
'trailer': 'prefix',
|
||||||
'transfer-encoding': 'prefix',
|
'transfer-encoding': 'transfer-encoding',
|
||||||
'tk': 'prefix',
|
'tk': 'prefix',
|
||||||
|
|
||||||
'upgrade': 'prefix',
|
'upgrade': 'prefix',
|
||||||
@ -133,6 +133,10 @@ class DefaultHeaderRewriter(object):
|
|||||||
|
|
||||||
return (self.header_prefix + name, value)
|
return (self.header_prefix + name, value)
|
||||||
|
|
||||||
|
elif rule == 'transfer-encoding':
|
||||||
|
self.rwinfo.is_chunked = True
|
||||||
|
return (self.header_prefix + name, value)
|
||||||
|
|
||||||
elif rule == 'cookie':
|
elif rule == 'cookie':
|
||||||
if self.rwinfo.cookie_rewriter:
|
if self.rwinfo.cookie_rewriter:
|
||||||
return self.rwinfo.cookie_rewriter.rewrite(value)
|
return self.rwinfo.cookie_rewriter.rewrite(value)
|
||||||
|
@ -6,6 +6,8 @@ from io import BytesIO
|
|||||||
from pywb.warcserver.index.cdxobject import CDXObject
|
from pywb.warcserver.index.cdxobject import CDXObject
|
||||||
from pywb.utils.canonicalize import canonicalize
|
from pywb.utils.canonicalize import canonicalize
|
||||||
|
|
||||||
|
from pywb.utils.io import chunk_encode_iter
|
||||||
|
|
||||||
from pywb.rewrite.wburl import WbUrl
|
from pywb.rewrite.wburl import WbUrl
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
from pywb.rewrite.default_rewriter import DefaultRewriter
|
from pywb.rewrite.default_rewriter import DefaultRewriter
|
||||||
@ -153,6 +155,34 @@ class TestContentRewriter(object):
|
|||||||
|
|
||||||
assert is_rw == False
|
assert is_rw == False
|
||||||
|
|
||||||
|
def test_binary_dechunk(self):
|
||||||
|
headers = {'Content-Type': 'application/octet-stream',
|
||||||
|
'Transfer-Encoding': 'chunked'}
|
||||||
|
|
||||||
|
content = b''.join(chunk_encode_iter([b'ABCD'] * 10)).decode('utf-8')
|
||||||
|
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
||||||
|
|
||||||
|
exp = ''.join(['ABCD'] * 10)
|
||||||
|
assert b''.join(gen).decode('utf-8') == exp
|
||||||
|
|
||||||
|
assert is_rw == False
|
||||||
|
|
||||||
|
assert ('Transfer-Encoding', 'chunked') not in headers.headers
|
||||||
|
|
||||||
|
def test_binary_dechunk_not_actually_chunked(self):
|
||||||
|
headers = {'Content-Type': 'application/octet-stream',
|
||||||
|
'Transfer-Encoding': 'chunked'}
|
||||||
|
|
||||||
|
content = ''.join(['ABCD'] * 10)
|
||||||
|
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
||||||
|
|
||||||
|
exp = ''.join(['ABCD'] * 10)
|
||||||
|
assert b''.join(gen).decode('utf-8') == exp
|
||||||
|
|
||||||
|
assert is_rw == False
|
||||||
|
|
||||||
|
assert ('Transfer-Encoding', 'chunked') not in headers.headers
|
||||||
|
|
||||||
def test_rewrite_json(self):
|
def test_rewrite_json(self):
|
||||||
headers = {'Content-Type': 'application/json'}
|
headers = {'Content-Type': 'application/json'}
|
||||||
content = '/**/ jQuery_ABC({"foo": "bar"});'
|
content = '/**/ jQuery_ABC({"foo": "bar"});'
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
six
|
six
|
||||||
warcio>=1.4.0
|
warcio>=1.5.0
|
||||||
chardet
|
chardet
|
||||||
requests
|
requests
|
||||||
redis
|
redis
|
||||||
|
Loading…
x
Reference in New Issue
Block a user