1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Handle Content-Type multipart/form-data without boundary (#599)

* Handle Content-Type multipart/form-data without boundary

* Add tests for multipart/form-data change
This commit is contained in:
Lauren Ko 2020-12-16 21:00:02 -06:00 committed by GitHub
parent de81efac78
commit b66608c5f3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 115 additions and 6 deletions

View File

@ -463,6 +463,104 @@ com,example)/xyz.pdf 20140401052011 http://example.com/xyz.pdf application/http
"""
def test_multipart_form():
test_data = b'\
WARC/1.0\r\n\
WARC-Type: response\r\n\
WARC-Record-ID: <urn:uuid:073fac44-c383-4a2b-980d-76fec83bd20d>\r\n\
WARC-Date: 2020-11-19T19:54:34Z\r\n\
WARC-Target-URI: https://example.com/ajax/bz?foo=bar\r\n\
Content-Type: application/http;msgtype=response\r\n\
WARC-Payload-Digest: sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\r\n\
Content-Length: 48\r\n\
WARC-Block-Digest: sha1:XN45YTSBLG5PLJ4HA7DRDYGJBM5VW4UO\r\n\
\r\n\
Content-Type: text/html; charset="utf-8"\r\n\
\r\n\
ABCD\r\n\
\r\n\
\r\n\
\r\n\
WARC/1.0\r\n\
WARC-Type: request\r\n\
WARC-Record-ID: <urn:uuid:3084e79c-ae58-4bfd-8590-fcf2830fe896>\r\n\
WARC-Date: 2020-11-19T19:54:34Z\r\n\
WARC-Target-URI: https://example.com/ajax/bz?foo=bar\r\n\
WARC-Concurrent-To: <urn:uuid:073fac44-c383-4a2b-980d-76fec83bd20d>\r\n\
WARC-Block-Digest: sha1:LNYP3X3NWXQLUGDI745P4L4FK27XGP24\r\n\
Content-Type: application/http;msgtype=request\r\n\
Content-Length: 321\r\n\
\r\n\
POST /ajax/bz?foo=bar HTTP/1.1\r\n\
Content-Type: multipart/form-data; boundary=----WebKitFormBoundaryWUBf9liofZK0nuJd\r\n\
content-Length: 199\r\n\
\r\n\
------WebKitFormBoundaryWUBf9liofZK0nuJd\r\n\
Content-Disposition: form-data; name="q"\r\n\
\r\n\
[{"webSessionId":"pb2tr7:vx83uz:fdi8ta","user":"0"}]\r\n\
------WebKitFormBoundaryWUBf9liofZK0nuJd--\r\n\
\r\n\
'
options = dict(include_all=True, append_post=True)
buff = BytesIO()
test_record = BytesIO(test_data)
write_cdx_index(buff, test_record, 'test.warc.gz', **options)
print(buff.getvalue())
assert buff.getvalue() == b"""\
CDX N b a m s k r M S V g
com,example)/ajax/bz?foo=bar&q=[{"websessionid":"pb2tr7:vx83uz:fdi8ta","user":"0"}] 20201119195434 https://example.com/ajax/bz?foo=bar unk text/html; 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 420 0 test.warc.gz
com,example)/ajax/bz?foo=bar&q=[{"websessionid":"pb2tr7:vx83uz:fdi8ta","user":"0"}] 20201119195434 https://example.com/ajax/bz?foo=bar multipart/form-data - - - - 701 428 test.warc.gz
"""
def test_multipart_form_no_boundary():
test_data = b'\
WARC/1.0\r\n\
WARC-Type: response\r\n\
WARC-Record-ID: <urn:uuid:3bc1606a-d517-487e-a6d5-bfeaebda2ec3>\r\n\
WARC-Date: 2020-11-19T14:02:52Z\r\n\
WARC-Target-URI: https://capi.connatix.com/core/story?v=77797\r\n\
WARC-IP-Address: 18.221.6.219\r\n\
Content-Type: application/http;msgtype=response\r\n\
WARC-Payload-Digest: sha1:SIGZ3RJW5J7DUKEZ4R7RSYUZNGLETIS5\r\n\
Content-Length: 41\r\n\
WARC-Block-Digest: sha1:JXKKZNALIPOW7J2FX5XUTGQZXKBSGZLU\r\n\
\r\n\
Content-Type: multipart/form-data\r\n\
\r\n\
ABCD\r\n\
\r\n\
\r\n\
\r\n\
WARC/1.0\r\n\
WARC-Type: request\r\n\
WARC-Record-ID: <urn:uuid:d5e7186f-5725-4ed1-b199-56fbdf4bd805>\r\n\
WARC-Date: 2020-11-19T14:02:52Z\r\n\
WARC-Target-URI: https://capi.connatix.com/core/story?v=77797\r\n\
WARC-Concurrent-To: <urn:uuid:3bc1606a-d517-487e-a6d5-bfeaebda2ec3>\r\n\
WARC-Block-Digest: sha1:QJ2YUIKEWDSCLK5A2DHGLQ7WWEKYMO3W\r\n\
Content-Type: application/http;msgtype=request\r\n\
Content-Length: 111\r\n\
\r\n\
POST /core/story?v=77797 HTTP/1.1\r\n\
Content-Length: 19\r\n\
Content-Type: multipart/form-data\r\n\
\r\n\
{"text": "default"}\r\n\
\r\n\
'
options = dict(include_all=True, append_post=True)
buff = BytesIO()
test_record = BytesIO(test_data)
write_cdx_index(buff, test_record, 'test.warc.gz', **options)
assert buff.getvalue() == b"""\
CDX N b a m s k r M S V g
com,connatix,capi)/core/story?__wb_post_data=eyj0zxh0ijogimrlzmf1bhqifq==&v=77797 20201119140252 https://capi.connatix.com/core/story?v=77797 unk multipart/form-data SIGZ3RJW5J7DUKEZ4R7RSYUZNGLETIS5 - - 453 0 test.warc.gz
com,connatix,capi)/core/story?__wb_post_data=eyj0zxh0ijogimrlzmf1bhqifq==&v=77797 20201119140252 https://capi.connatix.com/core/story?v=77797 multipart/form-data - - - - 500 461 test.warc.gz
"""
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -259,13 +259,17 @@ class MethodQueryCanonicalizer(object):
if PY3:
args['encoding'] = 'utf-8'
data = cgi.FieldStorage(**args)
try:
data = cgi.FieldStorage(**args)
except ValueError:
# Content-Type multipart/form-data may lack "boundary" info
query = handle_binary(query)
else:
values = []
for item in data.list:
values.append((item.name, item.value))
values = []
for item in data.list:
values.append((item.name, item.value))
query = urlencode(values, True)
query = urlencode(values, True)
elif mime.startswith('application/x-amf'):
query = self.amf_parse(query, environ)

View File

@ -143,6 +143,13 @@ class TestPostQueryExtract(object):
#base64 encoded data
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_post_data=gTZsYEygNFAO4HICtYkZAGZQ2w6wAiw='
def test_post_extract_no_boundary_in_multipart_form_mimetype(self):
mq = MethodQueryCanonicalizer('POST', 'multipart/form-data',
len(self.post_data), BytesIO(self.post_data))
assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6'
def test_options(self):
mq = MethodQueryCanonicalizer('OPTIONS', '', 0, BytesIO())
assert mq.append_query('http://example.com/') == 'http://example.com/?__pywb_method=options'