diff --git a/pywb/indexer/test/test_indexing.py b/pywb/indexer/test/test_indexing.py index 5d39f6f5..908cfd96 100644 --- a/pywb/indexer/test/test_indexing.py +++ b/pywb/indexer/test/test_indexing.py @@ -463,6 +463,104 @@ com,example)/xyz.pdf 20140401052011 http://example.com/xyz.pdf application/http """ +def test_multipart_form(): + test_data = b'\ +WARC/1.0\r\n\ +WARC-Type: response\r\n\ +WARC-Record-ID: \r\n\ +WARC-Date: 2020-11-19T19:54:34Z\r\n\ +WARC-Target-URI: https://example.com/ajax/bz?foo=bar\r\n\ +Content-Type: application/http;msgtype=response\r\n\ +WARC-Payload-Digest: sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\r\n\ +Content-Length: 48\r\n\ +WARC-Block-Digest: sha1:XN45YTSBLG5PLJ4HA7DRDYGJBM5VW4UO\r\n\ +\r\n\ +Content-Type: text/html; charset="utf-8"\r\n\ +\r\n\ +ABCD\r\n\ +\r\n\ +\r\n\ +\r\n\ +WARC/1.0\r\n\ +WARC-Type: request\r\n\ +WARC-Record-ID: \r\n\ +WARC-Date: 2020-11-19T19:54:34Z\r\n\ +WARC-Target-URI: https://example.com/ajax/bz?foo=bar\r\n\ +WARC-Concurrent-To: \r\n\ +WARC-Block-Digest: sha1:LNYP3X3NWXQLUGDI745P4L4FK27XGP24\r\n\ +Content-Type: application/http;msgtype=request\r\n\ +Content-Length: 321\r\n\ +\r\n\ +POST /ajax/bz?foo=bar HTTP/1.1\r\n\ +Content-Type: multipart/form-data; boundary=----WebKitFormBoundaryWUBf9liofZK0nuJd\r\n\ +content-Length: 199\r\n\ +\r\n\ +------WebKitFormBoundaryWUBf9liofZK0nuJd\r\n\ +Content-Disposition: form-data; name="q"\r\n\ +\r\n\ +[{"webSessionId":"pb2tr7:vx83uz:fdi8ta","user":"0"}]\r\n\ +------WebKitFormBoundaryWUBf9liofZK0nuJd--\r\n\ +\r\n\ +' + options = dict(include_all=True, append_post=True) + buff = BytesIO() + test_record = BytesIO(test_data) + write_cdx_index(buff, test_record, 'test.warc.gz', **options) + print(buff.getvalue()) + assert buff.getvalue() == b"""\ + CDX N b a m s k r M S V g +com,example)/ajax/bz?foo=bar&q=[{"websessionid":"pb2tr7:vx83uz:fdi8ta","user":"0"}] 20201119195434 https://example.com/ajax/bz?foo=bar unk text/html; 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 420 0 test.warc.gz +com,example)/ajax/bz?foo=bar&q=[{"websessionid":"pb2tr7:vx83uz:fdi8ta","user":"0"}] 20201119195434 https://example.com/ajax/bz?foo=bar multipart/form-data - - - - 701 428 test.warc.gz +""" + + +def test_multipart_form_no_boundary(): + test_data = b'\ +WARC/1.0\r\n\ +WARC-Type: response\r\n\ +WARC-Record-ID: \r\n\ +WARC-Date: 2020-11-19T14:02:52Z\r\n\ +WARC-Target-URI: https://capi.connatix.com/core/story?v=77797\r\n\ +WARC-IP-Address: 18.221.6.219\r\n\ +Content-Type: application/http;msgtype=response\r\n\ +WARC-Payload-Digest: sha1:SIGZ3RJW5J7DUKEZ4R7RSYUZNGLETIS5\r\n\ +Content-Length: 41\r\n\ +WARC-Block-Digest: sha1:JXKKZNALIPOW7J2FX5XUTGQZXKBSGZLU\r\n\ +\r\n\ +Content-Type: multipart/form-data\r\n\ +\r\n\ +ABCD\r\n\ +\r\n\ +\r\n\ +\r\n\ +WARC/1.0\r\n\ +WARC-Type: request\r\n\ +WARC-Record-ID: \r\n\ +WARC-Date: 2020-11-19T14:02:52Z\r\n\ +WARC-Target-URI: https://capi.connatix.com/core/story?v=77797\r\n\ +WARC-Concurrent-To: \r\n\ +WARC-Block-Digest: sha1:QJ2YUIKEWDSCLK5A2DHGLQ7WWEKYMO3W\r\n\ +Content-Type: application/http;msgtype=request\r\n\ +Content-Length: 111\r\n\ +\r\n\ +POST /core/story?v=77797 HTTP/1.1\r\n\ +Content-Length: 19\r\n\ +Content-Type: multipart/form-data\r\n\ +\r\n\ +{"text": "default"}\r\n\ +\r\n\ +' + options = dict(include_all=True, append_post=True) + buff = BytesIO() + test_record = BytesIO(test_data) + write_cdx_index(buff, test_record, 'test.warc.gz', **options) + assert buff.getvalue() == b"""\ + CDX N b a m s k r M S V g +com,connatix,capi)/core/story?__wb_post_data=eyj0zxh0ijogimrlzmf1bhqifq==&v=77797 20201119140252 https://capi.connatix.com/core/story?v=77797 unk multipart/form-data SIGZ3RJW5J7DUKEZ4R7RSYUZNGLETIS5 - - 453 0 test.warc.gz +com,connatix,capi)/core/story?__wb_post_data=eyj0zxh0ijogimrlzmf1bhqifq==&v=77797 20201119140252 https://capi.connatix.com/core/story?v=77797 multipart/form-data - - - - 500 461 test.warc.gz +""" + + if __name__ == "__main__": import doctest doctest.testmod() diff --git a/pywb/warcserver/inputrequest.py b/pywb/warcserver/inputrequest.py index f616648e..2d8e2b06 100644 --- a/pywb/warcserver/inputrequest.py +++ b/pywb/warcserver/inputrequest.py @@ -259,13 +259,17 @@ class MethodQueryCanonicalizer(object): if PY3: args['encoding'] = 'utf-8' - data = cgi.FieldStorage(**args) + try: + data = cgi.FieldStorage(**args) + except ValueError: + # Content-Type multipart/form-data may lack "boundary" info + query = handle_binary(query) + else: + values = [] + for item in data.list: + values.append((item.name, item.value)) - values = [] - for item in data.list: - values.append((item.name, item.value)) - - query = urlencode(values, True) + query = urlencode(values, True) elif mime.startswith('application/x-amf'): query = self.amf_parse(query, environ) diff --git a/pywb/warcserver/test/test_inputreq.py b/pywb/warcserver/test/test_inputreq.py index fe5cf67c..48abcdb4 100644 --- a/pywb/warcserver/test/test_inputreq.py +++ b/pywb/warcserver/test/test_inputreq.py @@ -143,6 +143,13 @@ class TestPostQueryExtract(object): #base64 encoded data assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_post_data=gTZsYEygNFAO4HICtYkZAGZQ2w6wAiw=' + def test_post_extract_no_boundary_in_multipart_form_mimetype(self): + mq = MethodQueryCanonicalizer('POST', 'multipart/form-data', + len(self.post_data), BytesIO(self.post_data)) + + assert mq.append_query('http://example.com/') == 'http://example.com/?__wb_post_data=Zm9vPWJhciZkaXI9JTJGYmF6' + + def test_options(self): mq = MethodQueryCanonicalizer('OPTIONS', '', 0, BytesIO()) assert mq.append_query('http://example.com/') == 'http://example.com/?__pywb_method=options'