From faae23d764f5e581076a7ac4441e5c20e6eb2cf1 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 27 Sep 2017 17:29:55 -0700 Subject: [PATCH] allow very long request header lines, to support large warcprox-meta header values --- setup.py | 2 +- tests/test_warcprox.py | 38 ++++++++++++++++++++++++++++++++++++++ warcprox/mitmproxy.py | 6 ++++++ 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 25a2e49..0824491 100755 --- a/setup.py +++ b/setup.py @@ -49,7 +49,7 @@ except: setuptools.setup( name='warcprox', - version='2.2b1.dev100', + version='2.2b1.dev101', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index dd80a86..8b44974 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1429,6 +1429,44 @@ def test_via_response_header(warcprox_, http_daemon, archiving_proxies, playback elif record.rec_type == 'request': assert record.http_headers.get_header('via') == '1.1 warcprox' +def test_long_warcprox_meta( + warcprox_, http_daemon, archiving_proxies, playback_proxies): + url = 'http://localhost:%s/b/g' % http_daemon.server_port + + # create a very long warcprox-meta header + headers = {'Warcprox-Meta': json.dumps({ + 'x':'y'*1000000, 'warc-prefix': 'test_long_warcprox_meta'})} + response = requests.get( + url, proxies=archiving_proxies, headers=headers, verify=False) + assert response.status_code == 200 + + # wait for writer thread to process + time.sleep(0.5) + while not all(wwt.idle for wwt in warcprox_.warc_writer_threads): + time.sleep(0.5) + time.sleep(0.5) + + # check that warcprox-meta was parsed and honored ("warc-prefix" param) + assert warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_long_warcprox_meta"] + writer = warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_long_warcprox_meta"] + warc_path = os.path.join(writer.directory, writer._f_finalname) + warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_long_warcprox_meta"].close_writer() + assert os.path.exists(warc_path) + + # read the warc + with open(warc_path, 'rb') as f: + rec_iter = iter(warcio.archiveiterator.ArchiveIterator(f)) + record = next(rec_iter) + assert record.rec_type == 'warcinfo' + record = next(rec_iter) + assert record.rec_type == 'response' + assert record.rec_headers.get_header('warc-target-uri') == url + record = next(rec_iter) + assert record.rec_type == 'request' + assert record.rec_headers.get_header('warc-target-uri') == url + with pytest.raises(StopIteration): + next(rec_iter) + if __name__ == '__main__': pytest.main() diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 6297dcc..914fb52 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -37,6 +37,12 @@ except ImportError: import urlparse as urllib_parse try: import http.client as http_client + # In python3 http.client.parse_headers() enforces http_client._MAXLINE + # as max length of an http header line, but we want to support very + # long warcprox-meta headers, so we tweak it here. Python2 doesn't seem + # to enforce any limit. Multiline headers could be an option but it + # turns out those are illegal as of RFC 7230. Plus, this is easier. + http_client._MAXLINE = 4194304 # 4 MiB except ImportError: import httplib as http_client import socket