From faae23d764f5e581076a7ac4441e5c20e6eb2cf1 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Wed, 27 Sep 2017 17:29:55 -0700
Subject: [PATCH] allow very long request header lines, to support large
 warcprox-meta header values

---
 setup.py               |  2 +-
 tests/test_warcprox.py | 38 ++++++++++++++++++++++++++++++++++++++
 warcprox/mitmproxy.py  |  6 ++++++
 3 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 25a2e49..0824491 100755
--- a/setup.py
+++ b/setup.py
@@ -49,7 +49,7 @@ except:
 
 setuptools.setup(
         name='warcprox',
-        version='2.2b1.dev100',
+        version='2.2b1.dev101',
         description='WARC writing MITM HTTP/S proxy',
         url='https://github.com/internetarchive/warcprox',
         author='Noah Levitt',
diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py
index dd80a86..8b44974 100755
--- a/tests/test_warcprox.py
+++ b/tests/test_warcprox.py
@@ -1429,6 +1429,44 @@ def test_via_response_header(warcprox_, http_daemon, archiving_proxies, playback
                 elif record.rec_type == 'request':
                     assert record.http_headers.get_header('via') == '1.1 warcprox'
 
+def test_long_warcprox_meta(
+        warcprox_, http_daemon, archiving_proxies, playback_proxies):
+    url = 'http://localhost:%s/b/g' % http_daemon.server_port
+
+    # create a very long warcprox-meta header
+    headers = {'Warcprox-Meta': json.dumps({
+        'x':'y'*1000000, 'warc-prefix': 'test_long_warcprox_meta'})}
+    response = requests.get(
+            url, proxies=archiving_proxies, headers=headers, verify=False)
+    assert response.status_code == 200
+
+    # wait for writer thread to process
+    time.sleep(0.5)
+    while not all(wwt.idle for wwt in warcprox_.warc_writer_threads):
+        time.sleep(0.5)
+    time.sleep(0.5)
+
+    # check that warcprox-meta was parsed and honored ("warc-prefix" param)
+    assert warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_long_warcprox_meta"]
+    writer = warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_long_warcprox_meta"]
+    warc_path = os.path.join(writer.directory, writer._f_finalname)
+    warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_long_warcprox_meta"].close_writer()
+    assert os.path.exists(warc_path)
+
+    # read the warc
+    with open(warc_path, 'rb') as f:
+        rec_iter = iter(warcio.archiveiterator.ArchiveIterator(f))
+        record = next(rec_iter)
+        assert record.rec_type == 'warcinfo'
+        record = next(rec_iter)
+        assert record.rec_type == 'response'
+        assert record.rec_headers.get_header('warc-target-uri') == url
+        record = next(rec_iter)
+        assert record.rec_type == 'request'
+        assert record.rec_headers.get_header('warc-target-uri') == url
+        with pytest.raises(StopIteration):
+            next(rec_iter)
+
 if __name__ == '__main__':
     pytest.main()
 
diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py
index 6297dcc..914fb52 100644
--- a/warcprox/mitmproxy.py
+++ b/warcprox/mitmproxy.py
@@ -37,6 +37,12 @@ except ImportError:
     import urlparse as urllib_parse
 try:
     import http.client as http_client
+    # In python3 http.client.parse_headers() enforces http_client._MAXLINE
+    # as max length of an http header line, but we want to support very
+    # long warcprox-meta headers, so we tweak it here. Python2 doesn't seem
+    # to enforce any limit. Multiline headers could be an option but it
+    # turns out those are illegal as of RFC 7230. Plus, this is easier.
+    http_client._MAXLINE = 4194304  # 4 MiB
 except ImportError:
     import httplib as http_client
 import socket