mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
allow very long request header lines, to support large warcprox-meta header values
This commit is contained in:
parent
8bfda9f4b3
commit
faae23d764
2
setup.py
2
setup.py
@ -49,7 +49,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.2b1.dev100',
|
version='2.2b1.dev101',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -1429,6 +1429,44 @@ def test_via_response_header(warcprox_, http_daemon, archiving_proxies, playback
|
|||||||
elif record.rec_type == 'request':
|
elif record.rec_type == 'request':
|
||||||
assert record.http_headers.get_header('via') == '1.1 warcprox'
|
assert record.http_headers.get_header('via') == '1.1 warcprox'
|
||||||
|
|
||||||
|
def test_long_warcprox_meta(
|
||||||
|
warcprox_, http_daemon, archiving_proxies, playback_proxies):
|
||||||
|
url = 'http://localhost:%s/b/g' % http_daemon.server_port
|
||||||
|
|
||||||
|
# create a very long warcprox-meta header
|
||||||
|
headers = {'Warcprox-Meta': json.dumps({
|
||||||
|
'x':'y'*1000000, 'warc-prefix': 'test_long_warcprox_meta'})}
|
||||||
|
response = requests.get(
|
||||||
|
url, proxies=archiving_proxies, headers=headers, verify=False)
|
||||||
|
assert response.status_code == 200
|
||||||
|
|
||||||
|
# wait for writer thread to process
|
||||||
|
time.sleep(0.5)
|
||||||
|
while not all(wwt.idle for wwt in warcprox_.warc_writer_threads):
|
||||||
|
time.sleep(0.5)
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
# check that warcprox-meta was parsed and honored ("warc-prefix" param)
|
||||||
|
assert warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_long_warcprox_meta"]
|
||||||
|
writer = warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_long_warcprox_meta"]
|
||||||
|
warc_path = os.path.join(writer.directory, writer._f_finalname)
|
||||||
|
warcprox_.warc_writer_threads[0].writer_pool.warc_writers["test_long_warcprox_meta"].close_writer()
|
||||||
|
assert os.path.exists(warc_path)
|
||||||
|
|
||||||
|
# read the warc
|
||||||
|
with open(warc_path, 'rb') as f:
|
||||||
|
rec_iter = iter(warcio.archiveiterator.ArchiveIterator(f))
|
||||||
|
record = next(rec_iter)
|
||||||
|
assert record.rec_type == 'warcinfo'
|
||||||
|
record = next(rec_iter)
|
||||||
|
assert record.rec_type == 'response'
|
||||||
|
assert record.rec_headers.get_header('warc-target-uri') == url
|
||||||
|
record = next(rec_iter)
|
||||||
|
assert record.rec_type == 'request'
|
||||||
|
assert record.rec_headers.get_header('warc-target-uri') == url
|
||||||
|
with pytest.raises(StopIteration):
|
||||||
|
next(rec_iter)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
pytest.main()
|
pytest.main()
|
||||||
|
|
||||||
|
@ -37,6 +37,12 @@ except ImportError:
|
|||||||
import urlparse as urllib_parse
|
import urlparse as urllib_parse
|
||||||
try:
|
try:
|
||||||
import http.client as http_client
|
import http.client as http_client
|
||||||
|
# In python3 http.client.parse_headers() enforces http_client._MAXLINE
|
||||||
|
# as max length of an http header line, but we want to support very
|
||||||
|
# long warcprox-meta headers, so we tweak it here. Python2 doesn't seem
|
||||||
|
# to enforce any limit. Multiline headers could be an option but it
|
||||||
|
# turns out those are illegal as of RFC 7230. Plus, this is easier.
|
||||||
|
http_client._MAXLINE = 4194304 # 4 MiB
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import httplib as http_client
|
import httplib as http_client
|
||||||
import socket
|
import socket
|
||||||
|
Loading…
x
Reference in New Issue
Block a user