From ca7625b18d3316b79725ceacdaf275078a32dc77 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 28 Apr 2017 11:07:33 -0700 Subject: [PATCH] set via header on request and response, record request via in warc (because it is sent to the remote site), do not record response via in warc (because it is not sent by the remote site) --- .travis.yml | 8 ++++---- setup.py | 2 +- tests/run-tests.sh | 8 ++++---- tests/test_warcprox.py | 22 +++++++++++++++++++++- warcprox/mitmproxy.py | 17 ++++++++++++++++- 5 files changed, 46 insertions(+), 11 deletions(-) diff --git a/.travis.yml b/.travis.yml index 0f7a315..3d02ebf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,12 +30,12 @@ before_install: - docker run -d --publish=28015:28015 rethinkdb before_script: -- pip install . pytest requests +- pip install . pytest requests warcio script: -- py.test -vv tests -- py.test -vv --rethinkdb-servers=localhost tests -- py.test -vv --rethinkdb-servers=localhost --rethinkdb-big-table tests +- py.test -v tests +- py.test -v --rethinkdb-servers=localhost tests +- py.test -v --rethinkdb-servers=localhost --rethinkdb-big-table tests notifications: slack: diff --git a/setup.py b/setup.py index 4483f14..547e1fa 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ except: setuptools.setup( name='warcprox', - version='2.1b1.dev71', + version='2.1b1.dev72', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/run-tests.sh b/tests/run-tests.sh index 0c5b254..334cfc2 100755 --- a/tests/run-tests.sh +++ b/tests/run-tests.sh @@ -40,9 +40,9 @@ do && (cd /warcprox && git diff HEAD) | patch -p1 \ && virtualenv -p $python /tmp/venv \ && source /tmp/venv/bin/activate \ - && pip --log-file /tmp/pip.log install . pytest requests \ - && py.test -vv tests \ - && py.test -vv --rethinkdb-servers=localhost tests \ - && py.test -vv --rethinkdb-servers=localhost --rethinkdb-big-table tests" + && pip --log-file /tmp/pip.log install . pytest requests warcio \ + && py.test -v tests \ + && py.test -v --rethinkdb-servers=localhost tests \ + && py.test -v --rethinkdb-servers=localhost --rethinkdb-big-table tests" done diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index c61f6d8..e4692cf 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -45,6 +45,7 @@ import signal from collections import Counter import socket import datetime +import warcio.archiveiterator try: import http.server as http_server @@ -79,7 +80,7 @@ def _send(self, data): logging.info('sending data from %s', repr(data)) orig_send(self, data) ### uncomment this to block see raw requests going over the wire -# http_client.HTTPConnection.send = _send +http_client.HTTPConnection.send = _send logging.basicConfig( stream=sys.stdout, level=logging.INFO, # level=warcprox.TRACE, @@ -1391,6 +1392,25 @@ def test_choose_a_port_for_me(service_registry): controller.stop.set() th.join() +def test_via_response_header(warcprox_, http_daemon, archiving_proxies, playback_proxies): + url = 'http://localhost:%s/a/z' % http_daemon.server_port + response = requests.get(url, proxies=archiving_proxies) + assert response.headers['via'] == '1.1 warcprox' + + playback_response = _poll_playback_until( + playback_proxies, url, status=200, timeout_sec=10) + assert response.status_code == 200 + assert not 'via' in playback_response + + warc = warcprox_.warc_writer_thread.writer_pool.default_warc_writer._fpath + with open(warc, 'rb') as f: + for record in warcio.archiveiterator.ArchiveIterator(f): + if record.rec_headers.get_header('warc-target-uri') == url: + if record.rec_type == 'response': + assert not record.http_headers.get_header('via') + elif record.rec_type == 'request': + assert record.http_headers.get_header('via') == '1.1 warcprox' + if __name__ == '__main__': pytest.main() diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 5f592e4..d69f26e 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -163,7 +163,9 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): status_and_headers = 'HTTP/1.1 {} {}\r\n'.format( self.status, self.reason) - for k,v in self.msg.items(): + self.headers['Via'] = via_header_value( + self.headers.get('Via'), '%0.1f' % (self.version / 10)) + for k,v in self.headers.items(): if k.lower() not in ( 'connection', 'proxy-connection', 'keep-alive', 'proxy-authenticate', 'proxy-authorization', 'upgrade', @@ -174,6 +176,15 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): self.recorder.payload_starts_now() +def via_header_value(orig, request_version): + via = orig + if via: + via += ', ' + else: + via = '' + via = via + '%s %s' % (request_version, 'warcprox') + return via + class MitmProxyHandler(http_server.BaseHTTPRequestHandler): ''' An http proxy implementation of BaseHTTPRequestHandler, that acts as a @@ -356,6 +367,10 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): 'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'): del self.headers[key] + self.headers['Via'] = via_header_value( + self.headers.get('Via'), + self.request_version.replace('HTTP/', '')) + # Add headers to the request # XXX in at least python3.3 str(self.headers) uses \n not \r\n :( req_str += '\r\n'.join(