set via header on request and response, record request via in warc (because it is sent to the remote site), do not record response via in warc (because it is not sent by the remote site)

This commit is contained in:
Noah Levitt 2017-04-28 11:07:33 -07:00
parent 47680cc17d
commit ca7625b18d
5 changed files with 46 additions and 11 deletions

View File

@ -30,12 +30,12 @@ before_install:
- docker run -d --publish=28015:28015 rethinkdb
before_script:
- pip install . pytest requests
- pip install . pytest requests warcio
script:
- py.test -vv tests
- py.test -vv --rethinkdb-servers=localhost tests
- py.test -vv --rethinkdb-servers=localhost --rethinkdb-big-table tests
- py.test -v tests
- py.test -v --rethinkdb-servers=localhost tests
- py.test -v --rethinkdb-servers=localhost --rethinkdb-big-table tests
notifications:
slack:

View File

@ -51,7 +51,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.1b1.dev71',
version='2.1b1.dev72',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -40,9 +40,9 @@ do
&& (cd /warcprox && git diff HEAD) | patch -p1 \
&& virtualenv -p $python /tmp/venv \
&& source /tmp/venv/bin/activate \
&& pip --log-file /tmp/pip.log install . pytest requests \
&& py.test -vv tests \
&& py.test -vv --rethinkdb-servers=localhost tests \
&& py.test -vv --rethinkdb-servers=localhost --rethinkdb-big-table tests"
&& pip --log-file /tmp/pip.log install . pytest requests warcio \
&& py.test -v tests \
&& py.test -v --rethinkdb-servers=localhost tests \
&& py.test -v --rethinkdb-servers=localhost --rethinkdb-big-table tests"
done

View File

@ -45,6 +45,7 @@ import signal
from collections import Counter
import socket
import datetime
import warcio.archiveiterator
try:
import http.server as http_server
@ -79,7 +80,7 @@ def _send(self, data):
logging.info('sending data from %s', repr(data))
orig_send(self, data)
### uncomment this to block see raw requests going over the wire
# http_client.HTTPConnection.send = _send
http_client.HTTPConnection.send = _send
logging.basicConfig(
stream=sys.stdout, level=logging.INFO, # level=warcprox.TRACE,
@ -1391,6 +1392,25 @@ def test_choose_a_port_for_me(service_registry):
controller.stop.set()
th.join()
def test_via_response_header(warcprox_, http_daemon, archiving_proxies, playback_proxies):
url = 'http://localhost:%s/a/z' % http_daemon.server_port
response = requests.get(url, proxies=archiving_proxies)
assert response.headers['via'] == '1.1 warcprox'
playback_response = _poll_playback_until(
playback_proxies, url, status=200, timeout_sec=10)
assert response.status_code == 200
assert not 'via' in playback_response
warc = warcprox_.warc_writer_thread.writer_pool.default_warc_writer._fpath
with open(warc, 'rb') as f:
for record in warcio.archiveiterator.ArchiveIterator(f):
if record.rec_headers.get_header('warc-target-uri') == url:
if record.rec_type == 'response':
assert not record.http_headers.get_header('via')
elif record.rec_type == 'request':
assert record.http_headers.get_header('via') == '1.1 warcprox'
if __name__ == '__main__':
pytest.main()

View File

@ -163,7 +163,9 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
status_and_headers = 'HTTP/1.1 {} {}\r\n'.format(
self.status, self.reason)
for k,v in self.msg.items():
self.headers['Via'] = via_header_value(
self.headers.get('Via'), '%0.1f' % (self.version / 10))
for k,v in self.headers.items():
if k.lower() not in (
'connection', 'proxy-connection', 'keep-alive',
'proxy-authenticate', 'proxy-authorization', 'upgrade',
@ -174,6 +176,15 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
self.recorder.payload_starts_now()
def via_header_value(orig, request_version):
via = orig
if via:
via += ', '
else:
via = ''
via = via + '%s %s' % (request_version, 'warcprox')
return via
class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
'''
An http proxy implementation of BaseHTTPRequestHandler, that acts as a
@ -356,6 +367,10 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'):
del self.headers[key]
self.headers['Via'] = via_header_value(
self.headers.get('Via'),
self.request_version.replace('HTTP/', ''))
# Add headers to the request
# XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
req_str += '\r\n'.join(