From 3d9a22b6c7855f60dea5f2f772432d548a6e6fd4 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Sun, 29 Oct 2017 18:48:08 +0000 Subject: [PATCH 1/4] Return capture timestamp When client request has HTTP header ``Warcprox-Meta": {"return-capture-timestamp": 1}``, add to the response the WARC record timestamp in the following HTTP header: ``Warcprox-Meta: {"capture-timestamp": '%Y-%m-%d %H:%M:%S"}``. Add unit test. --- tests/test_warcprox.py | 15 +++++++++++++++ warcprox/mitmproxy.py | 14 +++++++++++--- warcprox/warcproxy.py | 7 ++++++- 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index b24a5c8..22d4597 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -555,6 +555,21 @@ def test_limits(http_daemon, warcprox_, archiving_proxies): assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=10\n" +def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies): + url = 'http://localhost:{}/i/j'.format(http_daemon.server_port) + request_meta = {"return-capture-timestamp": 1} + headers = {"Warcprox-Meta": json.dumps(request_meta)} + response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) + assert response.status_code == 200 + assert response.headers['Warcprox-Meta'] + data = json.loads(response.headers['Warcprox-Meta']) + assert data['capture-timestamp'] + try: + dt = datetime.datetime.strptime(data['capture-timestamp'], '%Y-%m-%d %H:%M:%S') + assert dt + except ValueError: + pytest.fail('Invalid capture-timestamp format %s', data['capture-timestamp']) + def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, playback_proxies): url1 = 'http://localhost:{}/k/l'.format(http_daemon.server_port) url2 = 'https://localhost:{}/k/l'.format(https_daemon.server_port) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 914fb52..e60c07e 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -45,6 +45,7 @@ try: http_client._MAXLINE = 4194304 # 4 MiB except ImportError: import httplib as http_client +import json import socket import logging import ssl @@ -163,13 +164,17 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): self.fp, proxy_client, digest_algorithm, url=url) self.fp = self.recorder - def begin(self): + def begin(self, timestamp=None): http_client.HTTPResponse.begin(self) # reads status line, headers status_and_headers = 'HTTP/1.1 {} {}\r\n'.format( self.status, self.reason) self.msg['Via'] = via_header_value( self.msg.get('Via'), '%0.1f' % (self.version / 10.0)) + if timestamp: + rmeta = {"capture-timestamp": timestamp.strftime('%Y-%m-%d %H:%M:%S')} + self.msg['Warcprox-Meta'] = json.dumps(rmeta, separators=',:') + for k,v in self.msg.items(): if k.lower() not in ( 'connection', 'proxy-connection', 'keep-alive', @@ -361,12 +366,15 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self.logger.error("exception proxying request", exc_info=True) raise - def _proxy_request(self): + def _proxy_request(self, timestamp=None): ''' Sends the request to the remote server, then uses a ProxyingRecorder to read the response and send it to the proxy client, while recording the bytes in transit. Returns a tuple (request, response) where request is the raw request bytes, and response is a ProxyingRecorder. + + :param timestamp: generated on warcprox._proxy_request. It is the + timestamp written in the WARC record for this request. ''' # Build request req_str = '{} {} {}\r\n'.format( @@ -407,7 +415,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._remote_server_sock, proxy_client=self.connection, digest_algorithm=self.server.digest_algorithm, url=self.url, method=self.command) - prox_rec_res.begin() + prox_rec_res.begin(timestamp=timestamp) buf = prox_rec_res.read(8192) while buf != b'': diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 06983ed..48dc5cd 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -180,8 +180,13 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): remote_ip = self._remote_server_sock.getpeername()[0] timestamp = datetime.datetime.utcnow() + if warcprox_meta and 'return-capture-timestamp' in warcprox_meta: + return_timestamp = timestamp + else: + return_timestamp = None + req, prox_rec_res = warcprox.mitmproxy.MitmProxyHandler._proxy_request( - self) + self, timestamp=return_timestamp) content_type = None try: From 56f0118374495f397de1f40b96b426a5c9789d44 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Tue, 31 Oct 2017 10:49:10 +0000 Subject: [PATCH 2/4] Replace timestamp parameter with more generic request/response syntax Replace timestamp parameter with more generic extra_response_headers={} When request has --header ``Warcprox-Meta: {\"accept\":[\"capture-metadata\"]}"`` Response has the following header: ``Warcprox-Meta: {"capture-metadata":{"timestamp":"2017-10-31T10:47:50Z"}}`` Update unit test --- tests/test_warcprox.py | 7 ++++--- warcprox/mitmproxy.py | 10 +++++----- warcprox/warcproxy.py | 11 +++++------ 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 22d4597..1752b94 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -557,15 +557,16 @@ def test_limits(http_daemon, warcprox_, archiving_proxies): def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies): url = 'http://localhost:{}/i/j'.format(http_daemon.server_port) - request_meta = {"return-capture-timestamp": 1} + request_meta = {"accept": ["capture-metadata"]} headers = {"Warcprox-Meta": json.dumps(request_meta)} response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True) assert response.status_code == 200 assert response.headers['Warcprox-Meta'] data = json.loads(response.headers['Warcprox-Meta']) - assert data['capture-timestamp'] + assert data['capture-metadata'] try: - dt = datetime.datetime.strptime(data['capture-timestamp'], '%Y-%m-%d %H:%M:%S') + dt = datetime.datetime.strptime(data['capture-metadata']['timestamp'], + '%Y-%m-%dT%H:%M:%SZ') assert dt except ValueError: pytest.fail('Invalid capture-timestamp format %s', data['capture-timestamp']) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index e60c07e..e2cc321 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -164,15 +164,15 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): self.fp, proxy_client, digest_algorithm, url=url) self.fp = self.recorder - def begin(self, timestamp=None): + def begin(self, extra_response_headers={}): http_client.HTTPResponse.begin(self) # reads status line, headers status_and_headers = 'HTTP/1.1 {} {}\r\n'.format( self.status, self.reason) self.msg['Via'] = via_header_value( self.msg.get('Via'), '%0.1f' % (self.version / 10.0)) - if timestamp: - rmeta = {"capture-timestamp": timestamp.strftime('%Y-%m-%d %H:%M:%S')} + if extra_response_headers: + rmeta = {"capture-metadata": extra_response_headers} self.msg['Warcprox-Meta'] = json.dumps(rmeta, separators=',:') for k,v in self.msg.items(): @@ -366,7 +366,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self.logger.error("exception proxying request", exc_info=True) raise - def _proxy_request(self, timestamp=None): + def _proxy_request(self, extra_response_headers={}): ''' Sends the request to the remote server, then uses a ProxyingRecorder to read the response and send it to the proxy client, while recording the @@ -415,7 +415,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._remote_server_sock, proxy_client=self.connection, digest_algorithm=self.server.digest_algorithm, url=self.url, method=self.command) - prox_rec_res.begin(timestamp=timestamp) + prox_rec_res.begin(extra_response_headers=extra_response_headers) buf = prox_rec_res.read(8192) while buf != b'': diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 48dc5cd..ec613ab 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -179,14 +179,13 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): remote_ip = self._remote_server_sock.getpeername()[0] timestamp = datetime.datetime.utcnow() - - if warcprox_meta and 'return-capture-timestamp' in warcprox_meta: - return_timestamp = timestamp - else: - return_timestamp = None + extra_response_headers = {} + if warcprox_meta and 'accept' in warcprox_meta and \ + 'capture-metadata' in warcprox_meta['accept']: + extra_response_headers['timestamp'] = timestamp.strftime('%Y-%m-%dT%H:%M:%SZ') req, prox_rec_res = warcprox.mitmproxy.MitmProxyHandler._proxy_request( - self, timestamp=return_timestamp) + self, extra_response_headers=extra_response_headers) content_type = None try: From ca3121102ef3e67ef33b0e0ad1d6424fcaa11b31 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Thu, 2 Nov 2017 08:24:28 +0000 Subject: [PATCH 3/4] Move Warcprox-Meta header construction to warcproxy --- warcprox/mitmproxy.py | 4 ++-- warcprox/warcproxy.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index e2cc321..f6ea742 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -172,8 +172,8 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): self.msg['Via'] = via_header_value( self.msg.get('Via'), '%0.1f' % (self.version / 10.0)) if extra_response_headers: - rmeta = {"capture-metadata": extra_response_headers} - self.msg['Warcprox-Meta'] = json.dumps(rmeta, separators=',:') + for header, value in extra_response_headers.items(): + self.msg[header] = value for k,v in self.msg.items(): if k.lower() not in ( diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index ec613ab..d37e588 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -182,7 +182,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): extra_response_headers = {} if warcprox_meta and 'accept' in warcprox_meta and \ 'capture-metadata' in warcprox_meta['accept']: - extra_response_headers['timestamp'] = timestamp.strftime('%Y-%m-%dT%H:%M:%SZ') + rmeta = {'capture-metadata': {'timestamp': timestamp.strftime('%Y-%m-%dT%H:%M:%SZ')}} + extra_response_headers['Warcprox-Meta'] = json.dumps(rmeta, separators=',:') req, prox_rec_res = warcprox.mitmproxy.MitmProxyHandler._proxy_request( self, extra_response_headers=extra_response_headers) From d174e736be08dd075df80543d54f3f4a65bd3722 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Thu, 2 Nov 2017 19:43:45 +0000 Subject: [PATCH 4/4] Update docstring --- warcprox/mitmproxy.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index f6ea742..b14cddf 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -373,8 +373,9 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): bytes in transit. Returns a tuple (request, response) where request is the raw request bytes, and response is a ProxyingRecorder. - :param timestamp: generated on warcprox._proxy_request. It is the - timestamp written in the WARC record for this request. + :param extra_response_headers: generated on warcprox._proxy_request. + It may contain extra HTTP headers such as ``Warcprox-Meta`` which + are written in the WARC record for this request. ''' # Build request req_str = '{} {} {}\r\n'.format(