Return capture timestamp

When client request has HTTP header ``Warcprox-Meta": {"return-capture-timestamp": 1}``,
add to the response the WARC record timestamp in the following HTTP header:
``Warcprox-Meta: {"capture-timestamp": '%Y-%m-%d %H:%M:%S"}``.

Add unit test.
This commit is contained in:
Vangelis Banos 2017-10-29 18:48:08 +00:00
parent 8ead8182e1
commit 3d9a22b6c7
3 changed files with 32 additions and 4 deletions

View File

@ -555,6 +555,21 @@ def test_limits(http_daemon, warcprox_, archiving_proxies):
assert response.headers["content-type"] == "text/plain;charset=utf-8"
assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=10\n"
def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies):
url = 'http://localhost:{}/i/j'.format(http_daemon.server_port)
request_meta = {"return-capture-timestamp": 1}
headers = {"Warcprox-Meta": json.dumps(request_meta)}
response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 200
assert response.headers['Warcprox-Meta']
data = json.loads(response.headers['Warcprox-Meta'])
assert data['capture-timestamp']
try:
dt = datetime.datetime.strptime(data['capture-timestamp'], '%Y-%m-%d %H:%M:%S')
assert dt
except ValueError:
pytest.fail('Invalid capture-timestamp format %s', data['capture-timestamp'])
def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, playback_proxies):
url1 = 'http://localhost:{}/k/l'.format(http_daemon.server_port)
url2 = 'https://localhost:{}/k/l'.format(https_daemon.server_port)

View File

@ -45,6 +45,7 @@ try:
http_client._MAXLINE = 4194304 # 4 MiB
except ImportError:
import httplib as http_client
import json
import socket
import logging
import ssl
@ -163,13 +164,17 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
self.fp, proxy_client, digest_algorithm, url=url)
self.fp = self.recorder
def begin(self):
def begin(self, timestamp=None):
http_client.HTTPResponse.begin(self) # reads status line, headers
status_and_headers = 'HTTP/1.1 {} {}\r\n'.format(
self.status, self.reason)
self.msg['Via'] = via_header_value(
self.msg.get('Via'), '%0.1f' % (self.version / 10.0))
if timestamp:
rmeta = {"capture-timestamp": timestamp.strftime('%Y-%m-%d %H:%M:%S')}
self.msg['Warcprox-Meta'] = json.dumps(rmeta, separators=',:')
for k,v in self.msg.items():
if k.lower() not in (
'connection', 'proxy-connection', 'keep-alive',
@ -361,12 +366,15 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
self.logger.error("exception proxying request", exc_info=True)
raise
def _proxy_request(self):
def _proxy_request(self, timestamp=None):
'''
Sends the request to the remote server, then uses a ProxyingRecorder to
read the response and send it to the proxy client, while recording the
bytes in transit. Returns a tuple (request, response) where request is
the raw request bytes, and response is a ProxyingRecorder.
:param timestamp: generated on warcprox._proxy_request. It is the
timestamp written in the WARC record for this request.
'''
# Build request
req_str = '{} {} {}\r\n'.format(
@ -407,7 +415,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
self._remote_server_sock, proxy_client=self.connection,
digest_algorithm=self.server.digest_algorithm,
url=self.url, method=self.command)
prox_rec_res.begin()
prox_rec_res.begin(timestamp=timestamp)
buf = prox_rec_res.read(8192)
while buf != b'':

View File

@ -180,8 +180,13 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
remote_ip = self._remote_server_sock.getpeername()[0]
timestamp = datetime.datetime.utcnow()
if warcprox_meta and 'return-capture-timestamp' in warcprox_meta:
return_timestamp = timestamp
else:
return_timestamp = None
req, prox_rec_res = warcprox.mitmproxy.MitmProxyHandler._proxy_request(
self)
self, timestamp=return_timestamp)
content_type = None
try: