Return capture timestamp

When client request has HTTP header ``Warcprox-Meta": {"return-capture-timestamp": 1}``,
add to the response the WARC record timestamp in the following HTTP header:
``Warcprox-Meta: {"capture-timestamp": '%Y-%m-%d %H:%M:%S"}``.

Add unit test.
This commit is contained in:
Vangelis Banos 2017-10-29 18:48:08 +00:00
parent 8ead8182e1
commit 3d9a22b6c7
3 changed files with 32 additions and 4 deletions

View File

@ -555,6 +555,21 @@ def test_limits(http_daemon, warcprox_, archiving_proxies):
assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.headers["content-type"] == "text/plain;charset=utf-8"
assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=10\n" assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=10\n"
def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies):
url = 'http://localhost:{}/i/j'.format(http_daemon.server_port)
request_meta = {"return-capture-timestamp": 1}
headers = {"Warcprox-Meta": json.dumps(request_meta)}
response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 200
assert response.headers['Warcprox-Meta']
data = json.loads(response.headers['Warcprox-Meta'])
assert data['capture-timestamp']
try:
dt = datetime.datetime.strptime(data['capture-timestamp'], '%Y-%m-%d %H:%M:%S')
assert dt
except ValueError:
pytest.fail('Invalid capture-timestamp format %s', data['capture-timestamp'])
def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, playback_proxies): def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, playback_proxies):
url1 = 'http://localhost:{}/k/l'.format(http_daemon.server_port) url1 = 'http://localhost:{}/k/l'.format(http_daemon.server_port)
url2 = 'https://localhost:{}/k/l'.format(https_daemon.server_port) url2 = 'https://localhost:{}/k/l'.format(https_daemon.server_port)

View File

@ -45,6 +45,7 @@ try:
http_client._MAXLINE = 4194304 # 4 MiB http_client._MAXLINE = 4194304 # 4 MiB
except ImportError: except ImportError:
import httplib as http_client import httplib as http_client
import json
import socket import socket
import logging import logging
import ssl import ssl
@ -163,13 +164,17 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
self.fp, proxy_client, digest_algorithm, url=url) self.fp, proxy_client, digest_algorithm, url=url)
self.fp = self.recorder self.fp = self.recorder
def begin(self): def begin(self, timestamp=None):
http_client.HTTPResponse.begin(self) # reads status line, headers http_client.HTTPResponse.begin(self) # reads status line, headers
status_and_headers = 'HTTP/1.1 {} {}\r\n'.format( status_and_headers = 'HTTP/1.1 {} {}\r\n'.format(
self.status, self.reason) self.status, self.reason)
self.msg['Via'] = via_header_value( self.msg['Via'] = via_header_value(
self.msg.get('Via'), '%0.1f' % (self.version / 10.0)) self.msg.get('Via'), '%0.1f' % (self.version / 10.0))
if timestamp:
rmeta = {"capture-timestamp": timestamp.strftime('%Y-%m-%d %H:%M:%S')}
self.msg['Warcprox-Meta'] = json.dumps(rmeta, separators=',:')
for k,v in self.msg.items(): for k,v in self.msg.items():
if k.lower() not in ( if k.lower() not in (
'connection', 'proxy-connection', 'keep-alive', 'connection', 'proxy-connection', 'keep-alive',
@ -361,12 +366,15 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
self.logger.error("exception proxying request", exc_info=True) self.logger.error("exception proxying request", exc_info=True)
raise raise
def _proxy_request(self): def _proxy_request(self, timestamp=None):
''' '''
Sends the request to the remote server, then uses a ProxyingRecorder to Sends the request to the remote server, then uses a ProxyingRecorder to
read the response and send it to the proxy client, while recording the read the response and send it to the proxy client, while recording the
bytes in transit. Returns a tuple (request, response) where request is bytes in transit. Returns a tuple (request, response) where request is
the raw request bytes, and response is a ProxyingRecorder. the raw request bytes, and response is a ProxyingRecorder.
:param timestamp: generated on warcprox._proxy_request. It is the
timestamp written in the WARC record for this request.
''' '''
# Build request # Build request
req_str = '{} {} {}\r\n'.format( req_str = '{} {} {}\r\n'.format(
@ -407,7 +415,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
self._remote_server_sock, proxy_client=self.connection, self._remote_server_sock, proxy_client=self.connection,
digest_algorithm=self.server.digest_algorithm, digest_algorithm=self.server.digest_algorithm,
url=self.url, method=self.command) url=self.url, method=self.command)
prox_rec_res.begin() prox_rec_res.begin(timestamp=timestamp)
buf = prox_rec_res.read(8192) buf = prox_rec_res.read(8192)
while buf != b'': while buf != b'':

View File

@ -180,8 +180,13 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
remote_ip = self._remote_server_sock.getpeername()[0] remote_ip = self._remote_server_sock.getpeername()[0]
timestamp = datetime.datetime.utcnow() timestamp = datetime.datetime.utcnow()
if warcprox_meta and 'return-capture-timestamp' in warcprox_meta:
return_timestamp = timestamp
else:
return_timestamp = None
req, prox_rec_res = warcprox.mitmproxy.MitmProxyHandler._proxy_request( req, prox_rec_res = warcprox.mitmproxy.MitmProxyHandler._proxy_request(
self) self, timestamp=return_timestamp)
content_type = None content_type = None
try: try: