Merge pull request #45 from vbanos/return-capture-timestamp

Return capture timestamp
This commit is contained in:
Noah Levitt 2017-11-02 12:45:16 -07:00 committed by GitHub
commit 57d7795ced
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 34 additions and 4 deletions

View File

@ -555,6 +555,22 @@ def test_limits(http_daemon, warcprox_, archiving_proxies):
assert response.headers["content-type"] == "text/plain;charset=utf-8" assert response.headers["content-type"] == "text/plain;charset=utf-8"
assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=10\n" assert response.raw.data == b"request rejected by warcprox: reached limit test_limits_bucket/total/urls=10\n"
def test_return_capture_timestamp(http_daemon, warcprox_, archiving_proxies):
url = 'http://localhost:{}/i/j'.format(http_daemon.server_port)
request_meta = {"accept": ["capture-metadata"]}
headers = {"Warcprox-Meta": json.dumps(request_meta)}
response = requests.get(url, proxies=archiving_proxies, headers=headers, stream=True)
assert response.status_code == 200
assert response.headers['Warcprox-Meta']
data = json.loads(response.headers['Warcprox-Meta'])
assert data['capture-metadata']
try:
dt = datetime.datetime.strptime(data['capture-metadata']['timestamp'],
'%Y-%m-%dT%H:%M:%SZ')
assert dt
except ValueError:
pytest.fail('Invalid capture-timestamp format %s', data['capture-timestamp'])
def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, playback_proxies): def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, playback_proxies):
url1 = 'http://localhost:{}/k/l'.format(http_daemon.server_port) url1 = 'http://localhost:{}/k/l'.format(http_daemon.server_port)
url2 = 'https://localhost:{}/k/l'.format(https_daemon.server_port) url2 = 'https://localhost:{}/k/l'.format(https_daemon.server_port)

View File

@ -45,6 +45,7 @@ try:
http_client._MAXLINE = 4194304 # 4 MiB http_client._MAXLINE = 4194304 # 4 MiB
except ImportError: except ImportError:
import httplib as http_client import httplib as http_client
import json
import socket import socket
import logging import logging
import ssl import ssl
@ -163,13 +164,17 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse):
self.fp, proxy_client, digest_algorithm, url=url) self.fp, proxy_client, digest_algorithm, url=url)
self.fp = self.recorder self.fp = self.recorder
def begin(self): def begin(self, extra_response_headers={}):
http_client.HTTPResponse.begin(self) # reads status line, headers http_client.HTTPResponse.begin(self) # reads status line, headers
status_and_headers = 'HTTP/1.1 {} {}\r\n'.format( status_and_headers = 'HTTP/1.1 {} {}\r\n'.format(
self.status, self.reason) self.status, self.reason)
self.msg['Via'] = via_header_value( self.msg['Via'] = via_header_value(
self.msg.get('Via'), '%0.1f' % (self.version / 10.0)) self.msg.get('Via'), '%0.1f' % (self.version / 10.0))
if extra_response_headers:
for header, value in extra_response_headers.items():
self.msg[header] = value
for k,v in self.msg.items(): for k,v in self.msg.items():
if k.lower() not in ( if k.lower() not in (
'connection', 'proxy-connection', 'keep-alive', 'connection', 'proxy-connection', 'keep-alive',
@ -361,12 +366,16 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
self.logger.error("exception proxying request", exc_info=True) self.logger.error("exception proxying request", exc_info=True)
raise raise
def _proxy_request(self): def _proxy_request(self, extra_response_headers={}):
''' '''
Sends the request to the remote server, then uses a ProxyingRecorder to Sends the request to the remote server, then uses a ProxyingRecorder to
read the response and send it to the proxy client, while recording the read the response and send it to the proxy client, while recording the
bytes in transit. Returns a tuple (request, response) where request is bytes in transit. Returns a tuple (request, response) where request is
the raw request bytes, and response is a ProxyingRecorder. the raw request bytes, and response is a ProxyingRecorder.
:param extra_response_headers: generated on warcprox._proxy_request.
It may contain extra HTTP headers such as ``Warcprox-Meta`` which
are written in the WARC record for this request.
''' '''
# Build request # Build request
req_str = '{} {} {}\r\n'.format( req_str = '{} {} {}\r\n'.format(
@ -407,7 +416,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
self._remote_server_sock, proxy_client=self.connection, self._remote_server_sock, proxy_client=self.connection,
digest_algorithm=self.server.digest_algorithm, digest_algorithm=self.server.digest_algorithm,
url=self.url, method=self.command) url=self.url, method=self.command)
prox_rec_res.begin() prox_rec_res.begin(extra_response_headers=extra_response_headers)
buf = prox_rec_res.read(8192) buf = prox_rec_res.read(8192)
while buf != b'': while buf != b'':

View File

@ -179,9 +179,14 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
remote_ip = self._remote_server_sock.getpeername()[0] remote_ip = self._remote_server_sock.getpeername()[0]
timestamp = datetime.datetime.utcnow() timestamp = datetime.datetime.utcnow()
extra_response_headers = {}
if warcprox_meta and 'accept' in warcprox_meta and \
'capture-metadata' in warcprox_meta['accept']:
rmeta = {'capture-metadata': {'timestamp': timestamp.strftime('%Y-%m-%dT%H:%M:%SZ')}}
extra_response_headers['Warcprox-Meta'] = json.dumps(rmeta, separators=',:')
req, prox_rec_res = warcprox.mitmproxy.MitmProxyHandler._proxy_request( req, prox_rec_res = warcprox.mitmproxy.MitmProxyHandler._proxy_request(
self) self, extra_response_headers=extra_response_headers)
content_type = None content_type = None
try: try: