get "mimetype" (without ;params) from content-type in one place in RecordedUrl, and also note host and duration (time spent serving request)

This commit is contained in:
Noah Levitt 2015-09-04 01:32:55 +00:00
parent fee200c72c
commit b30218027e
3 changed files with 17 additions and 12 deletions

View File

@ -64,12 +64,6 @@ class RethinkCaptures:
canon_surt = surt.surt(recorded_url.url.decode("utf-8"), trailing_comma=True, host_massage=False)
mimetype = recorded_url.content_type
if mimetype:
n = mimetype.find(";")
if n >= 0:
mimetype = mimetype[:n]
entry = {
# id only specified for rethinkdb partitioning
"id": "{} {}".format(canon_surt[:20], records[0].id.decode("utf-8")[10:-1]),
@ -82,7 +76,7 @@ class RethinkCaptures:
"warc_type": records[0].type.decode("utf-8"),
"warc_id": records[0].id.decode("utf-8"),
"sha1base32": base64.b32encode(recorded_url.response_recorder.payload_digest.digest()).decode("utf-8"),
"content_type": mimetype,
"content_type": recorded_url.mimetype,
"response_code": recorded_url.status,
"http_method": recorded_url.method,
"bucket": bucket,

View File

@ -64,7 +64,7 @@ class WarcRecordBuilder:
principal_record = self.build_warc_record(url=recorded_url.url,
warc_date=warc_date, data=recorded_url.request_data,
warc_type=recorded_url.custom_type,
content_type=recorded_url.content_type)
content_type=recorded_url.content_type.encode("latin1"))
return (principal_record,)
def build_warc_record(self, url, warc_date=None, recorder=None, data=None,

View File

@ -237,7 +237,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
status=prox_rec_res.status, size=prox_rec_res.recorder.len,
client_ip=self.client_address[0],
content_type=prox_rec_res.getheader("Content-Type"),
method=self.command, timestamp=timestamp)
method=self.command, timestamp=timestamp,
host=self.hostname, duration=datetime.datetime.utcnow()-timestamp)
self.server.recorded_url_q.put(recorded_url)
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
@ -278,7 +279,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
response_recorder=None,
remote_ip=b'',
warcprox_meta=warcprox_meta,
content_type=self.headers['Content-Type'].encode('latin1'),
content_type=self.headers['Content-Type'],
custom_type=warc_type or self.headers['WARC-Type'],
status=204, size=len(request_data),
client_ip=self.client_address[0],
@ -309,7 +310,7 @@ class RecordedUrl:
def __init__(self, url, request_data, response_recorder, remote_ip,
warcprox_meta=None, content_type=None, custom_type=None,
status=None, size=None, client_ip=None, method=None,
timestamp=None):
timestamp=None, host=None, duration=None):
# XXX should test what happens with non-ascii url (when does
# url-encoding happen?)
if type(url) is not bytes:
@ -330,14 +331,24 @@ class RecordedUrl:
else:
self.warcprox_meta = {}
if isinstance(content_type, bytes):
raise Exception("content_type is not supposed to be bytes!")
self.content_type = content_type
self.custom_type = custom_type
self.mimetype = content_type
if self.mimetype:
n = self.mimetype.find(";")
if n >= 0:
self.mimetype = self.mimetype[:n]
self.custom_type = custom_type
self.status = status
self.size = size
self.client_ip = client_ip
self.method = method
self.timestamp = timestamp
self.host = host
self.duration = duration
def __del__(self):
self.logger.debug("finished with %s", self)