From b30218027e0cc5ad17c52ce839279e4775d70cd7 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 4 Sep 2015 01:32:55 +0000 Subject: [PATCH] get "mimetype" (without ;params) from content-type in one place in RecordedUrl, and also note host and duration (time spent serving request) --- warcprox/bigtable.py | 8 +------- warcprox/warc.py | 2 +- warcprox/warcproxy.py | 19 +++++++++++++++---- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index ea38cc9..a799d78 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -64,12 +64,6 @@ class RethinkCaptures: canon_surt = surt.surt(recorded_url.url.decode("utf-8"), trailing_comma=True, host_massage=False) - mimetype = recorded_url.content_type - if mimetype: - n = mimetype.find(";") - if n >= 0: - mimetype = mimetype[:n] - entry = { # id only specified for rethinkdb partitioning "id": "{} {}".format(canon_surt[:20], records[0].id.decode("utf-8")[10:-1]), @@ -82,7 +76,7 @@ class RethinkCaptures: "warc_type": records[0].type.decode("utf-8"), "warc_id": records[0].id.decode("utf-8"), "sha1base32": base64.b32encode(recorded_url.response_recorder.payload_digest.digest()).decode("utf-8"), - "content_type": mimetype, + "content_type": recorded_url.mimetype, "response_code": recorded_url.status, "http_method": recorded_url.method, "bucket": bucket, diff --git a/warcprox/warc.py b/warcprox/warc.py index 1c535ae..bea4a89 100644 --- a/warcprox/warc.py +++ b/warcprox/warc.py @@ -64,7 +64,7 @@ class WarcRecordBuilder: principal_record = self.build_warc_record(url=recorded_url.url, warc_date=warc_date, data=recorded_url.request_data, warc_type=recorded_url.custom_type, - content_type=recorded_url.content_type) + content_type=recorded_url.content_type.encode("latin1")) return (principal_record,) def build_warc_record(self, url, warc_date=None, recorder=None, data=None, diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 9b82ac5..ef95d28 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -237,7 +237,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): status=prox_rec_res.status, size=prox_rec_res.recorder.len, client_ip=self.client_address[0], content_type=prox_rec_res.getheader("Content-Type"), - method=self.command, timestamp=timestamp) + method=self.command, timestamp=timestamp, + host=self.hostname, duration=datetime.datetime.utcnow()-timestamp) self.server.recorded_url_q.put(recorded_url) self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) @@ -278,7 +279,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): response_recorder=None, remote_ip=b'', warcprox_meta=warcprox_meta, - content_type=self.headers['Content-Type'].encode('latin1'), + content_type=self.headers['Content-Type'], custom_type=warc_type or self.headers['WARC-Type'], status=204, size=len(request_data), client_ip=self.client_address[0], @@ -309,7 +310,7 @@ class RecordedUrl: def __init__(self, url, request_data, response_recorder, remote_ip, warcprox_meta=None, content_type=None, custom_type=None, status=None, size=None, client_ip=None, method=None, - timestamp=None): + timestamp=None, host=None, duration=None): # XXX should test what happens with non-ascii url (when does # url-encoding happen?) if type(url) is not bytes: @@ -330,14 +331,24 @@ class RecordedUrl: else: self.warcprox_meta = {} + if isinstance(content_type, bytes): + raise Exception("content_type is not supposed to be bytes!") self.content_type = content_type - self.custom_type = custom_type + self.mimetype = content_type + if self.mimetype: + n = self.mimetype.find(";") + if n >= 0: + self.mimetype = self.mimetype[:n] + + self.custom_type = custom_type self.status = status self.size = size self.client_ip = client_ip self.method = method self.timestamp = timestamp + self.host = host + self.duration = duration def __del__(self): self.logger.debug("finished with %s", self)