mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
get "mimetype" (without ;params) from content-type in one place in RecordedUrl, and also note host and duration (time spent serving request)
This commit is contained in:
parent
fee200c72c
commit
b30218027e
@ -64,12 +64,6 @@ class RethinkCaptures:
|
|||||||
|
|
||||||
canon_surt = surt.surt(recorded_url.url.decode("utf-8"), trailing_comma=True, host_massage=False)
|
canon_surt = surt.surt(recorded_url.url.decode("utf-8"), trailing_comma=True, host_massage=False)
|
||||||
|
|
||||||
mimetype = recorded_url.content_type
|
|
||||||
if mimetype:
|
|
||||||
n = mimetype.find(";")
|
|
||||||
if n >= 0:
|
|
||||||
mimetype = mimetype[:n]
|
|
||||||
|
|
||||||
entry = {
|
entry = {
|
||||||
# id only specified for rethinkdb partitioning
|
# id only specified for rethinkdb partitioning
|
||||||
"id": "{} {}".format(canon_surt[:20], records[0].id.decode("utf-8")[10:-1]),
|
"id": "{} {}".format(canon_surt[:20], records[0].id.decode("utf-8")[10:-1]),
|
||||||
@ -82,7 +76,7 @@ class RethinkCaptures:
|
|||||||
"warc_type": records[0].type.decode("utf-8"),
|
"warc_type": records[0].type.decode("utf-8"),
|
||||||
"warc_id": records[0].id.decode("utf-8"),
|
"warc_id": records[0].id.decode("utf-8"),
|
||||||
"sha1base32": base64.b32encode(recorded_url.response_recorder.payload_digest.digest()).decode("utf-8"),
|
"sha1base32": base64.b32encode(recorded_url.response_recorder.payload_digest.digest()).decode("utf-8"),
|
||||||
"content_type": mimetype,
|
"content_type": recorded_url.mimetype,
|
||||||
"response_code": recorded_url.status,
|
"response_code": recorded_url.status,
|
||||||
"http_method": recorded_url.method,
|
"http_method": recorded_url.method,
|
||||||
"bucket": bucket,
|
"bucket": bucket,
|
||||||
|
@ -64,7 +64,7 @@ class WarcRecordBuilder:
|
|||||||
principal_record = self.build_warc_record(url=recorded_url.url,
|
principal_record = self.build_warc_record(url=recorded_url.url,
|
||||||
warc_date=warc_date, data=recorded_url.request_data,
|
warc_date=warc_date, data=recorded_url.request_data,
|
||||||
warc_type=recorded_url.custom_type,
|
warc_type=recorded_url.custom_type,
|
||||||
content_type=recorded_url.content_type)
|
content_type=recorded_url.content_type.encode("latin1"))
|
||||||
return (principal_record,)
|
return (principal_record,)
|
||||||
|
|
||||||
def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
|
def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
|
||||||
|
@ -237,7 +237,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
status=prox_rec_res.status, size=prox_rec_res.recorder.len,
|
status=prox_rec_res.status, size=prox_rec_res.recorder.len,
|
||||||
client_ip=self.client_address[0],
|
client_ip=self.client_address[0],
|
||||||
content_type=prox_rec_res.getheader("Content-Type"),
|
content_type=prox_rec_res.getheader("Content-Type"),
|
||||||
method=self.command, timestamp=timestamp)
|
method=self.command, timestamp=timestamp,
|
||||||
|
host=self.hostname, duration=datetime.datetime.utcnow()-timestamp)
|
||||||
self.server.recorded_url_q.put(recorded_url)
|
self.server.recorded_url_q.put(recorded_url)
|
||||||
|
|
||||||
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
|
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
|
||||||
@ -278,7 +279,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
response_recorder=None,
|
response_recorder=None,
|
||||||
remote_ip=b'',
|
remote_ip=b'',
|
||||||
warcprox_meta=warcprox_meta,
|
warcprox_meta=warcprox_meta,
|
||||||
content_type=self.headers['Content-Type'].encode('latin1'),
|
content_type=self.headers['Content-Type'],
|
||||||
custom_type=warc_type or self.headers['WARC-Type'],
|
custom_type=warc_type or self.headers['WARC-Type'],
|
||||||
status=204, size=len(request_data),
|
status=204, size=len(request_data),
|
||||||
client_ip=self.client_address[0],
|
client_ip=self.client_address[0],
|
||||||
@ -309,7 +310,7 @@ class RecordedUrl:
|
|||||||
def __init__(self, url, request_data, response_recorder, remote_ip,
|
def __init__(self, url, request_data, response_recorder, remote_ip,
|
||||||
warcprox_meta=None, content_type=None, custom_type=None,
|
warcprox_meta=None, content_type=None, custom_type=None,
|
||||||
status=None, size=None, client_ip=None, method=None,
|
status=None, size=None, client_ip=None, method=None,
|
||||||
timestamp=None):
|
timestamp=None, host=None, duration=None):
|
||||||
# XXX should test what happens with non-ascii url (when does
|
# XXX should test what happens with non-ascii url (when does
|
||||||
# url-encoding happen?)
|
# url-encoding happen?)
|
||||||
if type(url) is not bytes:
|
if type(url) is not bytes:
|
||||||
@ -330,14 +331,24 @@ class RecordedUrl:
|
|||||||
else:
|
else:
|
||||||
self.warcprox_meta = {}
|
self.warcprox_meta = {}
|
||||||
|
|
||||||
|
if isinstance(content_type, bytes):
|
||||||
|
raise Exception("content_type is not supposed to be bytes!")
|
||||||
self.content_type = content_type
|
self.content_type = content_type
|
||||||
self.custom_type = custom_type
|
|
||||||
|
|
||||||
|
self.mimetype = content_type
|
||||||
|
if self.mimetype:
|
||||||
|
n = self.mimetype.find(";")
|
||||||
|
if n >= 0:
|
||||||
|
self.mimetype = self.mimetype[:n]
|
||||||
|
|
||||||
|
self.custom_type = custom_type
|
||||||
self.status = status
|
self.status = status
|
||||||
self.size = size
|
self.size = size
|
||||||
self.client_ip = client_ip
|
self.client_ip = client_ip
|
||||||
self.method = method
|
self.method = method
|
||||||
self.timestamp = timestamp
|
self.timestamp = timestamp
|
||||||
|
self.host = host
|
||||||
|
self.duration = duration
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
self.logger.debug("finished with %s", self)
|
self.logger.debug("finished with %s", self)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user