mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
get "mimetype" (without ;params) from content-type in one place in RecordedUrl, and also note host and duration (time spent serving request)
This commit is contained in:
parent
fee200c72c
commit
b30218027e
@ -64,12 +64,6 @@ class RethinkCaptures:
|
||||
|
||||
canon_surt = surt.surt(recorded_url.url.decode("utf-8"), trailing_comma=True, host_massage=False)
|
||||
|
||||
mimetype = recorded_url.content_type
|
||||
if mimetype:
|
||||
n = mimetype.find(";")
|
||||
if n >= 0:
|
||||
mimetype = mimetype[:n]
|
||||
|
||||
entry = {
|
||||
# id only specified for rethinkdb partitioning
|
||||
"id": "{} {}".format(canon_surt[:20], records[0].id.decode("utf-8")[10:-1]),
|
||||
@ -82,7 +76,7 @@ class RethinkCaptures:
|
||||
"warc_type": records[0].type.decode("utf-8"),
|
||||
"warc_id": records[0].id.decode("utf-8"),
|
||||
"sha1base32": base64.b32encode(recorded_url.response_recorder.payload_digest.digest()).decode("utf-8"),
|
||||
"content_type": mimetype,
|
||||
"content_type": recorded_url.mimetype,
|
||||
"response_code": recorded_url.status,
|
||||
"http_method": recorded_url.method,
|
||||
"bucket": bucket,
|
||||
|
@ -64,7 +64,7 @@ class WarcRecordBuilder:
|
||||
principal_record = self.build_warc_record(url=recorded_url.url,
|
||||
warc_date=warc_date, data=recorded_url.request_data,
|
||||
warc_type=recorded_url.custom_type,
|
||||
content_type=recorded_url.content_type)
|
||||
content_type=recorded_url.content_type.encode("latin1"))
|
||||
return (principal_record,)
|
||||
|
||||
def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
|
||||
|
@ -237,7 +237,8 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
status=prox_rec_res.status, size=prox_rec_res.recorder.len,
|
||||
client_ip=self.client_address[0],
|
||||
content_type=prox_rec_res.getheader("Content-Type"),
|
||||
method=self.command, timestamp=timestamp)
|
||||
method=self.command, timestamp=timestamp,
|
||||
host=self.hostname, duration=datetime.datetime.utcnow()-timestamp)
|
||||
self.server.recorded_url_q.put(recorded_url)
|
||||
|
||||
self.log_request(prox_rec_res.status, prox_rec_res.recorder.len)
|
||||
@ -278,7 +279,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
response_recorder=None,
|
||||
remote_ip=b'',
|
||||
warcprox_meta=warcprox_meta,
|
||||
content_type=self.headers['Content-Type'].encode('latin1'),
|
||||
content_type=self.headers['Content-Type'],
|
||||
custom_type=warc_type or self.headers['WARC-Type'],
|
||||
status=204, size=len(request_data),
|
||||
client_ip=self.client_address[0],
|
||||
@ -309,7 +310,7 @@ class RecordedUrl:
|
||||
def __init__(self, url, request_data, response_recorder, remote_ip,
|
||||
warcprox_meta=None, content_type=None, custom_type=None,
|
||||
status=None, size=None, client_ip=None, method=None,
|
||||
timestamp=None):
|
||||
timestamp=None, host=None, duration=None):
|
||||
# XXX should test what happens with non-ascii url (when does
|
||||
# url-encoding happen?)
|
||||
if type(url) is not bytes:
|
||||
@ -330,14 +331,24 @@ class RecordedUrl:
|
||||
else:
|
||||
self.warcprox_meta = {}
|
||||
|
||||
if isinstance(content_type, bytes):
|
||||
raise Exception("content_type is not supposed to be bytes!")
|
||||
self.content_type = content_type
|
||||
self.custom_type = custom_type
|
||||
|
||||
self.mimetype = content_type
|
||||
if self.mimetype:
|
||||
n = self.mimetype.find(";")
|
||||
if n >= 0:
|
||||
self.mimetype = self.mimetype[:n]
|
||||
|
||||
self.custom_type = custom_type
|
||||
self.status = status
|
||||
self.size = size
|
||||
self.client_ip = client_ip
|
||||
self.method = method
|
||||
self.timestamp = timestamp
|
||||
self.host = host
|
||||
self.duration = duration
|
||||
|
||||
def __del__(self):
|
||||
self.logger.debug("finished with %s", self)
|
||||
|
Loading…
x
Reference in New Issue
Block a user