mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
logging and exception handling tweaks
This commit is contained in:
parent
eb7de9d3f9
commit
86eab2119a
@ -32,7 +32,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
self.url = self.path
|
self.url = self.path
|
||||||
u = urllib_parse.urlparse(self.url)
|
u = urllib_parse.urlparse(self.url)
|
||||||
if u.scheme != 'http':
|
if u.scheme != 'http':
|
||||||
raise Exception('Unknown scheme %s' % repr(u.scheme))
|
raise Exception('unable to parse request "{}" as a proxy request'.format(self.requestline))
|
||||||
self.hostname = u.hostname
|
self.hostname = u.hostname
|
||||||
self.port = u.port or 80
|
self.port = u.port or 80
|
||||||
self.path = urllib_parse.urlunparse(
|
self.path = urllib_parse.urlunparse(
|
||||||
@ -83,6 +83,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
self._transition_to_ssl()
|
self._transition_to_ssl()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
try:
|
try:
|
||||||
|
self.logger.error("problem with connect line {}: {}".format(repr(self.requestline), e))
|
||||||
if type(e) is socket.timeout:
|
if type(e) is socket.timeout:
|
||||||
self.send_error(504, str(e))
|
self.send_error(504, str(e))
|
||||||
else:
|
else:
|
||||||
@ -129,13 +130,18 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
self._connect_to_host()
|
self._connect_to_host()
|
||||||
assert self.url
|
assert self.url
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
self.logger.error("problem processing request {}: {}".format(repr(self.requestline), e))
|
||||||
self.send_error(500, str(e))
|
self.send_error(500, str(e))
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
# if self.is_connect we already connected in do_CONNECT
|
# if self.is_connect we already connected in do_CONNECT
|
||||||
self.url = self._construct_tunneled_url()
|
self.url = self._construct_tunneled_url()
|
||||||
|
|
||||||
self._proxy_request()
|
try:
|
||||||
|
self._proxy_request()
|
||||||
|
except:
|
||||||
|
self.logger.error("exception from {}".format(self._proxy_request), exc_info=True)
|
||||||
|
raise
|
||||||
|
|
||||||
def _special_request(self, method, type_):
|
def _special_request(self, method, type_):
|
||||||
raise Exception('Not supported')
|
raise Exception('Not supported')
|
||||||
@ -147,12 +153,4 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
if item.startswith('do_'):
|
if item.startswith('do_'):
|
||||||
return self.do_COMMAND
|
return self.do_COMMAND
|
||||||
|
|
||||||
def log_error(self, fmt, *args):
|
|
||||||
self.logger.error("{0} - - [{1}] {2}".format(self.address_string(),
|
|
||||||
self.log_date_time_string(), fmt % args))
|
|
||||||
|
|
||||||
def log_message(self, fmt, *args):
|
|
||||||
self.logger.info("{} {} - - [{}] {}".format(self.__class__.__name__,
|
|
||||||
self.address_string(), self.log_date_time_string(), fmt % args))
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -210,8 +210,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
|
|
||||||
recorded_url = RecordedUrl(url=self.url, request_data=req,
|
recorded_url = RecordedUrl(url=self.url, request_data=req,
|
||||||
response_recorder=h.recorder, remote_ip=remote_ip,
|
response_recorder=h.recorder, remote_ip=remote_ip,
|
||||||
warcprox_meta=warcprox_meta, method=self.command,
|
warcprox_meta=warcprox_meta,
|
||||||
status=h.status, size=h.recorder.len)
|
status=h.status, size=h.recorder.len,
|
||||||
|
client_ip=self.client_address[0],
|
||||||
|
content_type=h.getheader("Content-Type"),
|
||||||
|
method=self.command)
|
||||||
self.server.recorded_url_q.put(recorded_url)
|
self.server.recorded_url_q.put(recorded_url)
|
||||||
|
|
||||||
return recorded_url
|
return recorded_url
|
||||||
@ -233,8 +236,9 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
warcprox_meta=warcprox_meta,
|
warcprox_meta=warcprox_meta,
|
||||||
content_type=self.headers['Content-Type'].encode('latin1'),
|
content_type=self.headers['Content-Type'].encode('latin1'),
|
||||||
custom_type=type_,
|
custom_type=type_,
|
||||||
method=method,
|
status=204, size=len(request_data),
|
||||||
status=204, size=len(request_data))
|
client_ip=self.client_address[0],
|
||||||
|
method=method)
|
||||||
|
|
||||||
self.server.recorded_url_q.put(rec_custom)
|
self.server.recorded_url_q.put(rec_custom)
|
||||||
self.send_response(204, 'OK')
|
self.send_response(204, 'OK')
|
||||||
@ -254,10 +258,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
# logging better handled elsewhere?
|
# logging better handled elsewhere?
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class RecordedUrl(object):
|
class RecordedUrl(object):
|
||||||
def __init__(self, url, request_data, response_recorder, remote_ip,
|
def __init__(self, url, request_data, response_recorder, remote_ip,
|
||||||
warcprox_meta=None, content_type=None, custom_type=None,
|
warcprox_meta=None, content_type=None, custom_type=None,
|
||||||
method=None, status=None, size=None):
|
status=None, size=None, client_ip=None, method=None):
|
||||||
# XXX should test what happens with non-ascii url (when does
|
# XXX should test what happens with non-ascii url (when does
|
||||||
# url-encoding happen?)
|
# url-encoding happen?)
|
||||||
if type(url) is not bytes:
|
if type(url) is not bytes:
|
||||||
@ -281,9 +286,10 @@ class RecordedUrl(object):
|
|||||||
self.content_type = content_type
|
self.content_type = content_type
|
||||||
self.custom_type = custom_type
|
self.custom_type = custom_type
|
||||||
|
|
||||||
self.method = method
|
|
||||||
self.status = status
|
self.status = status
|
||||||
self.size = size
|
self.size = size
|
||||||
|
self.client_ip = client_ip
|
||||||
|
self.method = method
|
||||||
|
|
||||||
class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
|
class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
|
||||||
logger = logging.getLogger("warcprox.warcprox.WarcProxy")
|
logger = logging.getLogger("warcprox.warcprox.WarcProxy")
|
||||||
|
@ -226,6 +226,12 @@ class WarcWriter:
|
|||||||
|
|
||||||
return self._f
|
return self._f
|
||||||
|
|
||||||
|
def _decode(self, x):
|
||||||
|
if isinstance(x, bytes):
|
||||||
|
return x.decode("utf-8")
|
||||||
|
else:
|
||||||
|
return x
|
||||||
|
|
||||||
def _final_tasks(self, recorded_url, recordset, recordset_offset):
|
def _final_tasks(self, recorded_url, recordset, recordset_offset):
|
||||||
if (self.dedup_db is not None
|
if (self.dedup_db is not None
|
||||||
and recordset[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
|
and recordset[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
|
||||||
@ -245,11 +251,20 @@ class WarcWriter:
|
|||||||
payload_digest = recordset[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8")
|
payload_digest = recordset[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8")
|
||||||
except:
|
except:
|
||||||
payload_digest = "-"
|
payload_digest = "-"
|
||||||
|
mimetype = self._decode(recorded_url.content_type)
|
||||||
|
mimetype = mimetype[:mimetype.find(";")]
|
||||||
|
|
||||||
# 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"}
|
# 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"}
|
||||||
self.logger.info("{} {} {} size={} {} {} offset={}".format(
|
self.logger.info("{} {} {} {} {} size={} {} {} offset={}".format(
|
||||||
recorded_url.status, recorded_url.method,
|
self._decode(recorded_url.client_ip),
|
||||||
recorded_url.url.decode('utf-8'), recorded_url.size,
|
self._decode(recorded_url.status),
|
||||||
payload_digest, self._f_finalname, recordset_offset))
|
self._decode(recorded_url.method),
|
||||||
|
self._decode(recorded_url.url),
|
||||||
|
mimetype,
|
||||||
|
recorded_url.size,
|
||||||
|
self._decode(payload_digest),
|
||||||
|
self._decode(self._f_finalname),
|
||||||
|
recordset_offset))
|
||||||
|
|
||||||
def write_records(self, recorded_url):
|
def write_records(self, recorded_url):
|
||||||
recordset = self.build_warc_records(recorded_url)
|
recordset = self.build_warc_records(recorded_url)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user