diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 74b6ca7..b1a9a5b 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -32,7 +32,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self.url = self.path u = urllib_parse.urlparse(self.url) if u.scheme != 'http': - raise Exception('Unknown scheme %s' % repr(u.scheme)) + raise Exception('unable to parse request "{}" as a proxy request'.format(self.requestline)) self.hostname = u.hostname self.port = u.port or 80 self.path = urllib_parse.urlunparse( @@ -83,6 +83,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._transition_to_ssl() except Exception as e: try: + self.logger.error("problem with connect line {}: {}".format(repr(self.requestline), e)) if type(e) is socket.timeout: self.send_error(504, str(e)) else: @@ -129,13 +130,18 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._connect_to_host() assert self.url except Exception as e: + self.logger.error("problem processing request {}: {}".format(repr(self.requestline), e)) self.send_error(500, str(e)) return else: # if self.is_connect we already connected in do_CONNECT self.url = self._construct_tunneled_url() - self._proxy_request() + try: + self._proxy_request() + except: + self.logger.error("exception from {}".format(self._proxy_request), exc_info=True) + raise def _special_request(self, method, type_): raise Exception('Not supported') @@ -147,12 +153,4 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): if item.startswith('do_'): return self.do_COMMAND - def log_error(self, fmt, *args): - self.logger.error("{0} - - [{1}] {2}".format(self.address_string(), - self.log_date_time_string(), fmt % args)) - - def log_message(self, fmt, *args): - self.logger.info("{} {} - - [{}] {}".format(self.__class__.__name__, - self.address_string(), self.log_date_time_string(), fmt % args)) - diff --git a/warcprox/warcprox.py b/warcprox/warcprox.py index 59115fb..4f0cafe 100644 --- a/warcprox/warcprox.py +++ b/warcprox/warcprox.py @@ -210,8 +210,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): recorded_url = RecordedUrl(url=self.url, request_data=req, response_recorder=h.recorder, remote_ip=remote_ip, - warcprox_meta=warcprox_meta, method=self.command, - status=h.status, size=h.recorder.len) + warcprox_meta=warcprox_meta, + status=h.status, size=h.recorder.len, + client_ip=self.client_address[0], + content_type=h.getheader("Content-Type"), + method=self.command) self.server.recorded_url_q.put(recorded_url) return recorded_url @@ -233,8 +236,9 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): warcprox_meta=warcprox_meta, content_type=self.headers['Content-Type'].encode('latin1'), custom_type=type_, - method=method, - status=204, size=len(request_data)) + status=204, size=len(request_data), + client_ip=self.client_address[0], + method=method) self.server.recorded_url_q.put(rec_custom) self.send_response(204, 'OK') @@ -254,10 +258,11 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): # logging better handled elsewhere? pass + class RecordedUrl(object): def __init__(self, url, request_data, response_recorder, remote_ip, warcprox_meta=None, content_type=None, custom_type=None, - method=None, status=None, size=None): + status=None, size=None, client_ip=None, method=None): # XXX should test what happens with non-ascii url (when does # url-encoding happen?) if type(url) is not bytes: @@ -281,9 +286,10 @@ class RecordedUrl(object): self.content_type = content_type self.custom_type = custom_type - self.method = method self.status = status self.size = size + self.client_ip = client_ip + self.method = method class WarcProxy(socketserver.ThreadingMixIn, http_server.HTTPServer): logger = logging.getLogger("warcprox.warcprox.WarcProxy") diff --git a/warcprox/warcwriter.py b/warcprox/warcwriter.py index 7146904..936e485 100644 --- a/warcprox/warcwriter.py +++ b/warcprox/warcwriter.py @@ -226,6 +226,12 @@ class WarcWriter: return self._f + def _decode(self, x): + if isinstance(x, bytes): + return x.decode("utf-8") + else: + return x + def _final_tasks(self, recorded_url, recordset, recordset_offset): if (self.dedup_db is not None and recordset[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE @@ -245,11 +251,20 @@ class WarcWriter: payload_digest = recordset[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8") except: payload_digest = "-" + mimetype = self._decode(recorded_url.content_type) + mimetype = mimetype[:mimetype.find(";")] + # 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"} - self.logger.info("{} {} {} size={} {} {} offset={}".format( - recorded_url.status, recorded_url.method, - recorded_url.url.decode('utf-8'), recorded_url.size, - payload_digest, self._f_finalname, recordset_offset)) + self.logger.info("{} {} {} {} {} size={} {} {} offset={}".format( + self._decode(recorded_url.client_ip), + self._decode(recorded_url.status), + self._decode(recorded_url.method), + self._decode(recorded_url.url), + mimetype, + recorded_url.size, + self._decode(payload_digest), + self._decode(self._f_finalname), + recordset_offset)) def write_records(self, recorded_url): recordset = self.build_warc_records(recorded_url)