diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 3bbefb9..8b61d52 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -363,7 +363,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): else: self.send_error(500, str(e)) except Exception as f: - self.logger.warning("failed to send error response ({}) to proxy client: {}".format(e, f)) + self.logger.warning("failed to send error response ({}) to proxy client: {}".format(e, f), exc_info=True) return # Reload! @@ -489,6 +489,31 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self.server.unregister_remote_server_sock( self._remote_server_conn.sock) + def _swallow_hop_by_hop_headers(self): + ''' + Swallow headers that don't make sense to forward on, i.e. + most hop-by-hop headers. + + http://tools.ietf.org/html/rfc2616#section-13.5. + ''' + # self.headers is an email.message.Message, which is case-insensitive + # and doesn't throw KeyError in __delitem__ + for key in ( + 'Warcprox-Meta', 'Connection', 'Proxy-Connection', 'Keep-Alive', + 'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'): + del self.headers[key] + + def _build_request(self): + req_str = '{} {} {}\r\n'.format( + self.command, self.path, self.request_version) + + # Add headers to the request + # XXX in at least python3.3 str(self.headers) uses \n not \r\n :( + req_str += '\r\n'.join( + '{}: {}'.format(k,v) for (k,v) in self.headers.items()) + + req = req_str.encode('latin1') + b'\r\n\r\n' + def _inner_proxy_request(self, extra_response_headers={}): ''' Sends the request to the remote server, then uses a ProxyingRecorder to @@ -500,29 +525,11 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): It may contain extra HTTP headers such as ``Warcprox-Meta`` which are written in the WARC record for this request. ''' - # Build request - req_str = '{} {} {}\r\n'.format( - self.command, self.path, self.request_version) - - # Swallow headers that don't make sense to forward on, i.e. most - # hop-by-hop headers. http://tools.ietf.org/html/rfc2616#section-13.5. - # self.headers is an email.message.Message, which is case-insensitive - # and doesn't throw KeyError in __delitem__ - for key in ( - 'Connection', 'Proxy-Connection', 'Keep-Alive', - 'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'): - del self.headers[key] - + self._swallow_hop_by_hop_headers() self.headers['Via'] = via_header_value( self.headers.get('Via'), self.request_version.replace('HTTP/', '')) - - # Add headers to the request - # XXX in at least python3.3 str(self.headers) uses \n not \r\n :( - req_str += '\r\n'.join( - '{}: {}'.format(k,v) for (k,v) in self.headers.items()) - - req = req_str.encode('latin1') + b'\r\n\r\n' + req = self._build_request() # Append message body if present to the request if 'Content-Length' in self.headers: @@ -548,7 +555,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): try: buf = prox_rec_res.read(65536) except http_client.IncompleteRead as e: - self.logger.warn('%s from %s', e, self.url) + self.logger.warning('%s from %s', e, self.url) buf = e.partial if (self._max_resource_size and diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 17d0682..c6f9f34 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -188,16 +188,21 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self._enforce_limits_and_blocks() return warcprox.mitmproxy.MitmProxyHandler._connect_to_remote_server(self) - def _proxy_request(self): - warcprox_meta = None + def _parse_warcprox_meta(self): + ''' + :return: Warcprox-Meta request header value as a dictionary, or None + ''' raw_warcprox_meta = self.headers.get('Warcprox-Meta') self.logger.trace( - 'request for %s Warcprox-Meta header: %s', self.url, - raw_warcprox_meta) + 'request for %s Warcprox-Meta header: %s', self.url, + raw_warcprox_meta) if raw_warcprox_meta: - warcprox_meta = json.loads(raw_warcprox_meta) - del self.headers['Warcprox-Meta'] + return json.loads(raw_warcprox_meta) + else: + return None + def _proxy_request(self): + warcprox_meta = self._parse_warcprox_meta() remote_ip = self._remote_server_conn.sock.getpeername()[0] timestamp = doublethink.utcnow() extra_response_headers = {} @@ -343,36 +348,21 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): except: self.logger.error("uncaught exception in do_WARCPROX_WRITE_RECORD", exc_info=True) raise + def send_error(self, code, message=None, explain=None, exception=None): super().send_error(code, message=message, explain=explain, exception=exception) - # Build request - req_str = '{} {} {}\r\n'.format( - self.command, self.path, self.request_version) + # If error happens during CONNECT handling and before the inner request, self.url + # is unset, and self.path is something like 'example.com:443' + urlish = self.url or self.path - # Swallow headers that don't make sense to forward on, i.e. most - # hop-by-hop headers. http://tools.ietf.org/html/rfc2616#section-13.5. - # self.headers is an email.message.Message, which is case-insensitive - # and doesn't throw KeyError in __delitem__ - for key in ( - 'Connection', 'Proxy-Connection', 'Keep-Alive', - 'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'): - del self.headers[key] + warcprox_meta = self._parse_warcprox_meta() + self._swallow_hop_by_hop_headers() + request_data = self._build_request() - # Add headers to the request - # XXX in at least python3.3 str(self.headers) uses \n not \r\n :( - req_str += '\r\n'.join( - '{}: {}'.format(k,v) for (k,v) in self.headers.items()) - - warcprox_meta = None - raw_warcprox_meta = self.headers.get('Warcprox-Meta') - if raw_warcprox_meta: - warcprox_meta = json.loads(raw_warcprox_meta) - - req = req_str.encode('latin1') + b'\r\n\r\n' failed_url = FailedUrl( - url=self.url, - request_data=req, + url=urlish, + request_data=request_data, warcprox_meta=warcprox_meta, status=code, client_ip=self.client_address[0], @@ -386,7 +376,6 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self.server.recorded_url_q.put(failed_url) - def log_message(self, fmt, *args): # logging better handled elsewhere? pass