mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Share code, handle exception during CONNECT
This commit is contained in:
parent
4ceebe1fa9
commit
a5e9c27223
@ -363,7 +363,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
else:
|
else:
|
||||||
self.send_error(500, str(e))
|
self.send_error(500, str(e))
|
||||||
except Exception as f:
|
except Exception as f:
|
||||||
self.logger.warning("failed to send error response ({}) to proxy client: {}".format(e, f))
|
self.logger.warning("failed to send error response ({}) to proxy client: {}".format(e, f), exc_info=True)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Reload!
|
# Reload!
|
||||||
@ -489,6 +489,31 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
self.server.unregister_remote_server_sock(
|
self.server.unregister_remote_server_sock(
|
||||||
self._remote_server_conn.sock)
|
self._remote_server_conn.sock)
|
||||||
|
|
||||||
|
def _swallow_hop_by_hop_headers(self):
|
||||||
|
'''
|
||||||
|
Swallow headers that don't make sense to forward on, i.e.
|
||||||
|
most hop-by-hop headers.
|
||||||
|
|
||||||
|
http://tools.ietf.org/html/rfc2616#section-13.5.
|
||||||
|
'''
|
||||||
|
# self.headers is an email.message.Message, which is case-insensitive
|
||||||
|
# and doesn't throw KeyError in __delitem__
|
||||||
|
for key in (
|
||||||
|
'Warcprox-Meta', 'Connection', 'Proxy-Connection', 'Keep-Alive',
|
||||||
|
'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'):
|
||||||
|
del self.headers[key]
|
||||||
|
|
||||||
|
def _build_request(self):
|
||||||
|
req_str = '{} {} {}\r\n'.format(
|
||||||
|
self.command, self.path, self.request_version)
|
||||||
|
|
||||||
|
# Add headers to the request
|
||||||
|
# XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
|
||||||
|
req_str += '\r\n'.join(
|
||||||
|
'{}: {}'.format(k,v) for (k,v) in self.headers.items())
|
||||||
|
|
||||||
|
req = req_str.encode('latin1') + b'\r\n\r\n'
|
||||||
|
|
||||||
def _inner_proxy_request(self, extra_response_headers={}):
|
def _inner_proxy_request(self, extra_response_headers={}):
|
||||||
'''
|
'''
|
||||||
Sends the request to the remote server, then uses a ProxyingRecorder to
|
Sends the request to the remote server, then uses a ProxyingRecorder to
|
||||||
@ -500,29 +525,11 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
It may contain extra HTTP headers such as ``Warcprox-Meta`` which
|
It may contain extra HTTP headers such as ``Warcprox-Meta`` which
|
||||||
are written in the WARC record for this request.
|
are written in the WARC record for this request.
|
||||||
'''
|
'''
|
||||||
# Build request
|
self._swallow_hop_by_hop_headers()
|
||||||
req_str = '{} {} {}\r\n'.format(
|
|
||||||
self.command, self.path, self.request_version)
|
|
||||||
|
|
||||||
# Swallow headers that don't make sense to forward on, i.e. most
|
|
||||||
# hop-by-hop headers. http://tools.ietf.org/html/rfc2616#section-13.5.
|
|
||||||
# self.headers is an email.message.Message, which is case-insensitive
|
|
||||||
# and doesn't throw KeyError in __delitem__
|
|
||||||
for key in (
|
|
||||||
'Connection', 'Proxy-Connection', 'Keep-Alive',
|
|
||||||
'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'):
|
|
||||||
del self.headers[key]
|
|
||||||
|
|
||||||
self.headers['Via'] = via_header_value(
|
self.headers['Via'] = via_header_value(
|
||||||
self.headers.get('Via'),
|
self.headers.get('Via'),
|
||||||
self.request_version.replace('HTTP/', ''))
|
self.request_version.replace('HTTP/', ''))
|
||||||
|
req = self._build_request()
|
||||||
# Add headers to the request
|
|
||||||
# XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
|
|
||||||
req_str += '\r\n'.join(
|
|
||||||
'{}: {}'.format(k,v) for (k,v) in self.headers.items())
|
|
||||||
|
|
||||||
req = req_str.encode('latin1') + b'\r\n\r\n'
|
|
||||||
|
|
||||||
# Append message body if present to the request
|
# Append message body if present to the request
|
||||||
if 'Content-Length' in self.headers:
|
if 'Content-Length' in self.headers:
|
||||||
@ -548,7 +555,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
|||||||
try:
|
try:
|
||||||
buf = prox_rec_res.read(65536)
|
buf = prox_rec_res.read(65536)
|
||||||
except http_client.IncompleteRead as e:
|
except http_client.IncompleteRead as e:
|
||||||
self.logger.warn('%s from %s', e, self.url)
|
self.logger.warning('%s from %s', e, self.url)
|
||||||
buf = e.partial
|
buf = e.partial
|
||||||
|
|
||||||
if (self._max_resource_size and
|
if (self._max_resource_size and
|
||||||
|
@ -188,16 +188,21 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
self._enforce_limits_and_blocks()
|
self._enforce_limits_and_blocks()
|
||||||
return warcprox.mitmproxy.MitmProxyHandler._connect_to_remote_server(self)
|
return warcprox.mitmproxy.MitmProxyHandler._connect_to_remote_server(self)
|
||||||
|
|
||||||
def _proxy_request(self):
|
def _parse_warcprox_meta(self):
|
||||||
warcprox_meta = None
|
'''
|
||||||
|
:return: Warcprox-Meta request header value as a dictionary, or None
|
||||||
|
'''
|
||||||
raw_warcprox_meta = self.headers.get('Warcprox-Meta')
|
raw_warcprox_meta = self.headers.get('Warcprox-Meta')
|
||||||
self.logger.trace(
|
self.logger.trace(
|
||||||
'request for %s Warcprox-Meta header: %s', self.url,
|
'request for %s Warcprox-Meta header: %s', self.url,
|
||||||
raw_warcprox_meta)
|
raw_warcprox_meta)
|
||||||
if raw_warcprox_meta:
|
if raw_warcprox_meta:
|
||||||
warcprox_meta = json.loads(raw_warcprox_meta)
|
return json.loads(raw_warcprox_meta)
|
||||||
del self.headers['Warcprox-Meta']
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _proxy_request(self):
|
||||||
|
warcprox_meta = self._parse_warcprox_meta()
|
||||||
remote_ip = self._remote_server_conn.sock.getpeername()[0]
|
remote_ip = self._remote_server_conn.sock.getpeername()[0]
|
||||||
timestamp = doublethink.utcnow()
|
timestamp = doublethink.utcnow()
|
||||||
extra_response_headers = {}
|
extra_response_headers = {}
|
||||||
@ -343,36 +348,21 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
except:
|
except:
|
||||||
self.logger.error("uncaught exception in do_WARCPROX_WRITE_RECORD", exc_info=True)
|
self.logger.error("uncaught exception in do_WARCPROX_WRITE_RECORD", exc_info=True)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def send_error(self, code, message=None, explain=None, exception=None):
|
def send_error(self, code, message=None, explain=None, exception=None):
|
||||||
super().send_error(code, message=message, explain=explain, exception=exception)
|
super().send_error(code, message=message, explain=explain, exception=exception)
|
||||||
|
|
||||||
# Build request
|
# If error happens during CONNECT handling and before the inner request, self.url
|
||||||
req_str = '{} {} {}\r\n'.format(
|
# is unset, and self.path is something like 'example.com:443'
|
||||||
self.command, self.path, self.request_version)
|
urlish = self.url or self.path
|
||||||
|
|
||||||
# Swallow headers that don't make sense to forward on, i.e. most
|
warcprox_meta = self._parse_warcprox_meta()
|
||||||
# hop-by-hop headers. http://tools.ietf.org/html/rfc2616#section-13.5.
|
self._swallow_hop_by_hop_headers()
|
||||||
# self.headers is an email.message.Message, which is case-insensitive
|
request_data = self._build_request()
|
||||||
# and doesn't throw KeyError in __delitem__
|
|
||||||
for key in (
|
|
||||||
'Connection', 'Proxy-Connection', 'Keep-Alive',
|
|
||||||
'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'):
|
|
||||||
del self.headers[key]
|
|
||||||
|
|
||||||
# Add headers to the request
|
|
||||||
# XXX in at least python3.3 str(self.headers) uses \n not \r\n :(
|
|
||||||
req_str += '\r\n'.join(
|
|
||||||
'{}: {}'.format(k,v) for (k,v) in self.headers.items())
|
|
||||||
|
|
||||||
warcprox_meta = None
|
|
||||||
raw_warcprox_meta = self.headers.get('Warcprox-Meta')
|
|
||||||
if raw_warcprox_meta:
|
|
||||||
warcprox_meta = json.loads(raw_warcprox_meta)
|
|
||||||
|
|
||||||
req = req_str.encode('latin1') + b'\r\n\r\n'
|
|
||||||
failed_url = FailedUrl(
|
failed_url = FailedUrl(
|
||||||
url=self.url,
|
url=urlish,
|
||||||
request_data=req,
|
request_data=request_data,
|
||||||
warcprox_meta=warcprox_meta,
|
warcprox_meta=warcprox_meta,
|
||||||
status=code,
|
status=code,
|
||||||
client_ip=self.client_address[0],
|
client_ip=self.client_address[0],
|
||||||
@ -386,7 +376,6 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
|||||||
|
|
||||||
self.server.recorded_url_q.put(failed_url)
|
self.server.recorded_url_q.put(failed_url)
|
||||||
|
|
||||||
|
|
||||||
def log_message(self, fmt, *args):
|
def log_message(self, fmt, *args):
|
||||||
# logging better handled elsewhere?
|
# logging better handled elsewhere?
|
||||||
pass
|
pass
|
||||||
|
Loading…
x
Reference in New Issue
Block a user