From f51f2ec225d8fec39bdc5848d76e8c6a1c762b86 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 14 May 2019 15:51:11 -0700 Subject: [PATCH] some tweaks to error responses use 502, 504 when appropriate, and don't send `str(e)` as in the http status line, because that is often an ugly jumble --- setup.py | 2 +- tests/test_warcprox.py | 4 ++-- warcprox/__init__.py | 9 +++++++++ warcprox/mitmproxy.py | 31 +++++++++++++++++++++---------- warcprox/warcproxy.py | 2 +- 5 files changed, 34 insertions(+), 14 deletions(-) diff --git a/setup.py b/setup.py index 29221af..56e8213 100755 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.11', + version='2.4.12', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 4323a6c..d34bb43 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1724,13 +1724,13 @@ def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies): url = 'http://localhost:%s/b/b' % http_daemon.server_port headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"../../../../etc/a"})} response = requests.get(url, proxies=archiving_proxies, headers=headers) - assert response.status_code == 500 + assert response.status_code == 400 assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix' url = 'http://localhost:%s/b/c' % http_daemon.server_port headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"..\\..\\..\\derp\\monkey"})} response = requests.get(url, proxies=archiving_proxies, headers=headers) - assert response.status_code == 500 + assert response.status_code == 400 assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix' def test_crawl_log(warcprox_, http_daemon, archiving_proxies): diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 852d3fc..9cd09a8 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -78,6 +78,15 @@ class RequestBlockedByRule(Exception): def __str__(self): return "%s: %s" % (self.__class__.__name__, self.msg) +class BadRequest(Exception): + ''' + Raised in case of a request deemed unacceptable by warcprox. + ''' + def __init__(self, msg): + self.msg = msg + def __str__(self): + return "%s: %s" % (self.__class__.__name__, self.msg) + class BasePostfetchProcessor(threading.Thread): logger = logging.getLogger("warcprox.BasePostfetchProcessor") diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index b158162..c1f01bd 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -77,7 +77,7 @@ import time import collections import cProfile from urllib3.util import is_connection_dropped -from urllib3.exceptions import NewConnectionError +from urllib3.exceptions import TimeoutError, HTTPError import doublethink class ProxyingRecorder(object): @@ -385,15 +385,14 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._determine_host_port() assert self.url # Check if target hostname:port is in `bad_hostnames_ports` cache - # to avoid retrying to connect. cached is a tuple containing - # (status_code, error message) + # to avoid retrying to connect. Cached value is http status code. cached = None hostname_port = self._hostname_port_cache_key() with self.server.bad_hostnames_ports_lock: cached = self.server.bad_hostnames_ports.get(hostname_port) if cached: self.logger.info('Cannot connect to %s (cache)', hostname_port) - self.send_error(cached[0], cached[1]) + self.send_error(cached) return # Connect to destination self._connect_to_remote_server() @@ -401,20 +400,32 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): # limit enforcers have already sent the appropriate response self.logger.info("%r: %r", self.requestline, e) return + except warcprox.BadRequest as e: + self.send_error(400, e.msg) + return except Exception as e: # If connection fails, add hostname:port to cache to avoid slow # subsequent reconnection attempts. `NewConnectionError` can be # caused by many types of errors which are handled by urllib3. - if type(e) in (socket.timeout, NewConnectionError): + response_code = 500 + cache = False + if isinstance(e, (socket.timeout, TimeoutError,)): + response_code = 504 + cache = True + elif isinstance(e, HTTPError): + response_code = 502 + cache = True + + if cache: host_port = self._hostname_port_cache_key() with self.server.bad_hostnames_ports_lock: - self.server.bad_hostnames_ports[host_port] = (500, str(e)) + self.server.bad_hostnames_ports[host_port] = response_code self.logger.info('bad_hostnames_ports cache size: %d', len(self.server.bad_hostnames_ports)) self.logger.error( "problem processing request %r: %r", self.requestline, e, exc_info=True) - self.send_error(500, str(e)) + self.send_error(response_code) return try: @@ -429,7 +440,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self.logger.error( 'error from remote server(?) %r: %r', self.requestline, e, exc_info=True) - self.send_error(502, str(e)) + self.send_error(502) return def send_error(self, code, message=None, explain=None): @@ -556,10 +567,10 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): # downloading. Its caused by prox_rec_res.begin(...) which calls # http_client._read_status(). In that case, the host is also bad # and we must add it to `bad_hostnames_ports` cache. - if type(e) == http_client.RemoteDisconnected: + if isinstance(e, http_client.RemoteDisconnected): host_port = self._hostname_port_cache_key() with self.server.bad_hostnames_ports_lock: - self.server.bad_hostnames_ports[host_port] = (502, str(e)) + self.server.bad_hostnames_ports[host_port] = 502 self.logger.info('bad_hostnames_ports cache size: %d', len(self.server.bad_hostnames_ports)) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 2d072b9..9b8545d 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -170,7 +170,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): if warcprox_meta and 'warc-prefix' in warcprox_meta and ( '/' in warcprox_meta['warc-prefix'] or '\\' in warcprox_meta['warc-prefix']): - raise Exception( + raise warcprox.BadRequest( "request rejected by warcprox: slash and backslash are not " "permitted in warc-prefix")