some tweaks to error responses

use 502, 504 when appropriate, and don't send `str(e)` as in the http
status line, because that is often an ugly jumble
This commit is contained in:
Noah Levitt 2019-05-14 15:51:11 -07:00
parent 2772b80fab
commit f51f2ec225
5 changed files with 34 additions and 14 deletions

View File

@ -43,7 +43,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.4.11',
version='2.4.12',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -1724,13 +1724,13 @@ def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies):
url = 'http://localhost:%s/b/b' % http_daemon.server_port
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"../../../../etc/a"})}
response = requests.get(url, proxies=archiving_proxies, headers=headers)
assert response.status_code == 500
assert response.status_code == 400
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
url = 'http://localhost:%s/b/c' % http_daemon.server_port
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"..\\..\\..\\derp\\monkey"})}
response = requests.get(url, proxies=archiving_proxies, headers=headers)
assert response.status_code == 500
assert response.status_code == 400
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
def test_crawl_log(warcprox_, http_daemon, archiving_proxies):

View File

@ -78,6 +78,15 @@ class RequestBlockedByRule(Exception):
def __str__(self):
return "%s: %s" % (self.__class__.__name__, self.msg)
class BadRequest(Exception):
'''
Raised in case of a request deemed unacceptable by warcprox.
'''
def __init__(self, msg):
self.msg = msg
def __str__(self):
return "%s: %s" % (self.__class__.__name__, self.msg)
class BasePostfetchProcessor(threading.Thread):
logger = logging.getLogger("warcprox.BasePostfetchProcessor")

View File

@ -77,7 +77,7 @@ import time
import collections
import cProfile
from urllib3.util import is_connection_dropped
from urllib3.exceptions import NewConnectionError
from urllib3.exceptions import TimeoutError, HTTPError
import doublethink
class ProxyingRecorder(object):
@ -385,15 +385,14 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
self._determine_host_port()
assert self.url
# Check if target hostname:port is in `bad_hostnames_ports` cache
# to avoid retrying to connect. cached is a tuple containing
# (status_code, error message)
# to avoid retrying to connect. Cached value is http status code.
cached = None
hostname_port = self._hostname_port_cache_key()
with self.server.bad_hostnames_ports_lock:
cached = self.server.bad_hostnames_ports.get(hostname_port)
if cached:
self.logger.info('Cannot connect to %s (cache)', hostname_port)
self.send_error(cached[0], cached[1])
self.send_error(cached)
return
# Connect to destination
self._connect_to_remote_server()
@ -401,20 +400,32 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
# limit enforcers have already sent the appropriate response
self.logger.info("%r: %r", self.requestline, e)
return
except warcprox.BadRequest as e:
self.send_error(400, e.msg)
return
except Exception as e:
# If connection fails, add hostname:port to cache to avoid slow
# subsequent reconnection attempts. `NewConnectionError` can be
# caused by many types of errors which are handled by urllib3.
if type(e) in (socket.timeout, NewConnectionError):
response_code = 500
cache = False
if isinstance(e, (socket.timeout, TimeoutError,)):
response_code = 504
cache = True
elif isinstance(e, HTTPError):
response_code = 502
cache = True
if cache:
host_port = self._hostname_port_cache_key()
with self.server.bad_hostnames_ports_lock:
self.server.bad_hostnames_ports[host_port] = (500, str(e))
self.server.bad_hostnames_ports[host_port] = response_code
self.logger.info('bad_hostnames_ports cache size: %d',
len(self.server.bad_hostnames_ports))
self.logger.error(
"problem processing request %r: %r",
self.requestline, e, exc_info=True)
self.send_error(500, str(e))
self.send_error(response_code)
return
try:
@ -429,7 +440,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
self.logger.error(
'error from remote server(?) %r: %r',
self.requestline, e, exc_info=True)
self.send_error(502, str(e))
self.send_error(502)
return
def send_error(self, code, message=None, explain=None):
@ -556,10 +567,10 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
# downloading. Its caused by prox_rec_res.begin(...) which calls
# http_client._read_status(). In that case, the host is also bad
# and we must add it to `bad_hostnames_ports` cache.
if type(e) == http_client.RemoteDisconnected:
if isinstance(e, http_client.RemoteDisconnected):
host_port = self._hostname_port_cache_key()
with self.server.bad_hostnames_ports_lock:
self.server.bad_hostnames_ports[host_port] = (502, str(e))
self.server.bad_hostnames_ports[host_port] = 502
self.logger.info('bad_hostnames_ports cache size: %d',
len(self.server.bad_hostnames_ports))

View File

@ -170,7 +170,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
if warcprox_meta and 'warc-prefix' in warcprox_meta and (
'/' in warcprox_meta['warc-prefix']
or '\\' in warcprox_meta['warc-prefix']):
raise Exception(
raise warcprox.BadRequest(
"request rejected by warcprox: slash and backslash are not "
"permitted in warc-prefix")