mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
some tweaks to error responses
use 502, 504 when appropriate, and don't send `str(e)` as in the http status line, because that is often an ugly jumble
This commit is contained in:
parent
2772b80fab
commit
f51f2ec225
2
setup.py
2
setup.py
@ -43,7 +43,7 @@ except:
|
||||
|
||||
setuptools.setup(
|
||||
name='warcprox',
|
||||
version='2.4.11',
|
||||
version='2.4.12',
|
||||
description='WARC writing MITM HTTP/S proxy',
|
||||
url='https://github.com/internetarchive/warcprox',
|
||||
author='Noah Levitt',
|
||||
|
@ -1724,13 +1724,13 @@ def test_slash_in_warc_prefix(warcprox_, http_daemon, archiving_proxies):
|
||||
url = 'http://localhost:%s/b/b' % http_daemon.server_port
|
||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"../../../../etc/a"})}
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||
assert response.status_code == 500
|
||||
assert response.status_code == 400
|
||||
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
|
||||
|
||||
url = 'http://localhost:%s/b/c' % http_daemon.server_port
|
||||
headers = {"Warcprox-Meta": json.dumps({"warc-prefix":"..\\..\\..\\derp\\monkey"})}
|
||||
response = requests.get(url, proxies=archiving_proxies, headers=headers)
|
||||
assert response.status_code == 500
|
||||
assert response.status_code == 400
|
||||
assert response.reason == 'request rejected by warcprox: slash and backslash are not permitted in warc-prefix'
|
||||
|
||||
def test_crawl_log(warcprox_, http_daemon, archiving_proxies):
|
||||
|
@ -78,6 +78,15 @@ class RequestBlockedByRule(Exception):
|
||||
def __str__(self):
|
||||
return "%s: %s" % (self.__class__.__name__, self.msg)
|
||||
|
||||
class BadRequest(Exception):
|
||||
'''
|
||||
Raised in case of a request deemed unacceptable by warcprox.
|
||||
'''
|
||||
def __init__(self, msg):
|
||||
self.msg = msg
|
||||
def __str__(self):
|
||||
return "%s: %s" % (self.__class__.__name__, self.msg)
|
||||
|
||||
class BasePostfetchProcessor(threading.Thread):
|
||||
logger = logging.getLogger("warcprox.BasePostfetchProcessor")
|
||||
|
||||
|
@ -77,7 +77,7 @@ import time
|
||||
import collections
|
||||
import cProfile
|
||||
from urllib3.util import is_connection_dropped
|
||||
from urllib3.exceptions import NewConnectionError
|
||||
from urllib3.exceptions import TimeoutError, HTTPError
|
||||
import doublethink
|
||||
|
||||
class ProxyingRecorder(object):
|
||||
@ -385,15 +385,14 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
self._determine_host_port()
|
||||
assert self.url
|
||||
# Check if target hostname:port is in `bad_hostnames_ports` cache
|
||||
# to avoid retrying to connect. cached is a tuple containing
|
||||
# (status_code, error message)
|
||||
# to avoid retrying to connect. Cached value is http status code.
|
||||
cached = None
|
||||
hostname_port = self._hostname_port_cache_key()
|
||||
with self.server.bad_hostnames_ports_lock:
|
||||
cached = self.server.bad_hostnames_ports.get(hostname_port)
|
||||
if cached:
|
||||
self.logger.info('Cannot connect to %s (cache)', hostname_port)
|
||||
self.send_error(cached[0], cached[1])
|
||||
self.send_error(cached)
|
||||
return
|
||||
# Connect to destination
|
||||
self._connect_to_remote_server()
|
||||
@ -401,20 +400,32 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
# limit enforcers have already sent the appropriate response
|
||||
self.logger.info("%r: %r", self.requestline, e)
|
||||
return
|
||||
except warcprox.BadRequest as e:
|
||||
self.send_error(400, e.msg)
|
||||
return
|
||||
except Exception as e:
|
||||
# If connection fails, add hostname:port to cache to avoid slow
|
||||
# subsequent reconnection attempts. `NewConnectionError` can be
|
||||
# caused by many types of errors which are handled by urllib3.
|
||||
if type(e) in (socket.timeout, NewConnectionError):
|
||||
response_code = 500
|
||||
cache = False
|
||||
if isinstance(e, (socket.timeout, TimeoutError,)):
|
||||
response_code = 504
|
||||
cache = True
|
||||
elif isinstance(e, HTTPError):
|
||||
response_code = 502
|
||||
cache = True
|
||||
|
||||
if cache:
|
||||
host_port = self._hostname_port_cache_key()
|
||||
with self.server.bad_hostnames_ports_lock:
|
||||
self.server.bad_hostnames_ports[host_port] = (500, str(e))
|
||||
self.server.bad_hostnames_ports[host_port] = response_code
|
||||
self.logger.info('bad_hostnames_ports cache size: %d',
|
||||
len(self.server.bad_hostnames_ports))
|
||||
self.logger.error(
|
||||
"problem processing request %r: %r",
|
||||
self.requestline, e, exc_info=True)
|
||||
self.send_error(500, str(e))
|
||||
self.send_error(response_code)
|
||||
return
|
||||
|
||||
try:
|
||||
@ -429,7 +440,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
self.logger.error(
|
||||
'error from remote server(?) %r: %r',
|
||||
self.requestline, e, exc_info=True)
|
||||
self.send_error(502, str(e))
|
||||
self.send_error(502)
|
||||
return
|
||||
|
||||
def send_error(self, code, message=None, explain=None):
|
||||
@ -556,10 +567,10 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
|
||||
# downloading. Its caused by prox_rec_res.begin(...) which calls
|
||||
# http_client._read_status(). In that case, the host is also bad
|
||||
# and we must add it to `bad_hostnames_ports` cache.
|
||||
if type(e) == http_client.RemoteDisconnected:
|
||||
if isinstance(e, http_client.RemoteDisconnected):
|
||||
host_port = self._hostname_port_cache_key()
|
||||
with self.server.bad_hostnames_ports_lock:
|
||||
self.server.bad_hostnames_ports[host_port] = (502, str(e))
|
||||
self.server.bad_hostnames_ports[host_port] = 502
|
||||
self.logger.info('bad_hostnames_ports cache size: %d',
|
||||
len(self.server.bad_hostnames_ports))
|
||||
|
||||
|
@ -170,7 +170,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler):
|
||||
if warcprox_meta and 'warc-prefix' in warcprox_meta and (
|
||||
'/' in warcprox_meta['warc-prefix']
|
||||
or '\\' in warcprox_meta['warc-prefix']):
|
||||
raise Exception(
|
||||
raise warcprox.BadRequest(
|
||||
"request rejected by warcprox: slash and backslash are not "
|
||||
"permitted in warc-prefix")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user