Merge pull request #61 from vbanos/remote-server-timeout

Make remote server connection timeout configurable
This commit is contained in:
Noah Levitt 2018-01-31 11:36:11 -08:00 committed by GitHub
commit 0f16585a24
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 30 additions and 4 deletions

View File

@ -249,6 +249,14 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler):
elif self.path == '/empty-response':
headers = b''
payload = b''
elif self.path == '/slow-response':
time.sleep(6)
headers = (b'HTTP/1.1 200 OK\r\n'
+ b'Content-Type: text/plain\r\n'
+ b'\r\n')
payload = b'Test.'
actual_headers = (b'Content-Type: text/plain\r\n'
+ b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n')
else:
payload = b'404 Not Found\n'
headers = (b'HTTP/1.1 404 Not Found\r\n'
@ -356,7 +364,8 @@ def warcprox_(request):
'--port=0',
'--playback-port=0',
'--onion-tor-socks-proxy=localhost:9050',
'--crawl-log-dir=crawl-logs']
'--crawl-log-dir=crawl-logs',
'--socket-timeout=4']
if request.config.getoption('--rethinkdb-dedup-url'):
argv.append('--rethinkdb-dedup-url=%s' % request.config.getoption('--rethinkdb-dedup-url'))
# test these here only
@ -1711,6 +1720,16 @@ def test_long_warcprox_meta(
with pytest.raises(StopIteration):
next(rec_iter)
def test_socket_timeout_response(
warcprox_, http_daemon, https_daemon, archiving_proxies,
playback_proxies):
"""Response will timeout because we use --socket-timeout=4 whereas the
target URL will return after 6 sec.
"""
url = 'http://localhost:%s/slow-response' % http_daemon.server_port
response = requests.get(url, proxies=archiving_proxies, verify=False)
assert response.status_code == 502
def test_empty_response(
warcprox_, http_daemon, https_daemon, archiving_proxies,
playback_proxies):

View File

@ -162,6 +162,10 @@ def _build_arg_parser(prog='warcprox'):
default=None, help=(
'host:port of tor socks proxy, used only to connect to '
'.onion sites'))
# Configurable connection socket timeout, default is 60 sec.
arg_parser.add_argument(
'--socket-timeout', dest='socket_timeout', type=float,
default=None, help=argparse.SUPPRESS)
arg_parser.add_argument(
'--crawl-log-dir', dest='crawl_log_dir', default=None, help=(
'if specified, write crawl log files in the specified '

View File

@ -205,12 +205,13 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
and records the bytes in transit as it proxies them.
'''
logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler")
_socket_timeout = 60
def __init__(self, request, client_address, server):
threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1])
self.is_connect = False
self._headers_buffer = []
request.settimeout(60) # XXX what value should this have?
request.settimeout(self._socket_timeout)
http_server.BaseHTTPRequestHandler.__init__(self, request, client_address, server)
def _determine_host_port(self):
@ -247,8 +248,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler):
self._remote_server_sock = socket.socket()
self._remote_server_sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
# XXX what value should this timeout have?
self._remote_server_sock.settimeout(60)
self._remote_server_sock.settimeout(self._socket_timeout)
self._remote_server_sock.connect((self.hostname, int(self.port)))
# Wrap socket if SSL is required

View File

@ -397,6 +397,9 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object):
WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy
WarcProxyHandler.onion_tor_socks_proxy_port = None
if options.socket_timeout:
WarcProxyHandler._socket_timeout = options.socket_timeout
http_server.HTTPServer.__init__(
self, server_address, WarcProxyHandler, bind_and_activate=True)