diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 886902f..0e0b298 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -249,6 +249,14 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): elif self.path == '/empty-response': headers = b'' payload = b'' + elif self.path == '/slow-response': + time.sleep(6) + headers = (b'HTTP/1.1 200 OK\r\n' + + b'Content-Type: text/plain\r\n' + + b'\r\n') + payload = b'Test.' + actual_headers = (b'Content-Type: text/plain\r\n' + + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n') else: payload = b'404 Not Found\n' headers = (b'HTTP/1.1 404 Not Found\r\n' @@ -356,7 +364,8 @@ def warcprox_(request): '--port=0', '--playback-port=0', '--onion-tor-socks-proxy=localhost:9050', - '--crawl-log-dir=crawl-logs'] + '--crawl-log-dir=crawl-logs', + '--socket-timeout=4'] if request.config.getoption('--rethinkdb-dedup-url'): argv.append('--rethinkdb-dedup-url=%s' % request.config.getoption('--rethinkdb-dedup-url')) # test these here only @@ -1711,6 +1720,16 @@ def test_long_warcprox_meta( with pytest.raises(StopIteration): next(rec_iter) +def test_socket_timeout_response( + warcprox_, http_daemon, https_daemon, archiving_proxies, + playback_proxies): + """Response will timeout because we use --socket-timeout=4 whereas the + target URL will return after 6 sec. + """ + url = 'http://localhost:%s/slow-response' % http_daemon.server_port + response = requests.get(url, proxies=archiving_proxies, verify=False) + assert response.status_code == 502 + def test_empty_response( warcprox_, http_daemon, https_daemon, archiving_proxies, playback_proxies): diff --git a/warcprox/main.py b/warcprox/main.py index f663d0d..8d16d3b 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -162,6 +162,10 @@ def _build_arg_parser(prog='warcprox'): default=None, help=( 'host:port of tor socks proxy, used only to connect to ' '.onion sites')) + # Configurable connection socket timeout, default is 60 sec. + arg_parser.add_argument( + '--socket-timeout', dest='socket_timeout', type=float, + default=None, help=argparse.SUPPRESS) arg_parser.add_argument( '--crawl-log-dir', dest='crawl_log_dir', default=None, help=( 'if specified, write crawl log files in the specified ' diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 7792c5c..95d5b31 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -205,12 +205,13 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): and records the bytes in transit as it proxies them. ''' logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler") + _socket_timeout = 60 def __init__(self, request, client_address, server): threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1]) self.is_connect = False self._headers_buffer = [] - request.settimeout(60) # XXX what value should this have? + request.settimeout(self._socket_timeout) http_server.BaseHTTPRequestHandler.__init__(self, request, client_address, server) def _determine_host_port(self): @@ -247,8 +248,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._remote_server_sock = socket.socket() self._remote_server_sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) - # XXX what value should this timeout have? - self._remote_server_sock.settimeout(60) + self._remote_server_sock.settimeout(self._socket_timeout) self._remote_server_sock.connect((self.hostname, int(self.port))) # Wrap socket if SSL is required diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 7ae5ab4..5b36300 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -397,6 +397,9 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object): WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy WarcProxyHandler.onion_tor_socks_proxy_port = None + if options.socket_timeout: + WarcProxyHandler._socket_timeout = options.socket_timeout + http_server.HTTPServer.__init__( self, server_address, WarcProxyHandler, bind_and_activate=True)