From 6b8440e39d7849894401c4259fac59959066ee6d Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Sat, 27 Jan 2018 15:38:44 +0000 Subject: [PATCH 1/3] Make remote server connection timeout configurable Default is 60 sec (the previously hard-coded value) and you can override it with --remote-server-timeout=XX --- warcprox/main.py | 4 ++++ warcprox/mitmproxy.py | 3 ++- warcprox/warcproxy.py | 3 +++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/warcprox/main.py b/warcprox/main.py index 1f270a1..f13d2dd 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -162,6 +162,10 @@ def _build_arg_parser(prog='warcprox'): default=None, help=( 'host:port of tor socks proxy, used only to connect to ' '.onion sites')) + # Configurable connection timeout to target sites, default is 60 sec. + arg_parser.add_argument( + '--remote-server-timeout', dest='remote_server_timeout', type=float, + default=None, help=argparse.SUPPRESS) arg_parser.add_argument( '--crawl-log-dir', dest='crawl_log_dir', default=None, help=( 'if specified, write crawl log files in the specified ' diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 7792c5c..b25901b 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -205,6 +205,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): and records the bytes in transit as it proxies them. ''' logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler") + _remote_server_timeout = 60 def __init__(self, request, client_address, server): threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1]) @@ -248,7 +249,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._remote_server_sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) # XXX what value should this timeout have? - self._remote_server_sock.settimeout(60) + self._remote_server_sock.settimeout(self._remote_server_timeout) self._remote_server_sock.connect((self.hostname, int(self.port))) # Wrap socket if SSL is required diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 7ae5ab4..6d8accb 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -397,6 +397,9 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object): WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy WarcProxyHandler.onion_tor_socks_proxy_port = None + if options.remote_server_timeout: + WarcProxyHandler._remote_server_timeout = options.remote_server_timeout + http_server.HTTPServer.__init__( self, server_address, WarcProxyHandler, bind_and_activate=True) From ca78293abdda19926768d26a68775319ff20fe5f Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Tue, 30 Jan 2018 07:03:58 +0000 Subject: [PATCH 2/3] Rename remote-server-timeout to socket-timeout Also apply it to both remote target and local proxy client connections. --- warcprox/main.py | 4 ++-- warcprox/mitmproxy.py | 7 +++---- warcprox/warcproxy.py | 4 ++-- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/warcprox/main.py b/warcprox/main.py index f13d2dd..358611d 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -162,9 +162,9 @@ def _build_arg_parser(prog='warcprox'): default=None, help=( 'host:port of tor socks proxy, used only to connect to ' '.onion sites')) - # Configurable connection timeout to target sites, default is 60 sec. + # Configurable connection socket timeout, default is 60 sec. arg_parser.add_argument( - '--remote-server-timeout', dest='remote_server_timeout', type=float, + '--socket-timeout', dest='socket_timeout', type=float, default=None, help=argparse.SUPPRESS) arg_parser.add_argument( '--crawl-log-dir', dest='crawl_log_dir', default=None, help=( diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index b25901b..95d5b31 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -205,13 +205,13 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): and records the bytes in transit as it proxies them. ''' logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler") - _remote_server_timeout = 60 + _socket_timeout = 60 def __init__(self, request, client_address, server): threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1]) self.is_connect = False self._headers_buffer = [] - request.settimeout(60) # XXX what value should this have? + request.settimeout(self._socket_timeout) http_server.BaseHTTPRequestHandler.__init__(self, request, client_address, server) def _determine_host_port(self): @@ -248,8 +248,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._remote_server_sock = socket.socket() self._remote_server_sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) - # XXX what value should this timeout have? - self._remote_server_sock.settimeout(self._remote_server_timeout) + self._remote_server_sock.settimeout(self._socket_timeout) self._remote_server_sock.connect((self.hostname, int(self.port))) # Wrap socket if SSL is required diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 6d8accb..5b36300 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -397,8 +397,8 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object): WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy WarcProxyHandler.onion_tor_socks_proxy_port = None - if options.remote_server_timeout: - WarcProxyHandler._remote_server_timeout = options.remote_server_timeout + if options.socket_timeout: + WarcProxyHandler._socket_timeout = options.socket_timeout http_server.HTTPServer.__init__( self, server_address, WarcProxyHandler, bind_and_activate=True) From d01b356493ada2cdb6f1b7d8a1758fa1f8338d26 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Tue, 30 Jan 2018 23:02:10 +0000 Subject: [PATCH 3/3] Add socket-timeout unit test Add socket-timeout=4 in ``warcprox_`` test fixture. Create test URL `/slow-url` which returns after 6 sec. Trying to access the target URL raises a ``socket.timeout`` and returns HTTP status 502. The new ``--socket-timeout`` option does not hurt any other test using the ``warcprox_`` fixture because they are too fast anyway. --- tests/test_warcprox.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index d091542..c37174f 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -249,6 +249,14 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): elif self.path == '/empty-response': headers = b'' payload = b'' + elif self.path == '/slow-response': + time.sleep(6) + headers = (b'HTTP/1.1 200 OK\r\n' + + b'Content-Type: text/plain\r\n' + + b'\r\n') + payload = b'Test.' + actual_headers = (b'Content-Type: text/plain\r\n' + + b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n') else: payload = b'404 Not Found\n' headers = (b'HTTP/1.1 404 Not Found\r\n' @@ -356,7 +364,8 @@ def warcprox_(request): '--port=0', '--playback-port=0', '--onion-tor-socks-proxy=localhost:9050', - '--crawl-log-dir=crawl-logs'] + '--crawl-log-dir=crawl-logs', + '--socket-timeout=4'] if request.config.getoption('--rethinkdb-dedup-url'): argv.append('--rethinkdb-dedup-url=%s' % request.config.getoption('--rethinkdb-dedup-url')) # test these here only @@ -1701,6 +1710,16 @@ def test_long_warcprox_meta( with pytest.raises(StopIteration): next(rec_iter) +def test_socket_timeout_response( + warcprox_, http_daemon, https_daemon, archiving_proxies, + playback_proxies): + """Response will timeout because we use --socket-timeout=4 whereas the + target URL will return after 6 sec. + """ + url = 'http://localhost:%s/slow-response' % http_daemon.server_port + response = requests.get(url, proxies=archiving_proxies, verify=False) + assert response.status_code == 502 + def test_empty_response( warcprox_, http_daemon, https_daemon, archiving_proxies, playback_proxies):