From 3e165916f0d55f9956a139f21d73c36e413ff984 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 2 Mar 2018 11:54:57 +0000 Subject: [PATCH 1/8] Remote server connection pool Use urllib3 connection pooling to improve remote server connection speed. Our aim is to reuse socket connections to the same target hosts when possible. Initialize a `urllib3.PoolManager` in `SingleThreadedWarcProxy` and use it in `MitmProxyHandler` to connect to remote servers. Socket read / write and ssl / socks code is exactly the same, only the connection management changes. Use arbitratry settings: pool_size=2000 and maxsize=100 (number of connections per host) for now. Maybe we can come up with better values in the future. --- warcprox/mitmproxy.py | 92 ++++++++++++++++++++++++++----------------- warcprox/warcproxy.py | 7 ++-- 2 files changed, 59 insertions(+), 40 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 1bbd930..c1be952 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -65,6 +65,8 @@ import time import collections import cProfile + + class ProxyingRecorder(object): """ Wraps a socket._fileobject, recording the bytes as they are read, @@ -236,44 +238,54 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self.hostname = urlcanon.normalize_host(host).decode('ascii') def _connect_to_remote_server(self): - # Connect to destination - if self.onion_tor_socks_proxy_host and self.hostname.endswith('.onion'): - self.logger.info( - "using tor socks proxy at %s:%s to connect to %s", - self.onion_tor_socks_proxy_host, - self.onion_tor_socks_proxy_port or 1080, self.hostname) - self._remote_server_sock = socks.socksocket() - self._remote_server_sock.set_proxy( - socks.SOCKS5, addr=self.onion_tor_socks_proxy_host, - port=self.onion_tor_socks_proxy_port, rdns=True) - else: - self._remote_server_sock = socket.socket() - self._remote_server_sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) + ''' + Connect to destination. + Note that connection_from_host has hard-coded `scheme='http'` + to avoid internal urllib3 logic when scheme is https. We handle ssl and + socks inside the current method. + self._conn_pool._get_conn() will either return an existing connection + or a new one. If its new, it needs initialization. + ''' + self._conn_pool = self.server.remote_connection_pool.connection_from_host( + host=self.hostname, port=int(self.port), scheme='http', + pool_kwargs={'maxsize': 100}) - self._remote_server_sock.settimeout(self._socket_timeout) - self._remote_server_sock.connect((self.hostname, int(self.port))) + self._remote_server_conn = self._conn_pool._get_conn() + if self._remote_server_conn.sock is None: + if self.onion_tor_socks_proxy_host and self.hostname.endswith('.onion'): + self.logger.info( + "using tor socks proxy at %s:%s to connect to %s", + self.onion_tor_socks_proxy_host, + self.onion_tor_socks_proxy_port or 1080, self.hostname) + self._remote_server_conn.sock = socks.socksocket() + self._remote_server_sock.set_proxy( + socks.SOCKS5, addr=self.onion_tor_socks_proxy_host, + port=self.onion_tor_socks_proxy_port, rdns=True) + else: + self._remote_server_conn.timeout = self._socket_timeout + self._remote_server_conn.connect() - # Wrap socket if SSL is required - if self.is_connect: - try: - context = ssl.create_default_context() - context.check_hostname = False - context.verify_mode = ssl.CERT_NONE - self._remote_server_sock = context.wrap_socket( - self._remote_server_sock, server_hostname=self.hostname) - except AttributeError: + # Wrap socket if SSL is required + if self.is_connect: try: - self._remote_server_sock = ssl.wrap_socket( - self._remote_server_sock) - except ssl.SSLError: - self.logger.warn( - "failed to establish ssl connection to %s; python " - "ssl library does not support SNI, considering " - "upgrading to python >= 2.7.9 or python 3.4", - self.hostname) + context = ssl.create_default_context() + context.check_hostname = False + context.verify_mode = ssl.CERT_NONE + self._remote_server_conn.sock = context.wrap_socket( + self._remote_server_conn.sock, server_hostname=self.hostname) + except AttributeError: + try: + self._remote_server_conn.sock = ssl.wrap_socket( + self._remote_server_conn.sock) + except ssl.SSLError: + self.logger.warn( + "failed to establish ssl connection to %s; python " + "ssl library does not support SNI, considering " + "upgrading to python >= 2.7.9 or python 3.4", + self.hostname) raise - return self._remote_server_sock + return self._remote_server_conn.sock def _transition_to_ssl(self): certfile = self.server.ca.get_wildcard_cert(self.hostname) @@ -416,14 +428,15 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): req += self.rfile.read(int(self.headers['Content-Length'])) prox_rec_res = None + connection_is_fine = False try: self.logger.debug('sending to remote server req=%r', req) # Send it down the pipe! - self._remote_server_sock.sendall(req) + self._remote_server_conn.sock.sendall(req) prox_rec_res = ProxyingRecordingHTTPResponse( - self._remote_server_sock, proxy_client=self.connection, + self._remote_server_conn.sock, proxy_client=self.connection, digest_algorithm=self.server.digest_algorithm, url=self.url, method=self.command) prox_rec_res.begin(extra_response_headers=extra_response_headers) @@ -439,12 +452,17 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._max_resource_size, self.url) break + connection_is_fine = True self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) finally: - # Let's close off the remote end + # Let's close off the remote end. If remote connection is fine, + # put it back in the pool to reuse it later. if prox_rec_res: prox_rec_res.close() - self._remote_server_sock.close() + if connection_is_fine: + self._conn_pool._put_conn(self._remote_conn) + else: + self._remote_server_conn.sock.close() return req, prox_rec_res diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 5b42655..c66c33d 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -43,6 +43,7 @@ import warcprox import datetime import urlcanon import os +from urllib3 import PoolManager class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): ''' @@ -173,7 +174,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): limits and block rules in the Warcprox-Meta request header, if any. Raises `warcprox.RequestBlockedByRule` if a rule has been enforced. Otherwise calls `MitmProxyHandler._connect_to_remote_server`, which - initializes `self._remote_server_sock`. + initializes `self._remote_server_conn`. ''' if 'Warcprox-Meta' in self.headers: warcprox_meta = json.loads(self.headers['Warcprox-Meta']) @@ -192,7 +193,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): warcprox_meta = json.loads(raw_warcprox_meta) del self.headers['Warcprox-Meta'] - remote_ip = self._remote_server_sock.getpeername()[0] + remote_ip = self._remote_server_conn.sock.getpeername()[0] timestamp = datetime.datetime.utcnow() extra_response_headers = {} if warcprox_meta and 'accept' in warcprox_meta and \ @@ -387,7 +388,7 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object): self.status_callback = status_callback self.stats_db = stats_db self.options = options - + self.remote_connection_pool = PoolManager(num_pools=2000) server_address = ( options.address or 'localhost', options.port if options.port is not None else 8000) From 2df4fe305643026c2be5596059122db30e53bd78 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 2 Mar 2018 11:58:07 +0000 Subject: [PATCH 2/8] Remove whitespace --- warcprox/mitmproxy.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index c1be952..d92e416 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -65,8 +65,6 @@ import time import collections import cProfile - - class ProxyingRecorder(object): """ Wraps a socket._fileobject, recording the bytes as they are read, From 9a797fe612dda1dcdb048ec57e80019a08640566 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 2 Mar 2018 12:34:52 +0000 Subject: [PATCH 3/8] Fix typo --- warcprox/mitmproxy.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index d92e416..6cf12dc 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -282,7 +282,6 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): "upgrading to python >= 2.7.9 or python 3.4", self.hostname) raise - return self._remote_server_conn.sock def _transition_to_ssl(self): @@ -458,7 +457,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): if prox_rec_res: prox_rec_res.close() if connection_is_fine: - self._conn_pool._put_conn(self._remote_conn) + self._conn_pool._put_conn(self._remote_server_conn) else: self._remote_server_conn.sock.close() From 3bb93556628d71d9d44a7529a0bdee2ff84a7dcb Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 2 Mar 2018 13:26:26 +0000 Subject: [PATCH 4/8] Extra connection evaluation before putting it back to the pool Use `urllib3.util.is_connection_dropped` to check that the connection is fine before putting it back to the pool to be reused later. --- warcprox/mitmproxy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 6cf12dc..c75c1e2 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -64,6 +64,7 @@ import urlcanon import time import collections import cProfile +from urllib3.util import is_connection_dropped class ProxyingRecorder(object): """ @@ -456,7 +457,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): # put it back in the pool to reuse it later. if prox_rec_res: prox_rec_res.close() - if connection_is_fine: + if connection_is_fine and not is_connection_dropped(self._remote_server_conn): self._conn_pool._put_conn(self._remote_server_conn) else: self._remote_server_conn.sock.close() From 435b0ec24be892ee06b74df5cd1dde4bd43a493e Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Tue, 6 Mar 2018 09:58:56 +0000 Subject: [PATCH 5/8] Address unit test failure in Python 3.4 --- warcprox/mitmproxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index c75c1e2..9d076e3 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -250,7 +250,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): pool_kwargs={'maxsize': 100}) self._remote_server_conn = self._conn_pool._get_conn() - if self._remote_server_conn.sock is None: + if is_connection_dropped(self._remote_server_conn): if self.onion_tor_socks_proxy_host and self.hostname.endswith('.onion'): self.logger.info( "using tor socks proxy at %s:%s to connect to %s", From 1d5692dd13f0a6ecbcc35adae7d4bab3f2798330 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 16 Mar 2018 13:10:29 +0000 Subject: [PATCH 6/8] Reduce the PoolManager num_pools size and fix bugs Define PoolManager num_pools size as `max(max_threads, 500)` and reduce each pool size from 100 to 30. The aim is to limit the total number of open connections. Fix remote SOCKS connection typo. Now that we reuse remote connections, its better NOT to remove the `keep-alive` request header. We need to send it to the remote host to make it keep the connection open if possible. --- warcprox/mitmproxy.py | 6 +++--- warcprox/warcproxy.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 9d076e3..1482210 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -176,7 +176,7 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): for k,v in self.msg.items(): if k.lower() not in ( - 'connection', 'proxy-connection', 'keep-alive', + 'connection', 'proxy-connection', 'proxy-authenticate', 'proxy-authorization', 'upgrade', 'strict-transport-security'): status_and_headers += '{}: {}\r\n'.format(k, v) @@ -247,7 +247,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): ''' self._conn_pool = self.server.remote_connection_pool.connection_from_host( host=self.hostname, port=int(self.port), scheme='http', - pool_kwargs={'maxsize': 100}) + pool_kwargs={'maxsize': 30}) self._remote_server_conn = self._conn_pool._get_conn() if is_connection_dropped(self._remote_server_conn): @@ -257,7 +257,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self.onion_tor_socks_proxy_host, self.onion_tor_socks_proxy_port or 1080, self.hostname) self._remote_server_conn.sock = socks.socksocket() - self._remote_server_sock.set_proxy( + self._remote_server_conn.sock.set_proxy( socks.SOCKS5, addr=self.onion_tor_socks_proxy_host, port=self.onion_tor_socks_proxy_port, rdns=True) else: diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index c66c33d..88a9c34 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -388,7 +388,8 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object): self.status_callback = status_callback self.stats_db = stats_db self.options = options - self.remote_connection_pool = PoolManager(num_pools=2000) + self.remote_connection_pool = PoolManager( + num_pools=max(options.max_threads, 500) if options.max_threads else 500) server_address = ( options.address or 'localhost', options.port if options.port is not None else 8000) From 0002d29f0d734c5d1f230965f90f43db1d3eccb7 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 16 Mar 2018 21:06:34 +0000 Subject: [PATCH 7/8] Improve Connection Pool Set connection pool maxsize to 6 (borrowing from browser behavior). Set num_pools to `max_threads / 6` but set a minimum of 200 for the cases that we use a very low number of `max_threads`. Remove `connection_is_fine` variable from connection code. Fix http headers bug introduced in the previous commit. --- warcprox/mitmproxy.py | 16 +++++++--------- warcprox/warcproxy.py | 2 +- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 1482210..e87975c 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -176,7 +176,7 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): for k,v in self.msg.items(): if k.lower() not in ( - 'connection', 'proxy-connection', + 'connection', 'proxy-connection', 'keep-alive', 'proxy-authenticate', 'proxy-authorization', 'upgrade', 'strict-transport-security'): status_and_headers += '{}: {}\r\n'.format(k, v) @@ -247,7 +247,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): ''' self._conn_pool = self.server.remote_connection_pool.connection_from_host( host=self.hostname, port=int(self.port), scheme='http', - pool_kwargs={'maxsize': 30}) + pool_kwargs={'maxsize': 6}) self._remote_server_conn = self._conn_pool._get_conn() if is_connection_dropped(self._remote_server_conn): @@ -426,7 +426,6 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): req += self.rfile.read(int(self.headers['Content-Length'])) prox_rec_res = None - connection_is_fine = False try: self.logger.debug('sending to remote server req=%r', req) @@ -450,17 +449,16 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._max_resource_size, self.url) break - connection_is_fine = True self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) - finally: # Let's close off the remote end. If remote connection is fine, # put it back in the pool to reuse it later. + if not is_connection_dropped(self._remote_server_conn): + self._conn_pool._put_conn(self._remote_server_conn) + except: + self._remote_server_conn.sock.close() + finally: if prox_rec_res: prox_rec_res.close() - if connection_is_fine and not is_connection_dropped(self._remote_server_conn): - self._conn_pool._put_conn(self._remote_server_conn) - else: - self._remote_server_conn.sock.close() return req, prox_rec_res diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 88a9c34..97da984 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -389,7 +389,7 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object): self.stats_db = stats_db self.options = options self.remote_connection_pool = PoolManager( - num_pools=max(options.max_threads, 500) if options.max_threads else 500) + num_pools=max(round(options.max_threads / 6), 200) if options.max_threads else 200) server_address = ( options.address or 'localhost', options.port if options.port is not None else 8000) From 0404ad239ff8d67a8a3263719dd7e16f80d84c94 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Tue, 20 Mar 2018 07:35:49 +0000 Subject: [PATCH 8/8] Fix SOCKS connection error --- warcprox/mitmproxy.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index e87975c..5a47398 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -260,6 +260,8 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._remote_server_conn.sock.set_proxy( socks.SOCKS5, addr=self.onion_tor_socks_proxy_host, port=self.onion_tor_socks_proxy_port, rdns=True) + self._remote_server_conn.timeout = self._socket_timeout + self._remote_server_conn.sock.connect((self.hostname, int(self.port))) else: self._remote_server_conn.timeout = self._socket_timeout self._remote_server_conn.connect()