From 3e165916f0d55f9956a139f21d73c36e413ff984 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 2 Mar 2018 11:54:57 +0000 Subject: [PATCH 01/14] Remote server connection pool Use urllib3 connection pooling to improve remote server connection speed. Our aim is to reuse socket connections to the same target hosts when possible. Initialize a `urllib3.PoolManager` in `SingleThreadedWarcProxy` and use it in `MitmProxyHandler` to connect to remote servers. Socket read / write and ssl / socks code is exactly the same, only the connection management changes. Use arbitratry settings: pool_size=2000 and maxsize=100 (number of connections per host) for now. Maybe we can come up with better values in the future. --- warcprox/mitmproxy.py | 92 ++++++++++++++++++++++++++----------------- warcprox/warcproxy.py | 7 ++-- 2 files changed, 59 insertions(+), 40 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 1bbd930..c1be952 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -65,6 +65,8 @@ import time import collections import cProfile + + class ProxyingRecorder(object): """ Wraps a socket._fileobject, recording the bytes as they are read, @@ -236,44 +238,54 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self.hostname = urlcanon.normalize_host(host).decode('ascii') def _connect_to_remote_server(self): - # Connect to destination - if self.onion_tor_socks_proxy_host and self.hostname.endswith('.onion'): - self.logger.info( - "using tor socks proxy at %s:%s to connect to %s", - self.onion_tor_socks_proxy_host, - self.onion_tor_socks_proxy_port or 1080, self.hostname) - self._remote_server_sock = socks.socksocket() - self._remote_server_sock.set_proxy( - socks.SOCKS5, addr=self.onion_tor_socks_proxy_host, - port=self.onion_tor_socks_proxy_port, rdns=True) - else: - self._remote_server_sock = socket.socket() - self._remote_server_sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) + ''' + Connect to destination. + Note that connection_from_host has hard-coded `scheme='http'` + to avoid internal urllib3 logic when scheme is https. We handle ssl and + socks inside the current method. + self._conn_pool._get_conn() will either return an existing connection + or a new one. If its new, it needs initialization. + ''' + self._conn_pool = self.server.remote_connection_pool.connection_from_host( + host=self.hostname, port=int(self.port), scheme='http', + pool_kwargs={'maxsize': 100}) - self._remote_server_sock.settimeout(self._socket_timeout) - self._remote_server_sock.connect((self.hostname, int(self.port))) + self._remote_server_conn = self._conn_pool._get_conn() + if self._remote_server_conn.sock is None: + if self.onion_tor_socks_proxy_host and self.hostname.endswith('.onion'): + self.logger.info( + "using tor socks proxy at %s:%s to connect to %s", + self.onion_tor_socks_proxy_host, + self.onion_tor_socks_proxy_port or 1080, self.hostname) + self._remote_server_conn.sock = socks.socksocket() + self._remote_server_sock.set_proxy( + socks.SOCKS5, addr=self.onion_tor_socks_proxy_host, + port=self.onion_tor_socks_proxy_port, rdns=True) + else: + self._remote_server_conn.timeout = self._socket_timeout + self._remote_server_conn.connect() - # Wrap socket if SSL is required - if self.is_connect: - try: - context = ssl.create_default_context() - context.check_hostname = False - context.verify_mode = ssl.CERT_NONE - self._remote_server_sock = context.wrap_socket( - self._remote_server_sock, server_hostname=self.hostname) - except AttributeError: + # Wrap socket if SSL is required + if self.is_connect: try: - self._remote_server_sock = ssl.wrap_socket( - self._remote_server_sock) - except ssl.SSLError: - self.logger.warn( - "failed to establish ssl connection to %s; python " - "ssl library does not support SNI, considering " - "upgrading to python >= 2.7.9 or python 3.4", - self.hostname) + context = ssl.create_default_context() + context.check_hostname = False + context.verify_mode = ssl.CERT_NONE + self._remote_server_conn.sock = context.wrap_socket( + self._remote_server_conn.sock, server_hostname=self.hostname) + except AttributeError: + try: + self._remote_server_conn.sock = ssl.wrap_socket( + self._remote_server_conn.sock) + except ssl.SSLError: + self.logger.warn( + "failed to establish ssl connection to %s; python " + "ssl library does not support SNI, considering " + "upgrading to python >= 2.7.9 or python 3.4", + self.hostname) raise - return self._remote_server_sock + return self._remote_server_conn.sock def _transition_to_ssl(self): certfile = self.server.ca.get_wildcard_cert(self.hostname) @@ -416,14 +428,15 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): req += self.rfile.read(int(self.headers['Content-Length'])) prox_rec_res = None + connection_is_fine = False try: self.logger.debug('sending to remote server req=%r', req) # Send it down the pipe! - self._remote_server_sock.sendall(req) + self._remote_server_conn.sock.sendall(req) prox_rec_res = ProxyingRecordingHTTPResponse( - self._remote_server_sock, proxy_client=self.connection, + self._remote_server_conn.sock, proxy_client=self.connection, digest_algorithm=self.server.digest_algorithm, url=self.url, method=self.command) prox_rec_res.begin(extra_response_headers=extra_response_headers) @@ -439,12 +452,17 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._max_resource_size, self.url) break + connection_is_fine = True self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) finally: - # Let's close off the remote end + # Let's close off the remote end. If remote connection is fine, + # put it back in the pool to reuse it later. if prox_rec_res: prox_rec_res.close() - self._remote_server_sock.close() + if connection_is_fine: + self._conn_pool._put_conn(self._remote_conn) + else: + self._remote_server_conn.sock.close() return req, prox_rec_res diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 5b42655..c66c33d 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -43,6 +43,7 @@ import warcprox import datetime import urlcanon import os +from urllib3 import PoolManager class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): ''' @@ -173,7 +174,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): limits and block rules in the Warcprox-Meta request header, if any. Raises `warcprox.RequestBlockedByRule` if a rule has been enforced. Otherwise calls `MitmProxyHandler._connect_to_remote_server`, which - initializes `self._remote_server_sock`. + initializes `self._remote_server_conn`. ''' if 'Warcprox-Meta' in self.headers: warcprox_meta = json.loads(self.headers['Warcprox-Meta']) @@ -192,7 +193,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): warcprox_meta = json.loads(raw_warcprox_meta) del self.headers['Warcprox-Meta'] - remote_ip = self._remote_server_sock.getpeername()[0] + remote_ip = self._remote_server_conn.sock.getpeername()[0] timestamp = datetime.datetime.utcnow() extra_response_headers = {} if warcprox_meta and 'accept' in warcprox_meta and \ @@ -387,7 +388,7 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object): self.status_callback = status_callback self.stats_db = stats_db self.options = options - + self.remote_connection_pool = PoolManager(num_pools=2000) server_address = ( options.address or 'localhost', options.port if options.port is not None else 8000) From 2df4fe305643026c2be5596059122db30e53bd78 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 2 Mar 2018 11:58:07 +0000 Subject: [PATCH 02/14] Remove whitespace --- warcprox/mitmproxy.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index c1be952..d92e416 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -65,8 +65,6 @@ import time import collections import cProfile - - class ProxyingRecorder(object): """ Wraps a socket._fileobject, recording the bytes as they are read, From 9a797fe612dda1dcdb048ec57e80019a08640566 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 2 Mar 2018 12:34:52 +0000 Subject: [PATCH 03/14] Fix typo --- warcprox/mitmproxy.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index d92e416..6cf12dc 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -282,7 +282,6 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): "upgrading to python >= 2.7.9 or python 3.4", self.hostname) raise - return self._remote_server_conn.sock def _transition_to_ssl(self): @@ -458,7 +457,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): if prox_rec_res: prox_rec_res.close() if connection_is_fine: - self._conn_pool._put_conn(self._remote_conn) + self._conn_pool._put_conn(self._remote_server_conn) else: self._remote_server_conn.sock.close() From 3bb93556628d71d9d44a7529a0bdee2ff84a7dcb Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 2 Mar 2018 13:26:26 +0000 Subject: [PATCH 04/14] Extra connection evaluation before putting it back to the pool Use `urllib3.util.is_connection_dropped` to check that the connection is fine before putting it back to the pool to be reused later. --- warcprox/mitmproxy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 6cf12dc..c75c1e2 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -64,6 +64,7 @@ import urlcanon import time import collections import cProfile +from urllib3.util import is_connection_dropped class ProxyingRecorder(object): """ @@ -456,7 +457,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): # put it back in the pool to reuse it later. if prox_rec_res: prox_rec_res.close() - if connection_is_fine: + if connection_is_fine and not is_connection_dropped(self._remote_server_conn): self._conn_pool._put_conn(self._remote_server_conn) else: self._remote_server_conn.sock.close() From 240b6da836eee198942932dd75d1fb99f813c8a9 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 5 Mar 2018 20:22:22 -0800 Subject: [PATCH 05/14] a minimal example a minimal example of a warcprox plu-i --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 397b930..113099b 100644 --- a/README.rst +++ b/README.rst @@ -46,7 +46,7 @@ have a method `notify(self, recorded_url, records)` or should subclass `warcprox.BasePostfetchProcessor`. More than one plugin can be configured by specifying `--plugin` multiples times. -XXX example? +`A minimal example `__ Usage ~~~~~ From 435b0ec24be892ee06b74df5cd1dde4bd43a493e Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Tue, 6 Mar 2018 09:58:56 +0000 Subject: [PATCH 06/14] Address unit test failure in Python 3.4 --- warcprox/mitmproxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index c75c1e2..9d076e3 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -250,7 +250,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): pool_kwargs={'maxsize': 100}) self._remote_server_conn = self._conn_pool._get_conn() - if self._remote_server_conn.sock is None: + if is_connection_dropped(self._remote_server_conn): if self.onion_tor_socks_proxy_host and self.hostname.endswith('.onion'): self.logger.info( "using tor socks proxy at %s:%s to connect to %s", From eda0656737e2e3564165d3e615dc2f49a408a085 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Wed, 7 Mar 2018 08:00:18 +0000 Subject: [PATCH 07/14] Configurable tmp file max memory size We use `tempfile.SpooledTemporaryFile(max_size=512*1024)` to keep recorded data before writing them to WARC. Data are kept in memory when they are smaller than `max_size`, else they are written to disk. We add option `--tmp-file-max-memory-size` to make this configurable. A higher value means less /tmp disk I/O and higher overall performance but also increased memory usage. --- warcprox/main.py | 4 ++++ warcprox/mitmproxy.py | 14 +++++++++----- warcprox/warcproxy.py | 2 ++ 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/warcprox/main.py b/warcprox/main.py index 64d01c7..8ff466b 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -166,6 +166,10 @@ def _build_arg_parser(prog='warcprox'): arg_parser.add_argument( '--socket-timeout', dest='socket_timeout', type=float, default=None, help=argparse.SUPPRESS) + # Increasing this value increases memory usage but reduces /tmp disk I/O. + arg_parser.add_argument( + '--tmp-file-max-memory-size', dest='tmp_file_max_memory_size', + type=int, default=512*1024, help=argparse.SUPPRESS) arg_parser.add_argument( '--max-resource-size', dest='max_resource_size', type=int, default=None, help='maximum resource size limit in bytes') diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 1bbd930..14f26f9 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -73,10 +73,11 @@ class ProxyingRecorder(object): logger = logging.getLogger("warcprox.mitmproxy.ProxyingRecorder") - def __init__(self, fp, proxy_client, digest_algorithm='sha1', url=None): + def __init__(self, fp, proxy_client, digest_algorithm='sha1', url=None, + tmp_file_max_memory_size=524288): self.fp = fp # "The file has no name, and will cease to exist when it is closed." - self.tempfile = tempfile.SpooledTemporaryFile(max_size=512*1024) + self.tempfile = tempfile.SpooledTemporaryFile(max_size=tmp_file_max_memory_size) self.digest_algorithm = digest_algorithm self.block_digest = hashlib.new(digest_algorithm) self.payload_offset = None @@ -146,7 +147,7 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): ''' def __init__( self, sock, debuglevel=0, method=None, proxy_client=None, - digest_algorithm='sha1', url=None): + digest_algorithm='sha1', url=None, tmp_file_max_memory_size=None): http_client.HTTPResponse.__init__( self, sock, debuglevel=debuglevel, method=method) self.proxy_client = proxy_client @@ -156,7 +157,8 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): # Keep around extra reference to self.fp because HTTPResponse sets # self.fp=None after it finishes reading, but we still need it self.recorder = ProxyingRecorder( - self.fp, proxy_client, digest_algorithm, url=url) + self.fp, proxy_client, digest_algorithm, url=url, + tmp_file_max_memory_size=tmp_file_max_memory_size) self.fp = self.recorder self.payload_digest = None @@ -208,6 +210,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler") _socket_timeout = 60 _max_resource_size = None + _tmp_file_max_memory_size = 512 * 1024 def __init__(self, request, client_address, server): threading.current_thread().name = 'MitmProxyHandler(tid={},started={},client={}:{})'.format(warcprox.gettid(), datetime.datetime.utcnow().isoformat(), client_address[0], client_address[1]) @@ -425,7 +428,8 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): prox_rec_res = ProxyingRecordingHTTPResponse( self._remote_server_sock, proxy_client=self.connection, digest_algorithm=self.server.digest_algorithm, - url=self.url, method=self.command) + url=self.url, method=self.command, + tmp_file_max_memory_size=self._tmp_file_max_memory_size) prox_rec_res.begin(extra_response_headers=extra_response_headers) buf = prox_rec_res.read(65536) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 5b42655..2aa171c 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -405,6 +405,8 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object): WarcProxyHandler._socket_timeout = options.socket_timeout if options.max_resource_size: WarcProxyHandler._max_resource_size = options.max_resource_size + if options.tmp_file_max_memory_size: + WarcProxyHandler._tmp_file_max_memory_size = options.tmp_file_max_memory_size http_server.HTTPServer.__init__( self, server_address, WarcProxyHandler, bind_and_activate=True) From 2f84fa8dbf67c76f5cb1d9c1ad0e6d45f58c63a8 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Thu, 8 Mar 2018 08:01:54 +0000 Subject: [PATCH 08/14] Fix ListenerPostfetchProcessor typo Use `self.listener` instead of `listener`. --- warcprox/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/__init__.py b/warcprox/__init__.py index 33af61a..76abafa 100644 --- a/warcprox/__init__.py +++ b/warcprox/__init__.py @@ -235,7 +235,7 @@ class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor): self.listener.stop() except: self.logger.error( - '%s raised exception', listener.stop, exc_info=True) + '%s raised exception', self.listener.stop, exc_info=True) def timestamp17(): now = datetime.datetime.utcnow() From 45c06eab584c8a560c5f72e0bd6b6392a53e3e63 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 8 Mar 2018 16:35:25 -0800 Subject: [PATCH 09/14] bump dev version number --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5169090..14e711c 100755 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ except: setuptools.setup( name='warcprox', - version='2.4b2.dev154', + version='2.4b2.dev155', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From 9bb2018fd2059f7ab5579964be28b0e7daa28390 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 12 Mar 2018 11:22:05 -0700 Subject: [PATCH 10/14] bump dev version after PR #75 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 14e711c..2457edc 100755 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ except: setuptools.setup( name='warcprox', - version='2.4b2.dev155', + version='2.4b2.dev156', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From 1d5692dd13f0a6ecbcc35adae7d4bab3f2798330 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 16 Mar 2018 13:10:29 +0000 Subject: [PATCH 11/14] Reduce the PoolManager num_pools size and fix bugs Define PoolManager num_pools size as `max(max_threads, 500)` and reduce each pool size from 100 to 30. The aim is to limit the total number of open connections. Fix remote SOCKS connection typo. Now that we reuse remote connections, its better NOT to remove the `keep-alive` request header. We need to send it to the remote host to make it keep the connection open if possible. --- warcprox/mitmproxy.py | 6 +++--- warcprox/warcproxy.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 9d076e3..1482210 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -176,7 +176,7 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): for k,v in self.msg.items(): if k.lower() not in ( - 'connection', 'proxy-connection', 'keep-alive', + 'connection', 'proxy-connection', 'proxy-authenticate', 'proxy-authorization', 'upgrade', 'strict-transport-security'): status_and_headers += '{}: {}\r\n'.format(k, v) @@ -247,7 +247,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): ''' self._conn_pool = self.server.remote_connection_pool.connection_from_host( host=self.hostname, port=int(self.port), scheme='http', - pool_kwargs={'maxsize': 100}) + pool_kwargs={'maxsize': 30}) self._remote_server_conn = self._conn_pool._get_conn() if is_connection_dropped(self._remote_server_conn): @@ -257,7 +257,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self.onion_tor_socks_proxy_host, self.onion_tor_socks_proxy_port or 1080, self.hostname) self._remote_server_conn.sock = socks.socksocket() - self._remote_server_sock.set_proxy( + self._remote_server_conn.sock.set_proxy( socks.SOCKS5, addr=self.onion_tor_socks_proxy_host, port=self.onion_tor_socks_proxy_port, rdns=True) else: diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index c66c33d..88a9c34 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -388,7 +388,8 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object): self.status_callback = status_callback self.stats_db = stats_db self.options = options - self.remote_connection_pool = PoolManager(num_pools=2000) + self.remote_connection_pool = PoolManager( + num_pools=max(options.max_threads, 500) if options.max_threads else 500) server_address = ( options.address or 'localhost', options.port if options.port is not None else 8000) From 0002d29f0d734c5d1f230965f90f43db1d3eccb7 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 16 Mar 2018 21:06:34 +0000 Subject: [PATCH 12/14] Improve Connection Pool Set connection pool maxsize to 6 (borrowing from browser behavior). Set num_pools to `max_threads / 6` but set a minimum of 200 for the cases that we use a very low number of `max_threads`. Remove `connection_is_fine` variable from connection code. Fix http headers bug introduced in the previous commit. --- warcprox/mitmproxy.py | 16 +++++++--------- warcprox/warcproxy.py | 2 +- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 1482210..e87975c 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -176,7 +176,7 @@ class ProxyingRecordingHTTPResponse(http_client.HTTPResponse): for k,v in self.msg.items(): if k.lower() not in ( - 'connection', 'proxy-connection', + 'connection', 'proxy-connection', 'keep-alive', 'proxy-authenticate', 'proxy-authorization', 'upgrade', 'strict-transport-security'): status_and_headers += '{}: {}\r\n'.format(k, v) @@ -247,7 +247,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): ''' self._conn_pool = self.server.remote_connection_pool.connection_from_host( host=self.hostname, port=int(self.port), scheme='http', - pool_kwargs={'maxsize': 30}) + pool_kwargs={'maxsize': 6}) self._remote_server_conn = self._conn_pool._get_conn() if is_connection_dropped(self._remote_server_conn): @@ -426,7 +426,6 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): req += self.rfile.read(int(self.headers['Content-Length'])) prox_rec_res = None - connection_is_fine = False try: self.logger.debug('sending to remote server req=%r', req) @@ -450,17 +449,16 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._max_resource_size, self.url) break - connection_is_fine = True self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) - finally: # Let's close off the remote end. If remote connection is fine, # put it back in the pool to reuse it later. + if not is_connection_dropped(self._remote_server_conn): + self._conn_pool._put_conn(self._remote_server_conn) + except: + self._remote_server_conn.sock.close() + finally: if prox_rec_res: prox_rec_res.close() - if connection_is_fine and not is_connection_dropped(self._remote_server_conn): - self._conn_pool._put_conn(self._remote_server_conn) - else: - self._remote_server_conn.sock.close() return req, prox_rec_res diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 88a9c34..97da984 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -389,7 +389,7 @@ class SingleThreadedWarcProxy(http_server.HTTPServer, object): self.stats_db = stats_db self.options = options self.remote_connection_pool = PoolManager( - num_pools=max(options.max_threads, 500) if options.max_threads else 500) + num_pools=max(round(options.max_threads / 6), 200) if options.max_threads else 200) server_address = ( options.address or 'localhost', options.port if options.port is not None else 8000) From 0404ad239ff8d67a8a3263719dd7e16f80d84c94 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Tue, 20 Mar 2018 07:35:49 +0000 Subject: [PATCH 13/14] Fix SOCKS connection error --- warcprox/mitmproxy.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index e87975c..5a47398 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -260,6 +260,8 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): self._remote_server_conn.sock.set_proxy( socks.SOCKS5, addr=self.onion_tor_socks_proxy_host, port=self.onion_tor_socks_proxy_port, rdns=True) + self._remote_server_conn.timeout = self._socket_timeout + self._remote_server_conn.sock.connect((self.hostname, int(self.port))) else: self._remote_server_conn.timeout = self._socket_timeout self._remote_server_conn.connect() From c79b89108ad3ca6c35ac7de9aaf5717a889e0b37 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 20 Mar 2018 10:53:04 -0700 Subject: [PATCH 14/14] bump version number after PR #72 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2457edc..6a79e63 100755 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ except: setuptools.setup( name='warcprox', - version='2.4b2.dev156', + version='2.4b2.dev157', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt',