From b1a8fecd9d15dc58bbc05b46ae7e369304cb019a Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 7 Jul 2017 14:54:55 -0700 Subject: [PATCH 1/2] try to fix https://github.com/internetarchive/warcprox/issues/27 --- setup.py | 2 +- warcprox/mitmproxy.py | 27 +++++++++++++++++---------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/setup.py b/setup.py index c06fed7..5c159a0 100755 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ except: setuptools.setup( name='warcprox', - version='2.1b1.dev92', + version='2.1b1.dev93', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index ec9dafc..6297dcc 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -52,7 +52,6 @@ try: import socketserver except ImportError: import SocketServer as socketserver -import resource import concurrent.futures import urlcanon import time @@ -440,15 +439,23 @@ class PooledMixIn(socketserver.ThreadingMixIn): # man getrlimit: "RLIMIT_NPROC The maximum number of processes (or, # more precisely on Linux, threads) that can be created for the # real user ID of the calling process." - rlimit_nproc = resource.getrlimit(resource.RLIMIT_NPROC)[0] - rlimit_nofile = resource.getrlimit(resource.RLIMIT_NOFILE)[0] - max_threads = min(rlimit_nofile // 10, rlimit_nproc // 2) - # resource.RLIM_INFINITY == -1 which can result in max_threads == 0 - if max_threads <= 0 or max_threads > 5000: - max_threads = 5000 - self.logger.info( - "max_threads=%s (rlimit_nproc=%s, rlimit_nofile=%s)", - max_threads, rlimit_nproc, rlimit_nofile) + try: + import resource + rlimit_nproc = resource.getrlimit(resource.RLIMIT_NPROC)[0] + rlimit_nofile = resource.getrlimit(resource.RLIMIT_NOFILE)[0] + max_threads = min(rlimit_nofile // 10, rlimit_nproc // 2) + # resource.RLIM_INFINITY == -1 which can result in max_threads == 0 + if max_threads <= 0 or max_threads > 5000: + max_threads = 5000 + self.logger.info( + "max_threads=%s (rlimit_nproc=%s, rlimit_nofile=%s)", + max_threads, rlimit_nproc, rlimit_nofile) + except Exception as e: + self.logger.warn( + "unable to calculate optimal number of threads based " + "on resource limits due to %s", e) + max_threads = 100 + self.logger.info("max_threads=%s", max_threads) self.max_threads = max_threads self.pool = concurrent.futures.ThreadPoolExecutor(max_threads) From 13ee68ce4acc06f5269c279e7471fe549644c5b7 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 20 Jul 2017 12:53:59 -0700 Subject: [PATCH 2/2] hidden argument --rethinkdb-big-table-name --- setup.py | 2 +- warcprox/main.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 5c159a0..e1b5bd8 100755 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ except: setuptools.setup( name='warcprox', - version='2.1b1.dev93', + version='2.1b1.dev94', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/main.py b/warcprox/main.py index 0940593..25fb3dc 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -113,6 +113,9 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): arg_parser.add_argument('--rethinkdb-big-table', dest='rethinkdb_big_table', action='store_true', default=False, help='use a big rethinkdb table called "captures", instead of a small table called "dedup"; table is suitable for use as index for playback (ignored unless --rethinkdb-servers is specified)') + arg_parser.add_argument( + '--rethinkdb-big-table-name', dest='rethinkdb_big_table_name', + default='captures', help=argparse.SUPPRESS) arg_parser.add_argument('--kafka-broker-list', dest='kafka_broker_list', default=None, help='kafka broker list for capture feed') arg_parser.add_argument('--kafka-capture-feed-topic', dest='kafka_capture_feed_topic', @@ -169,7 +172,8 @@ def init_controller(args): rr = doublethink.Rethinker( args.rethinkdb_servers.split(","), args.rethinkdb_db) if args.rethinkdb_big_table: - captures_db = warcprox.bigtable.RethinkCaptures(rr, options=options) + captures_db = warcprox.bigtable.RethinkCaptures( + rr, table=args.rethinkdb_big_table_name, options=options) dedup_db = warcprox.bigtable.RethinkCapturesDedup( captures_db, options=options) listeners.append(captures_db)