Merge branch 'master' into trough

* master:
  hidden argument --rethinkdb-big-table-name
  try to fix https://github.com/internetarchive/warcprox/issues/27
This commit is contained in:
Noah Levitt 2017-08-03 11:22:27 -07:00
commit c0cb59e5af
3 changed files with 23 additions and 12 deletions

View File

@ -49,7 +49,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.1b1.dev92',
version='2.1b1.dev94',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -114,6 +114,9 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
arg_parser.add_argument('--rethinkdb-big-table',
dest='rethinkdb_big_table', action='store_true', default=False,
help='use a big rethinkdb table called "captures", instead of a small table called "dedup"; table is suitable for use as index for playback (ignored unless --rethinkdb-servers is specified)')
arg_parser.add_argument(
'--rethinkdb-big-table-name', dest='rethinkdb_big_table_name',
default='captures', help=argparse.SUPPRESS)
arg_parser.add_argument('--queue-size', dest='queue_size', type=int,
default=500, help=argparse.SUPPRESS)
arg_parser.add_argument('--max-threads', dest='max_threads', type=int,
@ -179,7 +182,8 @@ def init_controller(args):
rr = doublethink.Rethinker(
args.rethinkdb_servers.split(","), args.rethinkdb_db)
if args.rethinkdb_big_table:
captures_db = warcprox.bigtable.RethinkCaptures(rr, options=options)
captures_db = warcprox.bigtable.RethinkCaptures(
rr, table=args.rethinkdb_big_table_name, options=options)
dedup_db = warcprox.bigtable.RethinkCapturesDedup(
captures_db, options=options)
listeners.append(captures_db)

View File

@ -52,7 +52,6 @@ try:
import socketserver
except ImportError:
import SocketServer as socketserver
import resource
import concurrent.futures
import urlcanon
import time
@ -440,15 +439,23 @@ class PooledMixIn(socketserver.ThreadingMixIn):
# man getrlimit: "RLIMIT_NPROC The maximum number of processes (or,
# more precisely on Linux, threads) that can be created for the
# real user ID of the calling process."
rlimit_nproc = resource.getrlimit(resource.RLIMIT_NPROC)[0]
rlimit_nofile = resource.getrlimit(resource.RLIMIT_NOFILE)[0]
max_threads = min(rlimit_nofile // 10, rlimit_nproc // 2)
# resource.RLIM_INFINITY == -1 which can result in max_threads == 0
if max_threads <= 0 or max_threads > 5000:
max_threads = 5000
self.logger.info(
"max_threads=%s (rlimit_nproc=%s, rlimit_nofile=%s)",
max_threads, rlimit_nproc, rlimit_nofile)
try:
import resource
rlimit_nproc = resource.getrlimit(resource.RLIMIT_NPROC)[0]
rlimit_nofile = resource.getrlimit(resource.RLIMIT_NOFILE)[0]
max_threads = min(rlimit_nofile // 10, rlimit_nproc // 2)
# resource.RLIM_INFINITY == -1 which can result in max_threads == 0
if max_threads <= 0 or max_threads > 5000:
max_threads = 5000
self.logger.info(
"max_threads=%s (rlimit_nproc=%s, rlimit_nofile=%s)",
max_threads, rlimit_nproc, rlimit_nofile)
except Exception as e:
self.logger.warn(
"unable to calculate optimal number of threads based "
"on resource limits due to %s", e)
max_threads = 100
self.logger.info("max_threads=%s", max_threads)
self.max_threads = max_threads
self.pool = concurrent.futures.ThreadPoolExecutor(max_threads)