Merge branch 'master' into trough

* master: hidden argument --rethinkdb-big-table-name try to fix https://github.com/internetarchive/warcprox/issues/27
2025-01-18 13:22:09 +01:00 · 2017-08-03 11:22:27 -07:00 · 2017-08-03 11:22:27 -07:00 · c0cb59e5af
commit c0cb59e5af
parent ad3e6f405d 13ee68ce4a
3 changed files with 23 additions and 12 deletions
--- a/setup.py
+++ b/setup.py
@ -49,7 +49,7 @@ except:

 setuptools.setup(
        name='warcprox',
-        version='2.1b1.dev92',
+        version='2.1b1.dev94',
        description='WARC writing MITM HTTP/S proxy',
        url='https://github.com/internetarchive/warcprox',
        author='Noah Levitt',
--- a/warcprox/main.py
+++ b/warcprox/main.py
@ -114,6 +114,9 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
    arg_parser.add_argument('--rethinkdb-big-table',
            dest='rethinkdb_big_table', action='store_true', default=False,
            help='use a big rethinkdb table called "captures", instead of a small table called "dedup"; table is suitable for use as index for playback (ignored unless --rethinkdb-servers is specified)')
+    arg_parser.add_argument(
+            '--rethinkdb-big-table-name', dest='rethinkdb_big_table_name',
+            default='captures', help=argparse.SUPPRESS)
    arg_parser.add_argument('--queue-size', dest='queue_size', type=int,
            default=500, help=argparse.SUPPRESS)
    arg_parser.add_argument('--max-threads', dest='max_threads', type=int,
@ -179,7 +182,8 @@ def init_controller(args):
        rr = doublethink.Rethinker(
                args.rethinkdb_servers.split(","), args.rethinkdb_db)
        if args.rethinkdb_big_table:
-            captures_db = warcprox.bigtable.RethinkCaptures(rr, options=options)
+            captures_db = warcprox.bigtable.RethinkCaptures(
+                    rr, table=args.rethinkdb_big_table_name, options=options)
            dedup_db = warcprox.bigtable.RethinkCapturesDedup(
                    captures_db, options=options)
            listeners.append(captures_db)
--- a/warcprox/mitmproxy.py
+++ b/warcprox/mitmproxy.py
@ -52,7 +52,6 @@ try:
    import socketserver
 except ImportError:
    import SocketServer as socketserver
-import resource
 import concurrent.futures
 import urlcanon
 import time
@ -440,15 +439,23 @@ class PooledMixIn(socketserver.ThreadingMixIn):
            # man getrlimit: "RLIMIT_NPROC The maximum number of processes (or,
            # more precisely on Linux, threads) that can be created for the
            # real user ID of the calling process."
-            rlimit_nproc = resource.getrlimit(resource.RLIMIT_NPROC)[0]
-            rlimit_nofile = resource.getrlimit(resource.RLIMIT_NOFILE)[0]
-            max_threads = min(rlimit_nofile // 10, rlimit_nproc // 2)
-            # resource.RLIM_INFINITY == -1 which can result in max_threads == 0
-            if max_threads <= 0 or max_threads > 5000:
-                max_threads = 5000
-            self.logger.info(
-                    "max_threads=%s (rlimit_nproc=%s, rlimit_nofile=%s)",
-                    max_threads, rlimit_nproc, rlimit_nofile)
+            try:
+                import resource
+                rlimit_nproc = resource.getrlimit(resource.RLIMIT_NPROC)[0]
+                rlimit_nofile = resource.getrlimit(resource.RLIMIT_NOFILE)[0]
+                max_threads = min(rlimit_nofile // 10, rlimit_nproc // 2)
+                # resource.RLIM_INFINITY == -1 which can result in max_threads == 0
+                if max_threads <= 0 or max_threads > 5000:
+                    max_threads = 5000
+                self.logger.info(
+                        "max_threads=%s (rlimit_nproc=%s, rlimit_nofile=%s)",
+                        max_threads, rlimit_nproc, rlimit_nofile)
+            except Exception as e:
+                self.logger.warn(
+                        "unable to calculate optimal number of threads based "
+                        "on resource limits due to %s", e)
+                max_threads = 100
+                self.logger.info("max_threads=%s", max_threads)
        self.max_threads = max_threads
        self.pool = concurrent.futures.ThreadPoolExecutor(max_threads)