From 25281376f637ac75dc023d6c87ea231a09c338b3 Mon Sep 17 00:00:00 2001
From: Vangelis Banos <vangelis@archive.org>
Date: Wed, 23 Jan 2019 11:07:46 +0000
Subject: [PATCH] Configurable max threads in CdxServerDedupLoader

`CdxServerDedupLoader` used `max_workers=400` by default.
We make it a CLI option `--cdxserver-dedup-max-threads` with a default
value of 400.

We need to be able to tweak this setting because it creates too many CDX
queries which cause problems with our production CDX servers.
---
 warcprox/dedup.py | 2 +-
 warcprox/main.py  | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/warcprox/dedup.py b/warcprox/dedup.py
index 5e26062..4c9f9f1 100644
--- a/warcprox/dedup.py
+++ b/warcprox/dedup.py
@@ -296,7 +296,7 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin)
     def __init__(self, cdx_dedup, options=warcprox.Options()):
         warcprox.BaseBatchPostfetchProcessor.__init__(self, options)
         DedupableMixin.__init__(self, options)
-        self.pool = futures.ThreadPoolExecutor(max_workers=400)
+        self.pool = futures.ThreadPoolExecutor(max_workers=options.cdxserver_dedup_max_threads)
         self.batch = set()
         self.cdx_dedup = cdx_dedup
 
diff --git a/warcprox/main.py b/warcprox/main.py
index 8dab727..4b13479 100644
--- a/warcprox/main.py
+++ b/warcprox/main.py
@@ -168,6 +168,10 @@ def _build_arg_parser(prog='warcprox', show_hidden=False):
             help=suppress(
                 'value of Cookie header to include in requests to the cdx '
                 'server, when using --cdxserver-dedup'))
+    hidden.add_argument(
+            '--cdxserver-dedup-max-threads', dest='cdxserver_dedup_max_threads',
+            type=int, default=400, help=suppress(
+                'maximum number of cdx server dedup threads'))
     arg_parser.add_argument('--dedup-min-text-size', dest='dedup_min_text_size',
                             type=int, default=0,
                             help=('try to dedup text resources with payload size over this limit in bytes'))