diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 5e26062..4c9f9f1 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -296,7 +296,7 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin) def __init__(self, cdx_dedup, options=warcprox.Options()): warcprox.BaseBatchPostfetchProcessor.__init__(self, options) DedupableMixin.__init__(self, options) - self.pool = futures.ThreadPoolExecutor(max_workers=400) + self.pool = futures.ThreadPoolExecutor(max_workers=options.cdxserver_dedup_max_threads) self.batch = set() self.cdx_dedup = cdx_dedup diff --git a/warcprox/main.py b/warcprox/main.py index 8dab727..e38bb02 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -168,6 +168,10 @@ def _build_arg_parser(prog='warcprox', show_hidden=False): help=suppress( 'value of Cookie header to include in requests to the cdx ' 'server, when using --cdxserver-dedup')) + hidden.add_argument( + '--cdxserver-dedup-max-threads', dest='cdxserver_dedup_max_threads', + type=int, default=50, help=suppress( + 'maximum number of cdx server dedup threads')) arg_parser.add_argument('--dedup-min-text-size', dest='dedup_min_text_size', type=int, default=0, help=('try to dedup text resources with payload size over this limit in bytes'))