Configurable max threads in CdxServerDedupLoader

`CdxServerDedupLoader` used `max_workers=400` by default.
We make it a CLI option `--cdxserver-dedup-max-threads` with a default
value of 400.

We need to be able to tweak this setting because it creates too many CDX
queries which cause problems with our production CDX servers.
This commit is contained in:
Vangelis Banos 2019-01-23 11:07:46 +00:00
parent cb72af015a
commit 25281376f6
2 changed files with 5 additions and 1 deletions

View File

@ -296,7 +296,7 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin)
def __init__(self, cdx_dedup, options=warcprox.Options()):
warcprox.BaseBatchPostfetchProcessor.__init__(self, options)
DedupableMixin.__init__(self, options)
self.pool = futures.ThreadPoolExecutor(max_workers=400)
self.pool = futures.ThreadPoolExecutor(max_workers=options.cdxserver_dedup_max_threads)
self.batch = set()
self.cdx_dedup = cdx_dedup

View File

@ -168,6 +168,10 @@ def _build_arg_parser(prog='warcprox', show_hidden=False):
help=suppress(
'value of Cookie header to include in requests to the cdx '
'server, when using --cdxserver-dedup'))
hidden.add_argument(
'--cdxserver-dedup-max-threads', dest='cdxserver_dedup_max_threads',
type=int, default=400, help=suppress(
'maximum number of cdx server dedup threads'))
arg_parser.add_argument('--dedup-min-text-size', dest='dedup_min_text_size',
type=int, default=0,
help=('try to dedup text resources with payload size over this limit in bytes'))