Merge pull request #113 from vbanos/cdxserver-dedup-max-threads

Configurable max threads in CdxServerDedupLoader
This commit is contained in:
Noah Levitt 2019-01-23 10:44:04 -08:00 committed by GitHub
commit 98f50ca296
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 5 additions and 1 deletions

View File

@ -296,7 +296,7 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin)
def __init__(self, cdx_dedup, options=warcprox.Options()):
warcprox.BaseBatchPostfetchProcessor.__init__(self, options)
DedupableMixin.__init__(self, options)
self.pool = futures.ThreadPoolExecutor(max_workers=400)
self.pool = futures.ThreadPoolExecutor(max_workers=options.cdxserver_dedup_max_threads)
self.batch = set()
self.cdx_dedup = cdx_dedup

View File

@ -168,6 +168,10 @@ def _build_arg_parser(prog='warcprox', show_hidden=False):
help=suppress(
'value of Cookie header to include in requests to the cdx '
'server, when using --cdxserver-dedup'))
hidden.add_argument(
'--cdxserver-dedup-max-threads', dest='cdxserver_dedup_max_threads',
type=int, default=50, help=suppress(
'maximum number of cdx server dedup threads'))
arg_parser.add_argument('--dedup-min-text-size', dest='dedup_min_text_size',
type=int, default=0,
help=('try to dedup text resources with payload size over this limit in bytes'))