mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Configurable max threads in CdxServerDedupLoader
`CdxServerDedupLoader` used `max_workers=400` by default. We make it a CLI option `--cdxserver-dedup-max-threads` with a default value of 400. We need to be able to tweak this setting because it creates too many CDX queries which cause problems with our production CDX servers.
This commit is contained in:
parent
cb72af015a
commit
25281376f6
@ -296,7 +296,7 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin)
|
||||
def __init__(self, cdx_dedup, options=warcprox.Options()):
|
||||
warcprox.BaseBatchPostfetchProcessor.__init__(self, options)
|
||||
DedupableMixin.__init__(self, options)
|
||||
self.pool = futures.ThreadPoolExecutor(max_workers=400)
|
||||
self.pool = futures.ThreadPoolExecutor(max_workers=options.cdxserver_dedup_max_threads)
|
||||
self.batch = set()
|
||||
self.cdx_dedup = cdx_dedup
|
||||
|
||||
|
@ -168,6 +168,10 @@ def _build_arg_parser(prog='warcprox', show_hidden=False):
|
||||
help=suppress(
|
||||
'value of Cookie header to include in requests to the cdx '
|
||||
'server, when using --cdxserver-dedup'))
|
||||
hidden.add_argument(
|
||||
'--cdxserver-dedup-max-threads', dest='cdxserver_dedup_max_threads',
|
||||
type=int, default=400, help=suppress(
|
||||
'maximum number of cdx server dedup threads'))
|
||||
arg_parser.add_argument('--dedup-min-text-size', dest='dedup_min_text_size',
|
||||
type=int, default=0,
|
||||
help=('try to dedup text resources with payload size over this limit in bytes'))
|
||||
|
Loading…
x
Reference in New Issue
Block a user