mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge pull request #113 from vbanos/cdxserver-dedup-max-threads
Configurable max threads in CdxServerDedupLoader
This commit is contained in:
commit
98f50ca296
@ -296,7 +296,7 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin)
|
|||||||
def __init__(self, cdx_dedup, options=warcprox.Options()):
|
def __init__(self, cdx_dedup, options=warcprox.Options()):
|
||||||
warcprox.BaseBatchPostfetchProcessor.__init__(self, options)
|
warcprox.BaseBatchPostfetchProcessor.__init__(self, options)
|
||||||
DedupableMixin.__init__(self, options)
|
DedupableMixin.__init__(self, options)
|
||||||
self.pool = futures.ThreadPoolExecutor(max_workers=400)
|
self.pool = futures.ThreadPoolExecutor(max_workers=options.cdxserver_dedup_max_threads)
|
||||||
self.batch = set()
|
self.batch = set()
|
||||||
self.cdx_dedup = cdx_dedup
|
self.cdx_dedup = cdx_dedup
|
||||||
|
|
||||||
|
@ -168,6 +168,10 @@ def _build_arg_parser(prog='warcprox', show_hidden=False):
|
|||||||
help=suppress(
|
help=suppress(
|
||||||
'value of Cookie header to include in requests to the cdx '
|
'value of Cookie header to include in requests to the cdx '
|
||||||
'server, when using --cdxserver-dedup'))
|
'server, when using --cdxserver-dedup'))
|
||||||
|
hidden.add_argument(
|
||||||
|
'--cdxserver-dedup-max-threads', dest='cdxserver_dedup_max_threads',
|
||||||
|
type=int, default=50, help=suppress(
|
||||||
|
'maximum number of cdx server dedup threads'))
|
||||||
arg_parser.add_argument('--dedup-min-text-size', dest='dedup_min_text_size',
|
arg_parser.add_argument('--dedup-min-text-size', dest='dedup_min_text_size',
|
||||||
type=int, default=0,
|
type=int, default=0,
|
||||||
help=('try to dedup text resources with payload size over this limit in bytes'))
|
help=('try to dedup text resources with payload size over this limit in bytes'))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user