From 0d8fe4a38fa46aa07eb4ff6ca0b86f6ffb0297a1 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Thu, 8 Feb 2018 22:24:20 +0000 Subject: [PATCH] Disable retries and set timeout=2.0 for CDX Dedup server Its better to skip CDX server dedup than slow down when its unresponsive. Also increase pool size from 50 to 200. --- warcprox/dedup.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 8b27874..8d63f96 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -206,9 +206,14 @@ class CdxServerDedup(DedupDb): def __init__(self, cdx_url="https://web.archive.org/cdx/search", maxsize=200, options=warcprox.Options()): + """Initialize cdx server connection pool and related parameters. + Use low timeout value and no retries to avoid blocking warcprox + operation by a slow CDX server. + """ self.cdx_url = cdx_url self.options = options - self.http_pool = urllib3.PoolManager(maxsize=maxsize) + self.http_pool = urllib3.PoolManager(maxsize=maxsize, retries=0, + timeout=2.0) if options.cdxserver_dedup_cookies: self.cookies = options.cdxserver_dedup_cookies @@ -271,7 +276,7 @@ class CdxServerDedup(DedupDb): class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor): def __init__(self, cdx_dedup, options=warcprox.Options()): warcprox.BaseBatchPostfetchProcessor.__init__(self, options) - self.pool = futures.ThreadPoolExecutor(max_workers=50) + self.pool = futures.ThreadPoolExecutor(max_workers=200) self.batch = set() self.cdx_dedup = cdx_dedup