From 53f13d3536de2bf5dcb68bf8d54d3628b5db0127 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Thu, 7 Feb 2019 09:08:11 +0000 Subject: [PATCH 1/4] Use in-memory LRU cache in CDX Server dedup Add option `--cdxserver-dedup-lru-cache-size=N` (default None) to enable in-memory caching of CDX dedup requests using stdlib `lru_cache` method. Cache memory info is available on `INFO` logging outputs like: ``` CacheInfo(hits=3172, misses=3293, maxsize=1024, currsize=1024) `` --- warcprox/dedup.py | 10 +++++++++- warcprox/main.py | 4 ++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 4c9f9f1..d9e9335 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -34,6 +34,7 @@ import urllib3 from urllib3.exceptions import HTTPError import collections from concurrent import futures +from functools import lru_cache urllib3.disable_warnings() @@ -236,6 +237,8 @@ class CdxServerDedup(DedupDb): headers['Cookie'] = options.cdxserver_dedup_cookies self.http_pool = urllib3.PoolManager(maxsize=maxsize, retries=0, timeout=2.0, headers=headers) + if options.cdxserver_dedup_lru_cache_size: + self.cached_lookup = lru_cache(maxsize=options.cdxserver_dedup_lru_cache_size)(self.lookup) def loader(self, *args, **kwargs): return CdxServerDedupLoader(self, self.options) @@ -299,6 +302,7 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin) self.pool = futures.ThreadPoolExecutor(max_workers=options.cdxserver_dedup_max_threads) self.batch = set() self.cdx_dedup = cdx_dedup + self.use_lru_cache = options.cdxserver_dedup_lru_cache_size != None def _get_process_put(self): recorded_url = self.inq.get(block=True, timeout=0.5) @@ -315,7 +319,11 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin) try: digest_key = warcprox.digest_str(recorded_url.payload_digest, self.options.base32) - dedup_info = self.cdx_dedup.lookup(digest_key, recorded_url.url) + if self.use_lru_cache: + dedup_info = self.cdx_dedup.cached_lookup(digest_key, recorded_url.url) + self.logger.info(self.cdx_dedup.cached_lookup.cache_info()) + else: + dedup_info = self.cdx_dedup.lookup(digest_key, recorded_url.url) if dedup_info: recorded_url.dedup_info = dedup_info except ValueError as exc: diff --git a/warcprox/main.py b/warcprox/main.py index e38bb02..4f9be61 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -172,6 +172,10 @@ def _build_arg_parser(prog='warcprox', show_hidden=False): '--cdxserver-dedup-max-threads', dest='cdxserver_dedup_max_threads', type=int, default=50, help=suppress( 'maximum number of cdx server dedup threads')) + hidden.add_argument( + '--cdxserver-dedup-lru-cache-size', dest='cdxserver_dedup_lru_cache_size', + type=int, help=suppress( + 'enable in-memory LRU cache to reduce duplicate CDX server requests')) arg_parser.add_argument('--dedup-min-text-size', dest='dedup_min_text_size', type=int, default=0, help=('try to dedup text resources with payload size over this limit in bytes')) From 1133715331a69afe4e8421a77d5f64caf3cf2052 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Tue, 12 Feb 2019 08:28:15 +0000 Subject: [PATCH 2/4] Enable cdx dedup lru cache by default use default value 1024 --- warcprox/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/main.py b/warcprox/main.py index 4f9be61..e73170b 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -174,7 +174,7 @@ def _build_arg_parser(prog='warcprox', show_hidden=False): 'maximum number of cdx server dedup threads')) hidden.add_argument( '--cdxserver-dedup-lru-cache-size', dest='cdxserver_dedup_lru_cache_size', - type=int, help=suppress( + type=int, default=1024, help=suppress( 'enable in-memory LRU cache to reduce duplicate CDX server requests')) arg_parser.add_argument('--dedup-min-text-size', dest='dedup_min_text_size', type=int, default=0, From 660989939efc9a98d3145ed3169716149ba75b6f Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Tue, 12 Feb 2019 20:43:27 +0000 Subject: [PATCH 3/4] Remove cli option cdxserver-dedup-lru-cache-size LRU cache is always enabled for cdxserver dedup module with a default cache size of 1024. --- warcprox/dedup.py | 11 +++-------- warcprox/main.py | 4 ---- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index d9e9335..d86f4f8 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -237,8 +237,7 @@ class CdxServerDedup(DedupDb): headers['Cookie'] = options.cdxserver_dedup_cookies self.http_pool = urllib3.PoolManager(maxsize=maxsize, retries=0, timeout=2.0, headers=headers) - if options.cdxserver_dedup_lru_cache_size: - self.cached_lookup = lru_cache(maxsize=options.cdxserver_dedup_lru_cache_size)(self.lookup) + self.cached_lookup = lru_cache(maxsize=1024)(self.lookup) def loader(self, *args, **kwargs): return CdxServerDedupLoader(self, self.options) @@ -302,7 +301,6 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin) self.pool = futures.ThreadPoolExecutor(max_workers=options.cdxserver_dedup_max_threads) self.batch = set() self.cdx_dedup = cdx_dedup - self.use_lru_cache = options.cdxserver_dedup_lru_cache_size != None def _get_process_put(self): recorded_url = self.inq.get(block=True, timeout=0.5) @@ -319,11 +317,8 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin) try: digest_key = warcprox.digest_str(recorded_url.payload_digest, self.options.base32) - if self.use_lru_cache: - dedup_info = self.cdx_dedup.cached_lookup(digest_key, recorded_url.url) - self.logger.info(self.cdx_dedup.cached_lookup.cache_info()) - else: - dedup_info = self.cdx_dedup.lookup(digest_key, recorded_url.url) + dedup_info = self.cdx_dedup.cached_lookup(digest_key, recorded_url.url) + self.logger.info(self.cdx_dedup.cached_lookup.cache_info()) if dedup_info: recorded_url.dedup_info = dedup_info except ValueError as exc: diff --git a/warcprox/main.py b/warcprox/main.py index e73170b..e38bb02 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -172,10 +172,6 @@ def _build_arg_parser(prog='warcprox', show_hidden=False): '--cdxserver-dedup-max-threads', dest='cdxserver_dedup_max_threads', type=int, default=50, help=suppress( 'maximum number of cdx server dedup threads')) - hidden.add_argument( - '--cdxserver-dedup-lru-cache-size', dest='cdxserver_dedup_lru_cache_size', - type=int, default=1024, help=suppress( - 'enable in-memory LRU cache to reduce duplicate CDX server requests')) arg_parser.add_argument('--dedup-min-text-size', dest='dedup_min_text_size', type=int, default=0, help=('try to dedup text resources with payload size over this limit in bytes')) From 99fb998e1dc4e91da18182cfa59cf1201d0edc4f Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Tue, 12 Feb 2019 21:46:49 +0000 Subject: [PATCH 4/4] log LRU cache info every 1000 requests to avoid writing to the log too often. --- warcprox/dedup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index d86f4f8..0bb15f6 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -318,7 +318,9 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin) digest_key = warcprox.digest_str(recorded_url.payload_digest, self.options.base32) dedup_info = self.cdx_dedup.cached_lookup(digest_key, recorded_url.url) - self.logger.info(self.cdx_dedup.cached_lookup.cache_info()) + cache_info = self.cdx_dedup.cached_lookup.cache_info() + if (cache_info.hits + cache_info.misses) % 1000 == 0: + self.logger.info(self.cdx_dedup.cached_lookup.cache_info()) if dedup_info: recorded_url.dedup_info = dedup_info except ValueError as exc: