mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Use in-memory LRU cache in CDX Server dedup
Add option `--cdxserver-dedup-lru-cache-size=N` (default None) to enable in-memory caching of CDX dedup requests using stdlib `lru_cache` method. Cache memory info is available on `INFO` logging outputs like: ``` CacheInfo(hits=3172, misses=3293, maxsize=1024, currsize=1024) ``
This commit is contained in:
parent
98f50ca296
commit
53f13d3536
@ -34,6 +34,7 @@ import urllib3
|
|||||||
from urllib3.exceptions import HTTPError
|
from urllib3.exceptions import HTTPError
|
||||||
import collections
|
import collections
|
||||||
from concurrent import futures
|
from concurrent import futures
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
urllib3.disable_warnings()
|
urllib3.disable_warnings()
|
||||||
|
|
||||||
@ -236,6 +237,8 @@ class CdxServerDedup(DedupDb):
|
|||||||
headers['Cookie'] = options.cdxserver_dedup_cookies
|
headers['Cookie'] = options.cdxserver_dedup_cookies
|
||||||
self.http_pool = urllib3.PoolManager(maxsize=maxsize, retries=0,
|
self.http_pool = urllib3.PoolManager(maxsize=maxsize, retries=0,
|
||||||
timeout=2.0, headers=headers)
|
timeout=2.0, headers=headers)
|
||||||
|
if options.cdxserver_dedup_lru_cache_size:
|
||||||
|
self.cached_lookup = lru_cache(maxsize=options.cdxserver_dedup_lru_cache_size)(self.lookup)
|
||||||
|
|
||||||
def loader(self, *args, **kwargs):
|
def loader(self, *args, **kwargs):
|
||||||
return CdxServerDedupLoader(self, self.options)
|
return CdxServerDedupLoader(self, self.options)
|
||||||
@ -299,6 +302,7 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin)
|
|||||||
self.pool = futures.ThreadPoolExecutor(max_workers=options.cdxserver_dedup_max_threads)
|
self.pool = futures.ThreadPoolExecutor(max_workers=options.cdxserver_dedup_max_threads)
|
||||||
self.batch = set()
|
self.batch = set()
|
||||||
self.cdx_dedup = cdx_dedup
|
self.cdx_dedup = cdx_dedup
|
||||||
|
self.use_lru_cache = options.cdxserver_dedup_lru_cache_size != None
|
||||||
|
|
||||||
def _get_process_put(self):
|
def _get_process_put(self):
|
||||||
recorded_url = self.inq.get(block=True, timeout=0.5)
|
recorded_url = self.inq.get(block=True, timeout=0.5)
|
||||||
@ -315,7 +319,11 @@ class CdxServerDedupLoader(warcprox.BaseBatchPostfetchProcessor, DedupableMixin)
|
|||||||
try:
|
try:
|
||||||
digest_key = warcprox.digest_str(recorded_url.payload_digest,
|
digest_key = warcprox.digest_str(recorded_url.payload_digest,
|
||||||
self.options.base32)
|
self.options.base32)
|
||||||
dedup_info = self.cdx_dedup.lookup(digest_key, recorded_url.url)
|
if self.use_lru_cache:
|
||||||
|
dedup_info = self.cdx_dedup.cached_lookup(digest_key, recorded_url.url)
|
||||||
|
self.logger.info(self.cdx_dedup.cached_lookup.cache_info())
|
||||||
|
else:
|
||||||
|
dedup_info = self.cdx_dedup.lookup(digest_key, recorded_url.url)
|
||||||
if dedup_info:
|
if dedup_info:
|
||||||
recorded_url.dedup_info = dedup_info
|
recorded_url.dedup_info = dedup_info
|
||||||
except ValueError as exc:
|
except ValueError as exc:
|
||||||
|
@ -172,6 +172,10 @@ def _build_arg_parser(prog='warcprox', show_hidden=False):
|
|||||||
'--cdxserver-dedup-max-threads', dest='cdxserver_dedup_max_threads',
|
'--cdxserver-dedup-max-threads', dest='cdxserver_dedup_max_threads',
|
||||||
type=int, default=50, help=suppress(
|
type=int, default=50, help=suppress(
|
||||||
'maximum number of cdx server dedup threads'))
|
'maximum number of cdx server dedup threads'))
|
||||||
|
hidden.add_argument(
|
||||||
|
'--cdxserver-dedup-lru-cache-size', dest='cdxserver_dedup_lru_cache_size',
|
||||||
|
type=int, help=suppress(
|
||||||
|
'enable in-memory LRU cache to reduce duplicate CDX server requests'))
|
||||||
arg_parser.add_argument('--dedup-min-text-size', dest='dedup_min_text_size',
|
arg_parser.add_argument('--dedup-min-text-size', dest='dedup_min_text_size',
|
||||||
type=int, default=0,
|
type=int, default=0,
|
||||||
help=('try to dedup text resources with payload size over this limit in bytes'))
|
help=('try to dedup text resources with payload size over this limit in bytes'))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user