Merge pull request #50 from vbanos/cdxserverdedup-maxsize

Configurable CdxServerDedup urllib3 connection pool size
This commit is contained in:
Noah Levitt 2018-01-15 16:46:37 -08:00 committed by GitHub
commit d7208d89c6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 43 additions and 41 deletions

View File

@ -4,43 +4,43 @@ from warcprox.dedup import CdxServerDedup
def test_cdx_dedup(): def test_cdx_dedup():
# Mock CDX Server responses to simulate found, not found and errors. # Mock CDX Server responses to simulate found, not found and errors.
with mock.patch('warcprox.dedup.CdxServerDedup.http_pool.request') as request: url = "http://example.com"
url = "http://example.com" # not found case
# not found case result = mock.Mock()
result = mock.Mock() result.status = 200
result.status = 200 result.data = b'20170101020405 test'
result.data = b'20170101020405 test' cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
request.return_value = result cdx_server.http_pool.request = mock.MagicMock(return_value=result)
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", url=url)
url=url) assert res is None
assert res is None
# found case # found case
result = mock.Mock() result = mock.Mock()
result.status = 200 result.status = 200
result.data = b'20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' result.data = b'20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
request.return_value = result cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") cdx_server.http_pool.request = mock.MagicMock(return_value=result)
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
url=url) url=url)
assert res["date"] == b"2017-02-03T04:05:03Z" assert res["date"] == b"2017-02-03T04:05:03Z"
# invalid CDX result status code # invalid CDX result status code
result = mock.Mock() result = mock.Mock()
result.status = 400 result.status = 400
result.data = b'20170101020405 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' result.data = b'20170101020405 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
request.return_value = result cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") cdx_server.http_pool.request = mock.MagicMock(return_value=result)
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
url=url) url=url)
assert res is None assert res is None
# invalid CDX result content
result = mock.Mock() # invalid CDX result content
result.status = 200 result = mock.Mock()
result.data = b'InvalidExceptionResult' result.status = 200
request.return_value = result result.data = b'InvalidExceptionResult'
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", cdx_server.http_pool.request = mock.MagicMock(return_value=result)
url=url) res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
assert res is None url=url)
assert res is None

View File

@ -185,12 +185,12 @@ class CdxServerDedup(object):
"""Query a CDX server to perform deduplication. """Query a CDX server to perform deduplication.
""" """
logger = logging.getLogger("warcprox.dedup.CdxServerDedup") logger = logging.getLogger("warcprox.dedup.CdxServerDedup")
http_pool = urllib3.PoolManager()
def __init__(self, cdx_url="https://web.archive.org/cdx/search", def __init__(self, cdx_url="https://web.archive.org/cdx/search",
options=warcprox.Options()): maxsize=200, options=warcprox.Options()):
self.cdx_url = cdx_url self.cdx_url = cdx_url
self.options = options self.options = options
self.http_pool = urllib3.PoolManager(maxsize=maxsize)
def start(self): def start(self):
pass pass

View File

@ -225,7 +225,9 @@ def init_controller(args):
elif args.rethinkdb_trough_db_url: elif args.rethinkdb_trough_db_url:
dedup_db = warcprox.dedup.TroughDedupDb(options) dedup_db = warcprox.dedup.TroughDedupDb(options)
elif args.cdxserver_dedup: elif args.cdxserver_dedup:
dedup_db = warcprox.dedup.CdxServerDedup(cdx_url=args.cdxserver_dedup) cdxserver_maxsize = args.writer_threads or 200
dedup_db = warcprox.dedup.CdxServerDedup(cdx_url=args.cdxserver_dedup,
maxsize=cdxserver_maxsize)
elif args.dedup_db_file in (None, '', '/dev/null'): elif args.dedup_db_file in (None, '', '/dev/null'):
logging.info('deduplication disabled') logging.info('deduplication disabled')
dedup_db = None dedup_db = None