mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge pull request #50 from vbanos/cdxserverdedup-maxsize
Configurable CdxServerDedup urllib3 connection pool size
This commit is contained in:
commit
d7208d89c6
@ -4,43 +4,43 @@ from warcprox.dedup import CdxServerDedup
|
||||
|
||||
def test_cdx_dedup():
|
||||
# Mock CDX Server responses to simulate found, not found and errors.
|
||||
with mock.patch('warcprox.dedup.CdxServerDedup.http_pool.request') as request:
|
||||
url = "http://example.com"
|
||||
# not found case
|
||||
result = mock.Mock()
|
||||
result.status = 200
|
||||
result.data = b'20170101020405 test'
|
||||
request.return_value = result
|
||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
url=url)
|
||||
assert res is None
|
||||
url = "http://example.com"
|
||||
# not found case
|
||||
result = mock.Mock()
|
||||
result.status = 200
|
||||
result.data = b'20170101020405 test'
|
||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||
cdx_server.http_pool.request = mock.MagicMock(return_value=result)
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
url=url)
|
||||
assert res is None
|
||||
|
||||
# found case
|
||||
result = mock.Mock()
|
||||
result.status = 200
|
||||
result.data = b'20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
||||
request.return_value = result
|
||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
url=url)
|
||||
assert res["date"] == b"2017-02-03T04:05:03Z"
|
||||
# found case
|
||||
result = mock.Mock()
|
||||
result.status = 200
|
||||
result.data = b'20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||
cdx_server.http_pool.request = mock.MagicMock(return_value=result)
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
url=url)
|
||||
assert res["date"] == b"2017-02-03T04:05:03Z"
|
||||
|
||||
# invalid CDX result status code
|
||||
result = mock.Mock()
|
||||
result.status = 400
|
||||
result.data = b'20170101020405 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
||||
request.return_value = result
|
||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
url=url)
|
||||
assert res is None
|
||||
# invalid CDX result content
|
||||
result = mock.Mock()
|
||||
result.status = 200
|
||||
result.data = b'InvalidExceptionResult'
|
||||
request.return_value = result
|
||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
url=url)
|
||||
assert res is None
|
||||
# invalid CDX result status code
|
||||
result = mock.Mock()
|
||||
result.status = 400
|
||||
result.data = b'20170101020405 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||
cdx_server.http_pool.request = mock.MagicMock(return_value=result)
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
url=url)
|
||||
assert res is None
|
||||
|
||||
# invalid CDX result content
|
||||
result = mock.Mock()
|
||||
result.status = 200
|
||||
result.data = b'InvalidExceptionResult'
|
||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||
cdx_server.http_pool.request = mock.MagicMock(return_value=result)
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
url=url)
|
||||
assert res is None
|
||||
|
@ -185,12 +185,12 @@ class CdxServerDedup(object):
|
||||
"""Query a CDX server to perform deduplication.
|
||||
"""
|
||||
logger = logging.getLogger("warcprox.dedup.CdxServerDedup")
|
||||
http_pool = urllib3.PoolManager()
|
||||
|
||||
def __init__(self, cdx_url="https://web.archive.org/cdx/search",
|
||||
options=warcprox.Options()):
|
||||
maxsize=200, options=warcprox.Options()):
|
||||
self.cdx_url = cdx_url
|
||||
self.options = options
|
||||
self.http_pool = urllib3.PoolManager(maxsize=maxsize)
|
||||
|
||||
def start(self):
|
||||
pass
|
||||
|
@ -225,7 +225,9 @@ def init_controller(args):
|
||||
elif args.rethinkdb_trough_db_url:
|
||||
dedup_db = warcprox.dedup.TroughDedupDb(options)
|
||||
elif args.cdxserver_dedup:
|
||||
dedup_db = warcprox.dedup.CdxServerDedup(cdx_url=args.cdxserver_dedup)
|
||||
cdxserver_maxsize = args.writer_threads or 200
|
||||
dedup_db = warcprox.dedup.CdxServerDedup(cdx_url=args.cdxserver_dedup,
|
||||
maxsize=cdxserver_maxsize)
|
||||
elif args.dedup_db_file in (None, '', '/dev/null'):
|
||||
logging.info('deduplication disabled')
|
||||
dedup_db = None
|
||||
|
Loading…
x
Reference in New Issue
Block a user