diff --git a/tests/test_dedup.py b/tests/test_dedup.py index 124efb5..eea3ccd 100644 --- a/tests/test_dedup.py +++ b/tests/test_dedup.py @@ -4,43 +4,43 @@ from warcprox.dedup import CdxServerDedup def test_cdx_dedup(): # Mock CDX Server responses to simulate found, not found and errors. - with mock.patch('warcprox.dedup.CdxServerDedup.http_pool.request') as request: - url = "http://example.com" - # not found case - result = mock.Mock() - result.status = 200 - result.data = b'20170101020405 test' - request.return_value = result - cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") - res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", - url=url) - assert res is None + url = "http://example.com" + # not found case + result = mock.Mock() + result.status = 200 + result.data = b'20170101020405 test' + cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") + cdx_server.http_pool.request = mock.MagicMock(return_value=result) + res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", + url=url) + assert res is None - # found case - result = mock.Mock() - result.status = 200 - result.data = b'20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' - request.return_value = result - cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") - res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", - url=url) - assert res["date"] == b"2017-02-03T04:05:03Z" + # found case + result = mock.Mock() + result.status = 200 + result.data = b'20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' + cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") + cdx_server.http_pool.request = mock.MagicMock(return_value=result) + res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", + url=url) + assert res["date"] == b"2017-02-03T04:05:03Z" - # invalid CDX result status code - result = mock.Mock() - result.status = 400 - result.data = b'20170101020405 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' - request.return_value = result - cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") - res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", - url=url) - assert res is None - # invalid CDX result content - result = mock.Mock() - result.status = 200 - result.data = b'InvalidExceptionResult' - request.return_value = result - cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") - res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", - url=url) - assert res is None + # invalid CDX result status code + result = mock.Mock() + result.status = 400 + result.data = b'20170101020405 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' + cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") + cdx_server.http_pool.request = mock.MagicMock(return_value=result) + res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", + url=url) + assert res is None + + # invalid CDX result content + result = mock.Mock() + result.status = 200 + result.data = b'InvalidExceptionResult' + cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") + cdx_server.http_pool.request = mock.MagicMock(return_value=result) + res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", + url=url) + assert res is None diff --git a/warcprox/dedup.py b/warcprox/dedup.py index f21e1df..45b8142 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -185,12 +185,12 @@ class CdxServerDedup(object): """Query a CDX server to perform deduplication. """ logger = logging.getLogger("warcprox.dedup.CdxServerDedup") - http_pool = urllib3.PoolManager() def __init__(self, cdx_url="https://web.archive.org/cdx/search", - options=warcprox.Options()): + maxsize=200, options=warcprox.Options()): self.cdx_url = cdx_url self.options = options + self.http_pool = urllib3.PoolManager(maxsize=maxsize) def start(self): pass diff --git a/warcprox/main.py b/warcprox/main.py index 8bfc3c4..065bd63 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -225,7 +225,9 @@ def init_controller(args): elif args.rethinkdb_trough_db_url: dedup_db = warcprox.dedup.TroughDedupDb(options) elif args.cdxserver_dedup: - dedup_db = warcprox.dedup.CdxServerDedup(cdx_url=args.cdxserver_dedup) + cdxserver_maxsize = args.writer_threads or 200 + dedup_db = warcprox.dedup.CdxServerDedup(cdx_url=args.cdxserver_dedup, + maxsize=cdxserver_maxsize) elif args.dedup_db_file in (None, '', '/dev/null'): logging.info('deduplication disabled') dedup_db = None