From e59fed2b6f1a588996c20d2fdd650a6a9465c1b4 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Mon, 15 Jan 2018 17:43:34 +0000 Subject: [PATCH 1/2] Configurable CdxServerDedup urllib3 connection pool size urllib3 pool has default ``maxsize=1`` http://urllib3.readthedocs.io/en/latest/advanced-usage.html. We need to set a higher value because we get warnings like this: ``` 2018-01-15 20:04:10,044 18436 WARNING WarcWriterThread030(tid=18502) urllib3.connectionpool._put_conn(connectionpool.py:277) Connection pool is full, discarding connection: wwwb-dedup ``` We set value: ```cdxserver_maxsize = args.writer_threads or 200```. Note that the ideal would be to use this https://github.com/internetarchive/warcprox/blob/master/warcprox/main.py#L284 but it is initialized after dedup, there is a dependency and we cannot use it. --- warcprox/dedup.py | 4 ++-- warcprox/main.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index f21e1df..45b8142 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -185,12 +185,12 @@ class CdxServerDedup(object): """Query a CDX server to perform deduplication. """ logger = logging.getLogger("warcprox.dedup.CdxServerDedup") - http_pool = urllib3.PoolManager() def __init__(self, cdx_url="https://web.archive.org/cdx/search", - options=warcprox.Options()): + maxsize=200, options=warcprox.Options()): self.cdx_url = cdx_url self.options = options + self.http_pool = urllib3.PoolManager(maxsize=maxsize) def start(self): pass diff --git a/warcprox/main.py b/warcprox/main.py index 348dfbf..457ddb2 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -222,7 +222,9 @@ def init_controller(args): elif args.rethinkdb_trough_db_url: dedup_db = warcprox.dedup.TroughDedupDb(options) elif args.cdxserver_dedup: - dedup_db = warcprox.dedup.CdxServerDedup(cdx_url=args.cdxserver_dedup) + cdxserver_maxsize = args.writer_threads or 200 + dedup_db = warcprox.dedup.CdxServerDedup(cdx_url=args.cdxserver_dedup, + maxsize=cdxserver_maxsize) elif args.dedup_db_file in (None, '', '/dev/null'): logging.info('deduplication disabled') dedup_db = None From 4a165e5f779275793ab720ba000d1463777556aa Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Mon, 15 Jan 2018 20:58:36 +0000 Subject: [PATCH 2/2] Update CdxServerDedup unit test To work correctly with the new way we init the ``CdxServerDedup.http_pool``. Use ``mock.MagicMock`` instead of ``mock.patch``. The unit test logic remains entirely the same. --- tests/test_dedup.py | 76 ++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/tests/test_dedup.py b/tests/test_dedup.py index 124efb5..eea3ccd 100644 --- a/tests/test_dedup.py +++ b/tests/test_dedup.py @@ -4,43 +4,43 @@ from warcprox.dedup import CdxServerDedup def test_cdx_dedup(): # Mock CDX Server responses to simulate found, not found and errors. - with mock.patch('warcprox.dedup.CdxServerDedup.http_pool.request') as request: - url = "http://example.com" - # not found case - result = mock.Mock() - result.status = 200 - result.data = b'20170101020405 test' - request.return_value = result - cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") - res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", - url=url) - assert res is None + url = "http://example.com" + # not found case + result = mock.Mock() + result.status = 200 + result.data = b'20170101020405 test' + cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") + cdx_server.http_pool.request = mock.MagicMock(return_value=result) + res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", + url=url) + assert res is None - # found case - result = mock.Mock() - result.status = 200 - result.data = b'20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' - request.return_value = result - cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") - res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", - url=url) - assert res["date"] == b"2017-02-03T04:05:03Z" + # found case + result = mock.Mock() + result.status = 200 + result.data = b'20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' + cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") + cdx_server.http_pool.request = mock.MagicMock(return_value=result) + res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", + url=url) + assert res["date"] == b"2017-02-03T04:05:03Z" - # invalid CDX result status code - result = mock.Mock() - result.status = 400 - result.data = b'20170101020405 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' - request.return_value = result - cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") - res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", - url=url) - assert res is None - # invalid CDX result content - result = mock.Mock() - result.status = 200 - result.data = b'InvalidExceptionResult' - request.return_value = result - cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") - res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", - url=url) - assert res is None + # invalid CDX result status code + result = mock.Mock() + result.status = 400 + result.data = b'20170101020405 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' + cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") + cdx_server.http_pool.request = mock.MagicMock(return_value=result) + res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", + url=url) + assert res is None + + # invalid CDX result content + result = mock.Mock() + result.status = 200 + result.data = b'InvalidExceptionResult' + cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") + cdx_server.http_pool.request = mock.MagicMock(return_value=result) + res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", + url=url) + assert res is None