From 960dda4c319816cf9733367255e313e67c512e45 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Thu, 19 Oct 2017 22:11:22 +0000 Subject: [PATCH] Add CdxServerDedup unit tests and improve its exception handling Add multiple ``CdxServerDedup`` unit tests to simulate found, not found and invalid responses from the CDX server. Use a different file ``tests/test_dedup.py`` because we test the CdxServerDedup component individually and it belongs to the ``warcprox.dedup`` package. Add ``mock`` package to dev requirements. Rework the warcprox.dedup.CdxServerDedup class to have better exception handling. --- setup.py | 2 +- tests/test_dedup.py | 54 ++++++++++++++++++++++++++++++++++++++++----- warcprox/dedup.py | 26 +++++++++++++--------- 3 files changed, 64 insertions(+), 18 deletions(-) diff --git a/setup.py b/setup.py index b9308e2..228ece7 100755 --- a/setup.py +++ b/setup.py @@ -60,7 +60,7 @@ setuptools.setup( license='GPL', packages=['warcprox'], install_requires=deps, - tests_require=['requests>=2.0.1', 'pytest', 'warcio'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636 + tests_require=['requests>=2.0.1', 'mock', 'pytest', 'warcio'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636 cmdclass = {'test': PyTest}, test_suite='warcprox.tests', entry_points={ diff --git a/tests/test_dedup.py b/tests/test_dedup.py index 7836d27..e1b7482 100644 --- a/tests/test_dedup.py +++ b/tests/test_dedup.py @@ -1,10 +1,52 @@ +import mock import pytest from warcprox.dedup import CdxServerDedup -def test_cdx(): - # TODO add mocking of CDX Server response - # TODO check found and not found cases - cdx_server = CdxServerDedup(cdx_url="https://web.archive.org/cdx/search/cdx") - res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", - url="http://example.com") +def test_cdx_dedup(): + # Mock CDX Server responses to simulate found, not found and errors. + with mock.patch('warcprox.dedup.CdxServerDedup.http_pool.request') as request: + recorded_url = mock.Mock(); + recorded_url.url = "http://example.com" + # not found case + result = mock.Mock() + result.status = 200 + result.data = b'20170101020405 test' + request.return_value = result + cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") + res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", + recorded_url=recorded_url) + assert res is None + + # found in the 2nd CDX line + result = mock.Mock() + result.status = 200 + result.data = b"""\ +20170101020304 xxx +20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A +20160505050505 yyyyyyyyyyyyyyyyyyyyyy""" + request.return_value = result + cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") + res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", + recorded_url=recorded_url) + assert res["url"] == "http://example.com" + assert res["date"] == "2017-02-03T04:05:03Z" + + # invalid CDX result status code + result = mock.Mock() + result.status = 400 + result.data = b'20170101020405 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' + request.return_value = result + cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") + res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", + recorded_url=recorded_url) + assert res is None + # invalid CDX result content + result = mock.Mock() + result.status = 200 + result.data = b'InvalidExceptionResult' + request.return_value = result + cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") + res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", + recorded_url=recorded_url) + assert res is None diff --git a/warcprox/dedup.py b/warcprox/dedup.py index a3c89f7..8aa9c16 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -29,6 +29,7 @@ from hanzo import warctools import warcprox import sqlite3 import urllib3 +from urllib3.exceptions import HTTPError urllib3.disable_warnings() @@ -206,10 +207,10 @@ class CdxServerDedup(object): """Query a CDX server to perform deduplication. """ logger = logging.getLogger("warcprox.dedup.CdxServerDedup") + http_pool = urllib3.PoolManager() def __init__(self, cdx_url="https://web.archive.org/cdx/search/cdx", options=warcprox.Options()): - self.http_pool = urllib3.PoolManager() self.cdx_url = cdx_url self.options = options @@ -226,30 +227,33 @@ class CdxServerDedup(object): computed on the original content, after decoding Content-Encoding and Transfer-Encoding, if any), if they match, write a revisit record. - :param digest_key: b'sha1:'. + :param digest_key: b'sha1:' (prefix is optional). Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' :param recorded_url: RecordedUrl object Result must contain: - {"url", "date": "%Y-%m-%dT%H:%M:%SZ", "id": "warc_id" if available} + {"url": , "date": "%Y-%m-%dT%H:%M:%SZ"} """ url = recorded_url.url u = url.decode("utf-8") if isinstance(url, bytes) else url try: result = self.http_pool.request('GET', self.cdx_url, fields=dict( url=u, fl="timestamp,digest", limit=-1)) - except urllib3.HTTPError as exc: - self.logger.error('CdxServerDedup request failed for url=%s %s', - url, exc) - if result.status == 200: - digest_key = digest_key[5:] # drop sha1: prefix + assert result.status == 200 + if isinstance(digest_key, bytes): + dkey = digest_key + else: + dkey = digest_key.encode('utf-8') + dkey = dkey[5:] if dkey.startswith(b'sha1:') else dkey for line in result.data.split(b'\n'): if line: (cdx_ts, cdx_digest) = line.split(b' ') - if cdx_digest == digest_key: + if cdx_digest == dkey: dt = datetime(*_split_timestamp(cdx_ts.decode('ascii'))) - # TODO find out id - return dict(id=url, url=url, + return dict(url=url, date=dt.strftime('%Y-%m-%dT%H:%M:%SZ')) + except (HTTPError, AssertionError, ValueError) as exc: + self.logger.error('CdxServerDedup request failed for url=%s %s', + url, exc) return None def notify(self, recorded_url, records):