Add CdxServerDedup unit tests and improve its exception handling

Add multiple ``CdxServerDedup`` unit tests to simulate found, not found and
invalid responses from the CDX server. Use a different file
``tests/test_dedup.py`` because we test the CdxServerDedup component
individually and it belongs to the ``warcprox.dedup`` package.

Add ``mock`` package to dev requirements.

Rework the warcprox.dedup.CdxServerDedup class to have better exception
handling.
This commit is contained in:
Vangelis Banos 2017-10-19 22:11:22 +00:00
parent fc5f39ffed
commit 960dda4c31
3 changed files with 64 additions and 18 deletions

View File

@ -60,7 +60,7 @@ setuptools.setup(
license='GPL',
packages=['warcprox'],
install_requires=deps,
tests_require=['requests>=2.0.1', 'pytest', 'warcio'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636
tests_require=['requests>=2.0.1', 'mock', 'pytest', 'warcio'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636
cmdclass = {'test': PyTest},
test_suite='warcprox.tests',
entry_points={

View File

@ -1,10 +1,52 @@
import mock
import pytest
from warcprox.dedup import CdxServerDedup
def test_cdx():
# TODO add mocking of CDX Server response
# TODO check found and not found cases
cdx_server = CdxServerDedup(cdx_url="https://web.archive.org/cdx/search/cdx")
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
url="http://example.com")
def test_cdx_dedup():
# Mock CDX Server responses to simulate found, not found and errors.
with mock.patch('warcprox.dedup.CdxServerDedup.http_pool.request') as request:
recorded_url = mock.Mock();
recorded_url.url = "http://example.com"
# not found case
result = mock.Mock()
result.status = 200
result.data = b'20170101020405 test'
request.return_value = result
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
recorded_url=recorded_url)
assert res is None
# found in the 2nd CDX line
result = mock.Mock()
result.status = 200
result.data = b"""\
20170101020304 xxx
20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A
20160505050505 yyyyyyyyyyyyyyyyyyyyyy"""
request.return_value = result
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
recorded_url=recorded_url)
assert res["url"] == "http://example.com"
assert res["date"] == "2017-02-03T04:05:03Z"
# invalid CDX result status code
result = mock.Mock()
result.status = 400
result.data = b'20170101020405 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
request.return_value = result
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
recorded_url=recorded_url)
assert res is None
# invalid CDX result content
result = mock.Mock()
result.status = 200
result.data = b'InvalidExceptionResult'
request.return_value = result
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
recorded_url=recorded_url)
assert res is None

View File

@ -29,6 +29,7 @@ from hanzo import warctools
import warcprox
import sqlite3
import urllib3
from urllib3.exceptions import HTTPError
urllib3.disable_warnings()
@ -206,10 +207,10 @@ class CdxServerDedup(object):
"""Query a CDX server to perform deduplication.
"""
logger = logging.getLogger("warcprox.dedup.CdxServerDedup")
http_pool = urllib3.PoolManager()
def __init__(self, cdx_url="https://web.archive.org/cdx/search/cdx",
options=warcprox.Options()):
self.http_pool = urllib3.PoolManager()
self.cdx_url = cdx_url
self.options = options
@ -226,30 +227,33 @@ class CdxServerDedup(object):
computed on the original content, after decoding Content-Encoding and
Transfer-Encoding, if any), if they match, write a revisit record.
:param digest_key: b'sha1:<KEY-VALUE>'.
:param digest_key: b'sha1:<KEY-VALUE>' (prefix is optional).
Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
:param recorded_url: RecordedUrl object
Result must contain:
{"url", "date": "%Y-%m-%dT%H:%M:%SZ", "id": "warc_id" if available}
{"url": <URL>, "date": "%Y-%m-%dT%H:%M:%SZ"}
"""
url = recorded_url.url
u = url.decode("utf-8") if isinstance(url, bytes) else url
try:
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
url=u, fl="timestamp,digest", limit=-1))
except urllib3.HTTPError as exc:
self.logger.error('CdxServerDedup request failed for url=%s %s',
url, exc)
if result.status == 200:
digest_key = digest_key[5:] # drop sha1: prefix
assert result.status == 200
if isinstance(digest_key, bytes):
dkey = digest_key
else:
dkey = digest_key.encode('utf-8')
dkey = dkey[5:] if dkey.startswith(b'sha1:') else dkey
for line in result.data.split(b'\n'):
if line:
(cdx_ts, cdx_digest) = line.split(b' ')
if cdx_digest == digest_key:
if cdx_digest == dkey:
dt = datetime(*_split_timestamp(cdx_ts.decode('ascii')))
# TODO find out id
return dict(id=url, url=url,
return dict(url=url,
date=dt.strftime('%Y-%m-%dT%H:%M:%SZ'))
except (HTTPError, AssertionError, ValueError) as exc:
self.logger.error('CdxServerDedup request failed for url=%s %s',
url, exc)
return None
def notify(self, recorded_url, records):