mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Add CdxServerDedup unit tests and improve its exception handling
Add multiple ``CdxServerDedup`` unit tests to simulate found, not found and invalid responses from the CDX server. Use a different file ``tests/test_dedup.py`` because we test the CdxServerDedup component individually and it belongs to the ``warcprox.dedup`` package. Add ``mock`` package to dev requirements. Rework the warcprox.dedup.CdxServerDedup class to have better exception handling.
This commit is contained in:
parent
fc5f39ffed
commit
960dda4c31
2
setup.py
2
setup.py
@ -60,7 +60,7 @@ setuptools.setup(
|
||||
license='GPL',
|
||||
packages=['warcprox'],
|
||||
install_requires=deps,
|
||||
tests_require=['requests>=2.0.1', 'pytest', 'warcio'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636
|
||||
tests_require=['requests>=2.0.1', 'mock', 'pytest', 'warcio'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636
|
||||
cmdclass = {'test': PyTest},
|
||||
test_suite='warcprox.tests',
|
||||
entry_points={
|
||||
|
@ -1,10 +1,52 @@
|
||||
import mock
|
||||
import pytest
|
||||
from warcprox.dedup import CdxServerDedup
|
||||
|
||||
|
||||
def test_cdx():
|
||||
# TODO add mocking of CDX Server response
|
||||
# TODO check found and not found cases
|
||||
cdx_server = CdxServerDedup(cdx_url="https://web.archive.org/cdx/search/cdx")
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
url="http://example.com")
|
||||
def test_cdx_dedup():
|
||||
# Mock CDX Server responses to simulate found, not found and errors.
|
||||
with mock.patch('warcprox.dedup.CdxServerDedup.http_pool.request') as request:
|
||||
recorded_url = mock.Mock();
|
||||
recorded_url.url = "http://example.com"
|
||||
# not found case
|
||||
result = mock.Mock()
|
||||
result.status = 200
|
||||
result.data = b'20170101020405 test'
|
||||
request.return_value = result
|
||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
recorded_url=recorded_url)
|
||||
assert res is None
|
||||
|
||||
# found in the 2nd CDX line
|
||||
result = mock.Mock()
|
||||
result.status = 200
|
||||
result.data = b"""\
|
||||
20170101020304 xxx
|
||||
20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A
|
||||
20160505050505 yyyyyyyyyyyyyyyyyyyyyy"""
|
||||
request.return_value = result
|
||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
recorded_url=recorded_url)
|
||||
assert res["url"] == "http://example.com"
|
||||
assert res["date"] == "2017-02-03T04:05:03Z"
|
||||
|
||||
# invalid CDX result status code
|
||||
result = mock.Mock()
|
||||
result.status = 400
|
||||
result.data = b'20170101020405 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
||||
request.return_value = result
|
||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
recorded_url=recorded_url)
|
||||
assert res is None
|
||||
# invalid CDX result content
|
||||
result = mock.Mock()
|
||||
result.status = 200
|
||||
result.data = b'InvalidExceptionResult'
|
||||
request.return_value = result
|
||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||
recorded_url=recorded_url)
|
||||
assert res is None
|
||||
|
@ -29,6 +29,7 @@ from hanzo import warctools
|
||||
import warcprox
|
||||
import sqlite3
|
||||
import urllib3
|
||||
from urllib3.exceptions import HTTPError
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
@ -206,10 +207,10 @@ class CdxServerDedup(object):
|
||||
"""Query a CDX server to perform deduplication.
|
||||
"""
|
||||
logger = logging.getLogger("warcprox.dedup.CdxServerDedup")
|
||||
http_pool = urllib3.PoolManager()
|
||||
|
||||
def __init__(self, cdx_url="https://web.archive.org/cdx/search/cdx",
|
||||
options=warcprox.Options()):
|
||||
self.http_pool = urllib3.PoolManager()
|
||||
self.cdx_url = cdx_url
|
||||
self.options = options
|
||||
|
||||
@ -226,30 +227,33 @@ class CdxServerDedup(object):
|
||||
computed on the original content, after decoding Content-Encoding and
|
||||
Transfer-Encoding, if any), if they match, write a revisit record.
|
||||
|
||||
:param digest_key: b'sha1:<KEY-VALUE>'.
|
||||
:param digest_key: b'sha1:<KEY-VALUE>' (prefix is optional).
|
||||
Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
||||
:param recorded_url: RecordedUrl object
|
||||
Result must contain:
|
||||
{"url", "date": "%Y-%m-%dT%H:%M:%SZ", "id": "warc_id" if available}
|
||||
{"url": <URL>, "date": "%Y-%m-%dT%H:%M:%SZ"}
|
||||
"""
|
||||
url = recorded_url.url
|
||||
u = url.decode("utf-8") if isinstance(url, bytes) else url
|
||||
try:
|
||||
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
|
||||
url=u, fl="timestamp,digest", limit=-1))
|
||||
except urllib3.HTTPError as exc:
|
||||
self.logger.error('CdxServerDedup request failed for url=%s %s',
|
||||
url, exc)
|
||||
if result.status == 200:
|
||||
digest_key = digest_key[5:] # drop sha1: prefix
|
||||
assert result.status == 200
|
||||
if isinstance(digest_key, bytes):
|
||||
dkey = digest_key
|
||||
else:
|
||||
dkey = digest_key.encode('utf-8')
|
||||
dkey = dkey[5:] if dkey.startswith(b'sha1:') else dkey
|
||||
for line in result.data.split(b'\n'):
|
||||
if line:
|
||||
(cdx_ts, cdx_digest) = line.split(b' ')
|
||||
if cdx_digest == digest_key:
|
||||
if cdx_digest == dkey:
|
||||
dt = datetime(*_split_timestamp(cdx_ts.decode('ascii')))
|
||||
# TODO find out id
|
||||
return dict(id=url, url=url,
|
||||
return dict(url=url,
|
||||
date=dt.strftime('%Y-%m-%dT%H:%M:%SZ'))
|
||||
except (HTTPError, AssertionError, ValueError) as exc:
|
||||
self.logger.error('CdxServerDedup request failed for url=%s %s',
|
||||
url, exc)
|
||||
return None
|
||||
|
||||
def notify(self, recorded_url, records):
|
||||
|
Loading…
x
Reference in New Issue
Block a user