mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Add CdxServerDedup unit tests and improve its exception handling
Add multiple ``CdxServerDedup`` unit tests to simulate found, not found and invalid responses from the CDX server. Use a different file ``tests/test_dedup.py`` because we test the CdxServerDedup component individually and it belongs to the ``warcprox.dedup`` package. Add ``mock`` package to dev requirements. Rework the warcprox.dedup.CdxServerDedup class to have better exception handling.
This commit is contained in:
parent
fc5f39ffed
commit
960dda4c31
2
setup.py
2
setup.py
@ -60,7 +60,7 @@ setuptools.setup(
|
|||||||
license='GPL',
|
license='GPL',
|
||||||
packages=['warcprox'],
|
packages=['warcprox'],
|
||||||
install_requires=deps,
|
install_requires=deps,
|
||||||
tests_require=['requests>=2.0.1', 'pytest', 'warcio'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636
|
tests_require=['requests>=2.0.1', 'mock', 'pytest', 'warcio'], # >=2.0.1 for https://github.com/kennethreitz/requests/pull/1636
|
||||||
cmdclass = {'test': PyTest},
|
cmdclass = {'test': PyTest},
|
||||||
test_suite='warcprox.tests',
|
test_suite='warcprox.tests',
|
||||||
entry_points={
|
entry_points={
|
||||||
|
@ -1,10 +1,52 @@
|
|||||||
|
import mock
|
||||||
import pytest
|
import pytest
|
||||||
from warcprox.dedup import CdxServerDedup
|
from warcprox.dedup import CdxServerDedup
|
||||||
|
|
||||||
|
|
||||||
def test_cdx():
|
def test_cdx_dedup():
|
||||||
# TODO add mocking of CDX Server response
|
# Mock CDX Server responses to simulate found, not found and errors.
|
||||||
# TODO check found and not found cases
|
with mock.patch('warcprox.dedup.CdxServerDedup.http_pool.request') as request:
|
||||||
cdx_server = CdxServerDedup(cdx_url="https://web.archive.org/cdx/search/cdx")
|
recorded_url = mock.Mock();
|
||||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
recorded_url.url = "http://example.com"
|
||||||
url="http://example.com")
|
# not found case
|
||||||
|
result = mock.Mock()
|
||||||
|
result.status = 200
|
||||||
|
result.data = b'20170101020405 test'
|
||||||
|
request.return_value = result
|
||||||
|
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||||
|
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||||
|
recorded_url=recorded_url)
|
||||||
|
assert res is None
|
||||||
|
|
||||||
|
# found in the 2nd CDX line
|
||||||
|
result = mock.Mock()
|
||||||
|
result.status = 200
|
||||||
|
result.data = b"""\
|
||||||
|
20170101020304 xxx
|
||||||
|
20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A
|
||||||
|
20160505050505 yyyyyyyyyyyyyyyyyyyyyy"""
|
||||||
|
request.return_value = result
|
||||||
|
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||||
|
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||||
|
recorded_url=recorded_url)
|
||||||
|
assert res["url"] == "http://example.com"
|
||||||
|
assert res["date"] == "2017-02-03T04:05:03Z"
|
||||||
|
|
||||||
|
# invalid CDX result status code
|
||||||
|
result = mock.Mock()
|
||||||
|
result.status = 400
|
||||||
|
result.data = b'20170101020405 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
||||||
|
request.return_value = result
|
||||||
|
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||||
|
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||||
|
recorded_url=recorded_url)
|
||||||
|
assert res is None
|
||||||
|
# invalid CDX result content
|
||||||
|
result = mock.Mock()
|
||||||
|
result.status = 200
|
||||||
|
result.data = b'InvalidExceptionResult'
|
||||||
|
request.return_value = result
|
||||||
|
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||||
|
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||||
|
recorded_url=recorded_url)
|
||||||
|
assert res is None
|
||||||
|
@ -29,6 +29,7 @@ from hanzo import warctools
|
|||||||
import warcprox
|
import warcprox
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import urllib3
|
import urllib3
|
||||||
|
from urllib3.exceptions import HTTPError
|
||||||
|
|
||||||
urllib3.disable_warnings()
|
urllib3.disable_warnings()
|
||||||
|
|
||||||
@ -206,10 +207,10 @@ class CdxServerDedup(object):
|
|||||||
"""Query a CDX server to perform deduplication.
|
"""Query a CDX server to perform deduplication.
|
||||||
"""
|
"""
|
||||||
logger = logging.getLogger("warcprox.dedup.CdxServerDedup")
|
logger = logging.getLogger("warcprox.dedup.CdxServerDedup")
|
||||||
|
http_pool = urllib3.PoolManager()
|
||||||
|
|
||||||
def __init__(self, cdx_url="https://web.archive.org/cdx/search/cdx",
|
def __init__(self, cdx_url="https://web.archive.org/cdx/search/cdx",
|
||||||
options=warcprox.Options()):
|
options=warcprox.Options()):
|
||||||
self.http_pool = urllib3.PoolManager()
|
|
||||||
self.cdx_url = cdx_url
|
self.cdx_url = cdx_url
|
||||||
self.options = options
|
self.options = options
|
||||||
|
|
||||||
@ -226,30 +227,33 @@ class CdxServerDedup(object):
|
|||||||
computed on the original content, after decoding Content-Encoding and
|
computed on the original content, after decoding Content-Encoding and
|
||||||
Transfer-Encoding, if any), if they match, write a revisit record.
|
Transfer-Encoding, if any), if they match, write a revisit record.
|
||||||
|
|
||||||
:param digest_key: b'sha1:<KEY-VALUE>'.
|
:param digest_key: b'sha1:<KEY-VALUE>' (prefix is optional).
|
||||||
Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
||||||
:param recorded_url: RecordedUrl object
|
:param recorded_url: RecordedUrl object
|
||||||
Result must contain:
|
Result must contain:
|
||||||
{"url", "date": "%Y-%m-%dT%H:%M:%SZ", "id": "warc_id" if available}
|
{"url": <URL>, "date": "%Y-%m-%dT%H:%M:%SZ"}
|
||||||
"""
|
"""
|
||||||
url = recorded_url.url
|
url = recorded_url.url
|
||||||
u = url.decode("utf-8") if isinstance(url, bytes) else url
|
u = url.decode("utf-8") if isinstance(url, bytes) else url
|
||||||
try:
|
try:
|
||||||
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
|
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
|
||||||
url=u, fl="timestamp,digest", limit=-1))
|
url=u, fl="timestamp,digest", limit=-1))
|
||||||
except urllib3.HTTPError as exc:
|
assert result.status == 200
|
||||||
self.logger.error('CdxServerDedup request failed for url=%s %s',
|
if isinstance(digest_key, bytes):
|
||||||
url, exc)
|
dkey = digest_key
|
||||||
if result.status == 200:
|
else:
|
||||||
digest_key = digest_key[5:] # drop sha1: prefix
|
dkey = digest_key.encode('utf-8')
|
||||||
|
dkey = dkey[5:] if dkey.startswith(b'sha1:') else dkey
|
||||||
for line in result.data.split(b'\n'):
|
for line in result.data.split(b'\n'):
|
||||||
if line:
|
if line:
|
||||||
(cdx_ts, cdx_digest) = line.split(b' ')
|
(cdx_ts, cdx_digest) = line.split(b' ')
|
||||||
if cdx_digest == digest_key:
|
if cdx_digest == dkey:
|
||||||
dt = datetime(*_split_timestamp(cdx_ts.decode('ascii')))
|
dt = datetime(*_split_timestamp(cdx_ts.decode('ascii')))
|
||||||
# TODO find out id
|
return dict(url=url,
|
||||||
return dict(id=url, url=url,
|
|
||||||
date=dt.strftime('%Y-%m-%dT%H:%M:%SZ'))
|
date=dt.strftime('%Y-%m-%dT%H:%M:%SZ'))
|
||||||
|
except (HTTPError, AssertionError, ValueError) as exc:
|
||||||
|
self.logger.error('CdxServerDedup request failed for url=%s %s',
|
||||||
|
url, exc)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def notify(self, recorded_url, records):
|
def notify(self, recorded_url, records):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user