Update CdxServerDedup lookup algorithm

Get only one item from CDX (``limit=-1``).

Update unit tests
This commit is contained in:
Vangelis Banos 2017-10-21 20:45:46 +00:00
parent 4fb44a7e9d
commit f6b1d6f408
2 changed files with 11 additions and 14 deletions

View File

@ -16,13 +16,10 @@ def test_cdx_dedup():
url=url)
assert res is None
# found in the 2nd CDX line
# found case
result = mock.Mock()
result.status = 200
result.data = b"""\
20170101020304 xxx
20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A
20160505050505 yyyyyyyyyyyyyyyyyyyyyy"""
result.data = b'20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
request.return_value = result
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",

View File

@ -216,21 +216,21 @@ class CdxServerDedup(object):
try:
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit",
limit=-10))
limit=-1))
assert result.status == 200
if isinstance(digest_key, bytes):
dkey = digest_key
else:
dkey = digest_key.encode('utf-8')
dkey = dkey[5:] if dkey.startswith(b'sha1:') else dkey
for line in result.data.split(b'\n'):
if line:
(cdx_ts, cdx_digest) = line.split(b' ')
if cdx_digest == dkey:
dt = datetime.strptime(cdx_ts.decode('ascii'),
'%Y%m%d%H%M%S')
date = dt.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8')
return dict(url=url, date=date)
line = result.data.split(b'\n')
if line:
(cdx_ts, cdx_digest) = line[0].split(b' ')
if cdx_digest == dkey:
dt = datetime.strptime(cdx_ts.decode('ascii'),
'%Y%m%d%H%M%S')
date = dt.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8')
return dict(url=url, date=date)
except (HTTPError, AssertionError, ValueError) as exc:
self.logger.error('CdxServerDedup request failed for url=%s %s',
url, exc)