mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Update CdxServerDedup lookup algorithm
Get only one item from CDX (``limit=-1``). Update unit tests
This commit is contained in:
parent
4fb44a7e9d
commit
f6b1d6f408
@ -16,13 +16,10 @@ def test_cdx_dedup():
|
|||||||
url=url)
|
url=url)
|
||||||
assert res is None
|
assert res is None
|
||||||
|
|
||||||
# found in the 2nd CDX line
|
# found case
|
||||||
result = mock.Mock()
|
result = mock.Mock()
|
||||||
result.status = 200
|
result.status = 200
|
||||||
result.data = b"""\
|
result.data = b'20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
||||||
20170101020304 xxx
|
|
||||||
20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A
|
|
||||||
20160505050505 yyyyyyyyyyyyyyyyyyyyyy"""
|
|
||||||
request.return_value = result
|
request.return_value = result
|
||||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||||
|
@ -216,21 +216,21 @@ class CdxServerDedup(object):
|
|||||||
try:
|
try:
|
||||||
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
|
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
|
||||||
url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit",
|
url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit",
|
||||||
limit=-10))
|
limit=-1))
|
||||||
assert result.status == 200
|
assert result.status == 200
|
||||||
if isinstance(digest_key, bytes):
|
if isinstance(digest_key, bytes):
|
||||||
dkey = digest_key
|
dkey = digest_key
|
||||||
else:
|
else:
|
||||||
dkey = digest_key.encode('utf-8')
|
dkey = digest_key.encode('utf-8')
|
||||||
dkey = dkey[5:] if dkey.startswith(b'sha1:') else dkey
|
dkey = dkey[5:] if dkey.startswith(b'sha1:') else dkey
|
||||||
for line in result.data.split(b'\n'):
|
line = result.data.split(b'\n')
|
||||||
if line:
|
if line:
|
||||||
(cdx_ts, cdx_digest) = line.split(b' ')
|
(cdx_ts, cdx_digest) = line[0].split(b' ')
|
||||||
if cdx_digest == dkey:
|
if cdx_digest == dkey:
|
||||||
dt = datetime.strptime(cdx_ts.decode('ascii'),
|
dt = datetime.strptime(cdx_ts.decode('ascii'),
|
||||||
'%Y%m%d%H%M%S')
|
'%Y%m%d%H%M%S')
|
||||||
date = dt.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8')
|
date = dt.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8')
|
||||||
return dict(url=url, date=date)
|
return dict(url=url, date=date)
|
||||||
except (HTTPError, AssertionError, ValueError) as exc:
|
except (HTTPError, AssertionError, ValueError) as exc:
|
||||||
self.logger.error('CdxServerDedup request failed for url=%s %s',
|
self.logger.error('CdxServerDedup request failed for url=%s %s',
|
||||||
url, exc)
|
url, exc)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user