From f6b1d6f40879642c754a20be5504574667e7bf06 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Sat, 21 Oct 2017 20:45:46 +0000 Subject: [PATCH] Update CdxServerDedup lookup algorithm Get only one item from CDX (``limit=-1``). Update unit tests --- tests/test_dedup.py | 7 ++----- warcprox/dedup.py | 18 +++++++++--------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/tests/test_dedup.py b/tests/test_dedup.py index 591337e..124efb5 100644 --- a/tests/test_dedup.py +++ b/tests/test_dedup.py @@ -16,13 +16,10 @@ def test_cdx_dedup(): url=url) assert res is None - # found in the 2nd CDX line + # found case result = mock.Mock() result.status = 200 - result.data = b"""\ -20170101020304 xxx -20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A -20160505050505 yyyyyyyyyyyyyyyyyyyyyy""" + result.data = b'20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' request.return_value = result cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 1513946..08bbf23 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -216,21 +216,21 @@ class CdxServerDedup(object): try: result = self.http_pool.request('GET', self.cdx_url, fields=dict( url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit", - limit=-10)) + limit=-1)) assert result.status == 200 if isinstance(digest_key, bytes): dkey = digest_key else: dkey = digest_key.encode('utf-8') dkey = dkey[5:] if dkey.startswith(b'sha1:') else dkey - for line in result.data.split(b'\n'): - if line: - (cdx_ts, cdx_digest) = line.split(b' ') - if cdx_digest == dkey: - dt = datetime.strptime(cdx_ts.decode('ascii'), - '%Y%m%d%H%M%S') - date = dt.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8') - return dict(url=url, date=date) + line = result.data.split(b'\n') + if line: + (cdx_ts, cdx_digest) = line[0].split(b' ') + if cdx_digest == dkey: + dt = datetime.strptime(cdx_ts.decode('ascii'), + '%Y%m%d%H%M%S') + date = dt.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8') + return dict(url=url, date=date) except (HTTPError, AssertionError, ValueError) as exc: self.logger.error('CdxServerDedup request failed for url=%s %s', url, exc)