diff --git a/tests/test_dedup.py b/tests/test_dedup.py index 5a0ca3b..591337e 100644 --- a/tests/test_dedup.py +++ b/tests/test_dedup.py @@ -1,13 +1,11 @@ import mock -import pytest from warcprox.dedup import CdxServerDedup def test_cdx_dedup(): # Mock CDX Server responses to simulate found, not found and errors. with mock.patch('warcprox.dedup.CdxServerDedup.http_pool.request') as request: - recorded_url = mock.Mock(); - recorded_url.url = "http://example.com" + url = "http://example.com" # not found case result = mock.Mock() result.status = 200 @@ -15,7 +13,7 @@ def test_cdx_dedup(): request.return_value = result cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", - recorded_url=recorded_url) + url=url) assert res is None # found in the 2nd CDX line @@ -28,7 +26,7 @@ def test_cdx_dedup(): request.return_value = result cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", - recorded_url=recorded_url) + url=url) assert res["date"] == b"2017-02-03T04:05:03Z" # invalid CDX result status code @@ -38,7 +36,7 @@ def test_cdx_dedup(): request.return_value = result cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", - recorded_url=recorded_url) + url=url) assert res is None # invalid CDX result content result = mock.Mock() @@ -47,5 +45,5 @@ def test_cdx_dedup(): request.return_value = result cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url") res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A", - recorded_url=recorded_url) + url=url) assert res is None diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 53b27c9..1513946 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -77,7 +77,7 @@ class DedupDb(object): conn.close() self.logger.debug('dedup db saved %s:%s', key, json_value) - def lookup(self, digest_key, bucket="", recorded_url=None): + def lookup(self, digest_key, bucket="", url=None): result = None key = digest_key.decode('utf-8') + '|' + bucket conn = sqlite3.connect(self.file) @@ -113,9 +113,10 @@ def decorate_with_dedup_info(dedup_db, recorded_url, base32=False): digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32) if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"], - recorded_url) + recorded_url.url) else: - recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url=recorded_url) + recorded_url.dedup_info = dedup_db.lookup(digest_key, + url=recorded_url.url) class RethinkDedupDb: logger = logging.getLogger("warcprox.dedup.RethinkDedupDb") @@ -160,7 +161,7 @@ class RethinkDedupDb: raise Exception("unexpected result %s saving %s", result, record) self.logger.debug('dedup db saved %s:%s', k, record) - def lookup(self, digest_key, bucket="", recorded_url=None): + def lookup(self, digest_key, bucket="", url=None): k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key k = "{}|{}".format(k, bucket) result = self.rr.table(self.table).get(k).run() @@ -200,18 +201,17 @@ class CdxServerDedup(object): """ pass - def lookup(self, digest_key, recorded_url): + def lookup(self, digest_key, url): """Compare `sha1` with SHA1 hash of fetched content (note SHA1 must be computed on the original content, after decoding Content-Encoding and Transfer-Encoding, if any), if they match, write a revisit record. :param digest_key: b'sha1:' (prefix is optional). Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' - :param recorded_url: RecordedUrl object + :param url: Target URL string Result must contain: {"url": , "date": "%Y-%m-%dT%H:%M:%SZ"} """ - url = recorded_url.url u = url.decode("utf-8") if isinstance(url, bytes) else url try: result = self.http_pool.request('GET', self.cdx_url, fields=dict(