mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Pass url instead of recorded_url obj to dedup lookup methods
This commit is contained in:
parent
f77aef9110
commit
4fb44a7e9d
@ -1,13 +1,11 @@
|
|||||||
import mock
|
import mock
|
||||||
import pytest
|
|
||||||
from warcprox.dedup import CdxServerDedup
|
from warcprox.dedup import CdxServerDedup
|
||||||
|
|
||||||
|
|
||||||
def test_cdx_dedup():
|
def test_cdx_dedup():
|
||||||
# Mock CDX Server responses to simulate found, not found and errors.
|
# Mock CDX Server responses to simulate found, not found and errors.
|
||||||
with mock.patch('warcprox.dedup.CdxServerDedup.http_pool.request') as request:
|
with mock.patch('warcprox.dedup.CdxServerDedup.http_pool.request') as request:
|
||||||
recorded_url = mock.Mock();
|
url = "http://example.com"
|
||||||
recorded_url.url = "http://example.com"
|
|
||||||
# not found case
|
# not found case
|
||||||
result = mock.Mock()
|
result = mock.Mock()
|
||||||
result.status = 200
|
result.status = 200
|
||||||
@ -15,7 +13,7 @@ def test_cdx_dedup():
|
|||||||
request.return_value = result
|
request.return_value = result
|
||||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||||
recorded_url=recorded_url)
|
url=url)
|
||||||
assert res is None
|
assert res is None
|
||||||
|
|
||||||
# found in the 2nd CDX line
|
# found in the 2nd CDX line
|
||||||
@ -28,7 +26,7 @@ def test_cdx_dedup():
|
|||||||
request.return_value = result
|
request.return_value = result
|
||||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||||
recorded_url=recorded_url)
|
url=url)
|
||||||
assert res["date"] == b"2017-02-03T04:05:03Z"
|
assert res["date"] == b"2017-02-03T04:05:03Z"
|
||||||
|
|
||||||
# invalid CDX result status code
|
# invalid CDX result status code
|
||||||
@ -38,7 +36,7 @@ def test_cdx_dedup():
|
|||||||
request.return_value = result
|
request.return_value = result
|
||||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||||
recorded_url=recorded_url)
|
url=url)
|
||||||
assert res is None
|
assert res is None
|
||||||
# invalid CDX result content
|
# invalid CDX result content
|
||||||
result = mock.Mock()
|
result = mock.Mock()
|
||||||
@ -47,5 +45,5 @@ def test_cdx_dedup():
|
|||||||
request.return_value = result
|
request.return_value = result
|
||||||
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
|
||||||
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
|
||||||
recorded_url=recorded_url)
|
url=url)
|
||||||
assert res is None
|
assert res is None
|
||||||
|
@ -77,7 +77,7 @@ class DedupDb(object):
|
|||||||
conn.close()
|
conn.close()
|
||||||
self.logger.debug('dedup db saved %s:%s', key, json_value)
|
self.logger.debug('dedup db saved %s:%s', key, json_value)
|
||||||
|
|
||||||
def lookup(self, digest_key, bucket="", recorded_url=None):
|
def lookup(self, digest_key, bucket="", url=None):
|
||||||
result = None
|
result = None
|
||||||
key = digest_key.decode('utf-8') + '|' + bucket
|
key = digest_key.decode('utf-8') + '|' + bucket
|
||||||
conn = sqlite3.connect(self.file)
|
conn = sqlite3.connect(self.file)
|
||||||
@ -113,9 +113,10 @@ def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
|
|||||||
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32)
|
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32)
|
||||||
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
||||||
recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"],
|
recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"],
|
||||||
recorded_url)
|
recorded_url.url)
|
||||||
else:
|
else:
|
||||||
recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url=recorded_url)
|
recorded_url.dedup_info = dedup_db.lookup(digest_key,
|
||||||
|
url=recorded_url.url)
|
||||||
|
|
||||||
class RethinkDedupDb:
|
class RethinkDedupDb:
|
||||||
logger = logging.getLogger("warcprox.dedup.RethinkDedupDb")
|
logger = logging.getLogger("warcprox.dedup.RethinkDedupDb")
|
||||||
@ -160,7 +161,7 @@ class RethinkDedupDb:
|
|||||||
raise Exception("unexpected result %s saving %s", result, record)
|
raise Exception("unexpected result %s saving %s", result, record)
|
||||||
self.logger.debug('dedup db saved %s:%s', k, record)
|
self.logger.debug('dedup db saved %s:%s', k, record)
|
||||||
|
|
||||||
def lookup(self, digest_key, bucket="", recorded_url=None):
|
def lookup(self, digest_key, bucket="", url=None):
|
||||||
k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key
|
k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key
|
||||||
k = "{}|{}".format(k, bucket)
|
k = "{}|{}".format(k, bucket)
|
||||||
result = self.rr.table(self.table).get(k).run()
|
result = self.rr.table(self.table).get(k).run()
|
||||||
@ -200,18 +201,17 @@ class CdxServerDedup(object):
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def lookup(self, digest_key, recorded_url):
|
def lookup(self, digest_key, url):
|
||||||
"""Compare `sha1` with SHA1 hash of fetched content (note SHA1 must be
|
"""Compare `sha1` with SHA1 hash of fetched content (note SHA1 must be
|
||||||
computed on the original content, after decoding Content-Encoding and
|
computed on the original content, after decoding Content-Encoding and
|
||||||
Transfer-Encoding, if any), if they match, write a revisit record.
|
Transfer-Encoding, if any), if they match, write a revisit record.
|
||||||
|
|
||||||
:param digest_key: b'sha1:<KEY-VALUE>' (prefix is optional).
|
:param digest_key: b'sha1:<KEY-VALUE>' (prefix is optional).
|
||||||
Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
|
||||||
:param recorded_url: RecordedUrl object
|
:param url: Target URL string
|
||||||
Result must contain:
|
Result must contain:
|
||||||
{"url": <URL>, "date": "%Y-%m-%dT%H:%M:%SZ"}
|
{"url": <URL>, "date": "%Y-%m-%dT%H:%M:%SZ"}
|
||||||
"""
|
"""
|
||||||
url = recorded_url.url
|
|
||||||
u = url.decode("utf-8") if isinstance(url, bytes) else url
|
u = url.decode("utf-8") if isinstance(url, bytes) else url
|
||||||
try:
|
try:
|
||||||
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
|
result = self.http_pool.request('GET', self.cdx_url, fields=dict(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user