mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Improve CdxServerDedup implementation
Replace ``_split_timestamp`` with ``datetime.strptime`` in ``warcprox.dedup``. Remove ``isinstance()`` and add optional ``record_url`` in the rest of the dedup ``lookup`` methods. Make `--cdxserver-dedup` option help more explanatory.
This commit is contained in:
parent
bc3d0cb4f6
commit
202d664f39
@ -77,7 +77,7 @@ class DedupDb(object):
|
||||
conn.close()
|
||||
self.logger.debug('dedup db saved %s:%s', key, json_value)
|
||||
|
||||
def lookup(self, digest_key, bucket=""):
|
||||
def lookup(self, digest_key, bucket="", recorded_url=None):
|
||||
result = None
|
||||
key = digest_key.decode('utf-8') + '|' + bucket
|
||||
conn = sqlite3.connect(self.file)
|
||||
@ -112,16 +112,10 @@ def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
|
||||
and recorded_url.response_recorder.payload_size() > 0):
|
||||
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32)
|
||||
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
||||
if isinstance(dedup_db, CdxServerDedup):
|
||||
recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"],
|
||||
recorded_url)
|
||||
else:
|
||||
recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"])
|
||||
recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"],
|
||||
recorded_url)
|
||||
else:
|
||||
if isinstance(dedup_db, CdxServerDedup):
|
||||
recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url)
|
||||
else:
|
||||
recorded_url.dedup_info = dedup_db.lookup(digest_key)
|
||||
recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url=recorded_url)
|
||||
|
||||
class RethinkDedupDb:
|
||||
logger = logging.getLogger("warcprox.dedup.RethinkDedupDb")
|
||||
@ -166,7 +160,7 @@ class RethinkDedupDb:
|
||||
raise Exception("unexpected result %s saving %s", result, record)
|
||||
self.logger.debug('dedup db saved %s:%s', k, record)
|
||||
|
||||
def lookup(self, digest_key, bucket=""):
|
||||
def lookup(self, digest_key, bucket="", recorded_url=None):
|
||||
k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key
|
||||
k = "{}|{}".format(k, bucket)
|
||||
result = self.rr.table(self.table).get(k).run()
|
||||
@ -187,22 +181,6 @@ class RethinkDedupDb:
|
||||
self.save(digest_key, records[0])
|
||||
|
||||
|
||||
def _split_timestamp(timestamp):
|
||||
"""split `timestamp` into a tuple of 6 integers.
|
||||
|
||||
:param timestamp: full-length timestamp.
|
||||
:type timestamp: bytes
|
||||
"""
|
||||
return (
|
||||
int(timestamp[:-10]),
|
||||
int(timestamp[-10:-8]),
|
||||
int(timestamp[-8:-6]),
|
||||
int(timestamp[-6:-4]),
|
||||
int(timestamp[-4:-2]),
|
||||
int(timestamp[-2:])
|
||||
)
|
||||
|
||||
|
||||
class CdxServerDedup(object):
|
||||
"""Query a CDX server to perform deduplication.
|
||||
"""
|
||||
@ -248,7 +226,8 @@ class CdxServerDedup(object):
|
||||
if line:
|
||||
(cdx_ts, cdx_digest) = line.split(b' ')
|
||||
if cdx_digest == dkey:
|
||||
dt = datetime(*_split_timestamp(cdx_ts.decode('ascii')))
|
||||
dt = datetime.strptime(cdx_ts.decode('ascii'),
|
||||
'%Y%m%d%H%M%S')
|
||||
date = dt.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8')
|
||||
return dict(url=url, date=date)
|
||||
except (HTTPError, AssertionError, ValueError) as exc:
|
||||
|
@ -107,7 +107,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||
group.add_argument('-j', '--dedup-db-file', dest='dedup_db_file',
|
||||
default='./warcprox.sqlite', help='persistent deduplication database file; empty string or /dev/null disables deduplication')
|
||||
group.add_argument('--cdxserver-dedup', dest='cdxserver_dedup',
|
||||
help='use a CDX Server for deduplication')
|
||||
help='use a CDX Server URL for deduplication; e.g. https://web.archive.org/cdx/search')
|
||||
group.add_argument('--rethinkdb-servers', dest='rethinkdb_servers',
|
||||
help='rethinkdb servers, used for dedup and stats if specified; e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
|
||||
arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default='warcprox',
|
||||
|
Loading…
x
Reference in New Issue
Block a user