Improve CdxServerDedup implementation

Replace ``_split_timestamp`` with ``datetime.strptime`` in
``warcprox.dedup``.

Remove ``isinstance()`` and add optional ``record_url`` in the rest of
the dedup ``lookup`` methods.

Make `--cdxserver-dedup` option help more explanatory.
This commit is contained in:
Vangelis Banos 2017-10-20 20:00:02 +00:00
parent bc3d0cb4f6
commit 202d664f39
2 changed files with 8 additions and 29 deletions

View File

@ -77,7 +77,7 @@ class DedupDb(object):
conn.close()
self.logger.debug('dedup db saved %s:%s', key, json_value)
def lookup(self, digest_key, bucket=""):
def lookup(self, digest_key, bucket="", recorded_url=None):
result = None
key = digest_key.decode('utf-8') + '|' + bucket
conn = sqlite3.connect(self.file)
@ -112,16 +112,10 @@ def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
and recorded_url.response_recorder.payload_size() > 0):
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32)
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
if isinstance(dedup_db, CdxServerDedup):
recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"],
recorded_url)
else:
recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"])
recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"],
recorded_url)
else:
if isinstance(dedup_db, CdxServerDedup):
recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url)
else:
recorded_url.dedup_info = dedup_db.lookup(digest_key)
recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url=recorded_url)
class RethinkDedupDb:
logger = logging.getLogger("warcprox.dedup.RethinkDedupDb")
@ -166,7 +160,7 @@ class RethinkDedupDb:
raise Exception("unexpected result %s saving %s", result, record)
self.logger.debug('dedup db saved %s:%s', k, record)
def lookup(self, digest_key, bucket=""):
def lookup(self, digest_key, bucket="", recorded_url=None):
k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key
k = "{}|{}".format(k, bucket)
result = self.rr.table(self.table).get(k).run()
@ -187,22 +181,6 @@ class RethinkDedupDb:
self.save(digest_key, records[0])
def _split_timestamp(timestamp):
"""split `timestamp` into a tuple of 6 integers.
:param timestamp: full-length timestamp.
:type timestamp: bytes
"""
return (
int(timestamp[:-10]),
int(timestamp[-10:-8]),
int(timestamp[-8:-6]),
int(timestamp[-6:-4]),
int(timestamp[-4:-2]),
int(timestamp[-2:])
)
class CdxServerDedup(object):
"""Query a CDX server to perform deduplication.
"""
@ -248,7 +226,8 @@ class CdxServerDedup(object):
if line:
(cdx_ts, cdx_digest) = line.split(b' ')
if cdx_digest == dkey:
dt = datetime(*_split_timestamp(cdx_ts.decode('ascii')))
dt = datetime.strptime(cdx_ts.decode('ascii'),
'%Y%m%d%H%M%S')
date = dt.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8')
return dict(url=url, date=date)
except (HTTPError, AssertionError, ValueError) as exc:

View File

@ -107,7 +107,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
group.add_argument('-j', '--dedup-db-file', dest='dedup_db_file',
default='./warcprox.sqlite', help='persistent deduplication database file; empty string or /dev/null disables deduplication')
group.add_argument('--cdxserver-dedup', dest='cdxserver_dedup',
help='use a CDX Server for deduplication')
help='use a CDX Server URL for deduplication; e.g. https://web.archive.org/cdx/search')
group.add_argument('--rethinkdb-servers', dest='rethinkdb_servers',
help='rethinkdb servers, used for dedup and stats if specified; e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default='warcprox',