From 202d664f3906716f15b52833a43a0e0c5eae9226 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 20 Oct 2017 20:00:02 +0000 Subject: [PATCH] Improve CdxServerDedup implementation Replace ``_split_timestamp`` with ``datetime.strptime`` in ``warcprox.dedup``. Remove ``isinstance()`` and add optional ``record_url`` in the rest of the dedup ``lookup`` methods. Make `--cdxserver-dedup` option help more explanatory. --- warcprox/dedup.py | 35 +++++++---------------------------- warcprox/main.py | 2 +- 2 files changed, 8 insertions(+), 29 deletions(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 6258860..41b9249 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -77,7 +77,7 @@ class DedupDb(object): conn.close() self.logger.debug('dedup db saved %s:%s', key, json_value) - def lookup(self, digest_key, bucket=""): + def lookup(self, digest_key, bucket="", recorded_url=None): result = None key = digest_key.decode('utf-8') + '|' + bucket conn = sqlite3.connect(self.file) @@ -112,16 +112,10 @@ def decorate_with_dedup_info(dedup_db, recorded_url, base32=False): and recorded_url.response_recorder.payload_size() > 0): digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32) if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: - if isinstance(dedup_db, CdxServerDedup): - recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"], - recorded_url) - else: - recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"]) + recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url.warcprox_meta["captures-bucket"], + recorded_url) else: - if isinstance(dedup_db, CdxServerDedup): - recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url) - else: - recorded_url.dedup_info = dedup_db.lookup(digest_key) + recorded_url.dedup_info = dedup_db.lookup(digest_key, recorded_url=recorded_url) class RethinkDedupDb: logger = logging.getLogger("warcprox.dedup.RethinkDedupDb") @@ -166,7 +160,7 @@ class RethinkDedupDb: raise Exception("unexpected result %s saving %s", result, record) self.logger.debug('dedup db saved %s:%s', k, record) - def lookup(self, digest_key, bucket=""): + def lookup(self, digest_key, bucket="", recorded_url=None): k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key k = "{}|{}".format(k, bucket) result = self.rr.table(self.table).get(k).run() @@ -187,22 +181,6 @@ class RethinkDedupDb: self.save(digest_key, records[0]) -def _split_timestamp(timestamp): - """split `timestamp` into a tuple of 6 integers. - - :param timestamp: full-length timestamp. - :type timestamp: bytes - """ - return ( - int(timestamp[:-10]), - int(timestamp[-10:-8]), - int(timestamp[-8:-6]), - int(timestamp[-6:-4]), - int(timestamp[-4:-2]), - int(timestamp[-2:]) - ) - - class CdxServerDedup(object): """Query a CDX server to perform deduplication. """ @@ -248,7 +226,8 @@ class CdxServerDedup(object): if line: (cdx_ts, cdx_digest) = line.split(b' ') if cdx_digest == dkey: - dt = datetime(*_split_timestamp(cdx_ts.decode('ascii'))) + dt = datetime.strptime(cdx_ts.decode('ascii'), + '%Y%m%d%H%M%S') date = dt.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8') return dict(url=url, date=date) except (HTTPError, AssertionError, ValueError) as exc: diff --git a/warcprox/main.py b/warcprox/main.py index 2d0414b..76e194a 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -107,7 +107,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): group.add_argument('-j', '--dedup-db-file', dest='dedup_db_file', default='./warcprox.sqlite', help='persistent deduplication database file; empty string or /dev/null disables deduplication') group.add_argument('--cdxserver-dedup', dest='cdxserver_dedup', - help='use a CDX Server for deduplication') + help='use a CDX Server URL for deduplication; e.g. https://web.archive.org/cdx/search') group.add_argument('--rethinkdb-servers', dest='rethinkdb_servers', help='rethinkdb servers, used for dedup and stats if specified; e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org') arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default='warcprox',