diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 4feeb7d..df4ec1a 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -10,6 +10,7 @@ import os import hashlib import threading import datetime +import rethinkstuff class RethinkCaptures: """Inserts in batches every 0.5 seconds""" @@ -97,8 +98,7 @@ class RethinkCaptures: "id": "{} {}".format(canon_surt[:20], records[0].id.decode("utf-8")[10:-1]), "abbr_canon_surt": canon_surt[:150], "canon_surt": canon_surt, - # "timestamp": re.sub(r"[^0-9]", "", records[0].date.decode("utf-8")), - "timestamp": records[0].date.decode("utf-8"), + "timestamp": recorded_url.timestamp.replace(tzinfo=rethinkstuff.UTC), "url": recorded_url.url.decode("utf-8"), "offset": records[0].offset, "filename": os.path.basename(records[0].warc_filename), @@ -140,7 +140,11 @@ class RethinkCapturesDedup: raw_digest = base64.b16decode(value_str, casefold=True) entry = self.captures_db.find_response_by_digest(algo, raw_digest, bucket) if entry: - dedup_info = {"url":entry["url"].encode("utf-8"), "date":entry["timestamp"].encode("utf-8"), "id":entry["warc_id"].encode("utf-8")} + dedup_info = { + "url": entry["url"].encode("utf-8"), + "date": entry["timestamp"].strftime("%Y-%m-%dT%H:%M:%SZ").encode("utf-8"), + "id": entry["warc_id"].encode("utf-8") + } return dedup_info else: return None diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 17735ed..7ac99d8 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -64,8 +64,8 @@ class DedupDb(object): def notify(self, recorded_url, records): if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE and recorded_url.response_recorder.payload_size() > 0): - digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, - self.options.base32) + digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, + self.options.base32) if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"]) else: @@ -73,8 +73,8 @@ class DedupDb(object): def decorate_with_dedup_info(dedup_db, recorded_url, base32=False): - if (recorded_url.response_recorder - and recorded_url.response_recorder.payload_digest + if (recorded_url.response_recorder + and recorded_url.response_recorder.payload_digest and recorded_url.response_recorder.payload_size() > 0): digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32) if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: @@ -100,7 +100,7 @@ class RethinkDedupDb: self.r.db_create(self.r.dbname).run() tables = self.r.table_list().run() if not self.table in tables: - self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s", + self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s", repr(self.table), repr(self.r.dbname), self.shards, self.replicas) self.r.table_create(self.table, primary_key="key", shards=self.shards, replicas=self.replicas).run() @@ -135,7 +135,7 @@ class RethinkDedupDb: def notify(self, recorded_url, records): if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE and recorded_url.response_recorder.payload_size() > 0): - digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, + digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, self.options.base32) if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"]) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 25beff8..3f1642c 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -81,7 +81,7 @@ class WarcWriterThread(threading.Thread): payload_digest = records[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8") except: payload_digest = "-" - + # 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"} self.logger.info("{} {} {} {} {} size={} {} {} {} offset={}".format( recorded_url.client_ip, recorded_url.status, recorded_url.method,