From 927419645b221d15c9518b0c63ee462743ee0c98 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Wed, 18 Nov 2015 02:00:48 +0000
Subject: [PATCH] use rethinkdb native time type for captures table timestamp

---
 warcprox/bigtable.py     | 10 +++++++---
 warcprox/dedup.py        | 12 ++++++------
 warcprox/writerthread.py |  2 +-
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py
index 4feeb7d..df4ec1a 100644
--- a/warcprox/bigtable.py
+++ b/warcprox/bigtable.py
@@ -10,6 +10,7 @@ import os
 import hashlib
 import threading
 import datetime
+import rethinkstuff
 
 class RethinkCaptures:
     """Inserts in batches every 0.5 seconds"""
@@ -97,8 +98,7 @@ class RethinkCaptures:
             "id": "{} {}".format(canon_surt[:20], records[0].id.decode("utf-8")[10:-1]),
             "abbr_canon_surt": canon_surt[:150],
             "canon_surt": canon_surt,
-            # "timestamp": re.sub(r"[^0-9]", "", records[0].date.decode("utf-8")),
-            "timestamp": records[0].date.decode("utf-8"),
+            "timestamp": recorded_url.timestamp.replace(tzinfo=rethinkstuff.UTC),
             "url": recorded_url.url.decode("utf-8"),
             "offset": records[0].offset,
             "filename": os.path.basename(records[0].warc_filename),
@@ -140,7 +140,11 @@ class RethinkCapturesDedup:
             raw_digest = base64.b16decode(value_str, casefold=True)
         entry = self.captures_db.find_response_by_digest(algo, raw_digest, bucket)
         if entry:
-            dedup_info = {"url":entry["url"].encode("utf-8"), "date":entry["timestamp"].encode("utf-8"), "id":entry["warc_id"].encode("utf-8")}
+            dedup_info = {
+                "url": entry["url"].encode("utf-8"),
+                "date": entry["timestamp"].strftime("%Y-%m-%dT%H:%M:%SZ").encode("utf-8"),
+                "id": entry["warc_id"].encode("utf-8")
+            }
             return dedup_info
         else:
             return None
diff --git a/warcprox/dedup.py b/warcprox/dedup.py
index 17735ed..7ac99d8 100644
--- a/warcprox/dedup.py
+++ b/warcprox/dedup.py
@@ -64,8 +64,8 @@ class DedupDb(object):
     def notify(self, recorded_url, records):
         if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
                 and recorded_url.response_recorder.payload_size() > 0):
-            digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, 
-                    self.options.base32)
+            digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
+                self.options.base32)
             if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
                 self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
             else:
@@ -73,8 +73,8 @@ class DedupDb(object):
 
 
 def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
-    if (recorded_url.response_recorder 
-            and recorded_url.response_recorder.payload_digest 
+    if (recorded_url.response_recorder
+            and recorded_url.response_recorder.payload_digest
             and recorded_url.response_recorder.payload_size() > 0):
         digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32)
         if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
@@ -100,7 +100,7 @@ class RethinkDedupDb:
             self.r.db_create(self.r.dbname).run()
         tables = self.r.table_list().run()
         if not self.table in tables:
-            self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s", 
+            self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s",
                              repr(self.table), repr(self.r.dbname), self.shards, self.replicas)
             self.r.table_create(self.table, primary_key="key", shards=self.shards, replicas=self.replicas).run()
 
@@ -135,7 +135,7 @@ class RethinkDedupDb:
     def notify(self, recorded_url, records):
         if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
                 and recorded_url.response_recorder.payload_size() > 0):
-            digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, 
+            digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
                     self.options.base32)
             if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
                 self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py
index 25beff8..3f1642c 100644
--- a/warcprox/writerthread.py
+++ b/warcprox/writerthread.py
@@ -81,7 +81,7 @@ class WarcWriterThread(threading.Thread):
             payload_digest = records[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8")
         except:
             payload_digest = "-"
-        
+
         # 2015-07-17T22:32:23.672Z     1         58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"}
         self.logger.info("{} {} {} {} {} size={} {} {} {} offset={}".format(
             recorded_url.client_ip, recorded_url.status, recorded_url.method,