mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
use rethinkdb native time type for captures table timestamp
This commit is contained in:
parent
df31068c80
commit
927419645b
@ -10,6 +10,7 @@ import os
|
|||||||
import hashlib
|
import hashlib
|
||||||
import threading
|
import threading
|
||||||
import datetime
|
import datetime
|
||||||
|
import rethinkstuff
|
||||||
|
|
||||||
class RethinkCaptures:
|
class RethinkCaptures:
|
||||||
"""Inserts in batches every 0.5 seconds"""
|
"""Inserts in batches every 0.5 seconds"""
|
||||||
@ -97,8 +98,7 @@ class RethinkCaptures:
|
|||||||
"id": "{} {}".format(canon_surt[:20], records[0].id.decode("utf-8")[10:-1]),
|
"id": "{} {}".format(canon_surt[:20], records[0].id.decode("utf-8")[10:-1]),
|
||||||
"abbr_canon_surt": canon_surt[:150],
|
"abbr_canon_surt": canon_surt[:150],
|
||||||
"canon_surt": canon_surt,
|
"canon_surt": canon_surt,
|
||||||
# "timestamp": re.sub(r"[^0-9]", "", records[0].date.decode("utf-8")),
|
"timestamp": recorded_url.timestamp.replace(tzinfo=rethinkstuff.UTC),
|
||||||
"timestamp": records[0].date.decode("utf-8"),
|
|
||||||
"url": recorded_url.url.decode("utf-8"),
|
"url": recorded_url.url.decode("utf-8"),
|
||||||
"offset": records[0].offset,
|
"offset": records[0].offset,
|
||||||
"filename": os.path.basename(records[0].warc_filename),
|
"filename": os.path.basename(records[0].warc_filename),
|
||||||
@ -140,7 +140,11 @@ class RethinkCapturesDedup:
|
|||||||
raw_digest = base64.b16decode(value_str, casefold=True)
|
raw_digest = base64.b16decode(value_str, casefold=True)
|
||||||
entry = self.captures_db.find_response_by_digest(algo, raw_digest, bucket)
|
entry = self.captures_db.find_response_by_digest(algo, raw_digest, bucket)
|
||||||
if entry:
|
if entry:
|
||||||
dedup_info = {"url":entry["url"].encode("utf-8"), "date":entry["timestamp"].encode("utf-8"), "id":entry["warc_id"].encode("utf-8")}
|
dedup_info = {
|
||||||
|
"url": entry["url"].encode("utf-8"),
|
||||||
|
"date": entry["timestamp"].strftime("%Y-%m-%dT%H:%M:%SZ").encode("utf-8"),
|
||||||
|
"id": entry["warc_id"].encode("utf-8")
|
||||||
|
}
|
||||||
return dedup_info
|
return dedup_info
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
@ -64,8 +64,8 @@ class DedupDb(object):
|
|||||||
def notify(self, recorded_url, records):
|
def notify(self, recorded_url, records):
|
||||||
if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
|
if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
|
||||||
and recorded_url.response_recorder.payload_size() > 0):
|
and recorded_url.response_recorder.payload_size() > 0):
|
||||||
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
|
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
|
||||||
self.options.base32)
|
self.options.base32)
|
||||||
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
||||||
self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
|
self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
|
||||||
else:
|
else:
|
||||||
@ -73,8 +73,8 @@ class DedupDb(object):
|
|||||||
|
|
||||||
|
|
||||||
def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
|
def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
|
||||||
if (recorded_url.response_recorder
|
if (recorded_url.response_recorder
|
||||||
and recorded_url.response_recorder.payload_digest
|
and recorded_url.response_recorder.payload_digest
|
||||||
and recorded_url.response_recorder.payload_size() > 0):
|
and recorded_url.response_recorder.payload_size() > 0):
|
||||||
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32)
|
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest, base32)
|
||||||
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
||||||
@ -100,7 +100,7 @@ class RethinkDedupDb:
|
|||||||
self.r.db_create(self.r.dbname).run()
|
self.r.db_create(self.r.dbname).run()
|
||||||
tables = self.r.table_list().run()
|
tables = self.r.table_list().run()
|
||||||
if not self.table in tables:
|
if not self.table in tables:
|
||||||
self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s",
|
self.logger.info("creating rethinkdb table %s in database %s shards=%s replicas=%s",
|
||||||
repr(self.table), repr(self.r.dbname), self.shards, self.replicas)
|
repr(self.table), repr(self.r.dbname), self.shards, self.replicas)
|
||||||
self.r.table_create(self.table, primary_key="key", shards=self.shards, replicas=self.replicas).run()
|
self.r.table_create(self.table, primary_key="key", shards=self.shards, replicas=self.replicas).run()
|
||||||
|
|
||||||
@ -135,7 +135,7 @@ class RethinkDedupDb:
|
|||||||
def notify(self, recorded_url, records):
|
def notify(self, recorded_url, records):
|
||||||
if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
|
if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE
|
||||||
and recorded_url.response_recorder.payload_size() > 0):
|
and recorded_url.response_recorder.payload_size() > 0):
|
||||||
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
|
digest_key = warcprox.digest_str(recorded_url.response_recorder.payload_digest,
|
||||||
self.options.base32)
|
self.options.base32)
|
||||||
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
||||||
self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
|
self.save(digest_key, records[0], bucket=recorded_url.warcprox_meta["captures-bucket"])
|
||||||
|
@ -81,7 +81,7 @@ class WarcWriterThread(threading.Thread):
|
|||||||
payload_digest = records[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8")
|
payload_digest = records[0].get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode("utf-8")
|
||||||
except:
|
except:
|
||||||
payload_digest = "-"
|
payload_digest = "-"
|
||||||
|
|
||||||
# 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"}
|
# 2015-07-17T22:32:23.672Z 1 58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"}
|
||||||
self.logger.info("{} {} {} {} {} size={} {} {} {} offset={}".format(
|
self.logger.info("{} {} {} {} {} size={} {} {} {} offset={}".format(
|
||||||
recorded_url.client_ip, recorded_url.status, recorded_url.method,
|
recorded_url.client_ip, recorded_url.status, recorded_url.method,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user