diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index a799d78..0587cf9 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -11,6 +11,7 @@ import warcprox import base64 import surt import os +import hashlib class RethinkCaptures: logger = logging.getLogger("warcprox.bigtables.RethinkCaptures") @@ -51,11 +52,14 @@ class RethinkCaptures: return result def notify(self, recorded_url, records): - if not recorded_url.response_recorder: - return - - if recorded_url.response_recorder.payload_digest.name != "sha1": - self.logger.warn("digest type is %s but big capture table is indexed by sha1", recorded_url.response_recorder.payload_digest.name) + if recorded_url.response_recorder: + if recorded_url.response_recorder.payload_digest.name == "sha1": + sha1base32 = base64.b32encode(recorded_url.response_recorder.payload_digest.digest()).decode("utf-8") + else: + self.logger.warn("digest type is %s but big capture table is indexed by sha1", recorded_url.response_recorder.payload_digest.name) + else: + digest = hashlib.new("sha1", records[0].content[1]) + sha1base32 = base64.b32encode(digest.digest()).decode("utf-8") if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta: bucket = recorded_url.warcprox_meta["captures-bucket"] @@ -75,7 +79,7 @@ class RethinkCaptures: "filename": os.path.basename(records[0].warc_filename), "warc_type": records[0].type.decode("utf-8"), "warc_id": records[0].id.decode("utf-8"), - "sha1base32": base64.b32encode(recorded_url.response_recorder.payload_digest.digest()).decode("utf-8"), + "sha1base32": sha1base32, "content_type": recorded_url.mimetype, "response_code": recorded_url.status, "http_method": recorded_url.method, diff --git a/warcprox/warc.py b/warcprox/warc.py index bea4a89..9391890 100644 --- a/warcprox/warc.py +++ b/warcprox/warc.py @@ -113,9 +113,11 @@ class WarcRecordBuilder: else: headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1'))) - block_digest = hashlib.new(self.digest_algorithm, data) + digest = hashlib.new(self.digest_algorithm, data) headers.append((warctools.WarcRecord.BLOCK_DIGEST, - warcprox.digest_str(block_digest, self.base32))) + warcprox.digest_str(digest, self.base32))) + headers.append((warctools.WarcRecord.PAYLOAD_DIGEST, + warcprox.digest_str(digest, self.base32))) content_tuple = content_type, data record = warctools.WarcRecord(headers=headers, content=content_tuple)