mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
fixes to let screenshot recordss be saved in big capture tables for wayback playback
This commit is contained in:
parent
c02c98e369
commit
686a297f98
@ -11,6 +11,7 @@ import warcprox
|
|||||||
import base64
|
import base64
|
||||||
import surt
|
import surt
|
||||||
import os
|
import os
|
||||||
|
import hashlib
|
||||||
|
|
||||||
class RethinkCaptures:
|
class RethinkCaptures:
|
||||||
logger = logging.getLogger("warcprox.bigtables.RethinkCaptures")
|
logger = logging.getLogger("warcprox.bigtables.RethinkCaptures")
|
||||||
@ -51,11 +52,14 @@ class RethinkCaptures:
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
def notify(self, recorded_url, records):
|
def notify(self, recorded_url, records):
|
||||||
if not recorded_url.response_recorder:
|
if recorded_url.response_recorder:
|
||||||
return
|
if recorded_url.response_recorder.payload_digest.name == "sha1":
|
||||||
|
sha1base32 = base64.b32encode(recorded_url.response_recorder.payload_digest.digest()).decode("utf-8")
|
||||||
if recorded_url.response_recorder.payload_digest.name != "sha1":
|
else:
|
||||||
self.logger.warn("digest type is %s but big capture table is indexed by sha1", recorded_url.response_recorder.payload_digest.name)
|
self.logger.warn("digest type is %s but big capture table is indexed by sha1", recorded_url.response_recorder.payload_digest.name)
|
||||||
|
else:
|
||||||
|
digest = hashlib.new("sha1", records[0].content[1])
|
||||||
|
sha1base32 = base64.b32encode(digest.digest()).decode("utf-8")
|
||||||
|
|
||||||
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
||||||
bucket = recorded_url.warcprox_meta["captures-bucket"]
|
bucket = recorded_url.warcprox_meta["captures-bucket"]
|
||||||
@ -75,7 +79,7 @@ class RethinkCaptures:
|
|||||||
"filename": os.path.basename(records[0].warc_filename),
|
"filename": os.path.basename(records[0].warc_filename),
|
||||||
"warc_type": records[0].type.decode("utf-8"),
|
"warc_type": records[0].type.decode("utf-8"),
|
||||||
"warc_id": records[0].id.decode("utf-8"),
|
"warc_id": records[0].id.decode("utf-8"),
|
||||||
"sha1base32": base64.b32encode(recorded_url.response_recorder.payload_digest.digest()).decode("utf-8"),
|
"sha1base32": sha1base32,
|
||||||
"content_type": recorded_url.mimetype,
|
"content_type": recorded_url.mimetype,
|
||||||
"response_code": recorded_url.status,
|
"response_code": recorded_url.status,
|
||||||
"http_method": recorded_url.method,
|
"http_method": recorded_url.method,
|
||||||
|
@ -113,9 +113,11 @@ class WarcRecordBuilder:
|
|||||||
|
|
||||||
else:
|
else:
|
||||||
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1')))
|
headers.append((warctools.WarcRecord.CONTENT_LENGTH, str(len(data)).encode('latin1')))
|
||||||
block_digest = hashlib.new(self.digest_algorithm, data)
|
digest = hashlib.new(self.digest_algorithm, data)
|
||||||
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
headers.append((warctools.WarcRecord.BLOCK_DIGEST,
|
||||||
warcprox.digest_str(block_digest, self.base32)))
|
warcprox.digest_str(digest, self.base32)))
|
||||||
|
headers.append((warctools.WarcRecord.PAYLOAD_DIGEST,
|
||||||
|
warcprox.digest_str(digest, self.base32)))
|
||||||
|
|
||||||
content_tuple = content_type, data
|
content_tuple = content_type, data
|
||||||
record = warctools.WarcRecord(headers=headers, content=content_tuple)
|
record = warctools.WarcRecord(headers=headers, content=content_tuple)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user