handle old dedup entries missing "warc_id"

This commit is contained in:
Noah Levitt 2016-03-08 22:52:02 +00:00
parent 422672408a
commit 2bec9db7df

View File

@ -143,8 +143,9 @@ class RethinkCapturesDedup:
dedup_info = {
"url": entry["url"].encode("utf-8"),
"date": entry["timestamp"].strftime("%Y-%m-%dT%H:%M:%SZ").encode("utf-8"),
"id": entry["warc_id"].encode("utf-8")
}
if "warc_id" in entry:
dedup_info["id"] = entry["warc_id"].encode("utf-8")
return dedup_info
else:
return None