From 2bec9db7dfd99788c3c91bca0545536666f12e88 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 8 Mar 2016 22:52:02 +0000 Subject: [PATCH] handle old dedup entries missing "warc_id" --- warcprox/bigtable.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/warcprox/bigtable.py b/warcprox/bigtable.py index 9f2bcae..7e6670c 100644 --- a/warcprox/bigtable.py +++ b/warcprox/bigtable.py @@ -143,8 +143,9 @@ class RethinkCapturesDedup: dedup_info = { "url": entry["url"].encode("utf-8"), "date": entry["timestamp"].strftime("%Y-%m-%dT%H:%M:%SZ").encode("utf-8"), - "id": entry["warc_id"].encode("utf-8") } + if "warc_id" in entry: + dedup_info["id"] = entry["warc_id"].encode("utf-8") return dedup_info else: return None