From 432e42803cba7e197c86aff4b5b2091986abb10a Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 4 May 2018 14:27:42 +0000 Subject: [PATCH] dedup-bucket is required in Warcprox-Meta to do dedup Modify `DedupableMixin.should_dedup` to check Warcprox-Meta for `dedup-bucket` in order to perform dedup. --- warcprox/dedup.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 33eab16..ec03c7c 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -44,8 +44,12 @@ class DedupableMixin(object): def should_dedup(self, recorded_url): """Check if we should try to run dedup on resource based on payload - size compared with min text/binary dedup size options. Return Boolean. + size compared with min text/binary dedup size options. + `dedup-bucket` is required in Warcprox-Meta to perform dedup. + Return Boolean. """ + if "dedup-bucket" not in recorded_url.warcprox_meta: + return False if recorded_url.is_text(): return recorded_url.response_recorder.payload_size() > self.min_text_size else: