From 8078ee7af91bbb1721be3377286889683ad141a6 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Sat, 15 Aug 2020 09:17:39 +0000 Subject: [PATCH] DedupableMixin.should_dedup() improvement When a recorded URL has `recorded_url.do_not_archive = True`, it is not written to WARC. This is checked in `WarcWriterProcessor._should_archive`. We shouldn't waste time on deduping something that is not going to be written to WARC anyway. --- warcprox/dedup.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 0181019..9d77acd 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -48,8 +48,12 @@ class DedupableMixin(object): size compared with min text/binary dedup size options. When we use option --dedup-only-with-bucket, `dedup-buckets` is required in Warcprox-Meta to perform dedup. + If recorded_url.do_not_archive is True, we skip dedup. This record will + not be written to WARC anyway. Return Boolean. """ + if recorded_url.do_not_archive: + return False if self.dedup_only_with_bucket and "dedup-buckets" not in recorded_url.warcprox_meta: return False if recorded_url.is_text():