From 7d4c8dcb4ec8c8afd8f65753f38630cc2776e065 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 8 Dec 2021 11:04:09 -0800 Subject: [PATCH] recorded_url.do_not_archive = True --- warcprox/dedup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 44319ea..2ceb876 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -435,7 +435,8 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): else: if hash_plus_url in hash_plus_urls: self.logger.debug( - 'discarding duplicate {}'.format(hash_plus_url)) + 'discarding duplicate {}, setting do_not_archive'.format(hash_plus_url)) + recorded_url.do_not_archive = True discards.append( warcprox.digest_str( recorded_url.payload_digest, self.options.base32)