From 1476bfec8c71854ad92083295270240fc4e7d93d Mon Sep 17 00:00:00 2001
From: Barbara Miller <galgeek@me.com>
Date: Thu, 2 Dec 2021 11:09:17 -0800
Subject: [PATCH] discard batch hash+url match

---
 warcprox/dedup.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/warcprox/dedup.py b/warcprox/dedup.py
index 77bc7c6..2648bf1 100644
--- a/warcprox/dedup.py
+++ b/warcprox/dedup.py
@@ -1,7 +1,7 @@
 '''
 warcprox/dedup.py - identical payload digest deduplication using sqlite db
 
-Copyright (C) 2013-2018 Internet Archive
+Copyright (C) 2013-2021 Internet Archive
 
 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU General Public License
@@ -412,10 +412,14 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
         '''
         buckets = collections.defaultdict(list)
         discards = []
+        # for duplicate checks, see https://webarchive.jira.com/browse/WT-31
+        hash_plus_urls = set()
         for recorded_url in batch:
             if (recorded_url.response_recorder
                     and recorded_url.payload_digest
-                    and self.trough_dedup_db.should_dedup(recorded_url)):
+                    and self.trough_dedup_db.should_dedup(recorded_url)
+                    and f'{recorded_url.payload_digest}{recorded_url.url}' not in hash_plus_urls):
+                hash_plus_urls.add(f'{recorded_url.payload_digest}{recorded_url.url}')
                 if (recorded_url.warcprox_meta
                         and 'dedup-buckets' in recorded_url.warcprox_meta):
                     for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
@@ -423,6 +427,9 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
                 else:
                     buckets['__unspecified__'].append(recorded_url)
             else:
+                if f'{recorded_url.payload_digest}{recorded_url.url}' in hash_plus_urls:
+                    self.logger.debug(
+                        f'discarding duplicate {recorded_url.payload_digest} {recorded_url.url}')
                 discards.append(
                         warcprox.digest_str(
                             recorded_url.payload_digest, self.options.base32)