diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 09f5996..336747f 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -424,7 +424,7 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): if (recorded_url.response_recorder and recorded_url.payload_digest and self.trough_dedup_db.should_dedup(recorded_url) - and '{}{}'.format(recorded_url.payload_digest, recorded_url.url) not in hash_plus_urls): + and hash_plus_url not in hash_plus_urls): hash_plus_urls.add(hash_plus_url) if (recorded_url.warcprox_meta and 'dedup-buckets' in recorded_url.warcprox_meta): @@ -435,13 +435,13 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): else: if hash_plus_url in hash_plus_urls: self.logger.debug( - 'discarding duplicate {}'.format(hash_plus_url) + 'discarding duplicate {}'.format(hash_plus_url)) discards.append( warcprox.digest_str( recorded_url.payload_digest, self.options.base32) if recorded_url.payload_digest else 'n/a') self.logger.debug( - 'hash_plus_urls: {}'.format(hash_plus_urls)) + 'hash_plus_urls: {}...'.format(hash_plus_urls[0])) self.logger.debug( 'len(batch)=%s len(discards)=%s buckets=%s', len(batch), len(discards),