Merge branch 'WT-31' into qa

This commit is contained in:
Barbara Miller 2021-12-06 19:43:37 -08:00
commit 2ceb0f69f1

View File

@ -424,7 +424,7 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
if (recorded_url.response_recorder
and recorded_url.payload_digest
and self.trough_dedup_db.should_dedup(recorded_url)
and '{}{}'.format(recorded_url.payload_digest, recorded_url.url) not in hash_plus_urls):
and hash_plus_url not in hash_plus_urls):
hash_plus_urls.add(hash_plus_url)
if (recorded_url.warcprox_meta
and 'dedup-buckets' in recorded_url.warcprox_meta):
@ -435,13 +435,13 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
else:
if hash_plus_url in hash_plus_urls:
self.logger.debug(
'discarding duplicate {}'.format(hash_plus_url)
'discarding duplicate {}'.format(hash_plus_url))
discards.append(
warcprox.digest_str(
recorded_url.payload_digest, self.options.base32)
if recorded_url.payload_digest else 'n/a')
self.logger.debug(
'hash_plus_urls: {}'.format(hash_plus_urls))
'hash_plus_urls: {}...'.format(hash_plus_urls[0]))
self.logger.debug(
'len(batch)=%s len(discards)=%s buckets=%s',
len(batch), len(discards),