mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge branch 'WT-31' into qa
This commit is contained in:
commit
2ceb0f69f1
@ -424,7 +424,7 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
||||
if (recorded_url.response_recorder
|
||||
and recorded_url.payload_digest
|
||||
and self.trough_dedup_db.should_dedup(recorded_url)
|
||||
and '{}{}'.format(recorded_url.payload_digest, recorded_url.url) not in hash_plus_urls):
|
||||
and hash_plus_url not in hash_plus_urls):
|
||||
hash_plus_urls.add(hash_plus_url)
|
||||
if (recorded_url.warcprox_meta
|
||||
and 'dedup-buckets' in recorded_url.warcprox_meta):
|
||||
@ -435,13 +435,13 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
||||
else:
|
||||
if hash_plus_url in hash_plus_urls:
|
||||
self.logger.debug(
|
||||
'discarding duplicate {}'.format(hash_plus_url)
|
||||
'discarding duplicate {}'.format(hash_plus_url))
|
||||
discards.append(
|
||||
warcprox.digest_str(
|
||||
recorded_url.payload_digest, self.options.base32)
|
||||
if recorded_url.payload_digest else 'n/a')
|
||||
self.logger.debug(
|
||||
'hash_plus_urls: {}'.format(hash_plus_urls))
|
||||
'hash_plus_urls: {}...'.format(hash_plus_urls[0]))
|
||||
self.logger.debug(
|
||||
'len(batch)=%s len(discards)=%s buckets=%s',
|
||||
len(batch), len(discards),
|
||||
|
Loading…
x
Reference in New Issue
Block a user