mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
str, not object
This commit is contained in:
parent
b67f1ad0f3
commit
5e5a74f204
@ -418,11 +418,14 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
# for duplicate checks, see https://webarchive.jira.com/browse/WT-31
|
# for duplicate checks, see https://webarchive.jira.com/browse/WT-31
|
||||||
hash_plus_urls = set()
|
hash_plus_urls = set()
|
||||||
for recorded_url in batch:
|
for recorded_url in batch:
|
||||||
|
if recorded_url.payload_digest:
|
||||||
|
hash_plus_url = ''.join((warcprox.digest_str(
|
||||||
|
recorded_url.payload_digest, self.options.base32), recorded_url.url.decode()))
|
||||||
if (recorded_url.response_recorder
|
if (recorded_url.response_recorder
|
||||||
and recorded_url.payload_digest
|
and recorded_url.payload_digest
|
||||||
and self.trough_dedup_db.should_dedup(recorded_url)
|
and self.trough_dedup_db.should_dedup(recorded_url)
|
||||||
and '{}{}'.format(recorded_url.payload_digest, recorded_url.url) not in hash_plus_urls):
|
and '{}{}'.format(recorded_url.payload_digest, recorded_url.url) not in hash_plus_urls):
|
||||||
hash_plus_urls.add('{}{}'.format(recorded_url.payload_digest, recorded_url.url))
|
hash_plus_urls.add(hash_plus_url)
|
||||||
if (recorded_url.warcprox_meta
|
if (recorded_url.warcprox_meta
|
||||||
and 'dedup-buckets' in recorded_url.warcprox_meta):
|
and 'dedup-buckets' in recorded_url.warcprox_meta):
|
||||||
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
|
||||||
@ -430,9 +433,9 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
else:
|
else:
|
||||||
buckets['__unspecified__'].append(recorded_url)
|
buckets['__unspecified__'].append(recorded_url)
|
||||||
else:
|
else:
|
||||||
if recorded_url.payload_digest and '{}{}'.format(recorded_url.payload_digest, recorded_url.url) in hash_plus_urls:
|
if hash_plus_url in hash_plus_urls:
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
'discarding duplicate {} {}'.format(recorded_url.payload_digest, recorded_url.url))
|
'discarding duplicate {}'.format(hash_plus_url)
|
||||||
discards.append(
|
discards.append(
|
||||||
warcprox.digest_str(
|
warcprox.digest_str(
|
||||||
recorded_url.payload_digest, self.options.base32)
|
recorded_url.payload_digest, self.options.base32)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user