mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
limit revisits mixin
This commit is contained in:
parent
ca02c22ff7
commit
5075920415
@ -1,7 +1,7 @@
|
|||||||
'''
|
'''
|
||||||
warcprox/dedup.py - identical payload digest deduplication using sqlite db
|
warcprox/dedup.py - identical payload digest deduplication using sqlite db
|
||||||
|
|
||||||
Copyright (C) 2013-2021 Internet Archive
|
Copyright (C) 2013-2023 Internet Archive
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
This program is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU General Public License
|
modify it under the terms of the GNU General Public License
|
||||||
@ -61,6 +61,35 @@ class DedupableMixin(object):
|
|||||||
else:
|
else:
|
||||||
return recorded_url.response_recorder.payload_size() > self.min_binary_size
|
return recorded_url.response_recorder.payload_size() > self.min_binary_size
|
||||||
|
|
||||||
|
class LimitRevisitsMixin(object):
|
||||||
|
"""
|
||||||
|
Limit revisits recorded to one per revisit_key
|
||||||
|
"""
|
||||||
|
def __init__(self, options=warcprox.Options()):
|
||||||
|
self.revisits = collections.defaultdict(set)
|
||||||
|
# TODO: define logger?
|
||||||
|
|
||||||
|
def limit_revisits(self, recorded_url, hash=None, revisit_key=None):
|
||||||
|
if not hash:
|
||||||
|
hash = warcprox.digest_str(recorded_url.payload_digest, self.options.base32)
|
||||||
|
if not revisit_key:
|
||||||
|
if (
|
||||||
|
recorded_url.warcprox_meta
|
||||||
|
and "metadata" in recorded_url.warcprox_meta
|
||||||
|
and "ait-job-id" in recorded_url.warcprox_meta["metadata"]
|
||||||
|
):
|
||||||
|
revisit_key = recorded_url.warcprox_meta["metadata"]["ait-job-id"]
|
||||||
|
else:
|
||||||
|
revisit_key = 'all'
|
||||||
|
|
||||||
|
hash_plus_url = b''.join((hash, recorded_url.url))
|
||||||
|
if hash_plus_url in self.revisits[revisit_key]:
|
||||||
|
logging.info("skipping revisit for url %s and hash %s", recorded_url.url, hash)
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
self.revisits[revisit_key].add(hash_plus_url)
|
||||||
|
return False
|
||||||
|
|
||||||
class DedupLoader(warcprox.BaseStandardPostfetchProcessor, DedupableMixin):
|
class DedupLoader(warcprox.BaseStandardPostfetchProcessor, DedupableMixin):
|
||||||
def __init__(self, dedup_db, options=warcprox.Options()):
|
def __init__(self, dedup_db, options=warcprox.Options()):
|
||||||
warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options)
|
warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options)
|
||||||
@ -398,7 +427,7 @@ class BatchTroughStorer(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
logging.warning(
|
logging.warning(
|
||||||
'timed out saving dedup info to trough', exc_info=True)
|
'timed out saving dedup info to trough', exc_info=True)
|
||||||
|
|
||||||
class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor, LimitRevisitsMixin):
|
||||||
logger = logging.getLogger("warcprox.dedup.BatchTroughLoader")
|
logger = logging.getLogger("warcprox.dedup.BatchTroughLoader")
|
||||||
|
|
||||||
def __init__(self, trough_dedup_db, options=warcprox.Options()):
|
def __init__(self, trough_dedup_db, options=warcprox.Options()):
|
||||||
@ -488,6 +517,9 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor):
|
|||||||
for entry in future.result():
|
for entry in future.result():
|
||||||
for recorded_url in key_index[entry['digest_key']]:
|
for recorded_url in key_index[entry['digest_key']]:
|
||||||
recorded_url.dedup_info = entry
|
recorded_url.dedup_info = entry
|
||||||
|
if recorded_url.dedup_info:
|
||||||
|
recorded_url.do_not_archive = \
|
||||||
|
self.limit_revisits(recorded_url)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# batch_lookup raised exception or something
|
# batch_lookup raised exception or something
|
||||||
logging.warning(
|
logging.warning(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user