mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Remove method decorate_with_dedup_info
Method `warcprox.dedup.decorate_with_dedup_info` is only used in `DedupLoader._process_url` and nowhere else. The problem is that `decorate_with_dedup_info` cannot get warcprox cli options. Thus we cannot pass the custom min size limits.
This commit is contained in:
parent
9057fbdf36
commit
6dce8cc644
@ -51,14 +51,24 @@ class DedupableMixin(object):
|
|||||||
else:
|
else:
|
||||||
return recorded_url.response_recorder.payload_size() > self.min_binary_size
|
return recorded_url.response_recorder.payload_size() > self.min_binary_size
|
||||||
|
|
||||||
class DedupLoader(warcprox.BaseStandardPostfetchProcessor):
|
class DedupLoader(warcprox.BaseStandardPostfetchProcessor, DedupableMixin):
|
||||||
def __init__(self, dedup_db, options=warcprox.Options()):
|
def __init__(self, dedup_db, options=warcprox.Options()):
|
||||||
warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options)
|
warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options)
|
||||||
|
DedupableMixin.__init__(self, options)
|
||||||
self.dedup_db = dedup_db
|
self.dedup_db = dedup_db
|
||||||
|
|
||||||
def _process_url(self, recorded_url):
|
def _process_url(self, recorded_url):
|
||||||
decorate_with_dedup_info(
|
if (recorded_url.response_recorder
|
||||||
self.dedup_db, recorded_url, self.options.base32)
|
and recorded_url.payload_digest
|
||||||
|
and self.should_dedup(recorded_url)):
|
||||||
|
digest_key = warcprox.digest_str(recorded_url.payload_digest, self.options.base32)
|
||||||
|
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
||||||
|
recorded_url.dedup_info = self.dedup_db.lookup(
|
||||||
|
digest_key, recorded_url.warcprox_meta["captures-bucket"],
|
||||||
|
recorded_url.url)
|
||||||
|
else:
|
||||||
|
recorded_url.dedup_info = self.dedup_db.lookup(
|
||||||
|
digest_key, url=recorded_url.url)
|
||||||
|
|
||||||
class DedupDb(DedupableMixin):
|
class DedupDb(DedupableMixin):
|
||||||
logger = logging.getLogger("warcprox.dedup.DedupDb")
|
logger = logging.getLogger("warcprox.dedup.DedupDb")
|
||||||
@ -138,19 +148,6 @@ class DedupDb(DedupableMixin):
|
|||||||
else:
|
else:
|
||||||
self.save(digest_key, records[0])
|
self.save(digest_key, records[0])
|
||||||
|
|
||||||
def decorate_with_dedup_info(dedup_db, recorded_url, base32=False):
|
|
||||||
if (recorded_url.response_recorder
|
|
||||||
and recorded_url.payload_digest
|
|
||||||
and recorded_url.response_recorder.payload_size() > 0):
|
|
||||||
digest_key = warcprox.digest_str(recorded_url.payload_digest, base32)
|
|
||||||
if recorded_url.warcprox_meta and "captures-bucket" in recorded_url.warcprox_meta:
|
|
||||||
recorded_url.dedup_info = dedup_db.lookup(
|
|
||||||
digest_key, recorded_url.warcprox_meta["captures-bucket"],
|
|
||||||
recorded_url.url)
|
|
||||||
else:
|
|
||||||
recorded_url.dedup_info = dedup_db.lookup(
|
|
||||||
digest_key, url=recorded_url.url)
|
|
||||||
|
|
||||||
class RethinkDedupDb(DedupDb, DedupableMixin):
|
class RethinkDedupDb(DedupDb, DedupableMixin):
|
||||||
logger = logging.getLogger("warcprox.dedup.RethinkDedupDb")
|
logger = logging.getLogger("warcprox.dedup.RethinkDedupDb")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user