mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
backout skip_revisits
This commit is contained in:
parent
65d7776ec4
commit
ad458ddb6a
@ -104,28 +104,8 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
|
|||||||
# special warc name prefix '-' means "don't archive"
|
# special warc name prefix '-' means "don't archive"
|
||||||
return (prefix != '-' and not recorded_url.do_not_archive
|
return (prefix != '-' and not recorded_url.do_not_archive
|
||||||
and self._filter_accepts(recorded_url)
|
and self._filter_accepts(recorded_url)
|
||||||
and not self._skip_revisit(recorded_url)
|
|
||||||
and not self._in_blackout(recorded_url))
|
and not self._in_blackout(recorded_url))
|
||||||
|
|
||||||
# maintain a set of revisit hashes seen, per ait crawl id
|
|
||||||
revisits = defaultdict(set)
|
|
||||||
def _skip_revisit(self, recorded_url):
|
|
||||||
if hasattr(recorded_url, "dedup_info") and recorded_url.dedup_info:
|
|
||||||
if (
|
|
||||||
recorded_url.warcprox_meta
|
|
||||||
and "metadata" in recorded_url.warcprox_meta
|
|
||||||
and "ait-job-id" in recorded_url.warcprox_meta["metadata"]
|
|
||||||
):
|
|
||||||
crawl_id = recorded_url.warcprox_meta["metadata"]["ait-job-id"]
|
|
||||||
hash = warcprox.digest_str( recorded_url.payload_digest, self.options.base32)
|
|
||||||
if hash in self.revisits[crawl_id]:
|
|
||||||
self._log(recorded_url, None, annotation="_skip_revisit")
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
self._log(recorded_url, None, annotation="_keep_revisit")
|
|
||||||
self.revisits[crawl_id].add(hash)
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _in_blackout(self, recorded_url):
|
def _in_blackout(self, recorded_url):
|
||||||
"""If --blackout-period=N (sec) is set, check if duplicate record
|
"""If --blackout-period=N (sec) is set, check if duplicate record
|
||||||
datetime is close to the original. If yes, we don't write it to WARC.
|
datetime is close to the original. If yes, we don't write it to WARC.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user