skip duplicate revisits, per ait-job-id

This commit is contained in:
Barbara Miller 2023-06-05 13:40:21 -07:00
parent 50a4f35e5f
commit 1dc7de7dd8

View File

@ -32,6 +32,7 @@ import time
import warcprox import warcprox
from concurrent import futures from concurrent import futures
from datetime import datetime from datetime import datetime
from collections import defaultdict
import threading import threading
try: try:
import queue import queue
@ -103,8 +104,31 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
# special warc name prefix '-' means "don't archive" # special warc name prefix '-' means "don't archive"
return (prefix != '-' and not recorded_url.do_not_archive return (prefix != '-' and not recorded_url.do_not_archive
and self._filter_accepts(recorded_url) and self._filter_accepts(recorded_url)
and not self._skip_revisit(recorded_url)
and not self._in_blackout(recorded_url)) and not self._in_blackout(recorded_url))
# maintain a set of revisit hashes seen, per ait crawl id
revisits = defaultdict(set)
def _skip_revisit(self, recorded_url):
if hasattr(recorded_url, "dedup_info") and recorded_url.dedup_info:
if (
recorded_url.warcprox_meta
and "metadata" in recorded_url.warcprox_meta
and "ait-job-id" in recorded_url.warcprox_meta["metadata"]
):
crawl_id = recorded_url.warcprox_meta["metadata"]["ait-job-id"]
if recorded_url.payload_digest in revisits[crawl_id]:
self.logger.info(
"Found duplicate revisit, skipping: %s, hash: %s",
recorded_url.url,
recorded_url.payload_digest,
)
return True
else:
revisits[crawl_id].add(recorded_url.payload_digest)
return False
def _in_blackout(self, recorded_url): def _in_blackout(self, recorded_url):
"""If --blackout-period=N (sec) is set, check if duplicate record """If --blackout-period=N (sec) is set, check if duplicate record
datetime is close to the original. If yes, we don't write it to WARC. datetime is close to the original. If yes, we don't write it to WARC.