Merge branch 'skip_revisits' into qa

This commit is contained in:
Barbara Miller 2023-06-23 11:30:01 -07:00
commit 0da822a555

View File

@ -2,7 +2,7 @@
warcprox/writerthread.py - warc writer thread, reads from the recorded url warcprox/writerthread.py - warc writer thread, reads from the recorded url
queue, writes warc records, runs final tasks after warc records are written queue, writes warc records, runs final tasks after warc records are written
Copyright (C) 2013-2019 Internet Archive Copyright (C) 2013-2023 Internet Archive
This program is free software; you can redistribute it and/or This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License modify it under the terms of the GNU General Public License
@ -117,14 +117,15 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
and "ait-job-id" in recorded_url.warcprox_meta["metadata"] and "ait-job-id" in recorded_url.warcprox_meta["metadata"]
): ):
crawl_id = recorded_url.warcprox_meta["metadata"]["ait-job-id"] crawl_id = recorded_url.warcprox_meta["metadata"]["ait-job-id"]
if recorded_url.payload_digest in self.revisits[crawl_id]: hash = warcprox.digest_str( recorded_url.payload_digest, self.options.base32)
if hash in self.revisits[crawl_id]:
self._log(recorded_url, None, annotation="_skip_revisit") self._log(recorded_url, None, annotation="_skip_revisit")
return True return True
else: else:
self.revisits[crawl_id].add(recorded_url.payload_digest) self._log(recorded_url, None, annotation="_keep_revisit")
self.revisits[crawl_id].add(hash)
return False return False
def _in_blackout(self, recorded_url): def _in_blackout(self, recorded_url):
"""If --blackout-period=N (sec) is set, check if duplicate record """If --blackout-period=N (sec) is set, check if duplicate record
datetime is close to the original. If yes, we don't write it to WARC. datetime is close to the original. If yes, we don't write it to WARC.