mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
Merge branch 'skip_revisits' into qa
This commit is contained in:
commit
0da822a555
@ -2,7 +2,7 @@
|
|||||||
warcprox/writerthread.py - warc writer thread, reads from the recorded url
|
warcprox/writerthread.py - warc writer thread, reads from the recorded url
|
||||||
queue, writes warc records, runs final tasks after warc records are written
|
queue, writes warc records, runs final tasks after warc records are written
|
||||||
|
|
||||||
Copyright (C) 2013-2019 Internet Archive
|
Copyright (C) 2013-2023 Internet Archive
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
This program is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU General Public License
|
modify it under the terms of the GNU General Public License
|
||||||
@ -117,14 +117,15 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
|
|||||||
and "ait-job-id" in recorded_url.warcprox_meta["metadata"]
|
and "ait-job-id" in recorded_url.warcprox_meta["metadata"]
|
||||||
):
|
):
|
||||||
crawl_id = recorded_url.warcprox_meta["metadata"]["ait-job-id"]
|
crawl_id = recorded_url.warcprox_meta["metadata"]["ait-job-id"]
|
||||||
if recorded_url.payload_digest in self.revisits[crawl_id]:
|
hash = warcprox.digest_str( recorded_url.payload_digest, self.options.base32)
|
||||||
|
if hash in self.revisits[crawl_id]:
|
||||||
self._log(recorded_url, None, annotation="_skip_revisit")
|
self._log(recorded_url, None, annotation="_skip_revisit")
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
self.revisits[crawl_id].add(recorded_url.payload_digest)
|
self._log(recorded_url, None, annotation="_keep_revisit")
|
||||||
|
self.revisits[crawl_id].add(hash)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _in_blackout(self, recorded_url):
|
def _in_blackout(self, recorded_url):
|
||||||
"""If --blackout-period=N (sec) is set, check if duplicate record
|
"""If --blackout-period=N (sec) is set, check if duplicate record
|
||||||
datetime is close to the original. If yes, we don't write it to WARC.
|
datetime is close to the original. If yes, we don't write it to WARC.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user