Merge pull request #183 from galgeek/qa

skip revisits
This commit is contained in:
Barbara Miller 2023-06-05 13:51:43 -07:00 committed by GitHub
commit 2acaba19df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 26 additions and 2 deletions

View File

@ -2,7 +2,7 @@
'''
setup.py - setuptools installation configuration for warcprox
Copyright (C) 2013-2021 Internet Archive
Copyright (C) 2013-2023 Internet Archive
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
@ -44,7 +44,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.4.29-qa-220804',
version='2.4.32.qa',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -32,6 +32,7 @@ import time
import warcprox
from concurrent import futures
from datetime import datetime
from collections import defaultdict
import threading
try:
import queue
@ -103,8 +104,31 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
# special warc name prefix '-' means "don't archive"
return (prefix != '-' and not recorded_url.do_not_archive
and self._filter_accepts(recorded_url)
and not self._skip_revisit(recorded_url)
and not self._in_blackout(recorded_url))
# maintain a set of revisit hashes seen, per ait crawl id
revisits = defaultdict(set)
def _skip_revisit(self, recorded_url):
if hasattr(recorded_url, "dedup_info") and recorded_url.dedup_info:
if (
recorded_url.warcprox_meta
and "metadata" in recorded_url.warcprox_meta
and "ait-job-id" in recorded_url.warcprox_meta["metadata"]
):
crawl_id = recorded_url.warcprox_meta["metadata"]["ait-job-id"]
if recorded_url.payload_digest in revisits[crawl_id]:
self.logger.info(
"Found duplicate revisit, skipping: %s, hash: %s",
recorded_url.url,
recorded_url.payload_digest,
)
return True
else:
revisits[crawl_id].add(recorded_url.payload_digest)
return False
def _in_blackout(self, recorded_url):
"""If --blackout-period=N (sec) is set, check if duplicate record
datetime is close to the original. If yes, we don't write it to WARC.