Merge pull request #183 from galgeek/qa

skip revisits
This commit is contained in:
Barbara Miller 2023-06-05 13:51:43 -07:00 committed by GitHub
commit 2acaba19df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 26 additions and 2 deletions

View File

@ -2,7 +2,7 @@
''' '''
setup.py - setuptools installation configuration for warcprox setup.py - setuptools installation configuration for warcprox
Copyright (C) 2013-2021 Internet Archive Copyright (C) 2013-2023 Internet Archive
This program is free software; you can redistribute it and/or This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License modify it under the terms of the GNU General Public License
@ -44,7 +44,7 @@ except:
setuptools.setup( setuptools.setup(
name='warcprox', name='warcprox',
version='2.4.29-qa-220804', version='2.4.32.qa',
description='WARC writing MITM HTTP/S proxy', description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox', url='https://github.com/internetarchive/warcprox',
author='Noah Levitt', author='Noah Levitt',

View File

@ -32,6 +32,7 @@ import time
import warcprox import warcprox
from concurrent import futures from concurrent import futures
from datetime import datetime from datetime import datetime
from collections import defaultdict
import threading import threading
try: try:
import queue import queue
@ -103,8 +104,31 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
# special warc name prefix '-' means "don't archive" # special warc name prefix '-' means "don't archive"
return (prefix != '-' and not recorded_url.do_not_archive return (prefix != '-' and not recorded_url.do_not_archive
and self._filter_accepts(recorded_url) and self._filter_accepts(recorded_url)
and not self._skip_revisit(recorded_url)
and not self._in_blackout(recorded_url)) and not self._in_blackout(recorded_url))
# maintain a set of revisit hashes seen, per ait crawl id
revisits = defaultdict(set)
def _skip_revisit(self, recorded_url):
if hasattr(recorded_url, "dedup_info") and recorded_url.dedup_info:
if (
recorded_url.warcprox_meta
and "metadata" in recorded_url.warcprox_meta
and "ait-job-id" in recorded_url.warcprox_meta["metadata"]
):
crawl_id = recorded_url.warcprox_meta["metadata"]["ait-job-id"]
if recorded_url.payload_digest in revisits[crawl_id]:
self.logger.info(
"Found duplicate revisit, skipping: %s, hash: %s",
recorded_url.url,
recorded_url.payload_digest,
)
return True
else:
revisits[crawl_id].add(recorded_url.payload_digest)
return False
def _in_blackout(self, recorded_url): def _in_blackout(self, recorded_url):
"""If --blackout-period=N (sec) is set, check if duplicate record """If --blackout-period=N (sec) is set, check if duplicate record
datetime is close to the original. If yes, we don't write it to WARC. datetime is close to the original. If yes, we don't write it to WARC.