From 9973d28de94e0a98b884ad6bf5caa7299f8297b4 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 4 Aug 2022 17:28:33 -0700 Subject: [PATCH 1/3] bump version --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 52af206..dee2d6d 100755 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ ''' setup.py - setuptools installation configuration for warcprox -Copyright (C) 2013-2021 Internet Archive +Copyright (C) 2013-2022 Internet Archive This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -44,7 +44,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.29', + version='2.4.31', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From 1dc7de7dd86445543d6ac028dca68e1b239d62dc Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 5 Jun 2023 13:40:21 -0700 Subject: [PATCH 2/3] skip duplicate revisits, per ait-job-id --- warcprox/writerthread.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 3cd6bc6..e0628c8 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -32,6 +32,7 @@ import time import warcprox from concurrent import futures from datetime import datetime +from collections import defaultdict import threading try: import queue @@ -103,8 +104,31 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor): # special warc name prefix '-' means "don't archive" return (prefix != '-' and not recorded_url.do_not_archive and self._filter_accepts(recorded_url) + and not self._skip_revisit(recorded_url) and not self._in_blackout(recorded_url)) + # maintain a set of revisit hashes seen, per ait crawl id + revisits = defaultdict(set) + def _skip_revisit(self, recorded_url): + if hasattr(recorded_url, "dedup_info") and recorded_url.dedup_info: + if ( + recorded_url.warcprox_meta + and "metadata" in recorded_url.warcprox_meta + and "ait-job-id" in recorded_url.warcprox_meta["metadata"] + ): + crawl_id = recorded_url.warcprox_meta["metadata"]["ait-job-id"] + if recorded_url.payload_digest in revisits[crawl_id]: + self.logger.info( + "Found duplicate revisit, skipping: %s, hash: %s", + recorded_url.url, + recorded_url.payload_digest, + ) + return True + else: + revisits[crawl_id].add(recorded_url.payload_digest) + return False + + def _in_blackout(self, recorded_url): """If --blackout-period=N (sec) is set, check if duplicate record datetime is close to the original. If yes, we don't write it to WARC. From d98896094a562e03f390ba5282008d5be978a106 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 5 Jun 2023 13:47:14 -0700 Subject: [PATCH 3/3] bump qa version --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index dee2d6d..4f9ad36 100755 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ ''' setup.py - setuptools installation configuration for warcprox -Copyright (C) 2013-2022 Internet Archive +Copyright (C) 2013-2023 Internet Archive This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -44,7 +44,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.31', + version='2.4.32.qa', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt',