mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
commit
2acaba19df
4
setup.py
4
setup.py
@ -2,7 +2,7 @@
|
|||||||
'''
|
'''
|
||||||
setup.py - setuptools installation configuration for warcprox
|
setup.py - setuptools installation configuration for warcprox
|
||||||
|
|
||||||
Copyright (C) 2013-2021 Internet Archive
|
Copyright (C) 2013-2023 Internet Archive
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or
|
This program is free software; you can redistribute it and/or
|
||||||
modify it under the terms of the GNU General Public License
|
modify it under the terms of the GNU General Public License
|
||||||
@ -44,7 +44,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.4.29-qa-220804',
|
version='2.4.32.qa',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -32,6 +32,7 @@ import time
|
|||||||
import warcprox
|
import warcprox
|
||||||
from concurrent import futures
|
from concurrent import futures
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from collections import defaultdict
|
||||||
import threading
|
import threading
|
||||||
try:
|
try:
|
||||||
import queue
|
import queue
|
||||||
@ -103,8 +104,31 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
|
|||||||
# special warc name prefix '-' means "don't archive"
|
# special warc name prefix '-' means "don't archive"
|
||||||
return (prefix != '-' and not recorded_url.do_not_archive
|
return (prefix != '-' and not recorded_url.do_not_archive
|
||||||
and self._filter_accepts(recorded_url)
|
and self._filter_accepts(recorded_url)
|
||||||
|
and not self._skip_revisit(recorded_url)
|
||||||
and not self._in_blackout(recorded_url))
|
and not self._in_blackout(recorded_url))
|
||||||
|
|
||||||
|
# maintain a set of revisit hashes seen, per ait crawl id
|
||||||
|
revisits = defaultdict(set)
|
||||||
|
def _skip_revisit(self, recorded_url):
|
||||||
|
if hasattr(recorded_url, "dedup_info") and recorded_url.dedup_info:
|
||||||
|
if (
|
||||||
|
recorded_url.warcprox_meta
|
||||||
|
and "metadata" in recorded_url.warcprox_meta
|
||||||
|
and "ait-job-id" in recorded_url.warcprox_meta["metadata"]
|
||||||
|
):
|
||||||
|
crawl_id = recorded_url.warcprox_meta["metadata"]["ait-job-id"]
|
||||||
|
if recorded_url.payload_digest in revisits[crawl_id]:
|
||||||
|
self.logger.info(
|
||||||
|
"Found duplicate revisit, skipping: %s, hash: %s",
|
||||||
|
recorded_url.url,
|
||||||
|
recorded_url.payload_digest,
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
revisits[crawl_id].add(recorded_url.payload_digest)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _in_blackout(self, recorded_url):
|
def _in_blackout(self, recorded_url):
|
||||||
"""If --blackout-period=N (sec) is set, check if duplicate record
|
"""If --blackout-period=N (sec) is set, check if duplicate record
|
||||||
datetime is close to the original. If yes, we don't write it to WARC.
|
datetime is close to the original. If yes, we don't write it to WARC.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user