mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
implementation of special prefix "-" which means "do not archive"
This commit is contained in:
parent
9784c91459
commit
500ffad7e4
2
setup.py
2
setup.py
@ -52,7 +52,7 @@ except:
|
||||
|
||||
setuptools.setup(
|
||||
name='warcprox',
|
||||
version='2.3.1b4.dev129',
|
||||
version='2.3.1b4.dev130',
|
||||
description='WARC writing MITM HTTP/S proxy',
|
||||
url='https://github.com/internetarchive/warcprox',
|
||||
author='Noah Levitt',
|
||||
|
@ -81,8 +81,11 @@ def _build_arg_parser(prog):
|
||||
help='write gzip-compressed warc records')
|
||||
arg_parser.add_argument('--no-warc-open-suffix', dest='no_warc_open_suffix',
|
||||
default=False, action='store_true', help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument('-n', '--prefix', dest='prefix',
|
||||
default='WARCPROX', help='WARC filename prefix')
|
||||
# not mentioned in --help: special value for '-' for --prefix means don't
|
||||
# archive the capture, unless prefix set in warcprox-meta header
|
||||
arg_parser.add_argument(
|
||||
'-n', '--prefix', dest='prefix', default='WARCPROX',
|
||||
help='default WARC filename prefix')
|
||||
arg_parser.add_argument(
|
||||
'-s', '--size', dest='rollover_size', default=1000*1000*1000,
|
||||
type=int, help='WARC file rollover size threshold in bytes')
|
||||
|
@ -79,7 +79,7 @@ class ProxyingRecorder(object):
|
||||
self.block_digest = hashlib.new(digest_algorithm)
|
||||
self.payload_offset = None
|
||||
self.proxy_client = proxy_client
|
||||
self._proxy_client_conn_open = True
|
||||
self._proxy_client_conn_open = bool(self.proxy_client)
|
||||
self.len = 0
|
||||
self.url = url
|
||||
|
||||
|
@ -39,9 +39,8 @@ class WarcWriterThread(threading.Thread):
|
||||
logger = logging.getLogger("warcprox.warcproxwriter.WarcWriterThread")
|
||||
|
||||
def __init__(
|
||||
self, name='WarcWriterThread', recorded_url_q=None,
|
||||
writer_pool=None, dedup_db=None, listeners=[],
|
||||
options=warcprox.Options()):
|
||||
self, recorded_url_q, name='WarcWriterThread', writer_pool=None,
|
||||
dedup_db=None, listeners=[], options=warcprox.Options()):
|
||||
"""recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl."""
|
||||
threading.Thread.__init__(self, name=name)
|
||||
self.recorded_url_q = recorded_url_q
|
||||
@ -73,6 +72,15 @@ class WarcWriterThread(threading.Thread):
|
||||
meth = recorded_url.method.upper()
|
||||
return meth in self._ALWAYS_ACCEPT or meth in self.method_filter
|
||||
|
||||
# XXX optimize handling of urls not to be archived throughout warcprox
|
||||
def _should_archive(self, recorded_url):
|
||||
prefix = (recorded_url.warcprox_meta['warc-prefix']
|
||||
if recorded_url.warcprox_meta
|
||||
and 'warc-prefix' in recorded_url.warcprox_meta
|
||||
else self.options.prefix)
|
||||
# special warc name prefix '-' means "don't archive"
|
||||
return prefix != '-' and self._filter_accepts(recorded_url)
|
||||
|
||||
def _run(self):
|
||||
self.name = '%s(tid=%s)'% (self.name, warcprox.gettid())
|
||||
while not self.stop.is_set():
|
||||
@ -87,7 +95,7 @@ class WarcWriterThread(threading.Thread):
|
||||
recorded_url = self.recorded_url_q.get(block=True, timeout=0.5)
|
||||
records = []
|
||||
self.idle = None
|
||||
if self._filter_accepts(recorded_url):
|
||||
if self._should_archive(recorded_url):
|
||||
if self.dedup_db:
|
||||
warcprox.dedup.decorate_with_dedup_info(self.dedup_db,
|
||||
recorded_url, base32=self.options.base32)
|
||||
|
Loading…
x
Reference in New Issue
Block a user