implementation of special prefix "-" which means "do not archive"

This commit is contained in:
Noah Levitt 2017-12-21 14:33:30 -08:00
parent 9784c91459
commit 500ffad7e4
4 changed files with 19 additions and 8 deletions

View File

@ -52,7 +52,7 @@ except:
setuptools.setup(
name='warcprox',
version='2.3.1b4.dev129',
version='2.3.1b4.dev130',
description='WARC writing MITM HTTP/S proxy',
url='https://github.com/internetarchive/warcprox',
author='Noah Levitt',

View File

@ -81,8 +81,11 @@ def _build_arg_parser(prog):
help='write gzip-compressed warc records')
arg_parser.add_argument('--no-warc-open-suffix', dest='no_warc_open_suffix',
default=False, action='store_true', help=argparse.SUPPRESS)
arg_parser.add_argument('-n', '--prefix', dest='prefix',
default='WARCPROX', help='WARC filename prefix')
# not mentioned in --help: special value for '-' for --prefix means don't
# archive the capture, unless prefix set in warcprox-meta header
arg_parser.add_argument(
'-n', '--prefix', dest='prefix', default='WARCPROX',
help='default WARC filename prefix')
arg_parser.add_argument(
'-s', '--size', dest='rollover_size', default=1000*1000*1000,
type=int, help='WARC file rollover size threshold in bytes')

View File

@ -79,7 +79,7 @@ class ProxyingRecorder(object):
self.block_digest = hashlib.new(digest_algorithm)
self.payload_offset = None
self.proxy_client = proxy_client
self._proxy_client_conn_open = True
self._proxy_client_conn_open = bool(self.proxy_client)
self.len = 0
self.url = url

View File

@ -39,9 +39,8 @@ class WarcWriterThread(threading.Thread):
logger = logging.getLogger("warcprox.warcproxwriter.WarcWriterThread")
def __init__(
self, name='WarcWriterThread', recorded_url_q=None,
writer_pool=None, dedup_db=None, listeners=[],
options=warcprox.Options()):
self, recorded_url_q, name='WarcWriterThread', writer_pool=None,
dedup_db=None, listeners=[], options=warcprox.Options()):
"""recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl."""
threading.Thread.__init__(self, name=name)
self.recorded_url_q = recorded_url_q
@ -73,6 +72,15 @@ class WarcWriterThread(threading.Thread):
meth = recorded_url.method.upper()
return meth in self._ALWAYS_ACCEPT or meth in self.method_filter
# XXX optimize handling of urls not to be archived throughout warcprox
def _should_archive(self, recorded_url):
prefix = (recorded_url.warcprox_meta['warc-prefix']
if recorded_url.warcprox_meta
and 'warc-prefix' in recorded_url.warcprox_meta
else self.options.prefix)
# special warc name prefix '-' means "don't archive"
return prefix != '-' and self._filter_accepts(recorded_url)
def _run(self):
self.name = '%s(tid=%s)'% (self.name, warcprox.gettid())
while not self.stop.is_set():
@ -87,7 +95,7 @@ class WarcWriterThread(threading.Thread):
recorded_url = self.recorded_url_q.get(block=True, timeout=0.5)
records = []
self.idle = None
if self._filter_accepts(recorded_url):
if self._should_archive(recorded_url):
if self.dedup_db:
warcprox.dedup.decorate_with_dedup_info(self.dedup_db,
recorded_url, base32=self.options.base32)