mirror of
https://github.com/internetarchive/warcprox.git
synced 2025-01-18 13:22:09 +01:00
implementation of special prefix "-" which means "do not archive"
This commit is contained in:
parent
9784c91459
commit
500ffad7e4
2
setup.py
2
setup.py
@ -52,7 +52,7 @@ except:
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='warcprox',
|
name='warcprox',
|
||||||
version='2.3.1b4.dev129',
|
version='2.3.1b4.dev130',
|
||||||
description='WARC writing MITM HTTP/S proxy',
|
description='WARC writing MITM HTTP/S proxy',
|
||||||
url='https://github.com/internetarchive/warcprox',
|
url='https://github.com/internetarchive/warcprox',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -81,8 +81,11 @@ def _build_arg_parser(prog):
|
|||||||
help='write gzip-compressed warc records')
|
help='write gzip-compressed warc records')
|
||||||
arg_parser.add_argument('--no-warc-open-suffix', dest='no_warc_open_suffix',
|
arg_parser.add_argument('--no-warc-open-suffix', dest='no_warc_open_suffix',
|
||||||
default=False, action='store_true', help=argparse.SUPPRESS)
|
default=False, action='store_true', help=argparse.SUPPRESS)
|
||||||
arg_parser.add_argument('-n', '--prefix', dest='prefix',
|
# not mentioned in --help: special value for '-' for --prefix means don't
|
||||||
default='WARCPROX', help='WARC filename prefix')
|
# archive the capture, unless prefix set in warcprox-meta header
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'-n', '--prefix', dest='prefix', default='WARCPROX',
|
||||||
|
help='default WARC filename prefix')
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-s', '--size', dest='rollover_size', default=1000*1000*1000,
|
'-s', '--size', dest='rollover_size', default=1000*1000*1000,
|
||||||
type=int, help='WARC file rollover size threshold in bytes')
|
type=int, help='WARC file rollover size threshold in bytes')
|
||||||
|
@ -79,7 +79,7 @@ class ProxyingRecorder(object):
|
|||||||
self.block_digest = hashlib.new(digest_algorithm)
|
self.block_digest = hashlib.new(digest_algorithm)
|
||||||
self.payload_offset = None
|
self.payload_offset = None
|
||||||
self.proxy_client = proxy_client
|
self.proxy_client = proxy_client
|
||||||
self._proxy_client_conn_open = True
|
self._proxy_client_conn_open = bool(self.proxy_client)
|
||||||
self.len = 0
|
self.len = 0
|
||||||
self.url = url
|
self.url = url
|
||||||
|
|
||||||
|
@ -39,9 +39,8 @@ class WarcWriterThread(threading.Thread):
|
|||||||
logger = logging.getLogger("warcprox.warcproxwriter.WarcWriterThread")
|
logger = logging.getLogger("warcprox.warcproxwriter.WarcWriterThread")
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, name='WarcWriterThread', recorded_url_q=None,
|
self, recorded_url_q, name='WarcWriterThread', writer_pool=None,
|
||||||
writer_pool=None, dedup_db=None, listeners=[],
|
dedup_db=None, listeners=[], options=warcprox.Options()):
|
||||||
options=warcprox.Options()):
|
|
||||||
"""recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl."""
|
"""recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl."""
|
||||||
threading.Thread.__init__(self, name=name)
|
threading.Thread.__init__(self, name=name)
|
||||||
self.recorded_url_q = recorded_url_q
|
self.recorded_url_q = recorded_url_q
|
||||||
@ -73,6 +72,15 @@ class WarcWriterThread(threading.Thread):
|
|||||||
meth = recorded_url.method.upper()
|
meth = recorded_url.method.upper()
|
||||||
return meth in self._ALWAYS_ACCEPT or meth in self.method_filter
|
return meth in self._ALWAYS_ACCEPT or meth in self.method_filter
|
||||||
|
|
||||||
|
# XXX optimize handling of urls not to be archived throughout warcprox
|
||||||
|
def _should_archive(self, recorded_url):
|
||||||
|
prefix = (recorded_url.warcprox_meta['warc-prefix']
|
||||||
|
if recorded_url.warcprox_meta
|
||||||
|
and 'warc-prefix' in recorded_url.warcprox_meta
|
||||||
|
else self.options.prefix)
|
||||||
|
# special warc name prefix '-' means "don't archive"
|
||||||
|
return prefix != '-' and self._filter_accepts(recorded_url)
|
||||||
|
|
||||||
def _run(self):
|
def _run(self):
|
||||||
self.name = '%s(tid=%s)'% (self.name, warcprox.gettid())
|
self.name = '%s(tid=%s)'% (self.name, warcprox.gettid())
|
||||||
while not self.stop.is_set():
|
while not self.stop.is_set():
|
||||||
@ -87,7 +95,7 @@ class WarcWriterThread(threading.Thread):
|
|||||||
recorded_url = self.recorded_url_q.get(block=True, timeout=0.5)
|
recorded_url = self.recorded_url_q.get(block=True, timeout=0.5)
|
||||||
records = []
|
records = []
|
||||||
self.idle = None
|
self.idle = None
|
||||||
if self._filter_accepts(recorded_url):
|
if self._should_archive(recorded_url):
|
||||||
if self.dedup_db:
|
if self.dedup_db:
|
||||||
warcprox.dedup.decorate_with_dedup_info(self.dedup_db,
|
warcprox.dedup.decorate_with_dedup_info(self.dedup_db,
|
||||||
recorded_url, base32=self.options.base32)
|
recorded_url, base32=self.options.base32)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user