diff --git a/setup.py b/setup.py index 4d1bc35..007c681 100755 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ except: setuptools.setup( name='warcprox', - version='2.3.1b4.dev129', + version='2.3.1b4.dev130', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/warcprox/main.py b/warcprox/main.py index 6140f72..47c4bb4 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -81,8 +81,11 @@ def _build_arg_parser(prog): help='write gzip-compressed warc records') arg_parser.add_argument('--no-warc-open-suffix', dest='no_warc_open_suffix', default=False, action='store_true', help=argparse.SUPPRESS) - arg_parser.add_argument('-n', '--prefix', dest='prefix', - default='WARCPROX', help='WARC filename prefix') + # not mentioned in --help: special value for '-' for --prefix means don't + # archive the capture, unless prefix set in warcprox-meta header + arg_parser.add_argument( + '-n', '--prefix', dest='prefix', default='WARCPROX', + help='default WARC filename prefix') arg_parser.add_argument( '-s', '--size', dest='rollover_size', default=1000*1000*1000, type=int, help='WARC file rollover size threshold in bytes') diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 8d950fa..bfe3a7d 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -79,7 +79,7 @@ class ProxyingRecorder(object): self.block_digest = hashlib.new(digest_algorithm) self.payload_offset = None self.proxy_client = proxy_client - self._proxy_client_conn_open = True + self._proxy_client_conn_open = bool(self.proxy_client) self.len = 0 self.url = url diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 3f42fc1..fb1f7a6 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -39,9 +39,8 @@ class WarcWriterThread(threading.Thread): logger = logging.getLogger("warcprox.warcproxwriter.WarcWriterThread") def __init__( - self, name='WarcWriterThread', recorded_url_q=None, - writer_pool=None, dedup_db=None, listeners=[], - options=warcprox.Options()): + self, recorded_url_q, name='WarcWriterThread', writer_pool=None, + dedup_db=None, listeners=[], options=warcprox.Options()): """recorded_url_q is a queue.Queue of warcprox.warcprox.RecordedUrl.""" threading.Thread.__init__(self, name=name) self.recorded_url_q = recorded_url_q @@ -73,6 +72,15 @@ class WarcWriterThread(threading.Thread): meth = recorded_url.method.upper() return meth in self._ALWAYS_ACCEPT or meth in self.method_filter + # XXX optimize handling of urls not to be archived throughout warcprox + def _should_archive(self, recorded_url): + prefix = (recorded_url.warcprox_meta['warc-prefix'] + if recorded_url.warcprox_meta + and 'warc-prefix' in recorded_url.warcprox_meta + else self.options.prefix) + # special warc name prefix '-' means "don't archive" + return prefix != '-' and self._filter_accepts(recorded_url) + def _run(self): self.name = '%s(tid=%s)'% (self.name, warcprox.gettid()) while not self.stop.is_set(): @@ -87,7 +95,7 @@ class WarcWriterThread(threading.Thread): recorded_url = self.recorded_url_q.get(block=True, timeout=0.5) records = [] self.idle = None - if self._filter_accepts(recorded_url): + if self._should_archive(recorded_url): if self.dedup_db: warcprox.dedup.decorate_with_dedup_info(self.dedup_db, recorded_url, base32=self.options.base32)