diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index f054d69..9d6db7d 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1426,11 +1426,19 @@ def test_controller_with_defaults(): assert not wwp.writer_pool.default_warc_writer.record_builder.base32 assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' + +class EarlyPlugin(warcprox.BaseStandardPostfetchProcessor): + CHAIN_POSITION = 'early' + def _process_url(self): + pass + + def test_load_plugin(): options = warcprox.Options(port=0, plugins=[ 'warcprox.stats.RunningStats', 'warcprox.BaseStandardPostfetchProcessor', - 'warcprox.BaseBatchPostfetchProcessor',]) + 'warcprox.BaseBatchPostfetchProcessor', + '%s.%s' % (__name__, EarlyPlugin.__name__),]) controller = warcprox.controller.WarcproxController(options) assert isinstance( controller._postfetch_chain[-1], @@ -1451,6 +1459,9 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) + assert isinstance( + controller._postfetch_chain[0], + EarlyPlugin) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() diff --git a/tests/test_writer.py b/tests/test_writer.py index fb39378..126932a 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -163,6 +163,47 @@ def test_special_dont_write_prefix(): wwt.join() +def test_do_not_archive(): + with tempfile.TemporaryDirectory() as tmpdir: + logging.debug('cd %s', tmpdir) + os.chdir(tmpdir) + + wwt = warcprox.writerthread.WarcWriterProcessor( + Options(writer_threads=1)) + wwt.inq = warcprox.TimestampedQueue(maxsize=1) + wwt.outq = warcprox.TimestampedQueue(maxsize=1) + try: + wwt.start() + # to be written -- default do_not_archive False + recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) + recorder.read() + wwt.inq.put(RecordedUrl( + url='http://example.com/yes', content_type='text/plain', + status=200, client_ip='127.0.0.2', request_data=b'abc', + response_recorder=recorder, remote_ip='127.0.0.3', + timestamp=datetime.utcnow(), + payload_digest=recorder.block_digest)) + # not to be written -- do_not_archive set True + recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) + recorder.read() + wwt.inq.put(RecordedUrl( + url='http://example.com/no', content_type='text/plain', + status=200, client_ip='127.0.0.2', request_data=b'abc', + response_recorder=recorder, remote_ip='127.0.0.3', + timestamp=datetime.utcnow(), + payload_digest=recorder.block_digest, + warcprox_meta={'warc-prefix': '-'}, + do_not_archive=True)) + recorded_url = wwt.outq.get(timeout=10) + assert recorded_url.warc_records + recorded_url = wwt.outq.get(timeout=10) + assert not recorded_url.warc_records + assert wwt.outq.empty() + finally: + wwt.stop.set() + wwt.join() + + def test_warc_writer_filename(tmpdir): """Test if WarcWriter is writing WARC files with custom filenames. """ diff --git a/warcprox/controller.py b/warcprox/controller.py index 30446c3..ff63657 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -212,6 +212,8 @@ class WarcproxController(object): self._postfetch_chain.append( warcprox.ListenerPostfetchProcessor( plugin, self.options)) + elif hasattr(plugin, 'CHAIN_POSITION') and plugin.CHAIN_POSITION == 'early': + self._postfetch_chain.insert(0, plugin) else: self._postfetch_chain.append(plugin) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 5793057..5b42655 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -331,7 +331,8 @@ class RecordedUrl: warcprox_meta=None, content_type=None, custom_type=None, status=None, size=None, client_ip=None, method=None, timestamp=None, host=None, duration=None, referer=None, - payload_digest=None, truncated=None, warc_records=None): + payload_digest=None, truncated=None, warc_records=None, + do_not_archive=False): # XXX should test what happens with non-ascii url (when does # url-encoding happen?) if type(url) is not bytes: @@ -372,6 +373,7 @@ class RecordedUrl: self.payload_digest = payload_digest self.truncated = truncated self.warc_records = warc_records + self.do_not_archive = do_not_archive # inherit from object so that multiple inheritance from this class works # properly in python 2 diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index b3dceab..a3eeedb 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -84,7 +84,8 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor): and 'warc-prefix' in recorded_url.warcprox_meta else self.options.prefix) # special warc name prefix '-' means "don't archive" - return prefix != '-' and self._filter_accepts(recorded_url) + return (prefix != '-' and not recorded_url.do_not_archive + and self._filter_accepts(recorded_url)) def _log(self, recorded_url, records): try: