From f5dd2fe03bbeafbdffd50165b1728c7de633fc5b Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 15:20:24 -0800 Subject: [PATCH 1/6] add test_do_not_archive, tweak early plugin name --- tests/test_warcprox.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 1a3a3e7..0a9d560 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1427,11 +1427,10 @@ def test_controller_with_defaults(): assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' -class TestEarlyPlugin(warcprox.BasePostfetchProcessor): +class MyEarlyPlugin(warcprox.BaseStandardPostfetchProcessor): CHAIN_POSITION = 'early' def _get_process_put(self): - recorded_url = self.inq.get(block=True, timeout=0.5) pass @@ -1440,7 +1439,7 @@ def test_load_plugin(): 'warcprox.stats.RunningStats', 'warcprox.BaseStandardPostfetchProcessor', 'warcprox.BaseBatchPostfetchProcessor', - '__main__.TestEarlyPlugin',]) + '%s.%s' % (__name__, MyEarlyPlugin.__name__),]) controller = warcprox.controller.WarcproxController(options) assert isinstance( controller._postfetch_chain[-1], @@ -1461,9 +1460,9 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) - assert isinstance( + assert issubclass( controller._postfetch_chain[-5], - __main__.TestEarlyPlugin) + warcprox.BaseStandardPostfetchProcessor) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From 1334b4a546cbba56fcb728cbcb78ccf1e3f32aef Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 19:49:12 -0800 Subject: [PATCH 2/6] restore master test_warcprox.py --- tests/test_warcprox.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 0a9d560..f054d69 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1426,20 +1426,11 @@ def test_controller_with_defaults(): assert not wwp.writer_pool.default_warc_writer.record_builder.base32 assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' - -class MyEarlyPlugin(warcprox.BaseStandardPostfetchProcessor): - CHAIN_POSITION = 'early' - - def _get_process_put(self): - pass - - def test_load_plugin(): options = warcprox.Options(port=0, plugins=[ 'warcprox.stats.RunningStats', 'warcprox.BaseStandardPostfetchProcessor', - 'warcprox.BaseBatchPostfetchProcessor', - '%s.%s' % (__name__, MyEarlyPlugin.__name__),]) + 'warcprox.BaseBatchPostfetchProcessor',]) controller = warcprox.controller.WarcproxController(options) assert isinstance( controller._postfetch_chain[-1], @@ -1460,9 +1451,6 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) - assert issubclass( - controller._postfetch_chain[-5], - warcprox.BaseStandardPostfetchProcessor) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From 7f50ecab0a7c1d15eeef9fef02a4ef2c8e9b756d Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 21:36:00 -0800 Subject: [PATCH 3/6] [0] isinstance of parent class --- tests/test_warcprox.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index f054d69..2f4f8e2 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1426,11 +1426,19 @@ def test_controller_with_defaults(): assert not wwp.writer_pool.default_warc_writer.record_builder.base32 assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' + +class MyEarlyPlugin(warcprox.BaseStandardPostfetchProcessor): + CHAIN_POSITION = 'early' + def _process_put(self): + pass + + def test_load_plugin(): options = warcprox.Options(port=0, plugins=[ 'warcprox.stats.RunningStats', 'warcprox.BaseStandardPostfetchProcessor', - 'warcprox.BaseBatchPostfetchProcessor',]) + 'warcprox.BaseBatchPostfetchProcessor', + '%s.%s' % (__name__, MyEarlyPlugin.__name__),]) controller = warcprox.controller.WarcproxController(options) assert isinstance( controller._postfetch_chain[-1], @@ -1451,6 +1459,10 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) + # MyEarlyPlugin + assert isinstance( + controller._postfetch_chain[0], + warcprox.BaseStandardPostfetchProcessor) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From 6ce5119a4860ca893d43b596feddae216ef2f217 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 22:23:40 -0800 Subject: [PATCH 4/6] add test_do_not_archive --- tests/test_writer.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/test_writer.py b/tests/test_writer.py index fb39378..126932a 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -163,6 +163,47 @@ def test_special_dont_write_prefix(): wwt.join() +def test_do_not_archive(): + with tempfile.TemporaryDirectory() as tmpdir: + logging.debug('cd %s', tmpdir) + os.chdir(tmpdir) + + wwt = warcprox.writerthread.WarcWriterProcessor( + Options(writer_threads=1)) + wwt.inq = warcprox.TimestampedQueue(maxsize=1) + wwt.outq = warcprox.TimestampedQueue(maxsize=1) + try: + wwt.start() + # to be written -- default do_not_archive False + recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) + recorder.read() + wwt.inq.put(RecordedUrl( + url='http://example.com/yes', content_type='text/plain', + status=200, client_ip='127.0.0.2', request_data=b'abc', + response_recorder=recorder, remote_ip='127.0.0.3', + timestamp=datetime.utcnow(), + payload_digest=recorder.block_digest)) + # not to be written -- do_not_archive set True + recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) + recorder.read() + wwt.inq.put(RecordedUrl( + url='http://example.com/no', content_type='text/plain', + status=200, client_ip='127.0.0.2', request_data=b'abc', + response_recorder=recorder, remote_ip='127.0.0.3', + timestamp=datetime.utcnow(), + payload_digest=recorder.block_digest, + warcprox_meta={'warc-prefix': '-'}, + do_not_archive=True)) + recorded_url = wwt.outq.get(timeout=10) + assert recorded_url.warc_records + recorded_url = wwt.outq.get(timeout=10) + assert not recorded_url.warc_records + assert wwt.outq.empty() + finally: + wwt.stop.set() + wwt.join() + + def test_warc_writer_filename(tmpdir): """Test if WarcWriter is writing WARC files with custom filenames. """ From e65dee57d414a6bc34815007691ced7e99958c3e Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 28 Feb 2018 12:15:12 -0800 Subject: [PATCH 5/6] minor test edits --- tests/test_warcprox.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 2f4f8e2..53b31eb 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1427,9 +1427,9 @@ def test_controller_with_defaults(): assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' -class MyEarlyPlugin(warcprox.BaseStandardPostfetchProcessor): +class EarlyPlugin(warcprox.BaseStandardPostfetchProcessor): CHAIN_POSITION = 'early' - def _process_put(self): + def _process_url(self): pass @@ -1438,7 +1438,7 @@ def test_load_plugin(): 'warcprox.stats.RunningStats', 'warcprox.BaseStandardPostfetchProcessor', 'warcprox.BaseBatchPostfetchProcessor', - '%s.%s' % (__name__, MyEarlyPlugin.__name__),]) + '%s.%s' % (__name__, EarlyPlugin.__name__),]) controller = warcprox.controller.WarcproxController(options) assert isinstance( controller._postfetch_chain[-1], @@ -1459,7 +1459,7 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) - # MyEarlyPlugin + # EarlyPlugin assert isinstance( controller._postfetch_chain[0], warcprox.BaseStandardPostfetchProcessor) From 289f4335ef769e988b1a7ee1431d78d81bb68243 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 28 Feb 2018 12:20:53 -0800 Subject: [PATCH 6/6] isinstance(controller._postfetch_chain[0], EarlyPlugin) --- tests/test_warcprox.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 53b31eb..9d6db7d 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1459,10 +1459,9 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) - # EarlyPlugin assert isinstance( controller._postfetch_chain[0], - warcprox.BaseStandardPostfetchProcessor) + EarlyPlugin) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options()