From 01fe728676f556e4633d0bb449418baa563aedc8 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 10:47:01 -0800 Subject: [PATCH 01/13] rm mistake --- warcprox/version.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 warcprox/version.txt diff --git a/warcprox/version.txt b/warcprox/version.txt deleted file mode 100644 index 5c2dcd5..0000000 --- a/warcprox/version.txt +++ /dev/null @@ -1 +0,0 @@ -1.4-20160105052702-f79e744 From eaed835275b6f2d7dd9aa53eef62d7fdbaca42f6 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 14:45:58 -0800 Subject: [PATCH 02/13] omit comment --- warcprox/controller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/controller.py b/warcprox/controller.py index 644fdec..ff63657 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -213,7 +213,7 @@ class WarcproxController(object): warcprox.ListenerPostfetchProcessor( plugin, self.options)) elif hasattr(plugin, 'CHAIN_POSITION') and plugin.CHAIN_POSITION == 'early': - self._postfetch_chain.insert(0, plugin) # or insert early but later than 0? + self._postfetch_chain.insert(0, plugin) else: self._postfetch_chain.append(plugin) From 39b2fe86d943e20746b3b71f485f977ff8412b38 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 14:46:25 -0800 Subject: [PATCH 03/13] test early plugin --- tests/test_warcprox.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index f054d69..1a3a3e7 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1426,11 +1426,21 @@ def test_controller_with_defaults(): assert not wwp.writer_pool.default_warc_writer.record_builder.base32 assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' + +class TestEarlyPlugin(warcprox.BasePostfetchProcessor): + CHAIN_POSITION = 'early' + + def _get_process_put(self): + recorded_url = self.inq.get(block=True, timeout=0.5) + pass + + def test_load_plugin(): options = warcprox.Options(port=0, plugins=[ 'warcprox.stats.RunningStats', 'warcprox.BaseStandardPostfetchProcessor', - 'warcprox.BaseBatchPostfetchProcessor',]) + 'warcprox.BaseBatchPostfetchProcessor', + '__main__.TestEarlyPlugin',]) controller = warcprox.controller.WarcproxController(options) assert isinstance( controller._postfetch_chain[-1], @@ -1451,6 +1461,9 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) + assert isinstance( + controller._postfetch_chain[-5], + __main__.TestEarlyPlugin) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From b5548311791c028f4899ed4f6da578b5f8d30ad8 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 15:20:24 -0800 Subject: [PATCH 04/13] add test_do_not_archive, tweak early plugin name --- tests/test_warcprox.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 1a3a3e7..8353219 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1176,6 +1176,13 @@ def test_tor_onion(archiving_proxies, warcprox_): # wait for postfetch chain wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2) +def test_do_not_archive(warcprox_): + recorded_url = warcprox.RecordedUrl + assert warcprox_._should_archive(recorded_url) == True + + recorded_url.do_not_archive = True + assert warcprox_._should_archive(recorded_url) == False + def test_missing_content_length(archiving_proxies, http_daemon, https_daemon, warcprox_): urls_before = warcprox_.proxy.running_stats.urls @@ -1427,7 +1434,7 @@ def test_controller_with_defaults(): assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' -class TestEarlyPlugin(warcprox.BasePostfetchProcessor): +class MyEarlyPlugin(warcprox.BasePostfetchProcessor): CHAIN_POSITION = 'early' def _get_process_put(self): @@ -1440,7 +1447,7 @@ def test_load_plugin(): 'warcprox.stats.RunningStats', 'warcprox.BaseStandardPostfetchProcessor', 'warcprox.BaseBatchPostfetchProcessor', - '__main__.TestEarlyPlugin',]) + '__main__.MyEarlyPlugin',]) controller = warcprox.controller.WarcproxController(options) assert isinstance( controller._postfetch_chain[-1], @@ -1463,7 +1470,7 @@ def test_load_plugin(): warcprox.stats.RunningStats) assert isinstance( controller._postfetch_chain[-5], - __main__.TestEarlyPlugin) + __main__.MyEarlyPlugin) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From b2672ab2f4b37d7bec9a3a38d7dcbea9b021b503 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 15:38:55 -0800 Subject: [PATCH 05/13] move test_do_not_archive --- tests/test_warcprox.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 8353219..c251b54 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1176,13 +1176,6 @@ def test_tor_onion(archiving_proxies, warcprox_): # wait for postfetch chain wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2) -def test_do_not_archive(warcprox_): - recorded_url = warcprox.RecordedUrl - assert warcprox_._should_archive(recorded_url) == True - - recorded_url.do_not_archive = True - assert warcprox_._should_archive(recorded_url) == False - def test_missing_content_length(archiving_proxies, http_daemon, https_daemon, warcprox_): urls_before = warcprox_.proxy.running_stats.urls From 0c650e115882f34c330318c15d6ce7f0a369f534 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 16:02:53 -0800 Subject: [PATCH 06/13] try __name__... --- tests/test_warcprox.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index c251b54..4ffa43c 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1440,7 +1440,7 @@ def test_load_plugin(): 'warcprox.stats.RunningStats', 'warcprox.BaseStandardPostfetchProcessor', 'warcprox.BaseBatchPostfetchProcessor', - '__main__.MyEarlyPlugin',]) + '%s.%s' % (__name__, MyEarlyPlugin.__name__),]) controller = warcprox.controller.WarcproxController(options) assert isinstance( controller._postfetch_chain[-1], @@ -1463,7 +1463,7 @@ def test_load_plugin(): warcprox.stats.RunningStats) assert isinstance( controller._postfetch_chain[-5], - __main__.MyEarlyPlugin) + '%s.%s' % (__name__, Foo.__name__)) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From 3ed551c3beef32f434cf316bb9133112331dad0f Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 16:22:38 -0800 Subject: [PATCH 07/13] try not Foo --- tests/test_warcprox.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 4ffa43c..5609026 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1463,7 +1463,7 @@ def test_load_plugin(): warcprox.stats.RunningStats) assert isinstance( controller._postfetch_chain[-5], - '%s.%s' % (__name__, Foo.__name__)) + '%s.%s' % (__name__, MyEarlyPlugin.__name__)) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From 97f7b2f3fddbeb05b456fdc1ad9d4a1bf8e3df84 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 16:44:36 -0800 Subject: [PATCH 08/13] type? --- tests/test_warcprox.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 5609026..e3cc270 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1463,7 +1463,7 @@ def test_load_plugin(): warcprox.stats.RunningStats) assert isinstance( controller._postfetch_chain[-5], - '%s.%s' % (__name__, MyEarlyPlugin.__name__)) + MyEarlyPlugin.__name__) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From f30fb4039372c30164fe07eba5e2339267c39181 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 17:00:08 -0800 Subject: [PATCH 09/13] try tuple --- tests/test_warcprox.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index e3cc270..62326c5 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1463,7 +1463,7 @@ def test_load_plugin(): warcprox.stats.RunningStats) assert isinstance( controller._postfetch_chain[-5], - MyEarlyPlugin.__name__) + (__name__, MyEarlyPlugin.__name__)) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From cb05fc0e0997d88b5bc4f19eb8bbb4f7bdd63205 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 18:31:00 -0800 Subject: [PATCH 10/13] test issubclass --- tests/test_warcprox.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 62326c5..0a9d560 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1427,11 +1427,10 @@ def test_controller_with_defaults(): assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' -class MyEarlyPlugin(warcprox.BasePostfetchProcessor): +class MyEarlyPlugin(warcprox.BaseStandardPostfetchProcessor): CHAIN_POSITION = 'early' def _get_process_put(self): - recorded_url = self.inq.get(block=True, timeout=0.5) pass @@ -1461,9 +1460,9 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) - assert isinstance( + assert issubclass( controller._postfetch_chain[-5], - (__name__, MyEarlyPlugin.__name__)) + warcprox.BaseStandardPostfetchProcessor) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From 9e2f357bab7f5b594acb63923f1bebd114214f63 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 19:49:12 -0800 Subject: [PATCH 11/13] restore master test_warcprox.py --- tests/test_warcprox.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 0a9d560..f054d69 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1426,20 +1426,11 @@ def test_controller_with_defaults(): assert not wwp.writer_pool.default_warc_writer.record_builder.base32 assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' - -class MyEarlyPlugin(warcprox.BaseStandardPostfetchProcessor): - CHAIN_POSITION = 'early' - - def _get_process_put(self): - pass - - def test_load_plugin(): options = warcprox.Options(port=0, plugins=[ 'warcprox.stats.RunningStats', 'warcprox.BaseStandardPostfetchProcessor', - 'warcprox.BaseBatchPostfetchProcessor', - '%s.%s' % (__name__, MyEarlyPlugin.__name__),]) + 'warcprox.BaseBatchPostfetchProcessor',]) controller = warcprox.controller.WarcproxController(options) assert isinstance( controller._postfetch_chain[-1], @@ -1460,9 +1451,6 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) - assert issubclass( - controller._postfetch_chain[-5], - warcprox.BaseStandardPostfetchProcessor) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From 84e5110bcb1e70a8224ffed98d0d4b86b95d4a97 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 21:36:00 -0800 Subject: [PATCH 12/13] [0] isinstance of parent class --- tests/test_warcprox.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index f054d69..2f4f8e2 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1426,11 +1426,19 @@ def test_controller_with_defaults(): assert not wwp.writer_pool.default_warc_writer.record_builder.base32 assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' + +class MyEarlyPlugin(warcprox.BaseStandardPostfetchProcessor): + CHAIN_POSITION = 'early' + def _process_put(self): + pass + + def test_load_plugin(): options = warcprox.Options(port=0, plugins=[ 'warcprox.stats.RunningStats', 'warcprox.BaseStandardPostfetchProcessor', - 'warcprox.BaseBatchPostfetchProcessor',]) + 'warcprox.BaseBatchPostfetchProcessor', + '%s.%s' % (__name__, MyEarlyPlugin.__name__),]) controller = warcprox.controller.WarcproxController(options) assert isinstance( controller._postfetch_chain[-1], @@ -1451,6 +1459,10 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) + # MyEarlyPlugin + assert isinstance( + controller._postfetch_chain[0], + warcprox.BaseStandardPostfetchProcessor) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From 3161793c5c1077141e6e0ff7e7a46b5403d1243d Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 22:23:40 -0800 Subject: [PATCH 13/13] add test_do_not_archive --- tests/test_writer.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/test_writer.py b/tests/test_writer.py index fb39378..126932a 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -163,6 +163,47 @@ def test_special_dont_write_prefix(): wwt.join() +def test_do_not_archive(): + with tempfile.TemporaryDirectory() as tmpdir: + logging.debug('cd %s', tmpdir) + os.chdir(tmpdir) + + wwt = warcprox.writerthread.WarcWriterProcessor( + Options(writer_threads=1)) + wwt.inq = warcprox.TimestampedQueue(maxsize=1) + wwt.outq = warcprox.TimestampedQueue(maxsize=1) + try: + wwt.start() + # to be written -- default do_not_archive False + recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) + recorder.read() + wwt.inq.put(RecordedUrl( + url='http://example.com/yes', content_type='text/plain', + status=200, client_ip='127.0.0.2', request_data=b'abc', + response_recorder=recorder, remote_ip='127.0.0.3', + timestamp=datetime.utcnow(), + payload_digest=recorder.block_digest)) + # not to be written -- do_not_archive set True + recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) + recorder.read() + wwt.inq.put(RecordedUrl( + url='http://example.com/no', content_type='text/plain', + status=200, client_ip='127.0.0.2', request_data=b'abc', + response_recorder=recorder, remote_ip='127.0.0.3', + timestamp=datetime.utcnow(), + payload_digest=recorder.block_digest, + warcprox_meta={'warc-prefix': '-'}, + do_not_archive=True)) + recorded_url = wwt.outq.get(timeout=10) + assert recorded_url.warc_records + recorded_url = wwt.outq.get(timeout=10) + assert not recorded_url.warc_records + assert wwt.outq.empty() + finally: + wwt.stop.set() + wwt.join() + + def test_warc_writer_filename(tmpdir): """Test if WarcWriter is writing WARC files with custom filenames. """