From 41fb7b52933edada7538ac18f354f2c008dd5d44 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 14 Feb 2018 15:48:21 -0800 Subject: [PATCH 01/13] add do_not_archive check to should_archive --- warcprox/writerthread.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 1010161..27c5eea 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -81,8 +81,12 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor): if recorded_url.warcprox_meta and 'warc-prefix' in recorded_url.warcprox_meta else self.options.prefix) + do_not_archive = (recorded_url.do_not_archive + if recorded_url.do_not_archive + else False) # special warc name prefix '-' means "don't archive" - return prefix != '-' and self._filter_accepts(recorded_url) + return prefix != '-' and (not do_not_archive) and + self._filter_accepts(recorded_url) def _log(self, recorded_url, records): try: From 7d4ba1f596cd04cef99785a977cfd5e458c3b169 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 14 Feb 2018 17:55:09 -0800 Subject: [PATCH 02/13] add CHAIN_POSITION support --- warcprox/controller.py | 2 ++ warcprox/writerthread.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/warcprox/controller.py b/warcprox/controller.py index 30446c3..644fdec 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -212,6 +212,8 @@ class WarcproxController(object): self._postfetch_chain.append( warcprox.ListenerPostfetchProcessor( plugin, self.options)) + elif hasattr(plugin, 'CHAIN_POSITION') and plugin.CHAIN_POSITION == 'early': + self._postfetch_chain.insert(0, plugin) # or insert early but later than 0? else: self._postfetch_chain.append(plugin) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 27c5eea..854319c 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -85,8 +85,8 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor): if recorded_url.do_not_archive else False) # special warc name prefix '-' means "don't archive" - return prefix != '-' and (not do_not_archive) and - self._filter_accepts(recorded_url) + return (prefix != '-' and (not do_not_archive) + and self._filter_accepts(recorded_url)) def _log(self, recorded_url, records): try: From 0ae4da264d8af1570ac2d90014d7320e9a3a16a7 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 15 Feb 2018 13:56:14 -0800 Subject: [PATCH 03/13] add do_not_archive to class --- warcprox/version.txt | 1 + warcprox/warcproxy.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 warcprox/version.txt diff --git a/warcprox/version.txt b/warcprox/version.txt new file mode 100644 index 0000000..5c2dcd5 --- /dev/null +++ b/warcprox/version.txt @@ -0,0 +1 @@ +1.4-20160105052702-f79e744 diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 5793057..5b42655 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -331,7 +331,8 @@ class RecordedUrl: warcprox_meta=None, content_type=None, custom_type=None, status=None, size=None, client_ip=None, method=None, timestamp=None, host=None, duration=None, referer=None, - payload_digest=None, truncated=None, warc_records=None): + payload_digest=None, truncated=None, warc_records=None, + do_not_archive=False): # XXX should test what happens with non-ascii url (when does # url-encoding happen?) if type(url) is not bytes: @@ -372,6 +373,7 @@ class RecordedUrl: self.payload_digest = payload_digest self.truncated = truncated self.warc_records = warc_records + self.do_not_archive = do_not_archive # inherit from object so that multiple inheritance from this class works # properly in python 2 From a6acc9cf5e4bffaaccf89d85d4d2fbc0e1b91f69 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 20 Feb 2018 15:49:58 -0800 Subject: [PATCH 04/13] no need for local var --- warcprox/writerthread.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 854319c..4492965 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -81,11 +81,8 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor): if recorded_url.warcprox_meta and 'warc-prefix' in recorded_url.warcprox_meta else self.options.prefix) - do_not_archive = (recorded_url.do_not_archive - if recorded_url.do_not_archive - else False) # special warc name prefix '-' means "don't archive" - return (prefix != '-' and (not do_not_archive) + return (prefix != '-' and not recorded_url.do_not_archive and self._filter_accepts(recorded_url)) def _log(self, recorded_url, records): From 01fe728676f556e4633d0bb449418baa563aedc8 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 10:47:01 -0800 Subject: [PATCH 05/13] rm mistake --- warcprox/version.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 warcprox/version.txt diff --git a/warcprox/version.txt b/warcprox/version.txt deleted file mode 100644 index 5c2dcd5..0000000 --- a/warcprox/version.txt +++ /dev/null @@ -1 +0,0 @@ -1.4-20160105052702-f79e744 From eaed835275b6f2d7dd9aa53eef62d7fdbaca42f6 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 14:45:58 -0800 Subject: [PATCH 06/13] omit comment --- warcprox/controller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/controller.py b/warcprox/controller.py index 644fdec..ff63657 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -213,7 +213,7 @@ class WarcproxController(object): warcprox.ListenerPostfetchProcessor( plugin, self.options)) elif hasattr(plugin, 'CHAIN_POSITION') and plugin.CHAIN_POSITION == 'early': - self._postfetch_chain.insert(0, plugin) # or insert early but later than 0? + self._postfetch_chain.insert(0, plugin) else: self._postfetch_chain.append(plugin) From 39b2fe86d943e20746b3b71f485f977ff8412b38 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 14:46:25 -0800 Subject: [PATCH 07/13] test early plugin --- tests/test_warcprox.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index f054d69..1a3a3e7 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1426,11 +1426,21 @@ def test_controller_with_defaults(): assert not wwp.writer_pool.default_warc_writer.record_builder.base32 assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' + +class TestEarlyPlugin(warcprox.BasePostfetchProcessor): + CHAIN_POSITION = 'early' + + def _get_process_put(self): + recorded_url = self.inq.get(block=True, timeout=0.5) + pass + + def test_load_plugin(): options = warcprox.Options(port=0, plugins=[ 'warcprox.stats.RunningStats', 'warcprox.BaseStandardPostfetchProcessor', - 'warcprox.BaseBatchPostfetchProcessor',]) + 'warcprox.BaseBatchPostfetchProcessor', + '__main__.TestEarlyPlugin',]) controller = warcprox.controller.WarcproxController(options) assert isinstance( controller._postfetch_chain[-1], @@ -1451,6 +1461,9 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) + assert isinstance( + controller._postfetch_chain[-5], + __main__.TestEarlyPlugin) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From f5dd2fe03bbeafbdffd50165b1728c7de633fc5b Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 15:20:24 -0800 Subject: [PATCH 08/13] add test_do_not_archive, tweak early plugin name --- tests/test_warcprox.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 1a3a3e7..0a9d560 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1427,11 +1427,10 @@ def test_controller_with_defaults(): assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' -class TestEarlyPlugin(warcprox.BasePostfetchProcessor): +class MyEarlyPlugin(warcprox.BaseStandardPostfetchProcessor): CHAIN_POSITION = 'early' def _get_process_put(self): - recorded_url = self.inq.get(block=True, timeout=0.5) pass @@ -1440,7 +1439,7 @@ def test_load_plugin(): 'warcprox.stats.RunningStats', 'warcprox.BaseStandardPostfetchProcessor', 'warcprox.BaseBatchPostfetchProcessor', - '__main__.TestEarlyPlugin',]) + '%s.%s' % (__name__, MyEarlyPlugin.__name__),]) controller = warcprox.controller.WarcproxController(options) assert isinstance( controller._postfetch_chain[-1], @@ -1461,9 +1460,9 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) - assert isinstance( + assert issubclass( controller._postfetch_chain[-5], - __main__.TestEarlyPlugin) + warcprox.BaseStandardPostfetchProcessor) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From 1334b4a546cbba56fcb728cbcb78ccf1e3f32aef Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 19:49:12 -0800 Subject: [PATCH 09/13] restore master test_warcprox.py --- tests/test_warcprox.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 0a9d560..f054d69 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1426,20 +1426,11 @@ def test_controller_with_defaults(): assert not wwp.writer_pool.default_warc_writer.record_builder.base32 assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' - -class MyEarlyPlugin(warcprox.BaseStandardPostfetchProcessor): - CHAIN_POSITION = 'early' - - def _get_process_put(self): - pass - - def test_load_plugin(): options = warcprox.Options(port=0, plugins=[ 'warcprox.stats.RunningStats', 'warcprox.BaseStandardPostfetchProcessor', - 'warcprox.BaseBatchPostfetchProcessor', - '%s.%s' % (__name__, MyEarlyPlugin.__name__),]) + 'warcprox.BaseBatchPostfetchProcessor',]) controller = warcprox.controller.WarcproxController(options) assert isinstance( controller._postfetch_chain[-1], @@ -1460,9 +1451,6 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) - assert issubclass( - controller._postfetch_chain[-5], - warcprox.BaseStandardPostfetchProcessor) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From 7f50ecab0a7c1d15eeef9fef02a4ef2c8e9b756d Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 21:36:00 -0800 Subject: [PATCH 10/13] [0] isinstance of parent class --- tests/test_warcprox.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index f054d69..2f4f8e2 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1426,11 +1426,19 @@ def test_controller_with_defaults(): assert not wwp.writer_pool.default_warc_writer.record_builder.base32 assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' + +class MyEarlyPlugin(warcprox.BaseStandardPostfetchProcessor): + CHAIN_POSITION = 'early' + def _process_put(self): + pass + + def test_load_plugin(): options = warcprox.Options(port=0, plugins=[ 'warcprox.stats.RunningStats', 'warcprox.BaseStandardPostfetchProcessor', - 'warcprox.BaseBatchPostfetchProcessor',]) + 'warcprox.BaseBatchPostfetchProcessor', + '%s.%s' % (__name__, MyEarlyPlugin.__name__),]) controller = warcprox.controller.WarcproxController(options) assert isinstance( controller._postfetch_chain[-1], @@ -1451,6 +1459,10 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) + # MyEarlyPlugin + assert isinstance( + controller._postfetch_chain[0], + warcprox.BaseStandardPostfetchProcessor) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From 6ce5119a4860ca893d43b596feddae216ef2f217 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 22:23:40 -0800 Subject: [PATCH 11/13] add test_do_not_archive --- tests/test_writer.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/test_writer.py b/tests/test_writer.py index fb39378..126932a 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -163,6 +163,47 @@ def test_special_dont_write_prefix(): wwt.join() +def test_do_not_archive(): + with tempfile.TemporaryDirectory() as tmpdir: + logging.debug('cd %s', tmpdir) + os.chdir(tmpdir) + + wwt = warcprox.writerthread.WarcWriterProcessor( + Options(writer_threads=1)) + wwt.inq = warcprox.TimestampedQueue(maxsize=1) + wwt.outq = warcprox.TimestampedQueue(maxsize=1) + try: + wwt.start() + # to be written -- default do_not_archive False + recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) + recorder.read() + wwt.inq.put(RecordedUrl( + url='http://example.com/yes', content_type='text/plain', + status=200, client_ip='127.0.0.2', request_data=b'abc', + response_recorder=recorder, remote_ip='127.0.0.3', + timestamp=datetime.utcnow(), + payload_digest=recorder.block_digest)) + # not to be written -- do_not_archive set True + recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) + recorder.read() + wwt.inq.put(RecordedUrl( + url='http://example.com/no', content_type='text/plain', + status=200, client_ip='127.0.0.2', request_data=b'abc', + response_recorder=recorder, remote_ip='127.0.0.3', + timestamp=datetime.utcnow(), + payload_digest=recorder.block_digest, + warcprox_meta={'warc-prefix': '-'}, + do_not_archive=True)) + recorded_url = wwt.outq.get(timeout=10) + assert recorded_url.warc_records + recorded_url = wwt.outq.get(timeout=10) + assert not recorded_url.warc_records + assert wwt.outq.empty() + finally: + wwt.stop.set() + wwt.join() + + def test_warc_writer_filename(tmpdir): """Test if WarcWriter is writing WARC files with custom filenames. """ From e65dee57d414a6bc34815007691ced7e99958c3e Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 28 Feb 2018 12:15:12 -0800 Subject: [PATCH 12/13] minor test edits --- tests/test_warcprox.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 2f4f8e2..53b31eb 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1427,9 +1427,9 @@ def test_controller_with_defaults(): assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' -class MyEarlyPlugin(warcprox.BaseStandardPostfetchProcessor): +class EarlyPlugin(warcprox.BaseStandardPostfetchProcessor): CHAIN_POSITION = 'early' - def _process_put(self): + def _process_url(self): pass @@ -1438,7 +1438,7 @@ def test_load_plugin(): 'warcprox.stats.RunningStats', 'warcprox.BaseStandardPostfetchProcessor', 'warcprox.BaseBatchPostfetchProcessor', - '%s.%s' % (__name__, MyEarlyPlugin.__name__),]) + '%s.%s' % (__name__, EarlyPlugin.__name__),]) controller = warcprox.controller.WarcproxController(options) assert isinstance( controller._postfetch_chain[-1], @@ -1459,7 +1459,7 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) - # MyEarlyPlugin + # EarlyPlugin assert isinstance( controller._postfetch_chain[0], warcprox.BaseStandardPostfetchProcessor) From 289f4335ef769e988b1a7ee1431d78d81bb68243 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 28 Feb 2018 12:20:53 -0800 Subject: [PATCH 13/13] isinstance(controller._postfetch_chain[0], EarlyPlugin) --- tests/test_warcprox.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 53b31eb..9d6db7d 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1459,10 +1459,9 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) - # EarlyPlugin assert isinstance( controller._postfetch_chain[0], - warcprox.BaseStandardPostfetchProcessor) + EarlyPlugin) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options()