From ddb7ecbe06406e16c9785c63541195fe54b0cda3 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 15 Nov 2017 17:37:19 -0800 Subject: [PATCH 01/37] deal with case of case of no warc records written in trough dedup --- warcprox/dedup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index d1e456d..9774c6a 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -292,7 +292,7 @@ class TroughDedupDb(object): return None def notify(self, recorded_url, records): - if (records[0].get_header(warctools.WarcRecord.TYPE) == warctools.WarcRecord.RESPONSE + if (records and records[0].type == b'response' and recorded_url.response_recorder.payload_size() > 0): digest_key = warcprox.digest_str( recorded_url.payload_digest, From d5bf49e44810e1b62bc3f40054f8c87461fe4185 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 14 Feb 2018 15:48:21 -0800 Subject: [PATCH 02/37] add do_not_archive check to should_archive --- warcprox/writerthread.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 1010161..27c5eea 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -81,8 +81,12 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor): if recorded_url.warcprox_meta and 'warc-prefix' in recorded_url.warcprox_meta else self.options.prefix) + do_not_archive = (recorded_url.do_not_archive + if recorded_url.do_not_archive + else False) # special warc name prefix '-' means "don't archive" - return prefix != '-' and self._filter_accepts(recorded_url) + return prefix != '-' and (not do_not_archive) and + self._filter_accepts(recorded_url) def _log(self, recorded_url, records): try: From b3f08359e872f46c028ca5649d79a8e25ff81be6 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 14 Feb 2018 17:55:09 -0800 Subject: [PATCH 03/37] add CHAIN_POSITION support --- warcprox/controller.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/warcprox/controller.py b/warcprox/controller.py index 30446c3..644fdec 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -212,6 +212,8 @@ class WarcproxController(object): self._postfetch_chain.append( warcprox.ListenerPostfetchProcessor( plugin, self.options)) + elif hasattr(plugin, 'CHAIN_POSITION') and plugin.CHAIN_POSITION == 'early': + self._postfetch_chain.insert(0, plugin) # or insert early but later than 0? else: self._postfetch_chain.append(plugin) From 1e432fb54e830e9780e5224fb07fa4cb5bbe9cf1 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 15 Feb 2018 13:55:42 -0800 Subject: [PATCH 04/37] no indent errors, please --- warcprox/writerthread.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 27c5eea..854319c 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -85,8 +85,8 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor): if recorded_url.do_not_archive else False) # special warc name prefix '-' means "don't archive" - return prefix != '-' and (not do_not_archive) and - self._filter_accepts(recorded_url) + return (prefix != '-' and (not do_not_archive) + and self._filter_accepts(recorded_url)) def _log(self, recorded_url, records): try: From e37693df5aac5af0c1ae2dcd6aaede0f421042d1 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 15 Feb 2018 13:56:14 -0800 Subject: [PATCH 05/37] add do_not_archive to class --- warcprox/warcproxy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 5b36300..e55b295 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -330,7 +330,7 @@ class RecordedUrl: warcprox_meta=None, content_type=None, custom_type=None, status=None, size=None, client_ip=None, method=None, timestamp=None, host=None, duration=None, referer=None, - payload_digest=None, warc_records=None): + payload_digest=None, warc_records=None, do_not_archive=False): # XXX should test what happens with non-ascii url (when does # url-encoding happen?) if type(url) is not bytes: @@ -370,6 +370,7 @@ class RecordedUrl: self.referer = referer self.payload_digest = payload_digest self.warc_records = warc_records + self.do_not_archive = do_not_archive # inherit from object so that multiple inheritance from this class works # properly in python 2 From 46dd01de892215ea08a822e3188097b030204f64 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 14 Feb 2018 15:48:21 -0800 Subject: [PATCH 06/37] add do_not_archive check to should_archive --- warcprox/writerthread.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 1010161..27c5eea 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -81,8 +81,12 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor): if recorded_url.warcprox_meta and 'warc-prefix' in recorded_url.warcprox_meta else self.options.prefix) + do_not_archive = (recorded_url.do_not_archive + if recorded_url.do_not_archive + else False) # special warc name prefix '-' means "don't archive" - return prefix != '-' and self._filter_accepts(recorded_url) + return prefix != '-' and (not do_not_archive) and + self._filter_accepts(recorded_url) def _log(self, recorded_url, records): try: From 982700d503d38ce95ad09f18f6a87314a99e0a6f Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 14 Feb 2018 17:55:09 -0800 Subject: [PATCH 07/37] add CHAIN_POSITION support --- warcprox/controller.py | 2 ++ warcprox/writerthread.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/warcprox/controller.py b/warcprox/controller.py index 30446c3..644fdec 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -212,6 +212,8 @@ class WarcproxController(object): self._postfetch_chain.append( warcprox.ListenerPostfetchProcessor( plugin, self.options)) + elif hasattr(plugin, 'CHAIN_POSITION') and plugin.CHAIN_POSITION == 'early': + self._postfetch_chain.insert(0, plugin) # or insert early but later than 0? else: self._postfetch_chain.append(plugin) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 27c5eea..854319c 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -85,8 +85,8 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor): if recorded_url.do_not_archive else False) # special warc name prefix '-' means "don't archive" - return prefix != '-' and (not do_not_archive) and - self._filter_accepts(recorded_url) + return (prefix != '-' and (not do_not_archive) + and self._filter_accepts(recorded_url)) def _log(self, recorded_url, records): try: From 483ed8016e84cefd704b63b6cae355dd271a90fc Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 15 Feb 2018 13:56:14 -0800 Subject: [PATCH 08/37] add do_not_archive to class --- warcprox/warcproxy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 5b36300..e55b295 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -330,7 +330,7 @@ class RecordedUrl: warcprox_meta=None, content_type=None, custom_type=None, status=None, size=None, client_ip=None, method=None, timestamp=None, host=None, duration=None, referer=None, - payload_digest=None, warc_records=None): + payload_digest=None, warc_records=None, do_not_archive=False): # XXX should test what happens with non-ascii url (when does # url-encoding happen?) if type(url) is not bytes: @@ -370,6 +370,7 @@ class RecordedUrl: self.referer = referer self.payload_digest = payload_digest self.warc_records = warc_records + self.do_not_archive = do_not_archive # inherit from object so that multiple inheritance from this class works # properly in python 2 From 2bbe60a4cb82cec9b51e584534e75aeb7213701d Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 20 Feb 2018 15:49:58 -0800 Subject: [PATCH 09/37] no need for local var --- warcprox/writerthread.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 854319c..4492965 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -81,11 +81,8 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor): if recorded_url.warcprox_meta and 'warc-prefix' in recorded_url.warcprox_meta else self.options.prefix) - do_not_archive = (recorded_url.do_not_archive - if recorded_url.do_not_archive - else False) # special warc name prefix '-' means "don't archive" - return (prefix != '-' and (not do_not_archive) + return (prefix != '-' and not recorded_url.do_not_archive and self._filter_accepts(recorded_url)) def _log(self, recorded_url, records): From b5548311791c028f4899ed4f6da578b5f8d30ad8 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 15:20:24 -0800 Subject: [PATCH 10/37] add test_do_not_archive, tweak early plugin name --- tests/test_warcprox.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 1a3a3e7..8353219 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1176,6 +1176,13 @@ def test_tor_onion(archiving_proxies, warcprox_): # wait for postfetch chain wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2) +def test_do_not_archive(warcprox_): + recorded_url = warcprox.RecordedUrl + assert warcprox_._should_archive(recorded_url) == True + + recorded_url.do_not_archive = True + assert warcprox_._should_archive(recorded_url) == False + def test_missing_content_length(archiving_proxies, http_daemon, https_daemon, warcprox_): urls_before = warcprox_.proxy.running_stats.urls @@ -1427,7 +1434,7 @@ def test_controller_with_defaults(): assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' -class TestEarlyPlugin(warcprox.BasePostfetchProcessor): +class MyEarlyPlugin(warcprox.BasePostfetchProcessor): CHAIN_POSITION = 'early' def _get_process_put(self): @@ -1440,7 +1447,7 @@ def test_load_plugin(): 'warcprox.stats.RunningStats', 'warcprox.BaseStandardPostfetchProcessor', 'warcprox.BaseBatchPostfetchProcessor', - '__main__.TestEarlyPlugin',]) + '__main__.MyEarlyPlugin',]) controller = warcprox.controller.WarcproxController(options) assert isinstance( controller._postfetch_chain[-1], @@ -1463,7 +1470,7 @@ def test_load_plugin(): warcprox.stats.RunningStats) assert isinstance( controller._postfetch_chain[-5], - __main__.TestEarlyPlugin) + __main__.MyEarlyPlugin) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From b2672ab2f4b37d7bec9a3a38d7dcbea9b021b503 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 15:38:55 -0800 Subject: [PATCH 11/37] move test_do_not_archive --- tests/test_warcprox.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 8353219..c251b54 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1176,13 +1176,6 @@ def test_tor_onion(archiving_proxies, warcprox_): # wait for postfetch chain wait(lambda: warcprox_.proxy.running_stats.urls - urls_before == 2) -def test_do_not_archive(warcprox_): - recorded_url = warcprox.RecordedUrl - assert warcprox_._should_archive(recorded_url) == True - - recorded_url.do_not_archive = True - assert warcprox_._should_archive(recorded_url) == False - def test_missing_content_length(archiving_proxies, http_daemon, https_daemon, warcprox_): urls_before = warcprox_.proxy.running_stats.urls From 0c650e115882f34c330318c15d6ce7f0a369f534 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 16:02:53 -0800 Subject: [PATCH 12/37] try __name__... --- tests/test_warcprox.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index c251b54..4ffa43c 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1440,7 +1440,7 @@ def test_load_plugin(): 'warcprox.stats.RunningStats', 'warcprox.BaseStandardPostfetchProcessor', 'warcprox.BaseBatchPostfetchProcessor', - '__main__.MyEarlyPlugin',]) + '%s.%s' % (__name__, MyEarlyPlugin.__name__),]) controller = warcprox.controller.WarcproxController(options) assert isinstance( controller._postfetch_chain[-1], @@ -1463,7 +1463,7 @@ def test_load_plugin(): warcprox.stats.RunningStats) assert isinstance( controller._postfetch_chain[-5], - __main__.MyEarlyPlugin) + '%s.%s' % (__name__, Foo.__name__)) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From 3ed551c3beef32f434cf316bb9133112331dad0f Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 16:22:38 -0800 Subject: [PATCH 13/37] try not Foo --- tests/test_warcprox.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 4ffa43c..5609026 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1463,7 +1463,7 @@ def test_load_plugin(): warcprox.stats.RunningStats) assert isinstance( controller._postfetch_chain[-5], - '%s.%s' % (__name__, Foo.__name__)) + '%s.%s' % (__name__, MyEarlyPlugin.__name__)) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From 97f7b2f3fddbeb05b456fdc1ad9d4a1bf8e3df84 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 16:44:36 -0800 Subject: [PATCH 14/37] type? --- tests/test_warcprox.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 5609026..e3cc270 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1463,7 +1463,7 @@ def test_load_plugin(): warcprox.stats.RunningStats) assert isinstance( controller._postfetch_chain[-5], - '%s.%s' % (__name__, MyEarlyPlugin.__name__)) + MyEarlyPlugin.__name__) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From f30fb4039372c30164fe07eba5e2339267c39181 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 17:00:08 -0800 Subject: [PATCH 15/37] try tuple --- tests/test_warcprox.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index e3cc270..62326c5 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1463,7 +1463,7 @@ def test_load_plugin(): warcprox.stats.RunningStats) assert isinstance( controller._postfetch_chain[-5], - MyEarlyPlugin.__name__) + (__name__, MyEarlyPlugin.__name__)) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From cb05fc0e0997d88b5bc4f19eb8bbb4f7bdd63205 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 18:31:00 -0800 Subject: [PATCH 16/37] test issubclass --- tests/test_warcprox.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 62326c5..0a9d560 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1427,11 +1427,10 @@ def test_controller_with_defaults(): assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' -class MyEarlyPlugin(warcprox.BasePostfetchProcessor): +class MyEarlyPlugin(warcprox.BaseStandardPostfetchProcessor): CHAIN_POSITION = 'early' def _get_process_put(self): - recorded_url = self.inq.get(block=True, timeout=0.5) pass @@ -1461,9 +1460,9 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) - assert isinstance( + assert issubclass( controller._postfetch_chain[-5], - (__name__, MyEarlyPlugin.__name__)) + warcprox.BaseStandardPostfetchProcessor) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From 9e2f357bab7f5b594acb63923f1bebd114214f63 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 19:49:12 -0800 Subject: [PATCH 17/37] restore master test_warcprox.py --- tests/test_warcprox.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 0a9d560..f054d69 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1426,20 +1426,11 @@ def test_controller_with_defaults(): assert not wwp.writer_pool.default_warc_writer.record_builder.base32 assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' - -class MyEarlyPlugin(warcprox.BaseStandardPostfetchProcessor): - CHAIN_POSITION = 'early' - - def _get_process_put(self): - pass - - def test_load_plugin(): options = warcprox.Options(port=0, plugins=[ 'warcprox.stats.RunningStats', 'warcprox.BaseStandardPostfetchProcessor', - 'warcprox.BaseBatchPostfetchProcessor', - '%s.%s' % (__name__, MyEarlyPlugin.__name__),]) + 'warcprox.BaseBatchPostfetchProcessor',]) controller = warcprox.controller.WarcproxController(options) assert isinstance( controller._postfetch_chain[-1], @@ -1460,9 +1451,6 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) - assert issubclass( - controller._postfetch_chain[-5], - warcprox.BaseStandardPostfetchProcessor) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From 84e5110bcb1e70a8224ffed98d0d4b86b95d4a97 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 21:36:00 -0800 Subject: [PATCH 18/37] [0] isinstance of parent class --- tests/test_warcprox.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index f054d69..2f4f8e2 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1426,11 +1426,19 @@ def test_controller_with_defaults(): assert not wwp.writer_pool.default_warc_writer.record_builder.base32 assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' + +class MyEarlyPlugin(warcprox.BaseStandardPostfetchProcessor): + CHAIN_POSITION = 'early' + def _process_put(self): + pass + + def test_load_plugin(): options = warcprox.Options(port=0, plugins=[ 'warcprox.stats.RunningStats', 'warcprox.BaseStandardPostfetchProcessor', - 'warcprox.BaseBatchPostfetchProcessor',]) + 'warcprox.BaseBatchPostfetchProcessor', + '%s.%s' % (__name__, MyEarlyPlugin.__name__),]) controller = warcprox.controller.WarcproxController(options) assert isinstance( controller._postfetch_chain[-1], @@ -1451,6 +1459,10 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) + # MyEarlyPlugin + assert isinstance( + controller._postfetch_chain[0], + warcprox.BaseStandardPostfetchProcessor) def test_choose_a_port_for_me(warcprox_): options = warcprox.Options() From 3161793c5c1077141e6e0ff7e7a46b5403d1243d Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 27 Feb 2018 22:23:40 -0800 Subject: [PATCH 19/37] add test_do_not_archive --- tests/test_writer.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/test_writer.py b/tests/test_writer.py index fb39378..126932a 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -163,6 +163,47 @@ def test_special_dont_write_prefix(): wwt.join() +def test_do_not_archive(): + with tempfile.TemporaryDirectory() as tmpdir: + logging.debug('cd %s', tmpdir) + os.chdir(tmpdir) + + wwt = warcprox.writerthread.WarcWriterProcessor( + Options(writer_threads=1)) + wwt.inq = warcprox.TimestampedQueue(maxsize=1) + wwt.outq = warcprox.TimestampedQueue(maxsize=1) + try: + wwt.start() + # to be written -- default do_not_archive False + recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) + recorder.read() + wwt.inq.put(RecordedUrl( + url='http://example.com/yes', content_type='text/plain', + status=200, client_ip='127.0.0.2', request_data=b'abc', + response_recorder=recorder, remote_ip='127.0.0.3', + timestamp=datetime.utcnow(), + payload_digest=recorder.block_digest)) + # not to be written -- do_not_archive set True + recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None) + recorder.read() + wwt.inq.put(RecordedUrl( + url='http://example.com/no', content_type='text/plain', + status=200, client_ip='127.0.0.2', request_data=b'abc', + response_recorder=recorder, remote_ip='127.0.0.3', + timestamp=datetime.utcnow(), + payload_digest=recorder.block_digest, + warcprox_meta={'warc-prefix': '-'}, + do_not_archive=True)) + recorded_url = wwt.outq.get(timeout=10) + assert recorded_url.warc_records + recorded_url = wwt.outq.get(timeout=10) + assert not recorded_url.warc_records + assert wwt.outq.empty() + finally: + wwt.stop.set() + wwt.join() + + def test_warc_writer_filename(tmpdir): """Test if WarcWriter is writing WARC files with custom filenames. """ From 3f10aafdc4c7d737c6391cbbac60f07e98c57daa Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 28 Feb 2018 15:06:21 -0800 Subject: [PATCH 20/37] fix merge conflict --- tests/test_warcprox.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 3df6be2..ac0b986 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1427,15 +1427,9 @@ def test_controller_with_defaults(): assert wwp.writer_pool.default_warc_writer.record_builder.digest_algorithm == 'sha1' -<<<<<<< HEAD -class MyEarlyPlugin(warcprox.BaseStandardPostfetchProcessor): - CHAIN_POSITION = 'early' - def _process_put(self): -======= class EarlyPlugin(warcprox.BaseStandardPostfetchProcessor): CHAIN_POSITION = 'early' def _process_url(self): ->>>>>>> do_not_archive pass From 56a65741bc54fd9bcbb3bd5b5663731169dac3aa Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 20 Jun 2019 14:52:28 -0700 Subject: [PATCH 21/37] fix link --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 77e7e58..73b9161 100644 --- a/README.rst +++ b/README.rst @@ -90,7 +90,7 @@ for deduplication works similarly to deduplication by `Heritrix a. Write ``response`` record with full payload b. Store new entry in deduplication database (can be disabled, see - `Warcprox-Meta HTTP request header ` + `Warcprox-Meta HTTP request header `_ The deduplication database is partitioned into different "buckets". URLs are deduplicated only against other captures in the same bucket. If specified, the From 32200db7ab546d610a00b28fe804334a49e6387f Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 13 Sep 2019 11:43:39 -0700 Subject: [PATCH 22/37] log long-running fetches --- warcprox/mitmproxy.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 6a7ce4c..9bac478 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -561,15 +561,18 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): 'bytes exceeded for URL %s', self._max_resource_size, self.url) break - elif (not 'content-length' in self.headers - and time.time() - start > 3 * 60 * 60): - prox_rec_res.truncated = b'time' - self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR) - self._remote_server_conn.sock.close() - self.logger.info( - 'reached hard timeout of 3 hours fetching url ' - 'without content-length: %s', self.url) - break + elif time.time() - start > 3 * 60 * 60: + if not 'content-length' in self.headers: + prox_rec_res.truncated = b'time' + self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR) + self._remote_server_conn.sock.close() + self.logger.info( + 'reached hard timeout of 3 hours fetching url ' + 'without content-length: %s', self.url) + break + else: + self.logger.info( + 'long-running fetch for URL %s', self.url) self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) # Let's close off the remote end. If remote connection is fine, From 89e6745274d8470624c2e3c147060f31b3621e9a Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Wed, 8 Jul 2020 16:48:05 +0000 Subject: [PATCH 23/37] Handle RuntimeError Some times when warcprox runs for several days under load it freezes and the last error in the log is: ``` WARNING:warcprox.warcproxy.WarcProxy:exception processing request from ('207.241.225.241', 40738) Traceback (most recent call last): File "/usr/lib/python3.7/socketserver.py", line 316, in _handle_request_noblock self.process_request(request, client_address) File "/opt/spn2/lib/python3.7/site-packages/warcprox/mitmproxy.py", line 641, in process_request self.process_request_thread, request, client_address) File "/usr/lib/python3.7/concurrent/futures/thread.py", line 172, in submit self._adjust_thread_count() File "/usr/lib/python3.7/concurrent/futures/thread.py", line 193, in _adjust_thread_count t.start() File "/usr/lib/python3.7/threading.py", line 852, in start _start_new_thread(self._bootstrap, ()) RuntimeError: can't start new thread ``` The process seems to run but it doesn't respond to any connection, not even `status` requests. We handle this exception and allow it to continue operation. --- warcprox/mitmproxy.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 6b32a40..88f6c1e 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -637,13 +637,17 @@ class PooledMixIn(socketserver.ThreadingMixIn): def process_request(self, request, client_address): self.active_requests[request] = doublethink.utcnow() - future = self.pool.submit( - self.process_request_thread, request, client_address) - future.add_done_callback( - lambda f: self.active_requests.pop(request, None)) - if future.done(): - # avoid theoretical timing issue, in case process_request_thread - # managed to finish before future.add_done_callback() ran + try: + future = self.pool.submit( + self.process_request_thread, request, client_address) + future.add_done_callback( + lambda f: self.active_requests.pop(request, None)) + if future.done(): + # avoid theoretical timing issue, in case process_request_thread + # managed to finish before future.add_done_callback() ran + self.active_requests.pop(request, None) + except RuntimeError as exc: + self.logger.error("Error processing request %s", str(exc)) self.active_requests.pop(request, None) def get_request(self): From e29d377dfd602d0dbdfb71e7a292c160855404bc Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 25 Sep 2020 15:58:47 -0700 Subject: [PATCH 24/37] fix for TypeError --- warcprox/controller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/controller.py b/warcprox/controller.py index 84c3b93..f0a7815 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -166,7 +166,7 @@ class WarcproxController(object): with processor.inq.mutex: l = list(processor.inq.queue) for recorded_url in l: - if earliest is None or recorded_url.timestamp < earliest: + if not earliest or (earliest and (recorded_url.timestamp < earliest)): earliest = recorded_url.timestamp return earliest From 42676cfb35b501f7fb5c6f64f699993d80665765 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 30 Sep 2020 11:00:35 -0700 Subject: [PATCH 25/37] check record_url.timestamp --- warcprox/controller.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/warcprox/controller.py b/warcprox/controller.py index f0a7815..43ef25f 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -166,8 +166,9 @@ class WarcproxController(object): with processor.inq.mutex: l = list(processor.inq.queue) for recorded_url in l: - if not earliest or (earliest and (recorded_url.timestamp < earliest)): - earliest = recorded_url.timestamp + if recorded_url.timestamp: + if not earliest or (recorded_url.timestamp < earliest): + earliest = recorded_url.timestamp return earliest def postfetch_status(self): From 533234162ee7bafd6c6ce6e0110be0619117be99 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 6 Dec 2021 19:32:35 -0800 Subject: [PATCH 26/37] str, not object --- warcprox/dedup.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index e8e95c7..09f5996 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -418,11 +418,14 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): # for duplicate checks, see https://webarchive.jira.com/browse/WT-31 hash_plus_urls = set() for recorded_url in batch: + if recorded_url.payload_digest: + hash_plus_url = ''.join((warcprox.digest_str( + recorded_url.payload_digest, self.options.base32), recorded_url.url.decode())) if (recorded_url.response_recorder and recorded_url.payload_digest and self.trough_dedup_db.should_dedup(recorded_url) and '{}{}'.format(recorded_url.payload_digest, recorded_url.url) not in hash_plus_urls): - hash_plus_urls.add('{}{}'.format(recorded_url.payload_digest, recorded_url.url)) + hash_plus_urls.add(hash_plus_url) if (recorded_url.warcprox_meta and 'dedup-buckets' in recorded_url.warcprox_meta): for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items(): @@ -430,9 +433,9 @@ class BatchTroughLoader(warcprox.BaseBatchPostfetchProcessor): else: buckets['__unspecified__'].append(recorded_url) else: - if recorded_url.payload_digest and '{}{}'.format(recorded_url.payload_digest, recorded_url.url) in hash_plus_urls: + if hash_plus_url in hash_plus_urls: self.logger.debug( - 'discarding duplicate {} {}'.format(recorded_url.payload_digest, recorded_url.url)) + 'discarding duplicate {}'.format(hash_plus_url) discards.append( warcprox.digest_str( recorded_url.payload_digest, self.options.base32) From a66a5157c76e771b77bae6e341e7bc5417653e97 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 29 Dec 2021 11:57:35 -0800 Subject: [PATCH 27/37] bump qa version too --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d76e45e..c9418fe 100755 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.27', + version='2.4.28', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From d3fdcbe152a72ac4548ae92f9bfac11d83b2545d Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 1 Jul 2022 11:28:28 -0700 Subject: [PATCH 28/37] bump qa version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 52af206..7619340 100755 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.29', + version='2.4.30-qa', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From 106cb905db6ab5526fdfc22b79ecf1e5e4cce8cd Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 3 Aug 2022 11:15:25 -0700 Subject: [PATCH 29/37] zlib decompression --- warcprox/warcproxy.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 05eb8b7..2034ec0 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -46,6 +46,7 @@ import tempfile import hashlib import doublethink import re +import zlib class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): ''' @@ -175,7 +176,9 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): warcprox_meta = json.loads(self.headers['Warcprox-Meta']) self._security_check(warcprox_meta) self._enforce_limits(warcprox_meta) - self._enforce_blocks(warcprox_meta) + if 'blocks' in warcprox_meta: + warcprox_meta['blocks'] = zlib.decompress(warcprox_meta['blocks']).decode() + self._enforce_blocks(warcprox_meta) def _connect_to_remote_server(self): ''' From 3e8102221d794ab67e8d1c85d8cef47a086da5fa Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 3 Aug 2022 14:59:36 -0700 Subject: [PATCH 30/37] use 'compressed_blocks' --- warcprox/warcproxy.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 2034ec0..be2affd 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -176,9 +176,10 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): warcprox_meta = json.loads(self.headers['Warcprox-Meta']) self._security_check(warcprox_meta) self._enforce_limits(warcprox_meta) - if 'blocks' in warcprox_meta: - warcprox_meta['blocks'] = zlib.decompress(warcprox_meta['blocks']).decode() - self._enforce_blocks(warcprox_meta) + if 'compressed_blocks' in warcprox_meta: + warcprox_meta['blocks'] = zlib.decompress(warcprox_meta['compressed_blocks']).decode() + del warcprox_meta['compressed_blocks'] + self._enforce_blocks(warcprox_meta) def _connect_to_remote_server(self): ''' From a232ffc6ba3bcf61285294166caa275cc7e82d05 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 3 Aug 2022 15:04:39 -0700 Subject: [PATCH 31/37] bump qa version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7619340..92a0daf 100755 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.30-qa', + version='2.4.31-qa', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From 2cdfceade15bbf013ce7f0653e8b3b6f9caec7e1 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 3 Aug 2022 15:49:54 -0700 Subject: [PATCH 32/37] decompress and split --- warcprox/warcproxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index be2affd..645055e 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -177,7 +177,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self._security_check(warcprox_meta) self._enforce_limits(warcprox_meta) if 'compressed_blocks' in warcprox_meta: - warcprox_meta['blocks'] = zlib.decompress(warcprox_meta['compressed_blocks']).decode() + warcprox_meta['blocks'] = zlib.decompress(warcprox_meta['compressed_blocks']).decode().split('~~') del warcprox_meta['compressed_blocks'] self._enforce_blocks(warcprox_meta) From 053a42a371ccbdb5f1fc22846b3b3dbf8c82a79d Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 3 Aug 2022 16:30:49 -0700 Subject: [PATCH 33/37] bump qa version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 68f81ba..4bf6650 100755 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.31-qa', + version='2.4.31-qa-1', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From 1205589faeea9dc467ac17ccbdd61d5dac7ebc18 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 3 Aug 2022 16:52:31 -0700 Subject: [PATCH 34/37] decompress and json.loads --- warcprox/warcproxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 645055e..42996e5 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -177,7 +177,7 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): self._security_check(warcprox_meta) self._enforce_limits(warcprox_meta) if 'compressed_blocks' in warcprox_meta: - warcprox_meta['blocks'] = zlib.decompress(warcprox_meta['compressed_blocks']).decode().split('~~') + warcprox_meta['blocks'] = json.loads(zlib.decompress(warcprox_meta['compressed_blocks']).decode()) del warcprox_meta['compressed_blocks'] self._enforce_blocks(warcprox_meta) From 20789e4edb9a89a94260058886a51a4bb0c63cc4 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 3 Aug 2022 16:53:52 -0700 Subject: [PATCH 35/37] bump qa version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4bf6650..40ebddb 100755 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.31-qa-1', + version='2.4.31-qa-2', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From 8f10fce93ade2f45311f15cba404a893e39f7bea Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 3 Aug 2022 19:58:54 -0700 Subject: [PATCH 36/37] resetting to Jul 1 updates --- setup.py | 4 ++-- tests/test_warcprox.py | 1 - warcprox/controller.py | 5 ++--- warcprox/mitmproxy.py | 39 ++++++++++++++++----------------------- warcprox/warcproxy.py | 4 ---- 5 files changed, 20 insertions(+), 33 deletions(-) diff --git a/setup.py b/setup.py index 40ebddb..52af206 100755 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ ''' setup.py - setuptools installation configuration for warcprox -Copyright (C) 2013-2022 Internet Archive +Copyright (C) 2013-2021 Internet Archive This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -44,7 +44,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.31-qa-2', + version='2.4.29', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 3ca74f2..c115b4f 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -1730,7 +1730,6 @@ def test_load_plugin(): assert isinstance( controller._postfetch_chain[-4].listener, warcprox.stats.RunningStats) - # MyEarlyPlugin assert isinstance( controller._postfetch_chain[0], EarlyPlugin) diff --git a/warcprox/controller.py b/warcprox/controller.py index 954cbc1..8d670cb 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -166,9 +166,8 @@ class WarcproxController(object): with processor.inq.mutex: l = list(processor.inq.queue) for recorded_url in l: - if recorded_url.timestamp: - if not earliest or (recorded_url.timestamp < earliest): - earliest = recorded_url.timestamp + if earliest is None or recorded_url.timestamp < earliest: + earliest = recorded_url.timestamp return earliest def postfetch_status(self): diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index a55b860..a423a22 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -596,18 +596,15 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): 'bytes exceeded for URL %s', self._max_resource_size, self.url) break - elif time.time() - start > 3 * 60 * 60: - if not 'content-length' in self.headers: - prox_rec_res.truncated = b'time' - self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR) - self._remote_server_conn.sock.close() - self.logger.info( - 'reached hard timeout of 3 hours fetching url ' - 'without content-length: %s', self.url) - break - else: - self.logger.info( - 'long-running fetch for URL %s', self.url) + elif (not 'content-length' in self.headers + and time.time() - start > 3 * 60 * 60): + prox_rec_res.truncated = b'time' + self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR) + self._remote_server_conn.sock.close() + self.logger.info( + 'reached hard timeout of 3 hours fetching url ' + 'without content-length: %s', self.url) + break self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) # Let's close off the remote end. If remote connection is fine, @@ -675,17 +672,13 @@ class PooledMixIn(socketserver.ThreadingMixIn): def process_request(self, request, client_address): self.active_requests[request] = doublethink.utcnow() - try: - future = self.pool.submit( - self.process_request_thread, request, client_address) - future.add_done_callback( - lambda f: self.active_requests.pop(request, None)) - if future.done(): - # avoid theoretical timing issue, in case process_request_thread - # managed to finish before future.add_done_callback() ran - self.active_requests.pop(request, None) - except RuntimeError as exc: - self.logger.error("Error processing request %s", str(exc)) + future = self.pool.submit( + self.process_request_thread, request, client_address) + future.add_done_callback( + lambda f: self.active_requests.pop(request, None)) + if future.done(): + # avoid theoretical timing issue, in case process_request_thread + # managed to finish before future.add_done_callback() ran self.active_requests.pop(request, None) def get_request(self): diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 42996e5..05eb8b7 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -46,7 +46,6 @@ import tempfile import hashlib import doublethink import re -import zlib class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): ''' @@ -176,9 +175,6 @@ class WarcProxyHandler(warcprox.mitmproxy.MitmProxyHandler): warcprox_meta = json.loads(self.headers['Warcprox-Meta']) self._security_check(warcprox_meta) self._enforce_limits(warcprox_meta) - if 'compressed_blocks' in warcprox_meta: - warcprox_meta['blocks'] = json.loads(zlib.decompress(warcprox_meta['compressed_blocks']).decode()) - del warcprox_meta['compressed_blocks'] self._enforce_blocks(warcprox_meta) def _connect_to_remote_server(self): From 58f3e58531119266c307387fee7c266f80561c25 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 4 Aug 2022 15:13:43 -0700 Subject: [PATCH 37/37] bump qa version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 52af206..c3450eb 100755 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.29', + version='2.4.29-qa-220804', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt',