From 2b408b3af0b86227d42466773154aa3ab2abfeaa Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 13 Sep 2019 10:56:58 -0700 Subject: [PATCH 01/12] avoid this problem 2019-09-13 17:15:40,659 594 CRITICAL MainThread warcprox.controller.WarcproxController.run_until_shutdown(controller.py:447) graceful shutdown failed Traceback (most recent call last): File "/opt/warcprox-ve3/lib/python3.5/site-packages/warcprox/controller.py", line 445, in run_until_shutdown self.shutdown() File "/opt/warcprox-ve3/lib/python3.5/site-packages/warcprox/controller.py", line 371, in shutdown self.proxy.server_close() File "/opt/warcprox-ve3/lib/python3.5/site-packages/warcprox/warcproxy.py", line 503, in server_close warcprox.mitmproxy.PooledMitmProxy.server_close(self) File "/opt/warcprox-ve3/lib/python3.5/site-packages/warcprox/mitmproxy.py", line 754, in server_close for sock in self.remote_server_socks: RuntimeError: Set changed size during iteration --- warcprox/mitmproxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 6a7ce4c..2d7f9b6 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -759,7 +759,7 @@ class PooledMitmProxy(PooledMixIn, MitmProxy): Abort active connections to remote servers to achieve prompt shutdown. ''' self.shutting_down = True - for sock in self.remote_server_socks: + for sock in list(self.remote_server_socks): self.shutdown_request(sock) class SingleThreadedMitmProxy(http_server.HTTPServer): From a8cd219da7fa2c228f650f397d817bd034e6bf57 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 13 Sep 2019 10:57:28 -0700 Subject: [PATCH 02/12] add missing import fixes this problem: Traceback (most recent call last): File "/opt/warcprox-ve3/lib/python3.5/site-packages/warcprox/main.py", line 330, in main controller.run_until_shutdown() File "/opt/warcprox-ve3/lib/python3.5/site-packages/warcprox/controller.py", line 449, in run_until_shutdown os.kill(os.getpid(), 9) NameError: name 'os' is not defined --- warcprox/controller.py | 1 + 1 file changed, 1 insertion(+) diff --git a/warcprox/controller.py b/warcprox/controller.py index 9a2880e..84c3b93 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -37,6 +37,7 @@ import doublethink import importlib import queue import socket +import os class Factory: @staticmethod From 88a7f79a7edcc86e44f354e4907b3d7456a6366b Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 13 Sep 2019 10:58:16 -0700 Subject: [PATCH 03/12] bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8c0415e..bf35b6a 100755 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.18', + version='2.4.19', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From 84a46e4323265b40950d7a57b94db5ca5e7d4b34 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 20 Sep 2019 05:55:51 +0000 Subject: [PATCH 04/12] Increase remote_connection_pool maxsize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We noticed a lot of log entries like this in production: ``` WARNING:urllib3.connectionpool:Connection pool is full, discarding connection: static.xx.fbcdn.net ``` this happens because we use a `PoolManager` and create a number of pools (param `num_pools`) but the number of connections each pool can have is just 1 by default (param `maxsize` is 1 by default). `urllib3` docs say: `maxsize` – Number of connections to save that can be reused. More than 1 is useful in multithreaded situations. Ref: https://urllib3.readthedocs.io/en/1.2.1/pools.html#urllib3.connectionpool.HTTPConnectionPool I suggest to use `maxsize=10` and re-evaluate after some time if its big enough. This improvement will boost performance as we'll reuse more connections to remote hosts. --- warcprox/mitmproxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 2d7f9b6..929bc49 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -777,7 +777,7 @@ class SingleThreadedMitmProxy(http_server.HTTPServer): self.bad_hostnames_ports_lock = RLock() self.remote_connection_pool = PoolManager( - num_pools=max((options.max_threads or 0) // 6, 400)) + num_pools=max((options.max_threads or 0) // 6, 400), maxsize=10) if options.onion_tor_socks_proxy: try: From 8f20fc014ea003e0ec7f39b84907d34378973d71 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 20 Sep 2019 06:31:15 +0000 Subject: [PATCH 05/12] Skip cdx dedup for volatile URLs with session params A lot of cdx dedup requests fail. Checking production logs, we see that we try to dedup URLs that are certainly volative and session-specific. We can skip them to reduce cdx dedup load. We won't find any matches anyway since they contain session-specific vars. We suggest to skip cdx dedup for URL that include `JSESSIONID=`, `session=` or `sess=`. These are common session URL params, there could be many-many more. Example URLs: ``` /session/683/urii8zej/xhr_streaming?JSESSIONID=dv0jkbk2-8xm9t9tf-7wp8lx0m-x4vb22ys https://tw.popin.cc/popin_discovery/recommend?mode=new&url=https%3A%2F%2Fwww.nownews.com%2Fcat%2Fpolitics%2Fmilitary%2F&&device=pc&media=www.nownews.com&extra=other&agency=cnplus&topn=100&ad=100&r_category=all&country=tw&redirect=false&infinite=nownews&infinite_domain=m.nownews.com&piuid=43757d2474f09288b8410a9f2a40acf1&info=eyJ1c2VyX3RkX29zIjoib3RoZXIiLCJ1c2VyX3RkX29zX3ZlcnNpb24iOiIwLjAuMCIsInVzZXJfdGRfYnJvd3NlciI6IkNocm9tZSIsInVzZXJfdGRfYnJvd3Nlcl92ZXJzaW9uIjoiNzQuMC4zNzI5IiwidXNlcl90ZF9zY3JlZW4iOiIxNjAweDEwMDAiLCJ1c2VyX3RkX3ZpZXdwb3J0IjoiMTEwMHg3ODQiLCJ1c2VyX3RkX3VzZXJfYWdlbnQiOiJNb3ppbGxhLzUuMCAoWDExOyBMaW51eCB4ODZfNjQpIEFwcGxlV2ViS2l0LzUzNy4zNiAoS0hUTUwsIGxpa2UgR2Vja28pIFVidW50dSBDaHJvbWl1bS83NC4wLjM3MjkuMTY5IENocm9tZS83NC4wLjM3MjkuMTY5IFNhZmFyaS81MzcuMzYiLCJ1c2VyX3RkX3JlZmVycmVyIjoiIiwidXNlcl90ZF9wYXRoIjoiL2NhdC9wb2xpdGljcy9taWxpdGFyeS8iLCJ1c2VyX3RkX2NoYXJzZXQiOiJ1dGYtOCIsInVzZXJfdGRfbGFuZ3VhZ2UiOiJlbi11cyIsInVzZXJfdGRfY29sb3IiOiIyNC1iaXQiLCJ1c2VyX3RkX3RpdGxlIjoiJUU4JUJCJThEJUU2JUFEJUE2JTIwJTdDJTIwTk9XbmV3cyUyMCVFNCVCQiU4QSVFNiU5NyVBNSVFNiU5NiVCMCVFOCU4MSU5RSIsInVzZXJfdGRfdXJsIjoiaHR0cHM6Ly93d3cubm93bmV3cy5jb20vY2F0L3BvbGl0aWNzL21pbGl0YXJ5LyIsInVzZXJfdGRfcGxhdGZvcm0iOiJMaW51eCB4ODZfNjQiLCJ1c2VyX3RkX2hvc3QiOiJ3d3cubm93bmV3cy5jb20iLCJ1c2VyX2RldmljZSI6InBjIiwidXNlcl90aW1lIjoxNTYyMDAxMzkyNzY2fQ==&session=13927861b5403&callback=_p6_8e102dd0c975 http://c.statcounter.com/text.php?sc_project=4092884&java=1&security=10fe3b6b&u1=915B47A927524F10185B2F074074BDCB&sc_random=0.017686960888044556&jg=310&rr=1.1.1.1.1.1.1.1.1&resolution=1600&h=1000&camefrom=&u=http%3A//buchlatech.blogspot.com/search/label/prototype&t=Buchla%20Tech%3A%20prototype&rcat=d&rdomo=d&rdomg=310&bb=0&sc_snum=1&sess=cfa820&p=0&text=2 ``` --- warcprox/dedup.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/warcprox/dedup.py b/warcprox/dedup.py index 9562fa5..0e09239 100644 --- a/warcprox/dedup.py +++ b/warcprox/dedup.py @@ -266,6 +266,9 @@ class CdxServerDedup(DedupDb): performance optimisation to handle that. limit < 0 is very inefficient in general. Maybe it could be configurable in the future. + Skip dedup for URLs with session params. These URLs are certainly + unique and highly volatile, we cannot dedup them. + :param digest_key: b'sha1:' (prefix is optional). Example: b'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A' :param url: Target URL string @@ -274,6 +277,8 @@ class CdxServerDedup(DedupDb): """ u = url.decode("utf-8") if isinstance(url, bytes) else url try: + if any(s in u for s in ('JSESSIONID=', 'session=', 'sess=')): + return None result = self.http_pool.request('GET', self.cdx_url, fields=dict( url=u, fl="timestamp,digest", filter="!mimetype:warc/revisit", limit=-1)) From 6536516375e65841531ad3fc2e687828d887804a Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 20 Sep 2019 12:49:09 +0000 Subject: [PATCH 06/12] Handle ValueError when trying to close WARC file We get a lot of the following error in production and warcprox becomes totally unresponsive when this happens. ``` CRITICAL:warcprox.writerthread.WarcWriterProcessor:WarcWriterProcessor(tid=16646) will try to continue after unexpected error Traceback (most recent call last): File "/opt/spn2/lib/python3.5/site-packages/warcprox/__init__.py", line 140, in _run self._get_process_put() File "/opt/spn2/lib/python3.5/site-packages/warcprox/writerthread.py", line 60, in _get_process_put self.writer_pool.maybe_idle_rollover() File "/opt/spn2/lib/python3.5/site-packages/warcprox/writer.py", line 233, in maybe_idle_rollover w.maybe_idle_rollover() File "/opt/spn2/lib/python3.5/site-packages/warcprox/writer.py", line 188, in maybe_idle_rollover self.close() File "/opt/spn2/lib/python3.5/site-packages/warcprox/writer.py", line 169, in close fcntl.lockf(self.f, fcntl.LOCK_UN) ValueError: I/O operation on closed file ``` Current code handles `IOError`. We also need to handle `ValueError` to address this. --- warcprox/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/writer.py b/warcprox/writer.py index 730d606..639e96d 100644 --- a/warcprox/writer.py +++ b/warcprox/writer.py @@ -167,7 +167,7 @@ class WarcWriter: if self.open_suffix == '': try: fcntl.lockf(self.f, fcntl.LOCK_UN) - except IOError as exc: + except (IOError, ValueError) as exc: self.logger.error( 'could not unlock file %s (%s)', self.path, exc) self.f.close() From 407e89025844cfd8261aa43e4fdbea33e244c801 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Sat, 21 Sep 2019 09:29:19 +0000 Subject: [PATCH 07/12] Set connection pool maxsize=6 --- warcprox/mitmproxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 929bc49..6b32a40 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -777,7 +777,7 @@ class SingleThreadedMitmProxy(http_server.HTTPServer): self.bad_hostnames_ports_lock = RLock() self.remote_connection_pool = PoolManager( - num_pools=max((options.max_threads or 0) // 6, 400), maxsize=10) + num_pools=max((options.max_threads or 0) // 6, 400), maxsize=6) if options.onion_tor_socks_proxy: try: From a09901dcefe8060bb2802d7937aec8570048d4c5 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Sat, 21 Sep 2019 09:43:27 +0000 Subject: [PATCH 08/12] Use "except Exception" to catch all exception types --- warcprox/writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warcprox/writer.py b/warcprox/writer.py index 639e96d..5187d08 100644 --- a/warcprox/writer.py +++ b/warcprox/writer.py @@ -167,7 +167,7 @@ class WarcWriter: if self.open_suffix == '': try: fcntl.lockf(self.f, fcntl.LOCK_UN) - except (IOError, ValueError) as exc: + except Exception as exc: self.logger.error( 'could not unlock file %s (%s)', self.path, exc) self.f.close() From d1b52f8d804d7b0ad196d97aa7a7cda13b7fa786 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 23 Sep 2019 11:16:48 -0700 Subject: [PATCH 09/12] try to fix test failing due to url-encoding https://travis-ci.org/internetarchive/warcprox/jobs/588557539 test_domain_data_soft_limit not sure what changed, maybe the requests library, though i can't reproduce locally, but explicitly decoding should fix the problem --- tests/test_warcprox.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 884ddd4..d95f51c 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -52,6 +52,7 @@ import mock import email.message import socketserver from concurrent import futures +import urllib.parse try: import http.server as http_server @@ -175,8 +176,10 @@ class _TestHttpRequestHandler(http_server.BaseHTTPRequestHandler): def build_response(self): m = re.match(r'^/([^/]+)/([^/]+)$', self.path) if m is not None: - special_header = 'warcprox-test-header: {}!'.format(m.group(1)).encode('utf-8') - payload = 'I am the warcprox test payload! {}!\n'.format(10*m.group(2)).encode('utf-8') + seg1 = urllib.parse.unquote(m.group(1)) + seg2 = urllib.parse.unquote(m.group(2)) + special_header = 'warcprox-test-header: {}!'.format(seg1).encode('utf-8') + payload = 'I am the warcprox test payload! {}!\n'.format(10*seg2).encode('utf-8') headers = (b'HTTP/1.1 200 OK\r\n' + b'Content-Type: text/plain\r\n' + special_header + b'\r\n' @@ -1351,7 +1354,7 @@ def test_domain_data_soft_limit( warcprox_.proxy.remote_connection_pool.clear() # novel, pushes stats over the limit - url = 'https://muh.XN--Zz-2Ka.locALHOst:{}/z/~'.format(https_daemon.server_port) + url = 'https://muh.XN--Zz-2Ka.locALHOst:{}/z/%7E'.format(https_daemon.server_port) response = requests.get( url, proxies=archiving_proxies, headers=headers, stream=True, verify=False) From 1f852f5f3646aa6f6b1a7eb27a3a0517dddd4ae4 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 23 Sep 2019 11:55:00 -0700 Subject: [PATCH 10/12] bump version after merges --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index bf35b6a..ccb4b22 100755 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.19', + version='2.4.20', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt', From 52e83632dd9bfab46c3387f50cb218d72ea3a005 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Thu, 26 Sep 2019 17:34:31 +0000 Subject: [PATCH 11/12] Another exception when trying to close a WARC file Recently, we found and fixed a problem when closing a WARC file. https://github.com/internetarchive/warcprox/pull/140 After using the updated warcprox in production, we got another exception in the same method, right after that point. ``` ERROR:root:caught exception processing b'https://abs.twimg.com/favicons/favicon.ico' Traceback (most recent call last): File "/opt/spn2/lib/python3.5/site-packages/warcprox/writerthread.py", line 78, in _process_url records = self.writer_pool.write_records(recorded_url) File "/opt/spn2/lib/python3.5/site-packages/warcprox/writer.py", line 227, in write_records return self._writer(recorded_url).write_records(recorded_url) File "/opt/spn2/lib/python3.5/site-packages/warcprox/writer.py", line 139, in write_records offset = self.f.tell() ValueError: I/O operation on closed file ERROR:warcprox.writer.WarcWriter:could not unlock file /1/liveweb/warcs/liveweb-20190923194044-wwwb-spn14.us.archive.org.warc.gz (I/O operation on closed file) CRITICAL:warcprox.writerthread.WarcWriterProcessor:WarcWriterProcessor(tid=6228) will try to continue after unexpected error Traceback (most recent call last): File "/opt/spn2/lib/python3.5/site-packages/warcprox/__init__.py", line 140, in _run self._get_process_put() File "/opt/spn2/lib/python3.5/site-packages/warcprox/writerthread.py", line 60, in _get_process_put self.writer_pool.maybe_idle_rollover() File "/opt/spn2/lib/python3.5/site-packages/warcprox/writer.py", line 233, in maybe_idle_rollover w.maybe_idle_rollover() File "/opt/spn2/lib/python3.5/site-packages/warcprox/writer.py", line 188, in maybe_idle_rollover self.close() File "/opt/spn2/lib/python3.5/site-packages/warcprox/writer.py", line 176, in close os.rename(self.path, finalpath) FileNotFoundError: [Errno 2] No such file or directory: '/1/liveweb/warcs/liveweb-20190923194044-wwwb-spn14.us.archive.org.warc.gz' -> '/1/liveweb/warcs/liveweb-20190923194044-wwwb-spn14.us.archive.org.warc.gz' ``` We don't have a WARC file and our code tries to run `os.rename` on a file that doesn't exist. We add exception handling for that case as well. I should have foreseen that when doing the previous fix :( --- warcprox/writer.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/warcprox/writer.py b/warcprox/writer.py index 5187d08..cc44be2 100644 --- a/warcprox/writer.py +++ b/warcprox/writer.py @@ -170,11 +170,14 @@ class WarcWriter: except Exception as exc: self.logger.error( 'could not unlock file %s (%s)', self.path, exc) - self.f.close() - finalpath = os.path.sep.join( - [self.directory, self.finalname]) - os.rename(self.path, finalpath) - + try: + self.f.close() + finalpath = os.path.sep.join( + [self.directory, self.finalname]) + os.rename(self.path, finalpath) + except Exception as exc: + self.logger.error( + 'could not close and rename file %s (%s)', self.path, exc) self.path = None self.f = None From f77c15203775cb06007364fd1ff04af8d40e6ca3 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 26 Sep 2019 11:49:07 -0700 Subject: [PATCH 12/12] bump version after merge --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ccb4b22..63d8488 100755 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ except: setuptools.setup( name='warcprox', - version='2.4.20', + version='2.4.21', description='WARC writing MITM HTTP/S proxy', url='https://github.com/internetarchive/warcprox', author='Noah Levitt',