From 7eab061cd48fe2f0dbde59ee4bf07572c939ef49 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Sat, 17 Feb 2018 14:53:18 +0000 Subject: [PATCH 1/5] Use updated list of SSL ciphers We use the default list of SSL ciphers of python `ssl` module when we connect to remote hosts. That list is probably outdated. https://github.com/python/cpython/blob/3.6/Lib/ssl.py#L192 We noticed problems when connection to various targets. E.g. ``` 2018-01-31 21:29:23,870 3067 WARNING MitmProxyHandler(tid=8052,started=2018-01-31T21:29:22.501118,client=127.0.0.1:56340) warcprox.warcprox.WarcProxyHandler.log_error(mitmproxy.py:447) code 500, message EOF occurred in violation of protocol (_ssl.c:645) 2018-01-31 21:29:23,987 3067 ERROR MitmProxyHandler(tid=7327,started=2018-01-31T21:29:22.741262,client=127.0.0.1:56448) warcprox.warcprox.WarcProxyHandler.do_CONNECT(mitmproxy.py:311) problem handling 'CONNECT beacon.krxd.net:443 HTTP/1.1': SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:645)') 2018-01-31 21:29:23,870 3067 ERROR MitmProxyHandler(tid=8052,started=2018-01-31T21:29:22.501118,client=127.0.0.1:56340) warcprox.warcprox.WarcProxyH andler.do_CONNECT(mitmproxy.py:311) problem handling 'CONNECT px.surveywall-api.survata.com:443 HTTP/1.1': SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:645)') ``` Research indicated that the cipher selection is not proper. I use `urllib3` cipher selection for better compatibility. https://github.com/shazow/urllib3/blob/master/urllib3/util/ssl_.py#L71 The `urllib3` list is bigger and includes TLS13 which from my experience is the latest state of the art. `ssl` module ciphers: ``` 'ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES:ECDH+RC4:DH+RC4:RSA+RC4:!aNULL:!eNULL:!MD5' ``` `urllib3` module ciphers: ``` 'TLS13-AES-256-GCM-SHA384:TLS13-CHACHA20-POLY1305-SHA256:TLS13-AES-128-GCM-SHA256:ECDH+AESGCM:ECDH+CHACHA20:DH+AESGCM:DH+CHACHA20:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:RSA+AESGCM:RSA+AES:!aNULL:!eNULL:!MD5' ``` --- warcprox/mitmproxy.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 95d5b31..cfb86c9 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -64,6 +64,7 @@ import urlcanon import time import collections import cProfile +from urllib3.util.ssl_ import DEFAULT_CIPHERS class ProxyingRecorder(object): """ @@ -257,8 +258,10 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): context = ssl.create_default_context() context.check_hostname = False context.verify_mode = ssl.CERT_NONE + context.ciphers = DEFAULT_CIPHERS self._remote_server_sock = context.wrap_socket( - self._remote_server_sock, server_hostname=self.hostname) + self._remote_server_sock, server_hostname=self.hostname, + ) except AttributeError: try: self._remote_server_sock = ssl.wrap_socket( From 7d76059d4e1e8f9abebf92cca93ac778c2d395b6 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Sat, 17 Feb 2018 19:24:14 +0000 Subject: [PATCH 2/5] Fixed typo --- warcprox/mitmproxy.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index cfb86c9..1c40968 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -260,8 +260,7 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): context.verify_mode = ssl.CERT_NONE context.ciphers = DEFAULT_CIPHERS self._remote_server_sock = context.wrap_socket( - self._remote_server_sock, server_hostname=self.hostname, - ) + self._remote_server_sock, server_hostname=self.hostname) except AttributeError: try: self._remote_server_sock = ssl.wrap_socket( From 46dd01de892215ea08a822e3188097b030204f64 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 14 Feb 2018 15:48:21 -0800 Subject: [PATCH 3/5] add do_not_archive check to should_archive --- warcprox/writerthread.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 1010161..27c5eea 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -81,8 +81,12 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor): if recorded_url.warcprox_meta and 'warc-prefix' in recorded_url.warcprox_meta else self.options.prefix) + do_not_archive = (recorded_url.do_not_archive + if recorded_url.do_not_archive + else False) # special warc name prefix '-' means "don't archive" - return prefix != '-' and self._filter_accepts(recorded_url) + return prefix != '-' and (not do_not_archive) and + self._filter_accepts(recorded_url) def _log(self, recorded_url, records): try: From 982700d503d38ce95ad09f18f6a87314a99e0a6f Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 14 Feb 2018 17:55:09 -0800 Subject: [PATCH 4/5] add CHAIN_POSITION support --- warcprox/controller.py | 2 ++ warcprox/writerthread.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/warcprox/controller.py b/warcprox/controller.py index 30446c3..644fdec 100644 --- a/warcprox/controller.py +++ b/warcprox/controller.py @@ -212,6 +212,8 @@ class WarcproxController(object): self._postfetch_chain.append( warcprox.ListenerPostfetchProcessor( plugin, self.options)) + elif hasattr(plugin, 'CHAIN_POSITION') and plugin.CHAIN_POSITION == 'early': + self._postfetch_chain.insert(0, plugin) # or insert early but later than 0? else: self._postfetch_chain.append(plugin) diff --git a/warcprox/writerthread.py b/warcprox/writerthread.py index 27c5eea..854319c 100644 --- a/warcprox/writerthread.py +++ b/warcprox/writerthread.py @@ -85,8 +85,8 @@ class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor): if recorded_url.do_not_archive else False) # special warc name prefix '-' means "don't archive" - return prefix != '-' and (not do_not_archive) and - self._filter_accepts(recorded_url) + return (prefix != '-' and (not do_not_archive) + and self._filter_accepts(recorded_url)) def _log(self, recorded_url, records): try: From 483ed8016e84cefd704b63b6cae355dd271a90fc Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Thu, 15 Feb 2018 13:56:14 -0800 Subject: [PATCH 5/5] add do_not_archive to class --- warcprox/warcproxy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 5b36300..e55b295 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -330,7 +330,7 @@ class RecordedUrl: warcprox_meta=None, content_type=None, custom_type=None, status=None, size=None, client_ip=None, method=None, timestamp=None, host=None, duration=None, referer=None, - payload_digest=None, warc_records=None): + payload_digest=None, warc_records=None, do_not_archive=False): # XXX should test what happens with non-ascii url (when does # url-encoding happen?) if type(url) is not bytes: @@ -370,6 +370,7 @@ class RecordedUrl: self.referer = referer self.payload_digest = payload_digest self.warc_records = warc_records + self.do_not_archive = do_not_archive # inherit from object so that multiple inheritance from this class works # properly in python 2