From 56e7c78ea3da3e72d4a305e4e30c6240e3941be0 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 29 Aug 2019 11:59:45 -0700 Subject: [PATCH] SOCKS Proxy Improvements (#504) * https over socks fix: fix issue with https url handling by using 'adapter.proxy_manager_for()' instead of 'adapter.get_connection' to get proxy manager, which create connection indirectly (parallel to no-proxy path). - simplify socks config, avoiding global monkey-patch, as requests/urllib3 now support socks proxy directly and do not require patching global socket. - add SOCKS_DISABLE env dynamically disabling socks proxy --- CHANGES.rst | 6 +- pywb/warcserver/http.py | 71 +++------------------- pywb/warcserver/resource/responseloader.py | 37 ++++++----- tests/test_socks.py | 21 +++---- 4 files changed, 46 insertions(+), 89 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 882b3b1c..87b50873 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,11 +1,15 @@ pywb 2.3.5 changelist ~~~~~~~~~~~~~~~~~~~~~ -* General auto-fetch fixes +* General auto-fetch fixes (#503) - Fixed issue that caused HTTP 404 errors to happen when parsing stylesheet hrefs as sheets (webrecorder/wombat #11) - Ensured that requests made are cached by the browser (webrecorder/wombat #13 & #15) - Ensured that the request made by the backing web worker when in proxy mode are not blocked by CORS (webrecorder/wombat #13 & #15) +* SOCKS proxy fixes (#504) + - simplify SOCKS config (avoiding global socket monkey patch), default to no cert verify to match non-proxy behavior + - SOCKS proxy can be disabled dynamically by setting SOCKS_DISABLE + pywb 2.3.4 changelist ~~~~~~~~~~~~~~~~~~~~~ diff --git a/pywb/warcserver/http.py b/pywb/warcserver/http.py index 1ee6bfa9..4c2aa8c3 100644 --- a/pywb/warcserver/http.py +++ b/pywb/warcserver/http.py @@ -8,16 +8,17 @@ from urllib3.poolmanager import PoolManager six.moves.http_client._MAXHEADERS = 10000 six.moves.http_client._MAXLINE = 131072 -SOCKS_PROXIES = None -orig_getaddrinfo = None - +# ============================================================================= class PywbHttpAdapter(HTTPAdapter): """This adaptor exists exists to restore the default behavior of urllib3 < 1.25.x, which was to not verify ssl certs, until a better solution is found """ + # todo: allow configuring this later? + cert_reqs = 'CERT_NONE' + def init_poolmanager( self, connections, maxsize, block=DEFAULT_POOLBLOCK, **pool_kwargs ): @@ -29,10 +30,14 @@ class PywbHttpAdapter(HTTPAdapter): maxsize=maxsize, block=block, strict=True, - cert_reqs='CERT_NONE', + cert_reqs=self.cert_reqs, **pool_kwargs ) + def proxy_manager_for(self, proxy, **proxy_kwargs): + proxy_kwargs['cert_reqs'] = self.cert_reqs + return super(PywbHttpAdapter, self).proxy_manager_for(proxy, **proxy_kwargs) + # ============================================================================= class DefaultAdapters(object): @@ -42,61 +47,3 @@ class DefaultAdapters(object): requests.packages.urllib3.disable_warnings() - -# ============================================================================= -def patch_socks(): - try: - import socks - except ImportError: # pragma: no cover - print('Ignoring SOCKS_HOST: PySocks must be installed to use SOCKS proxy') - return - - import socket - - socks_host = os.environ.get('SOCKS_HOST') - socks_port = os.environ.get('SOCKS_PORT', 9050) - - # Set socks proxy and wrap the urllib module - socks.set_default_proxy(socks.PROXY_TYPE_SOCKS5, socks_host, socks_port, True) - # socket.socket = socks.socksocket # sets default socket to be the sockipy socket - - # store original getaddrinfo - global orig_getaddrinfo - orig_getaddrinfo = socks.socket.getaddrinfo - - # Perform DNS resolution through socket - def getaddrinfo(*args): - if args[0] in ('127.0.0.1', 'localhost'): - res = orig_getaddrinfo(*args) - - else: - res = [(socket.AF_INET, socket.SOCK_STREAM, 6, '', (args[0], args[1]))] - - return res - - socks.socket.getaddrinfo = getaddrinfo - - socks_url = 'socks5h://{0}:{1}'.format(socks_host, socks_port) - - global SOCKS_PROXIES - SOCKS_PROXIES = {'http': socks_url, 'https': socks_url} - - -# ============================================================================= -def unpatch_socks(): - global orig_getaddrinfo - if not orig_getaddrinfo: - return - - import socks - - socks.socket.getaddrinfo = orig_getaddrinfo - orig_getaddrinfo = None - - global SOCKS_PROXIES - SOCKS_PROXIES = None - - -# ============================================================================= -if os.environ.get('SOCKS_HOST'): - patch_socks() diff --git a/pywb/warcserver/resource/responseloader.py b/pywb/warcserver/resource/responseloader.py index 830c9fbe..dd964c1d 100644 --- a/pywb/warcserver/resource/responseloader.py +++ b/pywb/warcserver/resource/responseloader.py @@ -1,6 +1,7 @@ import datetime import json import logging +import os import uuid from io import BytesIO @@ -23,7 +24,7 @@ from pywb.utils.format import ParamFormatter from pywb.utils.io import StreamIter, call_release_conn, compress_gzip_iter, no_except_close from pywb.utils.memento import MementoUtils from pywb.utils.wbexception import LiveResourceException -from pywb.warcserver.http import DefaultAdapters, SOCKS_PROXIES +from pywb.warcserver.http import DefaultAdapters from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin from pywb.warcserver.resource.resolvingloader import ResolvingLoader @@ -256,6 +257,13 @@ class LiveWebLoader(BaseLoader): def __init__(self, forward_proxy_prefix=None, adapter=None): self.forward_proxy_prefix = forward_proxy_prefix + socks_host = os.environ.get('SOCKS_HOST') + socks_port = os.environ.get('SOCKS_PORT', 9050) + if socks_host and socks_port: + self.socks_proxy = 'socks5h://{0}:{1}'.format(socks_host, socks_port) + else: + self.socks_proxy = None + def load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: @@ -475,23 +483,24 @@ class LiveWebLoader(BaseLoader): adapter = DefaultAdapters.live_adapter if is_live else DefaultAdapters.remote_adapter max_retries = adapter.max_retries - if SOCKS_PROXIES: - conn = adapter.get_connection(load_url, SOCKS_PROXIES) + # get either the poolmanager or proxy manager to handle this connection + if self.socks_proxy and not os.environ.get('SOCKS_DISABLE'): + manager = adapter.proxy_manager_for(self.socks_proxy) else: - conn = adapter.poolmanager + manager = adapter.poolmanager upstream_res = None try: - upstream_res = conn.urlopen(method=method, - url=load_url, - body=data, - headers=req_headers, - redirect=False, - assert_same_host=False, - preload_content=False, - decode_content=False, - retries=max_retries, - timeout=params.get('_timeout')) + upstream_res = manager.urlopen(method=method, + url=load_url, + body=data, + headers=req_headers, + redirect=False, + assert_same_host=False, + preload_content=False, + decode_content=False, + retries=max_retries, + timeout=params.get('_timeout')) return upstream_res diff --git a/tests/test_socks.py b/tests/test_socks.py index 68e16532..4981a77b 100644 --- a/tests/test_socks.py +++ b/tests/test_socks.py @@ -1,9 +1,6 @@ from .base_config_test import BaseConfigTest, fmod_sl -import pywb.warcserver.http as pywb_http import os -import socket -import gevent import pytest @@ -15,24 +12,24 @@ class TestSOCKSProxy(BaseConfigTest): os.environ['SOCKS_HOST'] = 'localhost' os.environ['SOCKS_PORT'] = '0' - pywb_http.patch_socks() - import pywb.warcserver.resource.responseloader - pywb.warcserver.resource.responseloader.SOCKS_PROXIES = pywb_http.SOCKS_PROXIES super(TestSOCKSProxy, cls).setup_class('config_test.yaml') @classmethod def teardown_class(cls): - pywb_http.unpatch_socks() super(TestSOCKSProxy, cls).teardown_class() - def test_socks_proxy_set(self): - assert pywb_http.SOCKS_PROXIES == {'http': 'socks5h://localhost:0', - 'https': 'socks5h://localhost:0' - } - def test_socks_attempt_connect(self, fmod_sl): # no proxy is set, expect to fail if socks is being used resp = self.get('/live/{0}http://httpbin.org/get', fmod_sl, status=400) assert resp.status_int == 400 + def test_socks_disable_enable(self, fmod_sl): + os.environ['SOCKS_DISABLE'] = '1' + resp = self.get('/live/{0}http://httpbin.org/get', fmod_sl, status=200) + assert resp.status_int == 200 + + os.environ['SOCKS_DISABLE'] = '' + + resp = self.get('/live/{0}http://httpbin.org/get', fmod_sl, status=400) + assert resp.status_int == 400