diff --git a/appveyor.yml b/appveyor.yml index 38fe0205..c268f471 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -17,7 +17,7 @@ install: - "pip install coverage pytest-cov coveralls" - "pip install cffi" - "pip install pyopenssl" - - "pip install certauth boto3 youtube-dl" + - "pip install certauth boto3 youtube-dl pysocks" - "pip install codecov" build_script: diff --git a/docs/manual/configuring.rst b/docs/manual/configuring.rst index 4b16b3e4..b15c4693 100644 --- a/docs/manual/configuring.rst +++ b/docs/manual/configuring.rst @@ -223,6 +223,16 @@ This configures the ``/live/`` route to point to the live web. This collection can be useful for testing, or even more powerful, when combined with recording. +SOCKS Proxy for Live Web +"""""""""""""""""""""""" + +pywb can be configured to use a SOCKS5 proxy when connecting to the live web. This allows pywb to be used with `Tor `_ and other +services that require a SOCKS proxy. + +If the ``SOCKS_HOST`` and optionally ``SOCKS_PORT`` environment variables are set, pywb will attempt to route all live web traffic through the SOCKS5 proxy. +Note that, at this time, it is not possible to configure a SOCKS proxy per pywb collection -- all live web traffic will use the SOCKS proxy if enabled. + + .. _auto-all: Auto "All" Aggregate Collection diff --git a/extra_requirements.txt b/extra_requirements.txt index 2edeb5de..355c0e2c 100644 --- a/extra_requirements.txt +++ b/extra_requirements.txt @@ -4,3 +4,4 @@ boto3 uwsgi git+https://github.com/t0m/pyamf.git@python3 git+https://github.com/esnme/ultrajson.git +pysocks diff --git a/pywb/warcserver/http.py b/pywb/warcserver/http.py index a667caf4..ee6ce790 100644 --- a/pywb/warcserver/http.py +++ b/pywb/warcserver/http.py @@ -1,9 +1,78 @@ from requests.adapters import HTTPAdapter import requests +import os +import six.moves.http_client +six.moves.http_client._MAXHEADERS = 10000 + +SOCKS_PROXIES = None +orig_getaddrinfo = None + + +#============================================================================= class DefaultAdapters(object): live_adapter = HTTPAdapter(max_retries=3) remote_adapter = HTTPAdapter(max_retries=3) requests.packages.urllib3.disable_warnings() + +#============================================================================= +def patch_socks(): + try: + import socks + except ImportError: #pragma: no cover + print('Ignoring SOCKS_HOST: PySocks must be installed to use SOCKS proxy') + return + + import socket + + socks_host = os.environ.get('SOCKS_HOST') + socks_port = os.environ.get('SOCKS_PORT', 9050) + + # Set socks proxy and wrap the urllib module + socks.set_default_proxy(socks.PROXY_TYPE_SOCKS5, socks_host, socks_port, True) + #socket.socket = socks.socksocket # sets default socket to be the sockipy socket + + # store original getaddrinfo + global orig_getaddrinfo + orig_getaddrinfo = socks.socket.getaddrinfo + + # Perform DNS resolution through socket + def getaddrinfo(*args): + if args[0] in ('127.0.0.1', 'localhost'): + res = orig_getaddrinfo(*args) + + else: + res = [(socket.AF_INET, socket.SOCK_STREAM, 6, '', (args[0], args[1]))] + + return res + + socks.socket.getaddrinfo = getaddrinfo + + socks_url = 'socks5h://{0}:{1}'.format(socks_host, socks_port) + + global SOCKS_PROXIES + SOCKS_PROXIES = {'http': socks_url, + 'https': socks_url} + +# ============================================================================= +def unpatch_socks(): + global orig_getaddrinfo + if not orig_getaddrinfo: + return + + import socks + socks.socket.getaddrinfo = orig_getaddrinfo + orig_getaddrinfo = None + + global SOCKS_PROXIES + SOCKS_PROXIES = None + + +# ============================================================================= +if os.environ.get('SOCKS_HOST'): + patch_socks() + + + diff --git a/pywb/warcserver/resource/responseloader.py b/pywb/warcserver/resource/responseloader.py index 6b7cccc9..ddf615a4 100644 --- a/pywb/warcserver/resource/responseloader.py +++ b/pywb/warcserver/resource/responseloader.py @@ -14,7 +14,7 @@ from pywb.utils.format import ParamFormatter from pywb.warcserver.resource.resolvingloader import ResolvingLoader from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin -from pywb.warcserver.http import DefaultAdapters +from pywb.warcserver.http import DefaultAdapters, SOCKS_PROXIES from six.moves.urllib.parse import urlsplit, quote, unquote @@ -30,9 +30,6 @@ import logging from requests.models import PreparedRequest -import six.moves.http_client -six.moves.http_client._MAXHEADERS = 10000 - logger = logging.getLogger('warcserver') @@ -447,11 +444,15 @@ class LiveWebLoader(BaseLoader): def _do_request(self, method, load_url, data, req_headers, params, is_live): adapter = DefaultAdapters.live_adapter if is_live else DefaultAdapters.remote_adapter - pool = adapter.poolmanager max_retries = adapter.max_retries + if SOCKS_PROXIES: + conn = adapter.get_connection(load_url, SOCKS_PROXIES) + else: + conn = adapter.poolmanager + try: - upstream_res = pool.urlopen(method=method, + upstream_res = conn.urlopen(method=method, url=load_url, body=data, headers=req_headers, @@ -465,7 +466,11 @@ class LiveWebLoader(BaseLoader): return upstream_res except Exception as e: - logger.debug('FAILED: ' + method + ' ' + load_url + ': ' + str(e)) + if logger.isEnabledFor(logging.DEBUG): + import traceback + traceback.print_exc() + logger.debug('FAILED: ' + method + ' ' + load_url + ': ' + str(e)) + raise LiveResourceException(load_url) def get_custom_metadata(self, content_type, dt): diff --git a/tests/test_socks.py b/tests/test_socks.py new file mode 100644 index 00000000..ba0ba494 --- /dev/null +++ b/tests/test_socks.py @@ -0,0 +1,38 @@ +from .base_config_test import BaseConfigTest, fmod_sl + +import pywb.warcserver.http as pywb_http +import os +import socket +import gevent +import pytest + + +# ============================================================================ +class TestSOCKSProxy(BaseConfigTest): + @classmethod + def setup_class(cls): + os.environ['SOCKS_HOST'] = 'localhost' + os.environ['SOCKS_PORT'] = '8080' + + pywb_http.patch_socks() + import pywb.warcserver.resource.responseloader + pywb.warcserver.resource.responseloader.SOCKS_PROXIES = pywb_http.SOCKS_PROXIES + super(TestSOCKSProxy, cls).setup_class('config_test.yaml') + + @classmethod + def teardown_class(cls): + pywb_http.unpatch_socks() + super(TestSOCKSProxy, cls).teardown_class() + + def test_socks_proxy_set(self): + assert pywb_http.SOCKS_PROXIES == {'http': 'socks5h://localhost:8080', + 'https': 'socks5h://localhost:8080' + } + + def test_socks_attempt_connect(self, fmod_sl): + pytest.importorskip('socks') + # no proxy is set, expect to fail if socks is being used + resp = self.get('/live/{0}http://httpbin.org/get', fmod_sl, status=400) + assert resp.status_int == 400 + +