mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
SOCKS proxy (#281)
warcserver: SOCKS proxy: - add support for running warcserver through a socks proxy specified via SOCKS_HOST and SOCKS_PORT - move socks patch setup, http max_header adjustment to http module - logging: print stack trace only if debugging - add pysocks to extra_requirements, enable in ci - add simple test (not actual proxy) to check that connection through proxy is attempted - docs: add SOCKS proxy section to docs
This commit is contained in:
parent
4f340933f3
commit
131c5ff5da
@ -17,7 +17,7 @@ install:
|
|||||||
- "pip install coverage pytest-cov coveralls"
|
- "pip install coverage pytest-cov coveralls"
|
||||||
- "pip install cffi"
|
- "pip install cffi"
|
||||||
- "pip install pyopenssl"
|
- "pip install pyopenssl"
|
||||||
- "pip install certauth boto3 youtube-dl"
|
- "pip install certauth boto3 youtube-dl pysocks"
|
||||||
- "pip install codecov"
|
- "pip install codecov"
|
||||||
|
|
||||||
build_script:
|
build_script:
|
||||||
|
@ -223,6 +223,16 @@ This configures the ``/live/`` route to point to the live web.
|
|||||||
This collection can be useful for testing, or even more powerful, when combined with recording.
|
This collection can be useful for testing, or even more powerful, when combined with recording.
|
||||||
|
|
||||||
|
|
||||||
|
SOCKS Proxy for Live Web
|
||||||
|
""""""""""""""""""""""""
|
||||||
|
|
||||||
|
pywb can be configured to use a SOCKS5 proxy when connecting to the live web. This allows pywb to be used with `Tor <https://torproject.org/>`_ and other
|
||||||
|
services that require a SOCKS proxy.
|
||||||
|
|
||||||
|
If the ``SOCKS_HOST`` and optionally ``SOCKS_PORT`` environment variables are set, pywb will attempt to route all live web traffic through the SOCKS5 proxy.
|
||||||
|
Note that, at this time, it is not possible to configure a SOCKS proxy per pywb collection -- all live web traffic will use the SOCKS proxy if enabled.
|
||||||
|
|
||||||
|
|
||||||
.. _auto-all:
|
.. _auto-all:
|
||||||
|
|
||||||
Auto "All" Aggregate Collection
|
Auto "All" Aggregate Collection
|
||||||
|
@ -4,3 +4,4 @@ boto3
|
|||||||
uwsgi
|
uwsgi
|
||||||
git+https://github.com/t0m/pyamf.git@python3
|
git+https://github.com/t0m/pyamf.git@python3
|
||||||
git+https://github.com/esnme/ultrajson.git
|
git+https://github.com/esnme/ultrajson.git
|
||||||
|
pysocks
|
||||||
|
@ -1,9 +1,78 @@
|
|||||||
from requests.adapters import HTTPAdapter
|
from requests.adapters import HTTPAdapter
|
||||||
import requests
|
import requests
|
||||||
|
import os
|
||||||
|
|
||||||
|
import six.moves.http_client
|
||||||
|
six.moves.http_client._MAXHEADERS = 10000
|
||||||
|
|
||||||
|
SOCKS_PROXIES = None
|
||||||
|
orig_getaddrinfo = None
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
class DefaultAdapters(object):
|
class DefaultAdapters(object):
|
||||||
live_adapter = HTTPAdapter(max_retries=3)
|
live_adapter = HTTPAdapter(max_retries=3)
|
||||||
remote_adapter = HTTPAdapter(max_retries=3)
|
remote_adapter = HTTPAdapter(max_retries=3)
|
||||||
|
|
||||||
requests.packages.urllib3.disable_warnings()
|
requests.packages.urllib3.disable_warnings()
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
def patch_socks():
|
||||||
|
try:
|
||||||
|
import socks
|
||||||
|
except ImportError: #pragma: no cover
|
||||||
|
print('Ignoring SOCKS_HOST: PySocks must be installed to use SOCKS proxy')
|
||||||
|
return
|
||||||
|
|
||||||
|
import socket
|
||||||
|
|
||||||
|
socks_host = os.environ.get('SOCKS_HOST')
|
||||||
|
socks_port = os.environ.get('SOCKS_PORT', 9050)
|
||||||
|
|
||||||
|
# Set socks proxy and wrap the urllib module
|
||||||
|
socks.set_default_proxy(socks.PROXY_TYPE_SOCKS5, socks_host, socks_port, True)
|
||||||
|
#socket.socket = socks.socksocket # sets default socket to be the sockipy socket
|
||||||
|
|
||||||
|
# store original getaddrinfo
|
||||||
|
global orig_getaddrinfo
|
||||||
|
orig_getaddrinfo = socks.socket.getaddrinfo
|
||||||
|
|
||||||
|
# Perform DNS resolution through socket
|
||||||
|
def getaddrinfo(*args):
|
||||||
|
if args[0] in ('127.0.0.1', 'localhost'):
|
||||||
|
res = orig_getaddrinfo(*args)
|
||||||
|
|
||||||
|
else:
|
||||||
|
res = [(socket.AF_INET, socket.SOCK_STREAM, 6, '', (args[0], args[1]))]
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
socks.socket.getaddrinfo = getaddrinfo
|
||||||
|
|
||||||
|
socks_url = 'socks5h://{0}:{1}'.format(socks_host, socks_port)
|
||||||
|
|
||||||
|
global SOCKS_PROXIES
|
||||||
|
SOCKS_PROXIES = {'http': socks_url,
|
||||||
|
'https': socks_url}
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
def unpatch_socks():
|
||||||
|
global orig_getaddrinfo
|
||||||
|
if not orig_getaddrinfo:
|
||||||
|
return
|
||||||
|
|
||||||
|
import socks
|
||||||
|
socks.socket.getaddrinfo = orig_getaddrinfo
|
||||||
|
orig_getaddrinfo = None
|
||||||
|
|
||||||
|
global SOCKS_PROXIES
|
||||||
|
SOCKS_PROXIES = None
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
if os.environ.get('SOCKS_HOST'):
|
||||||
|
patch_socks()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -14,7 +14,7 @@ from pywb.utils.format import ParamFormatter
|
|||||||
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
|
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
|
||||||
from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin
|
from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin
|
||||||
|
|
||||||
from pywb.warcserver.http import DefaultAdapters
|
from pywb.warcserver.http import DefaultAdapters, SOCKS_PROXIES
|
||||||
|
|
||||||
from six.moves.urllib.parse import urlsplit, quote, unquote
|
from six.moves.urllib.parse import urlsplit, quote, unquote
|
||||||
|
|
||||||
@ -30,9 +30,6 @@ import logging
|
|||||||
|
|
||||||
from requests.models import PreparedRequest
|
from requests.models import PreparedRequest
|
||||||
|
|
||||||
import six.moves.http_client
|
|
||||||
six.moves.http_client._MAXHEADERS = 10000
|
|
||||||
|
|
||||||
logger = logging.getLogger('warcserver')
|
logger = logging.getLogger('warcserver')
|
||||||
|
|
||||||
|
|
||||||
@ -447,11 +444,15 @@ class LiveWebLoader(BaseLoader):
|
|||||||
|
|
||||||
def _do_request(self, method, load_url, data, req_headers, params, is_live):
|
def _do_request(self, method, load_url, data, req_headers, params, is_live):
|
||||||
adapter = DefaultAdapters.live_adapter if is_live else DefaultAdapters.remote_adapter
|
adapter = DefaultAdapters.live_adapter if is_live else DefaultAdapters.remote_adapter
|
||||||
pool = adapter.poolmanager
|
|
||||||
max_retries = adapter.max_retries
|
max_retries = adapter.max_retries
|
||||||
|
|
||||||
|
if SOCKS_PROXIES:
|
||||||
|
conn = adapter.get_connection(load_url, SOCKS_PROXIES)
|
||||||
|
else:
|
||||||
|
conn = adapter.poolmanager
|
||||||
|
|
||||||
try:
|
try:
|
||||||
upstream_res = pool.urlopen(method=method,
|
upstream_res = conn.urlopen(method=method,
|
||||||
url=load_url,
|
url=load_url,
|
||||||
body=data,
|
body=data,
|
||||||
headers=req_headers,
|
headers=req_headers,
|
||||||
@ -465,7 +466,11 @@ class LiveWebLoader(BaseLoader):
|
|||||||
return upstream_res
|
return upstream_res
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug('FAILED: ' + method + ' ' + load_url + ': ' + str(e))
|
if logger.isEnabledFor(logging.DEBUG):
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
logger.debug('FAILED: ' + method + ' ' + load_url + ': ' + str(e))
|
||||||
|
|
||||||
raise LiveResourceException(load_url)
|
raise LiveResourceException(load_url)
|
||||||
|
|
||||||
def get_custom_metadata(self, content_type, dt):
|
def get_custom_metadata(self, content_type, dt):
|
||||||
|
38
tests/test_socks.py
Normal file
38
tests/test_socks.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
from .base_config_test import BaseConfigTest, fmod_sl
|
||||||
|
|
||||||
|
import pywb.warcserver.http as pywb_http
|
||||||
|
import os
|
||||||
|
import socket
|
||||||
|
import gevent
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
class TestSOCKSProxy(BaseConfigTest):
|
||||||
|
@classmethod
|
||||||
|
def setup_class(cls):
|
||||||
|
os.environ['SOCKS_HOST'] = 'localhost'
|
||||||
|
os.environ['SOCKS_PORT'] = '8080'
|
||||||
|
|
||||||
|
pywb_http.patch_socks()
|
||||||
|
import pywb.warcserver.resource.responseloader
|
||||||
|
pywb.warcserver.resource.responseloader.SOCKS_PROXIES = pywb_http.SOCKS_PROXIES
|
||||||
|
super(TestSOCKSProxy, cls).setup_class('config_test.yaml')
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def teardown_class(cls):
|
||||||
|
pywb_http.unpatch_socks()
|
||||||
|
super(TestSOCKSProxy, cls).teardown_class()
|
||||||
|
|
||||||
|
def test_socks_proxy_set(self):
|
||||||
|
assert pywb_http.SOCKS_PROXIES == {'http': 'socks5h://localhost:8080',
|
||||||
|
'https': 'socks5h://localhost:8080'
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_socks_attempt_connect(self, fmod_sl):
|
||||||
|
pytest.importorskip('socks')
|
||||||
|
# no proxy is set, expect to fail if socks is being used
|
||||||
|
resp = self.get('/live/{0}http://httpbin.org/get', fmod_sl, status=400)
|
||||||
|
assert resp.status_int == 400
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user