mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
SOCKS proxy (#281)
warcserver: SOCKS proxy: - add support for running warcserver through a socks proxy specified via SOCKS_HOST and SOCKS_PORT - move socks patch setup, http max_header adjustment to http module - logging: print stack trace only if debugging - add pysocks to extra_requirements, enable in ci - add simple test (not actual proxy) to check that connection through proxy is attempted - docs: add SOCKS proxy section to docs
This commit is contained in:
parent
4f340933f3
commit
131c5ff5da
@ -17,7 +17,7 @@ install:
|
||||
- "pip install coverage pytest-cov coveralls"
|
||||
- "pip install cffi"
|
||||
- "pip install pyopenssl"
|
||||
- "pip install certauth boto3 youtube-dl"
|
||||
- "pip install certauth boto3 youtube-dl pysocks"
|
||||
- "pip install codecov"
|
||||
|
||||
build_script:
|
||||
|
@ -223,6 +223,16 @@ This configures the ``/live/`` route to point to the live web.
|
||||
This collection can be useful for testing, or even more powerful, when combined with recording.
|
||||
|
||||
|
||||
SOCKS Proxy for Live Web
|
||||
""""""""""""""""""""""""
|
||||
|
||||
pywb can be configured to use a SOCKS5 proxy when connecting to the live web. This allows pywb to be used with `Tor <https://torproject.org/>`_ and other
|
||||
services that require a SOCKS proxy.
|
||||
|
||||
If the ``SOCKS_HOST`` and optionally ``SOCKS_PORT`` environment variables are set, pywb will attempt to route all live web traffic through the SOCKS5 proxy.
|
||||
Note that, at this time, it is not possible to configure a SOCKS proxy per pywb collection -- all live web traffic will use the SOCKS proxy if enabled.
|
||||
|
||||
|
||||
.. _auto-all:
|
||||
|
||||
Auto "All" Aggregate Collection
|
||||
|
@ -4,3 +4,4 @@ boto3
|
||||
uwsgi
|
||||
git+https://github.com/t0m/pyamf.git@python3
|
||||
git+https://github.com/esnme/ultrajson.git
|
||||
pysocks
|
||||
|
@ -1,9 +1,78 @@
|
||||
from requests.adapters import HTTPAdapter
|
||||
import requests
|
||||
import os
|
||||
|
||||
import six.moves.http_client
|
||||
six.moves.http_client._MAXHEADERS = 10000
|
||||
|
||||
SOCKS_PROXIES = None
|
||||
orig_getaddrinfo = None
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class DefaultAdapters(object):
|
||||
live_adapter = HTTPAdapter(max_retries=3)
|
||||
remote_adapter = HTTPAdapter(max_retries=3)
|
||||
|
||||
requests.packages.urllib3.disable_warnings()
|
||||
|
||||
|
||||
#=============================================================================
|
||||
def patch_socks():
|
||||
try:
|
||||
import socks
|
||||
except ImportError: #pragma: no cover
|
||||
print('Ignoring SOCKS_HOST: PySocks must be installed to use SOCKS proxy')
|
||||
return
|
||||
|
||||
import socket
|
||||
|
||||
socks_host = os.environ.get('SOCKS_HOST')
|
||||
socks_port = os.environ.get('SOCKS_PORT', 9050)
|
||||
|
||||
# Set socks proxy and wrap the urllib module
|
||||
socks.set_default_proxy(socks.PROXY_TYPE_SOCKS5, socks_host, socks_port, True)
|
||||
#socket.socket = socks.socksocket # sets default socket to be the sockipy socket
|
||||
|
||||
# store original getaddrinfo
|
||||
global orig_getaddrinfo
|
||||
orig_getaddrinfo = socks.socket.getaddrinfo
|
||||
|
||||
# Perform DNS resolution through socket
|
||||
def getaddrinfo(*args):
|
||||
if args[0] in ('127.0.0.1', 'localhost'):
|
||||
res = orig_getaddrinfo(*args)
|
||||
|
||||
else:
|
||||
res = [(socket.AF_INET, socket.SOCK_STREAM, 6, '', (args[0], args[1]))]
|
||||
|
||||
return res
|
||||
|
||||
socks.socket.getaddrinfo = getaddrinfo
|
||||
|
||||
socks_url = 'socks5h://{0}:{1}'.format(socks_host, socks_port)
|
||||
|
||||
global SOCKS_PROXIES
|
||||
SOCKS_PROXIES = {'http': socks_url,
|
||||
'https': socks_url}
|
||||
|
||||
# =============================================================================
|
||||
def unpatch_socks():
|
||||
global orig_getaddrinfo
|
||||
if not orig_getaddrinfo:
|
||||
return
|
||||
|
||||
import socks
|
||||
socks.socket.getaddrinfo = orig_getaddrinfo
|
||||
orig_getaddrinfo = None
|
||||
|
||||
global SOCKS_PROXIES
|
||||
SOCKS_PROXIES = None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
if os.environ.get('SOCKS_HOST'):
|
||||
patch_socks()
|
||||
|
||||
|
||||
|
||||
|
@ -14,7 +14,7 @@ from pywb.utils.format import ParamFormatter
|
||||
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
|
||||
from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin
|
||||
|
||||
from pywb.warcserver.http import DefaultAdapters
|
||||
from pywb.warcserver.http import DefaultAdapters, SOCKS_PROXIES
|
||||
|
||||
from six.moves.urllib.parse import urlsplit, quote, unquote
|
||||
|
||||
@ -30,9 +30,6 @@ import logging
|
||||
|
||||
from requests.models import PreparedRequest
|
||||
|
||||
import six.moves.http_client
|
||||
six.moves.http_client._MAXHEADERS = 10000
|
||||
|
||||
logger = logging.getLogger('warcserver')
|
||||
|
||||
|
||||
@ -447,11 +444,15 @@ class LiveWebLoader(BaseLoader):
|
||||
|
||||
def _do_request(self, method, load_url, data, req_headers, params, is_live):
|
||||
adapter = DefaultAdapters.live_adapter if is_live else DefaultAdapters.remote_adapter
|
||||
pool = adapter.poolmanager
|
||||
max_retries = adapter.max_retries
|
||||
|
||||
if SOCKS_PROXIES:
|
||||
conn = adapter.get_connection(load_url, SOCKS_PROXIES)
|
||||
else:
|
||||
conn = adapter.poolmanager
|
||||
|
||||
try:
|
||||
upstream_res = pool.urlopen(method=method,
|
||||
upstream_res = conn.urlopen(method=method,
|
||||
url=load_url,
|
||||
body=data,
|
||||
headers=req_headers,
|
||||
@ -465,7 +466,11 @@ class LiveWebLoader(BaseLoader):
|
||||
return upstream_res
|
||||
|
||||
except Exception as e:
|
||||
if logger.isEnabledFor(logging.DEBUG):
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
logger.debug('FAILED: ' + method + ' ' + load_url + ': ' + str(e))
|
||||
|
||||
raise LiveResourceException(load_url)
|
||||
|
||||
def get_custom_metadata(self, content_type, dt):
|
||||
|
38
tests/test_socks.py
Normal file
38
tests/test_socks.py
Normal file
@ -0,0 +1,38 @@
|
||||
from .base_config_test import BaseConfigTest, fmod_sl
|
||||
|
||||
import pywb.warcserver.http as pywb_http
|
||||
import os
|
||||
import socket
|
||||
import gevent
|
||||
import pytest
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class TestSOCKSProxy(BaseConfigTest):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
os.environ['SOCKS_HOST'] = 'localhost'
|
||||
os.environ['SOCKS_PORT'] = '8080'
|
||||
|
||||
pywb_http.patch_socks()
|
||||
import pywb.warcserver.resource.responseloader
|
||||
pywb.warcserver.resource.responseloader.SOCKS_PROXIES = pywb_http.SOCKS_PROXIES
|
||||
super(TestSOCKSProxy, cls).setup_class('config_test.yaml')
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls):
|
||||
pywb_http.unpatch_socks()
|
||||
super(TestSOCKSProxy, cls).teardown_class()
|
||||
|
||||
def test_socks_proxy_set(self):
|
||||
assert pywb_http.SOCKS_PROXIES == {'http': 'socks5h://localhost:8080',
|
||||
'https': 'socks5h://localhost:8080'
|
||||
}
|
||||
|
||||
def test_socks_attempt_connect(self, fmod_sl):
|
||||
pytest.importorskip('socks')
|
||||
# no proxy is set, expect to fail if socks is being used
|
||||
resp = self.get('/live/{0}http://httpbin.org/get', fmod_sl, status=400)
|
||||
assert resp.status_int == 400
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user