1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

SOCKS proxy (#281)

warcserver: SOCKS proxy:
- add support for running warcserver through a socks proxy specified via SOCKS_HOST and SOCKS_PORT
- move socks patch setup, http max_header adjustment to http module
- logging: print stack trace only if debugging
- add pysocks to extra_requirements, enable in ci
- add simple test (not actual proxy) to check that connection through proxy is attempted
- docs: add SOCKS proxy section to docs
This commit is contained in:
Ilya Kreymer 2018-01-17 10:51:49 -08:00 committed by GitHub
parent 4f340933f3
commit 131c5ff5da
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 131 additions and 8 deletions

View File

@ -17,7 +17,7 @@ install:
- "pip install coverage pytest-cov coveralls"
- "pip install cffi"
- "pip install pyopenssl"
- "pip install certauth boto3 youtube-dl"
- "pip install certauth boto3 youtube-dl pysocks"
- "pip install codecov"
build_script:

View File

@ -223,6 +223,16 @@ This configures the ``/live/`` route to point to the live web.
This collection can be useful for testing, or even more powerful, when combined with recording.
SOCKS Proxy for Live Web
""""""""""""""""""""""""
pywb can be configured to use a SOCKS5 proxy when connecting to the live web. This allows pywb to be used with `Tor <https://torproject.org/>`_ and other
services that require a SOCKS proxy.
If the ``SOCKS_HOST`` and optionally ``SOCKS_PORT`` environment variables are set, pywb will attempt to route all live web traffic through the SOCKS5 proxy.
Note that, at this time, it is not possible to configure a SOCKS proxy per pywb collection -- all live web traffic will use the SOCKS proxy if enabled.
.. _auto-all:
Auto "All" Aggregate Collection

View File

@ -4,3 +4,4 @@ boto3
uwsgi
git+https://github.com/t0m/pyamf.git@python3
git+https://github.com/esnme/ultrajson.git
pysocks

View File

@ -1,9 +1,78 @@
from requests.adapters import HTTPAdapter
import requests
import os
import six.moves.http_client
six.moves.http_client._MAXHEADERS = 10000
SOCKS_PROXIES = None
orig_getaddrinfo = None
#=============================================================================
class DefaultAdapters(object):
live_adapter = HTTPAdapter(max_retries=3)
remote_adapter = HTTPAdapter(max_retries=3)
requests.packages.urllib3.disable_warnings()
#=============================================================================
def patch_socks():
try:
import socks
except ImportError: #pragma: no cover
print('Ignoring SOCKS_HOST: PySocks must be installed to use SOCKS proxy')
return
import socket
socks_host = os.environ.get('SOCKS_HOST')
socks_port = os.environ.get('SOCKS_PORT', 9050)
# Set socks proxy and wrap the urllib module
socks.set_default_proxy(socks.PROXY_TYPE_SOCKS5, socks_host, socks_port, True)
#socket.socket = socks.socksocket # sets default socket to be the sockipy socket
# store original getaddrinfo
global orig_getaddrinfo
orig_getaddrinfo = socks.socket.getaddrinfo
# Perform DNS resolution through socket
def getaddrinfo(*args):
if args[0] in ('127.0.0.1', 'localhost'):
res = orig_getaddrinfo(*args)
else:
res = [(socket.AF_INET, socket.SOCK_STREAM, 6, '', (args[0], args[1]))]
return res
socks.socket.getaddrinfo = getaddrinfo
socks_url = 'socks5h://{0}:{1}'.format(socks_host, socks_port)
global SOCKS_PROXIES
SOCKS_PROXIES = {'http': socks_url,
'https': socks_url}
# =============================================================================
def unpatch_socks():
global orig_getaddrinfo
if not orig_getaddrinfo:
return
import socks
socks.socket.getaddrinfo = orig_getaddrinfo
orig_getaddrinfo = None
global SOCKS_PROXIES
SOCKS_PROXIES = None
# =============================================================================
if os.environ.get('SOCKS_HOST'):
patch_socks()

View File

@ -14,7 +14,7 @@ from pywb.utils.format import ParamFormatter
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin
from pywb.warcserver.http import DefaultAdapters
from pywb.warcserver.http import DefaultAdapters, SOCKS_PROXIES
from six.moves.urllib.parse import urlsplit, quote, unquote
@ -30,9 +30,6 @@ import logging
from requests.models import PreparedRequest
import six.moves.http_client
six.moves.http_client._MAXHEADERS = 10000
logger = logging.getLogger('warcserver')
@ -447,11 +444,15 @@ class LiveWebLoader(BaseLoader):
def _do_request(self, method, load_url, data, req_headers, params, is_live):
adapter = DefaultAdapters.live_adapter if is_live else DefaultAdapters.remote_adapter
pool = adapter.poolmanager
max_retries = adapter.max_retries
if SOCKS_PROXIES:
conn = adapter.get_connection(load_url, SOCKS_PROXIES)
else:
conn = adapter.poolmanager
try:
upstream_res = pool.urlopen(method=method,
upstream_res = conn.urlopen(method=method,
url=load_url,
body=data,
headers=req_headers,
@ -465,7 +466,11 @@ class LiveWebLoader(BaseLoader):
return upstream_res
except Exception as e:
logger.debug('FAILED: ' + method + ' ' + load_url + ': ' + str(e))
if logger.isEnabledFor(logging.DEBUG):
import traceback
traceback.print_exc()
logger.debug('FAILED: ' + method + ' ' + load_url + ': ' + str(e))
raise LiveResourceException(load_url)
def get_custom_metadata(self, content_type, dt):

38
tests/test_socks.py Normal file
View File

@ -0,0 +1,38 @@
from .base_config_test import BaseConfigTest, fmod_sl
import pywb.warcserver.http as pywb_http
import os
import socket
import gevent
import pytest
# ============================================================================
class TestSOCKSProxy(BaseConfigTest):
@classmethod
def setup_class(cls):
os.environ['SOCKS_HOST'] = 'localhost'
os.environ['SOCKS_PORT'] = '8080'
pywb_http.patch_socks()
import pywb.warcserver.resource.responseloader
pywb.warcserver.resource.responseloader.SOCKS_PROXIES = pywb_http.SOCKS_PROXIES
super(TestSOCKSProxy, cls).setup_class('config_test.yaml')
@classmethod
def teardown_class(cls):
pywb_http.unpatch_socks()
super(TestSOCKSProxy, cls).teardown_class()
def test_socks_proxy_set(self):
assert pywb_http.SOCKS_PROXIES == {'http': 'socks5h://localhost:8080',
'https': 'socks5h://localhost:8080'
}
def test_socks_attempt_connect(self, fmod_sl):
pytest.importorskip('socks')
# no proxy is set, expect to fail if socks is being used
resp = self.get('/live/{0}http://httpbin.org/get', fmod_sl, status=400)
assert resp.status_int == 400