1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

SOCKS Proxy Improvements (#504)

* https over socks fix: fix issue with https url handling by using 'adapter.proxy_manager_for()' instead of 'adapter.get_connection' to get proxy manager, which create connection indirectly (parallel to no-proxy path).
- simplify socks config, avoiding global monkey-patch, as requests/urllib3 now support socks proxy directly and do not require patching global socket.
- add SOCKS_DISABLE env dynamically disabling socks proxy
This commit is contained in:
Ilya Kreymer 2019-08-29 11:59:45 -07:00 committed by GitHub
parent 295f67e675
commit 56e7c78ea3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 46 additions and 89 deletions

View File

@ -1,11 +1,15 @@
pywb 2.3.5 changelist
~~~~~~~~~~~~~~~~~~~~~
* General auto-fetch fixes
* General auto-fetch fixes (#503)
- Fixed issue that caused HTTP 404 errors to happen when parsing <link> stylesheet hrefs as sheets (webrecorder/wombat #11)
- Ensured that requests made are cached by the browser (webrecorder/wombat #13 & #15)
- Ensured that the request made by the backing web worker when in proxy mode are not blocked by CORS (webrecorder/wombat #13 & #15)
* SOCKS proxy fixes (#504)
- simplify SOCKS config (avoiding global socket monkey patch), default to no cert verify to match non-proxy behavior
- SOCKS proxy can be disabled dynamically by setting SOCKS_DISABLE
pywb 2.3.4 changelist
~~~~~~~~~~~~~~~~~~~~~

View File

@ -8,16 +8,17 @@ from urllib3.poolmanager import PoolManager
six.moves.http_client._MAXHEADERS = 10000
six.moves.http_client._MAXLINE = 131072
SOCKS_PROXIES = None
orig_getaddrinfo = None
# =============================================================================
class PywbHttpAdapter(HTTPAdapter):
"""This adaptor exists exists to restore the default behavior
of urllib3 < 1.25.x, which was to not verify ssl certs,
until a better solution is found
"""
# todo: allow configuring this later?
cert_reqs = 'CERT_NONE'
def init_poolmanager(
self, connections, maxsize, block=DEFAULT_POOLBLOCK, **pool_kwargs
):
@ -29,10 +30,14 @@ class PywbHttpAdapter(HTTPAdapter):
maxsize=maxsize,
block=block,
strict=True,
cert_reqs='CERT_NONE',
cert_reqs=self.cert_reqs,
**pool_kwargs
)
def proxy_manager_for(self, proxy, **proxy_kwargs):
proxy_kwargs['cert_reqs'] = self.cert_reqs
return super(PywbHttpAdapter, self).proxy_manager_for(proxy, **proxy_kwargs)
# =============================================================================
class DefaultAdapters(object):
@ -42,61 +47,3 @@ class DefaultAdapters(object):
requests.packages.urllib3.disable_warnings()
# =============================================================================
def patch_socks():
try:
import socks
except ImportError: # pragma: no cover
print('Ignoring SOCKS_HOST: PySocks must be installed to use SOCKS proxy')
return
import socket
socks_host = os.environ.get('SOCKS_HOST')
socks_port = os.environ.get('SOCKS_PORT', 9050)
# Set socks proxy and wrap the urllib module
socks.set_default_proxy(socks.PROXY_TYPE_SOCKS5, socks_host, socks_port, True)
# socket.socket = socks.socksocket # sets default socket to be the sockipy socket
# store original getaddrinfo
global orig_getaddrinfo
orig_getaddrinfo = socks.socket.getaddrinfo
# Perform DNS resolution through socket
def getaddrinfo(*args):
if args[0] in ('127.0.0.1', 'localhost'):
res = orig_getaddrinfo(*args)
else:
res = [(socket.AF_INET, socket.SOCK_STREAM, 6, '', (args[0], args[1]))]
return res
socks.socket.getaddrinfo = getaddrinfo
socks_url = 'socks5h://{0}:{1}'.format(socks_host, socks_port)
global SOCKS_PROXIES
SOCKS_PROXIES = {'http': socks_url, 'https': socks_url}
# =============================================================================
def unpatch_socks():
global orig_getaddrinfo
if not orig_getaddrinfo:
return
import socks
socks.socket.getaddrinfo = orig_getaddrinfo
orig_getaddrinfo = None
global SOCKS_PROXIES
SOCKS_PROXIES = None
# =============================================================================
if os.environ.get('SOCKS_HOST'):
patch_socks()

View File

@ -1,6 +1,7 @@
import datetime
import json
import logging
import os
import uuid
from io import BytesIO
@ -23,7 +24,7 @@ from pywb.utils.format import ParamFormatter
from pywb.utils.io import StreamIter, call_release_conn, compress_gzip_iter, no_except_close
from pywb.utils.memento import MementoUtils
from pywb.utils.wbexception import LiveResourceException
from pywb.warcserver.http import DefaultAdapters, SOCKS_PROXIES
from pywb.warcserver.http import DefaultAdapters
from pywb.warcserver.resource.pathresolvers import DefaultResolverMixin
from pywb.warcserver.resource.resolvingloader import ResolvingLoader
@ -256,6 +257,13 @@ class LiveWebLoader(BaseLoader):
def __init__(self, forward_proxy_prefix=None, adapter=None):
self.forward_proxy_prefix = forward_proxy_prefix
socks_host = os.environ.get('SOCKS_HOST')
socks_port = os.environ.get('SOCKS_PORT', 9050)
if socks_host and socks_port:
self.socks_proxy = 'socks5h://{0}:{1}'.format(socks_host, socks_port)
else:
self.socks_proxy = None
def load_resource(self, cdx, params):
load_url = cdx.get('load_url')
if not load_url:
@ -475,14 +483,15 @@ class LiveWebLoader(BaseLoader):
adapter = DefaultAdapters.live_adapter if is_live else DefaultAdapters.remote_adapter
max_retries = adapter.max_retries
if SOCKS_PROXIES:
conn = adapter.get_connection(load_url, SOCKS_PROXIES)
# get either the poolmanager or proxy manager to handle this connection
if self.socks_proxy and not os.environ.get('SOCKS_DISABLE'):
manager = adapter.proxy_manager_for(self.socks_proxy)
else:
conn = adapter.poolmanager
manager = adapter.poolmanager
upstream_res = None
try:
upstream_res = conn.urlopen(method=method,
upstream_res = manager.urlopen(method=method,
url=load_url,
body=data,
headers=req_headers,

View File

@ -1,9 +1,6 @@
from .base_config_test import BaseConfigTest, fmod_sl
import pywb.warcserver.http as pywb_http
import os
import socket
import gevent
import pytest
@ -15,24 +12,24 @@ class TestSOCKSProxy(BaseConfigTest):
os.environ['SOCKS_HOST'] = 'localhost'
os.environ['SOCKS_PORT'] = '0'
pywb_http.patch_socks()
import pywb.warcserver.resource.responseloader
pywb.warcserver.resource.responseloader.SOCKS_PROXIES = pywb_http.SOCKS_PROXIES
super(TestSOCKSProxy, cls).setup_class('config_test.yaml')
@classmethod
def teardown_class(cls):
pywb_http.unpatch_socks()
super(TestSOCKSProxy, cls).teardown_class()
def test_socks_proxy_set(self):
assert pywb_http.SOCKS_PROXIES == {'http': 'socks5h://localhost:0',
'https': 'socks5h://localhost:0'
}
def test_socks_attempt_connect(self, fmod_sl):
# no proxy is set, expect to fail if socks is being used
resp = self.get('/live/{0}http://httpbin.org/get', fmod_sl, status=400)
assert resp.status_int == 400
def test_socks_disable_enable(self, fmod_sl):
os.environ['SOCKS_DISABLE'] = '1'
resp = self.get('/live/{0}http://httpbin.org/get', fmod_sl, status=200)
assert resp.status_int == 200
os.environ['SOCKS_DISABLE'] = ''
resp = self.get('/live/{0}http://httpbin.org/get', fmod_sl, status=400)
assert resp.status_int == 400