From 00dc9eed84cda7556c9a2719da8e621fbc74384f Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 13 Nov 2015 01:17:35 +0000 Subject: [PATCH] new option --onion-tor-socks-proxy, host:port of tor socks proxy, used only to connect to .onion sites --- .travis.yml | 1 + setup.py | 11 +++++++++-- tests/Dockerfile | 2 ++ tests/single-threaded-proxy.py | 2 ++ tests/test_warcprox.py | 15 ++++++++++++++- warcprox/main.py | 2 ++ warcprox/mitmproxy.py | 14 +++++++++++++- warcprox/warcproxy.py | 10 ++++++++++ 8 files changed, 53 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 56470e2..de744f3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,6 +12,7 @@ addons: packages: - python-gdbm - python3-gdbm + - tor services: - docker diff --git a/setup.py b/setup.py index ef33be2..0245625 100755 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools.command.test import test as TestCommand import sys -import setuptools +import setuptools # special class needs to be added to support the pytest written dump-anydbm tests class PyTest(TestCommand): @@ -17,7 +17,14 @@ class PyTest(TestCommand): errno = pytest.main(self.test_args) sys.exit(errno) -deps = ['certauth>=1.1.0', 'warctools', 'kafka-python', 'surt==0.3b2', 'rethinkstuff'] +deps = [ + 'certauth>=1.1.0', + 'warctools', + 'kafka-python', + 'surt==0.3b2', + 'rethinkstuff', + 'PySocks', +] try: import concurrent.futures except: diff --git a/tests/Dockerfile b/tests/Dockerfile index 4a8b5b8..aa3746f 100644 --- a/tests/Dockerfile +++ b/tests/Dockerfile @@ -20,3 +20,5 @@ RUN mkdir -vp /etc/service/rethinkdb \ RUN apt-get -y install python-virtualenv git RUN apt-get -y install python-gdbm python3-gdbm libpython2.7-dev libpython3.4-dev libffi-dev libssl-dev RUN pip install devpi-client +RUN apt-get -y install tor # starts tor socks proxy on port 9050 + diff --git a/tests/single-threaded-proxy.py b/tests/single-threaded-proxy.py index fd6808e..69db94c 100755 --- a/tests/single-threaded-proxy.py +++ b/tests/single-threaded-proxy.py @@ -43,6 +43,8 @@ def parse_args(): arg_parser.add_argument('--certs-dir', dest='certs_dir', default='./{0}-warcprox-ca'.format(socket.gethostname()), help='where to store and load generated certificates') + arg_parser.add_argument('--onion-tor-socks-proxy', dest='onion_tor_socks_proxy', + default=None, help='host:port of tor socks proxy, used only to connect to .onion sites') arg_parser.add_argument('--version', action='version', version="warcprox {}".format(warcprox.__version__)) arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') diff --git a/tests/test_warcprox.py b/tests/test_warcprox.py index 404279d..c660964 100755 --- a/tests/test_warcprox.py +++ b/tests/test_warcprox.py @@ -272,7 +272,8 @@ def warcprox_(request, captures_db, dedup_db, stats_db, service_registry): recorded_url_q = queue.Queue() - options = warcprox.Options(port=0, playback_port=0) + options = warcprox.Options(port=0, playback_port=0, + onion_tor_socks_proxy='localhost:9050') proxy = warcprox.warcproxy.WarcProxy(ca=ca, recorded_url_q=recorded_url_q, stats_db=stats_db, options=options) options.port = proxy.server_port @@ -696,6 +697,18 @@ def test_dedup_buckets(https_daemon, http_daemon, warcprox_, archiving_proxies, finally: fh.close() +# XXX this test relies on a tor proxy running at localhost:9050 with a working +# connection to the internet, and relies on a third party site (facebook) being +# up and behaving a certain way +def test_tor_onion(archiving_proxies): + response = requests.get('http://www.facebookcorewwwi.onion/', + proxies=archiving_proxies, verify=False, allow_redirects=False) + assert response.status_code == 302 + + response = requests.get('https://www.facebookcorewwwi.onion/', + proxies=archiving_proxies, verify=False, allow_redirects=False) + assert response.status_code == 200 + if __name__ == '__main__': pytest.main() diff --git a/warcprox/main.py b/warcprox/main.py index bcff5b0..3ad92fe 100644 --- a/warcprox/main.py +++ b/warcprox/main.py @@ -84,6 +84,8 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): help=argparse.SUPPRESS) arg_parser.add_argument('--profile', action='store_true', default=False, help=argparse.SUPPRESS) + arg_parser.add_argument('--onion-tor-socks-proxy', dest='onion_tor_socks_proxy', + default=None, help='host:port of tor socks proxy, used only to connect to .onion sites') arg_parser.add_argument('--version', action='version', version="warcprox {}".format(warcprox.__version__)) arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true') diff --git a/warcprox/mitmproxy.py b/warcprox/mitmproxy.py index 3df9f33..b8f645e 100644 --- a/warcprox/mitmproxy.py +++ b/warcprox/mitmproxy.py @@ -16,6 +16,7 @@ import ssl import warcprox import threading import datetime +import socks class MitmProxyHandler(http_server.BaseHTTPRequestHandler): logger = logging.getLogger("warcprox.mitmproxy.MitmProxyHandler") @@ -51,7 +52,18 @@ class MitmProxyHandler(http_server.BaseHTTPRequestHandler): def _connect_to_host(self): # Connect to destination - self._proxy_sock = socket.socket() + if self.onion_tor_socks_proxy_host and self.hostname.lower().endswith('.onion'): + self.logger.info("using tor socks proxy at %s:%s to connect to %s", + self.onion_tor_socks_proxy_host, + self.onion_tor_socks_proxy_port or 1080, + self.hostname) + self._proxy_sock = socks.socksocket() + self._proxy_sock.set_proxy(socks.SOCKS5, + addr=self.onion_tor_socks_proxy_host, + port=self.onion_tor_socks_proxy_port, rdns=True) + else: + self._proxy_sock = socket.socket() + self._proxy_sock.settimeout(60) # XXX what value should this have? self._proxy_sock.connect((self.hostname, int(self.port))) diff --git a/warcprox/warcproxy.py b/warcprox/warcproxy.py index 2b83564..b46f610 100644 --- a/warcprox/warcproxy.py +++ b/warcprox/warcproxy.py @@ -350,6 +350,16 @@ class SingleThreadedWarcProxy(http_server.HTTPServer): def __init__(self, ca=None, recorded_url_q=None, stats_db=None, options=warcprox.Options()): server_address = (options.address or 'localhost', options.port if options.port is not None else 8000) + + if options.onion_tor_socks_proxy: + try: + host, port = options.onion_tor_socks_proxy.split(':') + WarcProxyHandler.onion_tor_socks_proxy_host = host + WarcProxyHandler.onion_tor_socks_proxy_port = int(port) + except ValueError: + WarcProxyHandler.onion_tor_socks_proxy_host = options.onion_tor_socks_proxy + WarcProxyHandler.onion_tor_socks_proxy_port = None + http_server.HTTPServer.__init__(self, server_address, WarcProxyHandler, bind_and_activate=True) self.digest_algorithm = options.digest_algorithm or 'sha1'