From 1a1aa814d06dbb3ba1f40bdfdda25f80a2f602eb Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 5 Feb 2014 13:08:10 -0800 Subject: [PATCH] first pass at simple http proxy! #8 * proxy router for handling only proxy * proxy/archival router for handling both archival and proxy mode, togglable with 'enable_http_proxy' setting in config * supports only most recent capture playback -- no support for selecting replay date/calendar view yet * not testable with WebTest -- need better way to unit test proxy mode --- config.yaml | 3 ++ pywb/proxy.py | 102 ++++++++++++++++++++++++++++++++++++++++++++++ pywb/pywb_init.py | 9 +++- run-tests.py | 11 ++++- test_config.yaml | 3 ++ 5 files changed, 124 insertions(+), 4 deletions(-) create mode 100644 pywb/proxy.py diff --git a/config.yaml b/config.yaml index 79c80279..934ded44 100644 --- a/config.yaml +++ b/config.yaml @@ -75,5 +75,8 @@ hostpaths: ['http://localhost:8080/'] # ==== New / Experimental Settings ==== # Not yet production ready -- used primarily for testing +# Enable simple http proxy mode +enable_http_proxy: false + # enable cdx server api for querying cdx directly (experimental) enable_cdx_api: false diff --git a/pywb/proxy.py b/pywb/proxy.py new file mode 100644 index 00000000..51e92f83 --- /dev/null +++ b/pywb/proxy.py @@ -0,0 +1,102 @@ +from wbrequestresponse import WbResponse, WbRequest +from archivalrouter import ArchivalRouter +import urlparse + +#================================================================= +# An experimental router which combines both archival and proxy modes +# http proxy mode support is very simple: only latest capture is available currently +#================================================================= + +class ProxyArchivalRouter: + def __init__(self, routes, hostpaths = None, abs_path = True, home_view = None, error_view = None): + self.archival = ArchivalRouter(routes, hostpaths, abs_path, home_view, error_view) + self.proxy = ProxyRouter(routes[0].handler, hostpaths, error_view) + self.error_view = error_view + + def __call__(self, env): + response = self.archival(env) + if response: + return response + + response = self.proxy(env) + if response: + return response + + +#================================================================= +# Simple router which routes http proxy requests +# Handles requests of the form: GET http://example.com +# Only supports latest capture replay at the moment +#================================================================= +class ProxyRouter: + def __init__(self, handler, hostpaths = None, error_view = None): + self.handler = handler + self.hostpaths = hostpaths + + self.error_view = error_view + + def __call__(self, env): + url = env['REL_REQUEST_URI'] + + if url.endswith('/proxy.pac'): + return self.make_pac_response(env) + + if not url.startswith('http://'): + return None + + wbrequest = WbRequest(env, + request_uri = url, + coll = '', + wb_url_str = url, + wb_prefix = '', + use_abs_prefix = False, + wburl_class = self.handler.get_wburl_type(), + url_rewriter_class = ProxyHttpsUrlRewriter, + is_proxy = True) + + return self.handler(wbrequest) + + + # Proxy Auto-Config (PAC) script for the proxy + def make_pac_response(self, env): + server_hostport = env['SERVER_NAME'] + ':' + env['SERVER_PORT'] + + buff = 'function FindProxyForURL (url, host) {\n' + + direct_cond =' if (shExpMatch(host, "{0}")) {{ return "DIRECT"; }}\n' + + for hostpath in self.hostpaths: + parts = urlparse.urlsplit(hostpath).netloc.split(':') + buff += direct_cond.format(parts[0]) + + buff += direct_cond.format(env['SERVER_NAME']) + + #buff += '\n return "PROXY {0}";\n}}\n'.format(self.hostpaths[0]) + buff += '\n return "PROXY {0}";\n}}\n'.format(server_hostport) + + return WbResponse.text_response(buff, content_type = 'application/x-ns-proxy-autoconfig') + + + + +#================================================================= +# A rewriter which only rewrites https -> http +#================================================================= +class ProxyHttpsUrlRewriter: + HTTP = 'http://' + HTTPS = 'https://' + def __init__(self, wbrequest, prefix): + pass + + def rewrite(self, url, mod = None): + if url.startswith(self.HTTPS): + return self.HTTP + url[len(self.HTTPS):] + else: + return url + + def get_timestamp_url(self, timestamp, url): + return url + + def get_abs_url(self, url = ''): + return url + diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py index 38772c99..6db4103b 100644 --- a/pywb/pywb_init.py +++ b/pywb/pywb_init.py @@ -5,7 +5,7 @@ import os import yaml import config_utils import logging - +import proxy #================================================================= ## Reference non-YAML config @@ -54,9 +54,14 @@ def pywb_config_manual(config = {}): if config.get('debug_echo_req', False): routes.append(archivalrouter.Route('echo_req', handlers.DebugEchoHandler())) + # Check for new proxy mode! + if config.get('enable_http_proxy', False): + router = proxy.ProxyArchivalRouter + else: + router = archivalrouter.ArchivalRouter # Finally, create wb router - return archivalrouter.ArchivalRouter( + return router( routes, # Specify hostnames that pywb will be running on # This will help catch occasionally missed rewrites that fall-through to the host diff --git a/run-tests.py b/run-tests.py index 9b39436c..bfba91ed 100644 --- a/run-tests.py +++ b/run-tests.py @@ -75,6 +75,15 @@ class TestWb: assert 'Mon, Jan 27 2014 17:12:51' in resp.body assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body + # XX: Doesn't work as webtest does not support proxy mode + # need a way to test + #def test_proxy_replay(self): + #resp = self.testapp.get('http://www.iana.org/domains/idn-tables') + #self._assert_basic_html(resp) + + #assert 'Sun, Jan 26 2014 20:11:27' in resp.body + #assert 'wb.js' in resp.body + def test_cdx_server_filters(self): resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mimetype:warc/revisit&filter=filename:dupes.warc.gz') self._assert_basic_text(resp) @@ -102,5 +111,3 @@ class TestWb: resp = self.testapp.get('/pywb/?abc', status = 400) assert resp.status_int == 400 assert 'Bad Request Url: http://?abc' in resp.body - - diff --git a/test_config.yaml b/test_config.yaml index 1193e023..df16f0b3 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -75,5 +75,8 @@ hostpaths: ['http://localhost:8080/'] # ==== New / Experimental Settings ==== # Not yet production ready -- used primarily for testing +# Enable simple http proxy mode +enable_http_proxy: true + # enable cdx server api for querying cdx directly (experimental) enable_cdx_api: true