From eaaefbfd2455147ac52342526863430802778a2b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 20 Aug 2014 02:02:47 -0400 Subject: [PATCH] * config cleanup: remove 'hostpaths' setting entirely, avoiding the need to specify host on which pywb will run (this was cumbersome to maintain and not really useful) ReferRedirect just checks that the current request host header, if present, matches that of the referrer and checks that the coll and script name match. * removed proxy_pac as it was also unneeded/unused and required use of the hostpaths * added test for invalid CONNECT usage (405 response) --- config.yaml | 10 ------ pywb/framework/archivalrouter.py | 21 +++--------- pywb/framework/proxy.py | 37 +--------------------- pywb/framework/test/test_archivalrouter.py | 21 ++++-------- tests/test_config.yaml | 10 ------ tests/test_integration.py | 9 +++--- tests/test_proxy.py | 8 +++++ tests/test_proxy_https.py | 9 ++---- 8 files changed, 29 insertions(+), 96 deletions(-) diff --git a/config.yaml b/config.yaml index fc2290ba..c5e662e5 100644 --- a/config.yaml +++ b/config.yaml @@ -66,16 +66,6 @@ archive_paths: ./sample_archive/warcs/ # ==== Other Paths ==== -# list of host names that pywb will be running from to detect -# 'fallthrough' requests based on referrer -# -# eg: an incorrect request for http://localhost:8080/image.gif with a referrer -# of http://localhost:8080/pywb/index.html, pywb can correctly redirect -# to http://localhost:8080/pywb/image.gif -# - -#hostpaths: ['http://localhost:8080'] - # Rewrite urls with absolute paths instead of relative #absoulte_paths: true diff --git a/pywb/framework/archivalrouter.py b/pywb/framework/archivalrouter.py index 861fad90..df5b7ec6 100644 --- a/pywb/framework/archivalrouter.py +++ b/pywb/framework/archivalrouter.py @@ -16,12 +16,7 @@ class ArchivalRouter(object): # optional port setting may be ignored by wsgi container self.port = kwargs.get('port') - hostpaths = kwargs.get('hostpaths') - - if hostpaths: - self.fallback = ReferRedirect(hostpaths) - else: - self.fallback = None + self.fallback = ReferRedirect() self.abs_path = kwargs.get('abs_path') @@ -133,12 +128,6 @@ class Route(object): # based on the referrer settings #================================================================= class ReferRedirect: - def __init__(self, match_prefixs): - if isinstance(match_prefixs, list): - self.match_prefixs = match_prefixs - else: - self.match_prefixs = [match_prefixs] - def __call__(self, env, the_router): referrer = env.get('HTTP_REFERER') @@ -151,10 +140,10 @@ class ReferRedirect: # get referrer path name ref_split = urlparse.urlsplit(referrer) - # ensure referrer starts with one of allowed hosts - if not any(referrer.startswith(i) for i in self.match_prefixs): - if ref_split.netloc != env.get('HTTP_HOST'): - return None + # require that referrer starts with current Host, if any + curr_host = env.get('HTTP_HOST') + if curr_host and curr_host != ref_split.netloc: + return None path = ref_split.path diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index 848489e5..e07a531f 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -53,7 +53,6 @@ class ProxyRouter(object): for more details. """ - PAC_PATH = '/proxy.pac' BLOCK_SIZE = 4096 DEF_MAGIC_NAME = 'pywb.proxy' @@ -64,8 +63,6 @@ class ProxyRouter(object): 'p3p': 'CP="NOI ADM DEV COM NAV OUR STP"'} def __init__(self, routes, **kwargs): - self.hostpaths = kwargs.get('hostpaths') - self.error_view = kwargs.get('error_view') proxy_options = kwargs.get('config', {}) @@ -89,9 +86,6 @@ class ProxyRouter(object): self.unaltered = proxy_options.get('unaltered_replay', False) - self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH) - - if not proxy_options.get('enable_https_proxy'): self.ca = None self.proxy_cert_dl_view = None @@ -116,13 +110,10 @@ class ProxyRouter(object): def __call__(self, env): is_https = (env['REQUEST_METHOD'] == 'CONNECT') - # for non-https requests, check pac path and non-proxy urls + # for non-https requests, check non-proxy urls if not is_https: url = env['REL_REQUEST_URI'] - if url == self.proxy_pac_path: - return self.make_pac_response(env) - if not url.startswith(('http://', 'https://')): return None @@ -359,29 +350,3 @@ class ProxyRouter(object): content_type=content_type) else: return None - - # Proxy Auto-Config (PAC) script for the proxy - def make_pac_response(self, env): - hostname = env.get('HTTP_HOST') - if not hostname: - server_hostport = env['SERVER_NAME'] + ':' + env['SERVER_PORT'] - hostonly = env['SERVER_NAME'] - else: - server_hostport = hostname - hostonly = hostname.split(':')[0] - - buff = 'function FindProxyForURL (url, host) {\n' - - direct = ' if (shExpMatch(host, "{0}")) {{ return "DIRECT"; }}\n' - - for hostpath in self.hostpaths: - parts = urlparse.urlsplit(hostpath).netloc.split(':') - buff += direct.format(parts[0]) - - buff += direct.format(hostonly) - - buff += '\n return "PROXY {0}";\n}}\n'.format(server_hostport) - - content_type = 'application/x-ns-proxy-autoconfig' - - return WbResponse.text_response(buff, content_type=content_type) diff --git a/pywb/framework/test/test_archivalrouter.py b/pywb/framework/test/test_archivalrouter.py index 52009353..4509947c 100644 --- a/pywb/framework/test/test_archivalrouter.py +++ b/pywb/framework/test/test_archivalrouter.py @@ -25,14 +25,7 @@ # not matching route -- skipped >>> _test_route_req(Route('web', BaseHandler()), {'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}) - -# Referer Redirect Test ->>> ReferRedirect('http://localhost:8080/').match_prefixs -['http://localhost:8080/'] - ->>> ReferRedirect(['http://example:9090/']).match_prefixs -['http://example:9090/'] - +# Test Refer Redirects >>> _test_redir('http://localhost:8080/', '/diff_path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') 'http://localhost:8080/coll/20131010/http://example.com/diff_path/other.html' @@ -55,11 +48,11 @@ 'http://localhost:8080/coll/20131010/http://example.com/path/other.html' # Wrong Host ->>> _test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') +>>> _test_redir('http://example.com:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') False # Right Host ->>> _test_redir('http://localhost:8080/', '/other.html', 'http://example.com:8080/coll/20131010/http://example.com/path/page.html', http_host = 'example.com:8080') +>>> _test_redir('http://example.com:8080/', '/other.html', 'http://example.com:8080/coll/20131010/http://example.com/path/page.html') 'http://example.com:8080/coll/20131010/http://example.com/other.html' # With custom SCRIPT_NAME @@ -87,6 +80,7 @@ False from pywb.framework.archivalrouter import Route, ReferRedirect, ArchivalRouter from pywb.framework.basehandlers import BaseHandler, WbUrlHandler import pprint +import urlparse def _test_route_req(route, env, abs_path=False): matcher, coll = route.is_handling(env['REL_REQUEST_URI']) @@ -101,17 +95,16 @@ def _test_route_req(route, env, abs_path=False): pprint.pprint(the_dict) -def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None): +def _test_redir(match_host, request_uri, referrer, script_name='', coll='coll'): env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name} - if http_host: - env['HTTP_HOST'] = http_host + env['HTTP_HOST'] = urlparse.urlsplit(match_host).netloc routes = [Route(coll, WbUrlHandler())] the_router = ArchivalRouter(routes) - redir = ReferRedirect(match_host) + redir = ReferRedirect() #req = WbRequest.from_uri(request_uri, env) rep = redir(env, the_router) if not rep: diff --git a/tests/test_config.yaml b/tests/test_config.yaml index 468a3131..2603bb2a 100644 --- a/tests/test_config.yaml +++ b/tests/test_config.yaml @@ -85,16 +85,6 @@ error_html: ui/error.html # ==== Other Paths ==== -# list of host names that pywb will be running from to detect -# 'fallthrough' requests based on referrer -# -# eg: an incorrect request for http://localhost:8080/image.gif with a referrer -# of http://localhost:8080/pywb/index.html, pywb can correctly redirect -# to http://localhost:8080/pywb/image.gif -# - -#hostpaths: ['http://localhost:8080'] - # Rewrite urls with absolute paths instead of relative absoulte_paths: true diff --git a/tests/test_integration.py b/tests/test_integration.py index 7a360341..a3d9d26c 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -219,16 +219,17 @@ class TestWb: assert '/pywb/20140127171251mp_/http://www.iana.org/domains/example' in resp.body def test_redirect_relative_3(self): + # webtest uses Host: localhost:80 by default # first two requests should result in same redirect - target = 'http://localhost:8080/pywb/2014mp_/http://iana.org/_css/2013.1/screen.css' + target = 'http://localhost:80/pywb/2014mp_/http://iana.org/_css/2013.1/screen.css' # without timestamp - resp = self.testapp.get('/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:8080/pywb/2014mp_/http://iana.org/')]) + resp = self.testapp.get('/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:80/pywb/2014mp_/http://iana.org/')]) assert resp.status_int == 302 assert resp.headers['Location'] == target, resp.headers['Location'] # with timestamp - resp = self.testapp.get('/2014/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:8080/pywb/2014mp_/http://iana.org/')]) + resp = self.testapp.get('/2014/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:80/pywb/2014mp_/http://iana.org/')]) assert resp.status_int == 302 assert resp.headers['Location'] == target, resp.headers['Location'] @@ -313,7 +314,7 @@ class TestWb: def test_post_redirect(self): # post handled without redirect (since 307 not allowed) - resp = self.testapp.post('/post', {'foo': 'bar', 'test': 'abc'}, headers=[('Referer', 'http://localhost:8080/pywb/2014mp_/http://httpbin.org/post')]) + resp = self.testapp.post('/post', {'foo': 'bar', 'test': 'abc'}, headers=[('Referer', 'http://localhost:80/pywb/2014mp_/http://httpbin.org/post')]) assert resp.status_int == 200 assert '"foo": "bar"' in resp.body assert '"test": "abc"' in resp.body diff --git a/tests/test_proxy.py b/tests/test_proxy.py index 124b6b1e..cee011af 100644 --- a/tests/test_proxy.py +++ b/tests/test_proxy.py @@ -77,3 +77,11 @@ class TestProxyWb: resp = self.testapp.get('/x-ignore-this-x', headers = headers, extra_environ = dict(REQUEST_URI = 'http://www.iana.org/', SCRIPT_NAME = ''), status=407) + + + def test_proxy_connect_unsupported(self): + resp = self.testapp.request('/x-ignore-this-x', method='CONNECT', + environ=dict(REQUEST_URI='example:443', SCRIPT_NAME=''), + status=405) + + assert resp.status_int == 405 diff --git a/tests/test_proxy_https.py b/tests/test_proxy_https.py index d5e864fc..940f6e06 100644 --- a/tests/test_proxy_https.py +++ b/tests/test_proxy_https.py @@ -62,7 +62,7 @@ class TestHttpsProxy: def setup(self): self.session = requests.Session() - def get_url(self, url, headers=None): + def get_url(self, url): global sesh_key if sesh_key: self.session.headers.update({'Cookie': '__pywb_proxy_sesh=' + sesh_key}) @@ -174,16 +174,14 @@ class TestHttpsProxy: def test_replay_static(self): resp = self.get_url('https://pywb.proxy/static/default/wb.js') assert resp.status_code == 200 - found = u'function init_banner' in resp.text - assert found, resp.text + assert 'function init_banner' in resp.text # download index page and cert downloads def test_replay_dl_page(self): resp = self.get_url('https://pywb.proxy/') assert resp.status_code == 200 assert 'text/html' in resp.headers['content-type'] - found = u'Download' in resp.text - assert found, resp.text + assert 'Download' in resp.text def test_dl_pem(self): resp = self.get_url('https://pywb.proxy/pywb-ca.pem') @@ -194,4 +192,3 @@ class TestHttpsProxy: resp = self.get_url('https://pywb.proxy/pywb-ca.p12') assert resp.headers['content-type'] == 'application/x-pkcs12' -