1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

* config cleanup: remove 'hostpaths' setting entirely, avoiding the need to specify host on which pywb

will run (this was cumbersome to maintain and not really useful)
ReferRedirect just checks that the current request host header, if present, matches that of the referrer
and checks that the coll and script name match.
* removed proxy_pac as it was also unneeded/unused and required use of the hostpaths
* added test for invalid CONNECT usage (405 response)
This commit is contained in:
Ilya Kreymer 2014-08-20 02:02:47 -04:00
parent 6b476d83de
commit eaaefbfd24
8 changed files with 29 additions and 96 deletions

View File

@ -66,16 +66,6 @@ archive_paths: ./sample_archive/warcs/
# ==== Other Paths ==== # ==== Other Paths ====
# list of host names that pywb will be running from to detect
# 'fallthrough' requests based on referrer
#
# eg: an incorrect request for http://localhost:8080/image.gif with a referrer
# of http://localhost:8080/pywb/index.html, pywb can correctly redirect
# to http://localhost:8080/pywb/image.gif
#
#hostpaths: ['http://localhost:8080']
# Rewrite urls with absolute paths instead of relative # Rewrite urls with absolute paths instead of relative
#absoulte_paths: true #absoulte_paths: true

View File

@ -16,12 +16,7 @@ class ArchivalRouter(object):
# optional port setting may be ignored by wsgi container # optional port setting may be ignored by wsgi container
self.port = kwargs.get('port') self.port = kwargs.get('port')
hostpaths = kwargs.get('hostpaths') self.fallback = ReferRedirect()
if hostpaths:
self.fallback = ReferRedirect(hostpaths)
else:
self.fallback = None
self.abs_path = kwargs.get('abs_path') self.abs_path = kwargs.get('abs_path')
@ -133,12 +128,6 @@ class Route(object):
# based on the referrer settings # based on the referrer settings
#================================================================= #=================================================================
class ReferRedirect: class ReferRedirect:
def __init__(self, match_prefixs):
if isinstance(match_prefixs, list):
self.match_prefixs = match_prefixs
else:
self.match_prefixs = [match_prefixs]
def __call__(self, env, the_router): def __call__(self, env, the_router):
referrer = env.get('HTTP_REFERER') referrer = env.get('HTTP_REFERER')
@ -151,10 +140,10 @@ class ReferRedirect:
# get referrer path name # get referrer path name
ref_split = urlparse.urlsplit(referrer) ref_split = urlparse.urlsplit(referrer)
# ensure referrer starts with one of allowed hosts # require that referrer starts with current Host, if any
if not any(referrer.startswith(i) for i in self.match_prefixs): curr_host = env.get('HTTP_HOST')
if ref_split.netloc != env.get('HTTP_HOST'): if curr_host and curr_host != ref_split.netloc:
return None return None
path = ref_split.path path = ref_split.path

View File

@ -53,7 +53,6 @@ class ProxyRouter(object):
for more details. for more details.
""" """
PAC_PATH = '/proxy.pac'
BLOCK_SIZE = 4096 BLOCK_SIZE = 4096
DEF_MAGIC_NAME = 'pywb.proxy' DEF_MAGIC_NAME = 'pywb.proxy'
@ -64,8 +63,6 @@ class ProxyRouter(object):
'p3p': 'CP="NOI ADM DEV COM NAV OUR STP"'} 'p3p': 'CP="NOI ADM DEV COM NAV OUR STP"'}
def __init__(self, routes, **kwargs): def __init__(self, routes, **kwargs):
self.hostpaths = kwargs.get('hostpaths')
self.error_view = kwargs.get('error_view') self.error_view = kwargs.get('error_view')
proxy_options = kwargs.get('config', {}) proxy_options = kwargs.get('config', {})
@ -89,9 +86,6 @@ class ProxyRouter(object):
self.unaltered = proxy_options.get('unaltered_replay', False) self.unaltered = proxy_options.get('unaltered_replay', False)
self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH)
if not proxy_options.get('enable_https_proxy'): if not proxy_options.get('enable_https_proxy'):
self.ca = None self.ca = None
self.proxy_cert_dl_view = None self.proxy_cert_dl_view = None
@ -116,13 +110,10 @@ class ProxyRouter(object):
def __call__(self, env): def __call__(self, env):
is_https = (env['REQUEST_METHOD'] == 'CONNECT') is_https = (env['REQUEST_METHOD'] == 'CONNECT')
# for non-https requests, check pac path and non-proxy urls # for non-https requests, check non-proxy urls
if not is_https: if not is_https:
url = env['REL_REQUEST_URI'] url = env['REL_REQUEST_URI']
if url == self.proxy_pac_path:
return self.make_pac_response(env)
if not url.startswith(('http://', 'https://')): if not url.startswith(('http://', 'https://')):
return None return None
@ -359,29 +350,3 @@ class ProxyRouter(object):
content_type=content_type) content_type=content_type)
else: else:
return None return None
# Proxy Auto-Config (PAC) script for the proxy
def make_pac_response(self, env):
hostname = env.get('HTTP_HOST')
if not hostname:
server_hostport = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
hostonly = env['SERVER_NAME']
else:
server_hostport = hostname
hostonly = hostname.split(':')[0]
buff = 'function FindProxyForURL (url, host) {\n'
direct = ' if (shExpMatch(host, "{0}")) {{ return "DIRECT"; }}\n'
for hostpath in self.hostpaths:
parts = urlparse.urlsplit(hostpath).netloc.split(':')
buff += direct.format(parts[0])
buff += direct.format(hostonly)
buff += '\n return "PROXY {0}";\n}}\n'.format(server_hostport)
content_type = 'application/x-ns-proxy-autoconfig'
return WbResponse.text_response(buff, content_type=content_type)

View File

@ -25,14 +25,7 @@
# not matching route -- skipped # not matching route -- skipped
>>> _test_route_req(Route('web', BaseHandler()), {'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}) >>> _test_route_req(Route('web', BaseHandler()), {'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''})
# Test Refer Redirects
# Referer Redirect Test
>>> ReferRedirect('http://localhost:8080/').match_prefixs
['http://localhost:8080/']
>>> ReferRedirect(['http://example:9090/']).match_prefixs
['http://example:9090/']
>>> _test_redir('http://localhost:8080/', '/diff_path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') >>> _test_redir('http://localhost:8080/', '/diff_path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
'http://localhost:8080/coll/20131010/http://example.com/diff_path/other.html' 'http://localhost:8080/coll/20131010/http://example.com/diff_path/other.html'
@ -55,11 +48,11 @@
'http://localhost:8080/coll/20131010/http://example.com/path/other.html' 'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
# Wrong Host # Wrong Host
>>> _test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') >>> _test_redir('http://example.com:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
False False
# Right Host # Right Host
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://example.com:8080/coll/20131010/http://example.com/path/page.html', http_host = 'example.com:8080') >>> _test_redir('http://example.com:8080/', '/other.html', 'http://example.com:8080/coll/20131010/http://example.com/path/page.html')
'http://example.com:8080/coll/20131010/http://example.com/other.html' 'http://example.com:8080/coll/20131010/http://example.com/other.html'
# With custom SCRIPT_NAME # With custom SCRIPT_NAME
@ -87,6 +80,7 @@ False
from pywb.framework.archivalrouter import Route, ReferRedirect, ArchivalRouter from pywb.framework.archivalrouter import Route, ReferRedirect, ArchivalRouter
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
import pprint import pprint
import urlparse
def _test_route_req(route, env, abs_path=False): def _test_route_req(route, env, abs_path=False):
matcher, coll = route.is_handling(env['REL_REQUEST_URI']) matcher, coll = route.is_handling(env['REL_REQUEST_URI'])
@ -101,17 +95,16 @@ def _test_route_req(route, env, abs_path=False):
pprint.pprint(the_dict) pprint.pprint(the_dict)
def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None): def _test_redir(match_host, request_uri, referrer, script_name='', coll='coll'):
env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name} env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name}
if http_host: env['HTTP_HOST'] = urlparse.urlsplit(match_host).netloc
env['HTTP_HOST'] = http_host
routes = [Route(coll, WbUrlHandler())] routes = [Route(coll, WbUrlHandler())]
the_router = ArchivalRouter(routes) the_router = ArchivalRouter(routes)
redir = ReferRedirect(match_host) redir = ReferRedirect()
#req = WbRequest.from_uri(request_uri, env) #req = WbRequest.from_uri(request_uri, env)
rep = redir(env, the_router) rep = redir(env, the_router)
if not rep: if not rep:

View File

@ -85,16 +85,6 @@ error_html: ui/error.html
# ==== Other Paths ==== # ==== Other Paths ====
# list of host names that pywb will be running from to detect
# 'fallthrough' requests based on referrer
#
# eg: an incorrect request for http://localhost:8080/image.gif with a referrer
# of http://localhost:8080/pywb/index.html, pywb can correctly redirect
# to http://localhost:8080/pywb/image.gif
#
#hostpaths: ['http://localhost:8080']
# Rewrite urls with absolute paths instead of relative # Rewrite urls with absolute paths instead of relative
absoulte_paths: true absoulte_paths: true

View File

@ -219,16 +219,17 @@ class TestWb:
assert '/pywb/20140127171251mp_/http://www.iana.org/domains/example' in resp.body assert '/pywb/20140127171251mp_/http://www.iana.org/domains/example' in resp.body
def test_redirect_relative_3(self): def test_redirect_relative_3(self):
# webtest uses Host: localhost:80 by default
# first two requests should result in same redirect # first two requests should result in same redirect
target = 'http://localhost:8080/pywb/2014mp_/http://iana.org/_css/2013.1/screen.css' target = 'http://localhost:80/pywb/2014mp_/http://iana.org/_css/2013.1/screen.css'
# without timestamp # without timestamp
resp = self.testapp.get('/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:8080/pywb/2014mp_/http://iana.org/')]) resp = self.testapp.get('/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:80/pywb/2014mp_/http://iana.org/')])
assert resp.status_int == 302 assert resp.status_int == 302
assert resp.headers['Location'] == target, resp.headers['Location'] assert resp.headers['Location'] == target, resp.headers['Location']
# with timestamp # with timestamp
resp = self.testapp.get('/2014/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:8080/pywb/2014mp_/http://iana.org/')]) resp = self.testapp.get('/2014/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:80/pywb/2014mp_/http://iana.org/')])
assert resp.status_int == 302 assert resp.status_int == 302
assert resp.headers['Location'] == target, resp.headers['Location'] assert resp.headers['Location'] == target, resp.headers['Location']
@ -313,7 +314,7 @@ class TestWb:
def test_post_redirect(self): def test_post_redirect(self):
# post handled without redirect (since 307 not allowed) # post handled without redirect (since 307 not allowed)
resp = self.testapp.post('/post', {'foo': 'bar', 'test': 'abc'}, headers=[('Referer', 'http://localhost:8080/pywb/2014mp_/http://httpbin.org/post')]) resp = self.testapp.post('/post', {'foo': 'bar', 'test': 'abc'}, headers=[('Referer', 'http://localhost:80/pywb/2014mp_/http://httpbin.org/post')])
assert resp.status_int == 200 assert resp.status_int == 200
assert '"foo": "bar"' in resp.body assert '"foo": "bar"' in resp.body
assert '"test": "abc"' in resp.body assert '"test": "abc"' in resp.body

View File

@ -77,3 +77,11 @@ class TestProxyWb:
resp = self.testapp.get('/x-ignore-this-x', headers = headers, resp = self.testapp.get('/x-ignore-this-x', headers = headers,
extra_environ = dict(REQUEST_URI = 'http://www.iana.org/', SCRIPT_NAME = ''), extra_environ = dict(REQUEST_URI = 'http://www.iana.org/', SCRIPT_NAME = ''),
status=407) status=407)
def test_proxy_connect_unsupported(self):
resp = self.testapp.request('/x-ignore-this-x', method='CONNECT',
environ=dict(REQUEST_URI='example:443', SCRIPT_NAME=''),
status=405)
assert resp.status_int == 405

View File

@ -62,7 +62,7 @@ class TestHttpsProxy:
def setup(self): def setup(self):
self.session = requests.Session() self.session = requests.Session()
def get_url(self, url, headers=None): def get_url(self, url):
global sesh_key global sesh_key
if sesh_key: if sesh_key:
self.session.headers.update({'Cookie': '__pywb_proxy_sesh=' + sesh_key}) self.session.headers.update({'Cookie': '__pywb_proxy_sesh=' + sesh_key})
@ -174,16 +174,14 @@ class TestHttpsProxy:
def test_replay_static(self): def test_replay_static(self):
resp = self.get_url('https://pywb.proxy/static/default/wb.js') resp = self.get_url('https://pywb.proxy/static/default/wb.js')
assert resp.status_code == 200 assert resp.status_code == 200
found = u'function init_banner' in resp.text assert 'function init_banner' in resp.text
assert found, resp.text
# download index page and cert downloads # download index page and cert downloads
def test_replay_dl_page(self): def test_replay_dl_page(self):
resp = self.get_url('https://pywb.proxy/') resp = self.get_url('https://pywb.proxy/')
assert resp.status_code == 200 assert resp.status_code == 200
assert 'text/html' in resp.headers['content-type'] assert 'text/html' in resp.headers['content-type']
found = u'Download' in resp.text assert 'Download' in resp.text
assert found, resp.text
def test_dl_pem(self): def test_dl_pem(self):
resp = self.get_url('https://pywb.proxy/pywb-ca.pem') resp = self.get_url('https://pywb.proxy/pywb-ca.pem')
@ -194,4 +192,3 @@ class TestHttpsProxy:
resp = self.get_url('https://pywb.proxy/pywb-ca.p12') resp = self.get_url('https://pywb.proxy/pywb-ca.p12')
assert resp.headers['content-type'] == 'application/x-pkcs12' assert resp.headers['content-type'] == 'application/x-pkcs12'