1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

* config cleanup: remove 'hostpaths' setting entirely, avoiding the need to specify host on which pywb

will run (this was cumbersome to maintain and not really useful)
ReferRedirect just checks that the current request host header, if present, matches that of the referrer
and checks that the coll and script name match.
* removed proxy_pac as it was also unneeded/unused and required use of the hostpaths
* added test for invalid CONNECT usage (405 response)
This commit is contained in:
Ilya Kreymer 2014-08-20 02:02:47 -04:00
parent 6b476d83de
commit eaaefbfd24
8 changed files with 29 additions and 96 deletions

View File

@ -66,16 +66,6 @@ archive_paths: ./sample_archive/warcs/
# ==== Other Paths ====
# list of host names that pywb will be running from to detect
# 'fallthrough' requests based on referrer
#
# eg: an incorrect request for http://localhost:8080/image.gif with a referrer
# of http://localhost:8080/pywb/index.html, pywb can correctly redirect
# to http://localhost:8080/pywb/image.gif
#
#hostpaths: ['http://localhost:8080']
# Rewrite urls with absolute paths instead of relative
#absoulte_paths: true

View File

@ -16,12 +16,7 @@ class ArchivalRouter(object):
# optional port setting may be ignored by wsgi container
self.port = kwargs.get('port')
hostpaths = kwargs.get('hostpaths')
if hostpaths:
self.fallback = ReferRedirect(hostpaths)
else:
self.fallback = None
self.fallback = ReferRedirect()
self.abs_path = kwargs.get('abs_path')
@ -133,12 +128,6 @@ class Route(object):
# based on the referrer settings
#=================================================================
class ReferRedirect:
def __init__(self, match_prefixs):
if isinstance(match_prefixs, list):
self.match_prefixs = match_prefixs
else:
self.match_prefixs = [match_prefixs]
def __call__(self, env, the_router):
referrer = env.get('HTTP_REFERER')
@ -151,9 +140,9 @@ class ReferRedirect:
# get referrer path name
ref_split = urlparse.urlsplit(referrer)
# ensure referrer starts with one of allowed hosts
if not any(referrer.startswith(i) for i in self.match_prefixs):
if ref_split.netloc != env.get('HTTP_HOST'):
# require that referrer starts with current Host, if any
curr_host = env.get('HTTP_HOST')
if curr_host and curr_host != ref_split.netloc:
return None
path = ref_split.path

View File

@ -53,7 +53,6 @@ class ProxyRouter(object):
for more details.
"""
PAC_PATH = '/proxy.pac'
BLOCK_SIZE = 4096
DEF_MAGIC_NAME = 'pywb.proxy'
@ -64,8 +63,6 @@ class ProxyRouter(object):
'p3p': 'CP="NOI ADM DEV COM NAV OUR STP"'}
def __init__(self, routes, **kwargs):
self.hostpaths = kwargs.get('hostpaths')
self.error_view = kwargs.get('error_view')
proxy_options = kwargs.get('config', {})
@ -89,9 +86,6 @@ class ProxyRouter(object):
self.unaltered = proxy_options.get('unaltered_replay', False)
self.proxy_pac_path = proxy_options.get('pac_path', self.PAC_PATH)
if not proxy_options.get('enable_https_proxy'):
self.ca = None
self.proxy_cert_dl_view = None
@ -116,13 +110,10 @@ class ProxyRouter(object):
def __call__(self, env):
is_https = (env['REQUEST_METHOD'] == 'CONNECT')
# for non-https requests, check pac path and non-proxy urls
# for non-https requests, check non-proxy urls
if not is_https:
url = env['REL_REQUEST_URI']
if url == self.proxy_pac_path:
return self.make_pac_response(env)
if not url.startswith(('http://', 'https://')):
return None
@ -359,29 +350,3 @@ class ProxyRouter(object):
content_type=content_type)
else:
return None
# Proxy Auto-Config (PAC) script for the proxy
def make_pac_response(self, env):
hostname = env.get('HTTP_HOST')
if not hostname:
server_hostport = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
hostonly = env['SERVER_NAME']
else:
server_hostport = hostname
hostonly = hostname.split(':')[0]
buff = 'function FindProxyForURL (url, host) {\n'
direct = ' if (shExpMatch(host, "{0}")) {{ return "DIRECT"; }}\n'
for hostpath in self.hostpaths:
parts = urlparse.urlsplit(hostpath).netloc.split(':')
buff += direct.format(parts[0])
buff += direct.format(hostonly)
buff += '\n return "PROXY {0}";\n}}\n'.format(server_hostport)
content_type = 'application/x-ns-proxy-autoconfig'
return WbResponse.text_response(buff, content_type=content_type)

View File

@ -25,14 +25,7 @@
# not matching route -- skipped
>>> _test_route_req(Route('web', BaseHandler()), {'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''})
# Referer Redirect Test
>>> ReferRedirect('http://localhost:8080/').match_prefixs
['http://localhost:8080/']
>>> ReferRedirect(['http://example:9090/']).match_prefixs
['http://example:9090/']
# Test Refer Redirects
>>> _test_redir('http://localhost:8080/', '/diff_path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
'http://localhost:8080/coll/20131010/http://example.com/diff_path/other.html'
@ -55,11 +48,11 @@
'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
# Wrong Host
>>> _test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
>>> _test_redir('http://example.com:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
False
# Right Host
>>> _test_redir('http://localhost:8080/', '/other.html', 'http://example.com:8080/coll/20131010/http://example.com/path/page.html', http_host = 'example.com:8080')
>>> _test_redir('http://example.com:8080/', '/other.html', 'http://example.com:8080/coll/20131010/http://example.com/path/page.html')
'http://example.com:8080/coll/20131010/http://example.com/other.html'
# With custom SCRIPT_NAME
@ -87,6 +80,7 @@ False
from pywb.framework.archivalrouter import Route, ReferRedirect, ArchivalRouter
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
import pprint
import urlparse
def _test_route_req(route, env, abs_path=False):
matcher, coll = route.is_handling(env['REL_REQUEST_URI'])
@ -101,17 +95,16 @@ def _test_route_req(route, env, abs_path=False):
pprint.pprint(the_dict)
def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'coll', http_host = None):
def _test_redir(match_host, request_uri, referrer, script_name='', coll='coll'):
env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name}
if http_host:
env['HTTP_HOST'] = http_host
env['HTTP_HOST'] = urlparse.urlsplit(match_host).netloc
routes = [Route(coll, WbUrlHandler())]
the_router = ArchivalRouter(routes)
redir = ReferRedirect(match_host)
redir = ReferRedirect()
#req = WbRequest.from_uri(request_uri, env)
rep = redir(env, the_router)
if not rep:

View File

@ -85,16 +85,6 @@ error_html: ui/error.html
# ==== Other Paths ====
# list of host names that pywb will be running from to detect
# 'fallthrough' requests based on referrer
#
# eg: an incorrect request for http://localhost:8080/image.gif with a referrer
# of http://localhost:8080/pywb/index.html, pywb can correctly redirect
# to http://localhost:8080/pywb/image.gif
#
#hostpaths: ['http://localhost:8080']
# Rewrite urls with absolute paths instead of relative
absoulte_paths: true

View File

@ -219,16 +219,17 @@ class TestWb:
assert '/pywb/20140127171251mp_/http://www.iana.org/domains/example' in resp.body
def test_redirect_relative_3(self):
# webtest uses Host: localhost:80 by default
# first two requests should result in same redirect
target = 'http://localhost:8080/pywb/2014mp_/http://iana.org/_css/2013.1/screen.css'
target = 'http://localhost:80/pywb/2014mp_/http://iana.org/_css/2013.1/screen.css'
# without timestamp
resp = self.testapp.get('/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:8080/pywb/2014mp_/http://iana.org/')])
resp = self.testapp.get('/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:80/pywb/2014mp_/http://iana.org/')])
assert resp.status_int == 302
assert resp.headers['Location'] == target, resp.headers['Location']
# with timestamp
resp = self.testapp.get('/2014/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:8080/pywb/2014mp_/http://iana.org/')])
resp = self.testapp.get('/2014/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:80/pywb/2014mp_/http://iana.org/')])
assert resp.status_int == 302
assert resp.headers['Location'] == target, resp.headers['Location']
@ -313,7 +314,7 @@ class TestWb:
def test_post_redirect(self):
# post handled without redirect (since 307 not allowed)
resp = self.testapp.post('/post', {'foo': 'bar', 'test': 'abc'}, headers=[('Referer', 'http://localhost:8080/pywb/2014mp_/http://httpbin.org/post')])
resp = self.testapp.post('/post', {'foo': 'bar', 'test': 'abc'}, headers=[('Referer', 'http://localhost:80/pywb/2014mp_/http://httpbin.org/post')])
assert resp.status_int == 200
assert '"foo": "bar"' in resp.body
assert '"test": "abc"' in resp.body

View File

@ -77,3 +77,11 @@ class TestProxyWb:
resp = self.testapp.get('/x-ignore-this-x', headers = headers,
extra_environ = dict(REQUEST_URI = 'http://www.iana.org/', SCRIPT_NAME = ''),
status=407)
def test_proxy_connect_unsupported(self):
resp = self.testapp.request('/x-ignore-this-x', method='CONNECT',
environ=dict(REQUEST_URI='example:443', SCRIPT_NAME=''),
status=405)
assert resp.status_int == 405

View File

@ -62,7 +62,7 @@ class TestHttpsProxy:
def setup(self):
self.session = requests.Session()
def get_url(self, url, headers=None):
def get_url(self, url):
global sesh_key
if sesh_key:
self.session.headers.update({'Cookie': '__pywb_proxy_sesh=' + sesh_key})
@ -174,16 +174,14 @@ class TestHttpsProxy:
def test_replay_static(self):
resp = self.get_url('https://pywb.proxy/static/default/wb.js')
assert resp.status_code == 200
found = u'function init_banner' in resp.text
assert found, resp.text
assert 'function init_banner' in resp.text
# download index page and cert downloads
def test_replay_dl_page(self):
resp = self.get_url('https://pywb.proxy/')
assert resp.status_code == 200
assert 'text/html' in resp.headers['content-type']
found = u'Download' in resp.text
assert found, resp.text
assert 'Download' in resp.text
def test_dl_pem(self):
resp = self.get_url('https://pywb.proxy/pywb-ca.pem')
@ -194,4 +192,3 @@ class TestHttpsProxy:
resp = self.get_url('https://pywb.proxy/pywb-ca.p12')
assert resp.headers['content-type'] == 'application/x-pkcs12'