diff --git a/pywb/framework/archivalrouter.py b/pywb/framework/archivalrouter.py index 7404b35b..1b027488 100644 --- a/pywb/framework/archivalrouter.py +++ b/pywb/framework/archivalrouter.py @@ -29,16 +29,49 @@ class ArchivalRouter(object): self.error_view = kwargs.get('error_view') def __call__(self, env): + request_uri = env['REL_REQUEST_URI'] + for route in self.routes: - result = route(env, self.abs_path) - if result: - return result + matcher, coll = route.is_handling(request_uri) + if matcher: + wbrequest = self.parse_request(route, env, matcher, + coll, request_uri, + use_abs_prefix=self.abs_path) + + return route.handler(wbrequest) # Default Home Page - if env['REL_REQUEST_URI'] in ['/', '/index.html', '/index.htm']: + if request_uri in ['/', '/index.html', '/index.htm']: return self.render_home_page(env) - return self.fallback(env, self.routes) if self.fallback else None + return self.fallback(env, self) if self.fallback else None + + def parse_request(self, route, env, matcher, coll, request_uri, + use_abs_prefix=False): + matched_str = matcher.group(0) + if matched_str: + rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/' + # remove the '/' + rel_prefix part of uri + wb_url_str = request_uri[len(matched_str) + 2:] + else: + rel_prefix = env['SCRIPT_NAME'] + '/' + # the request_uri is the wb_url, since no coll + wb_url_str = request_uri[1:] + + wbrequest = route.request_class(env, + request_uri=request_uri, + wb_url_str=wb_url_str, + rel_prefix=rel_prefix, + coll=coll, + use_abs_prefix=use_abs_prefix, + wburl_class=route.handler.get_wburl_type(), + urlrewriter_class=UrlRewriter) + + # Allow for applying of additional filters + route.apply_filters(wbrequest, matcher) + + return wbrequest + def render_home_page(self, env): # render the homepage! @@ -73,45 +106,15 @@ class Route(object): self.coll_group = coll_group self._custom_init(config) - def __call__(self, env, use_abs_prefix): - wbrequest = self.parse_request(env, use_abs_prefix) - return self.handler(wbrequest) if wbrequest else None - - def parse_request(self, env, use_abs_prefix, request_uri=None): - if not request_uri: - request_uri = env['REL_REQUEST_URI'] - + def is_handling(self, request_uri): matcher = self.regex.match(request_uri[1:]) if not matcher: - return None - - matched_str = matcher.group(0) - if matched_str: - rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/' - # remove the '/' + rel_prefix part of uri - wb_url_str = request_uri[len(matched_str) + 2:] - else: - rel_prefix = env['SCRIPT_NAME'] + '/' - # the request_uri is the wb_url, since no coll - wb_url_str = request_uri[1:] + return None, None coll = matcher.group(self.coll_group) + return matcher, coll - wbrequest = self.request_class(env, - request_uri=request_uri, - wb_url_str=wb_url_str, - rel_prefix=rel_prefix, - coll=coll, - use_abs_prefix=use_abs_prefix, - wburl_class=self.handler.get_wburl_type(), - urlrewriter_class=UrlRewriter) - - # Allow for applying of additional filters - self._apply_filters(wbrequest, matcher) - - return wbrequest - - def _apply_filters(self, wbrequest, matcher): + def apply_filters(self, wbrequest, matcher): for filter in self.filters: last_grp = len(matcher.groups()) filter_str = filter.format(matcher.group(last_grp)) @@ -136,9 +139,11 @@ class ReferRedirect: else: self.match_prefixs = [match_prefixs] - def __call__(self, env, routes): + def __call__(self, env, the_router): referrer = env.get('HTTP_REFERER') + routes = the_router.routes + # ensure there is a referrer if referrer is None: return None @@ -166,17 +171,15 @@ class ReferRedirect: ref_request = None for route in routes: - ref_request = route.parse_request(env, False, request_uri=path) - if ref_request: + matcher, coll = route.is_handling(path) + if matcher: + ref_request = the_router.parse_request(route, env, + matcher, coll, path) ref_route = route break - # must have matched one of the routes - if not ref_request: - return None - - # must have a rewriter - if not ref_request.urlrewriter: + # must have matched one of the routes with a urlrewriter + if not ref_request or not ref_request.urlrewriter: return None rewriter = ref_request.urlrewriter diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index 2ab0c9bc..faf6b72e 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -1,8 +1,11 @@ from wbrequestresponse import WbResponse, WbRequest from archivalrouter import ArchivalRouter + import urlparse +import base64 from pywb.rewrite.url_rewriter import HttpsUrlRewriter +from pywb.utils.statusandheaders import StatusAndHeaders #================================================================= @@ -15,10 +18,7 @@ class ProxyArchivalRouter(ArchivalRouter): """ def __init__(self, routes, **kwargs): super(ProxyArchivalRouter, self).__init__(routes, **kwargs) - request_class = routes[0].request_class - self.proxy = ProxyRouter(routes[0].handler, - request_class=request_class, - **kwargs) + self.proxy = ProxyRouter(routes, **kwargs) def __call__(self, env): response = self.proxy(env) @@ -43,12 +43,14 @@ class ProxyRouter(object): See: http://www.mementoweb.org/guide/rfc/#Pattern1.3 for more details. """ - def __init__(self, handler, **kwargs): - self.handler = handler + def __init__(self, routes, **kwargs): + self.routes = routes self.hostpaths = kwargs.get('hostpaths') self.error_view = kwargs.get('error_view') - self.request_class = kwargs.get('request_class') + + self.auth_msg = kwargs.get('auth_msg', + 'Please enter name of a collection to use for proxy mode') def __call__(self, env): url = env['REL_REQUEST_URI'] @@ -59,16 +61,50 @@ class ProxyRouter(object): if not url.startswith('http://'): return None - wbrequest = self.request_class(env, + proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION') + + route = None + coll = None + matcher = None + + if proxy_auth: + proxy_coll = self.read_basic_auth_coll(proxy_auth) + proxy_coll = '/' + proxy_coll + '/' + + if not proxy_coll: + return self.proxy_auth_coll_response() + + for r in self.routes: + matcher, c = r.is_handling(proxy_coll) + print r.regex.pattern + if matcher: + route = r + coll = c + break + + if not route: + return self.proxy_auth_coll_response() + + print 'COLL ', coll + + else: + route = self.routes[0] + coll = self.routes[0].regex.pattern + + wbrequest = route.request_class(env, request_uri=url, wb_url_str=url, + coll=coll, host_prefix=self.hostpaths[0], - wburl_class=self.handler.get_wburl_type(), + wburl_class=route.handler.get_wburl_type(), urlrewriter_class=HttpsUrlRewriter, use_abs_prefix=False, is_proxy=True) - return self.handler(wbrequest) + if matcher: + route.apply_filters(wbrequest, matcher) + + return route.handler(wbrequest) # Proxy Auto-Config (PAC) script for the proxy def make_pac_response(self, env): @@ -97,3 +133,27 @@ class ProxyRouter(object): content_type = 'application/x-ns-proxy-autoconfig' return WbResponse.text_response(buff, content_type=content_type) + + def proxy_auth_coll_response(self): + proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg) + + headers = [('Content-Type', 'text/plain'), + ('Proxy-Authenticate', proxy_msg)] + + status_headers = StatusAndHeaders('407 Proxy Authentication', headers) + + value = self.auth_msg + + return WbResponse(status_headers, value=[value]) + + @staticmethod + def read_basic_auth_coll(value): + parts = value.split(' ') + if parts[0].lower() != 'basic': + return '' + + if len(parts) != 2: + return '' + + user_pass = base64.b64decode(parts[1]) + return user_pass.split(':')[0] diff --git a/pywb/framework/test/test_archivalrouter.py b/pywb/framework/test/test_archivalrouter.py index b27f5f45..52009353 100644 --- a/pywb/framework/test/test_archivalrouter.py +++ b/pywb/framework/test/test_archivalrouter.py @@ -1,7 +1,7 @@ """ # Test WbRequest parsed via a Route # route with relative path, print resulting wbrequest ->>> print_req(Route('web', WbUrlHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False)) +>>> _test_route_req(Route('web', WbUrlHandler()), {'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}) {'coll': 'web', 'request_uri': '/web/test.example.com', 'wb_prefix': '/web/', @@ -9,21 +9,21 @@ # route with absolute path, running at script /my_pywb, print resultingwbrequest ->>> print_req(Route('web', WbUrlHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)) +>>> _test_route_req(Route('web', WbUrlHandler()), {'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True) {'coll': 'web', 'request_uri': '/web/2013im_/test.example.com', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')} # route with no collection ->>> print_req(Route('', BaseHandler())({'REL_REQUEST_URI': 'http://example.com', 'SCRIPT_NAME': '/pywb'}, False)) +>>> _test_route_req(Route('', BaseHandler()), {'REL_REQUEST_URI': 'http://example.com', 'SCRIPT_NAME': '/pywb'}) {'coll': '', 'request_uri': 'http://example.com', 'wb_prefix': '/pywb/', 'wb_url': None} # not matching route -- skipped ->>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False) +>>> _test_route_req(Route('web', BaseHandler()), {'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}) # Referer Redirect Test @@ -84,11 +84,18 @@ False """ -from pywb.framework.archivalrouter import Route, ReferRedirect +from pywb.framework.archivalrouter import Route, ReferRedirect, ArchivalRouter from pywb.framework.basehandlers import BaseHandler, WbUrlHandler import pprint -def print_req(req): +def _test_route_req(route, env, abs_path=False): + matcher, coll = route.is_handling(env['REL_REQUEST_URI']) + if not matcher: + return + + the_router = ArchivalRouter([route], abs_path=abs_path) + req = the_router.parse_request(route, env, matcher, coll, env['REL_REQUEST_URI'], abs_path) + varlist = vars(req) the_dict = dict((k, varlist[k]) for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')) pprint.pprint(the_dict) @@ -102,9 +109,11 @@ def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'col routes = [Route(coll, WbUrlHandler())] + the_router = ArchivalRouter(routes) + redir = ReferRedirect(match_host) #req = WbRequest.from_uri(request_uri, env) - rep = redir(env, routes) + rep = redir(env, the_router) if not rep: return False diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index 1b2faacc..24f76da1 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -83,7 +83,8 @@ def test_example_domain_specific_3(): assert '/* Bootloader.configurePage' in buff def test_wombat_top(): - status_headers, buff = get_rewritten('https://assets-cdn.github.com/assets/github-0f06d0f46fe7bcfbf31f2380f23aec15ba21b8ec.js', urlrewriter) + #status_headers, buff = get_rewritten('https://assets-cdn.github.com/assets/github-0f06d0f46fe7bcfbf31f2380f23aec15ba21b8ec.js', urlrewriter) + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/toptest.js', urlrewriter) assert 'WB_wombat_top!==window' in buff diff --git a/sample_archive/text_content/toptest.js b/sample_archive/text_content/toptest.js new file mode 100644 index 00000000..53af2de0 --- /dev/null +++ b/sample_archive/text_content/toptest.js @@ -0,0 +1 @@ +!function(){top!==window&&(alert("For security reasons, framing is not allowed."),top.location.replace(document.location))} diff --git a/tests/test_config.yaml b/tests/test_config.yaml index bace37eb..bbb96849 100644 --- a/tests/test_config.yaml +++ b/tests/test_config.yaml @@ -17,6 +17,10 @@ collections: index_paths: './sample_archive/cdx/' filters: ['filename:dupe*'] + pywb-filt-2: + index_paths: './sample_archive/cdx/' + filters: ['!filename:dupe*'] + pywb-nonframe: index_paths: './sample_archive/cdx/' framed_replay: false diff --git a/tests/test_integration.py b/tests/test_integration.py index 7e915acd..9427e2af 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,5 +1,6 @@ from pytest import raises import webtest +import base64 from pywb.webapp.pywb_init import create_wb_router from pywb.framework.wsgi_wrappers import init_app from pywb.cdx.cdxobject import CDXObject @@ -317,6 +318,42 @@ class TestWb: assert 'Sun, Jan 26 2014 20:11:27' in resp.body assert 'wb.js' in resp.body + def test_proxy_replay_auth_filtered(self): + headers = [('Proxy-Authorization', 'Basic ' + base64.b64encode('pywb-filt-2:'))] + resp = self.testapp.get('/x-ignore-this-x', headers = headers, + extra_environ = dict(REQUEST_URI = 'http://www.iana.org/', SCRIPT_NAME = '')) + + self._assert_basic_html(resp) + + assert 'Sun, Jan 26 2014 20:06:24' in resp.body + assert 'wb.js' in resp.body + + def test_proxy_replay_auth(self): + headers = [('Proxy-Authorization', 'Basic ' + base64.b64encode('pywb'))] + resp = self.testapp.get('/x-ignore-this-x', headers = headers, + extra_environ = dict(REQUEST_URI = 'http://www.iana.org/', SCRIPT_NAME = '')) + + self._assert_basic_html(resp) + + assert 'Mon, Jan 27 2014 17:12:38' in resp.body + assert 'wb.js' in resp.body + + def test_proxy_replay_auth_no_coll(self): + headers = [('Proxy-Authorization', 'Basic ' + base64.b64encode('no-such-coll'))] + resp = self.testapp.get('/x-ignore-this-x', headers = headers, + extra_environ = dict(REQUEST_URI = 'http://www.iana.org/', SCRIPT_NAME = ''), + status=407) + + assert resp.status_int == 407 + + def test_proxy_replay_auth_invalid(self): + headers = [('Proxy-Authorization', 'abc' + base64.b64encode('no-such-coll'))] + resp = self.testapp.get('/x-ignore-this-x', headers = headers, + extra_environ = dict(REQUEST_URI = 'http://www.iana.org/', SCRIPT_NAME = ''), + status=407) + + assert resp.status_int == 407 + def test_proxy_pac(self): resp = self.testapp.get('/proxy.pac', extra_environ = dict(SERVER_NAME='pywb-proxy', SERVER_PORT='8080')) assert resp.content_type == 'application/x-ns-proxy-autoconfig'