From 02326a2b12d669e82dfa78290ecc08b73e36369d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 7 Jul 2014 17:02:28 -0700 Subject: [PATCH 01/26] bump dev version to 0.4.8 --- README.rst | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 2fc4a3f8..c7d7cbc7 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.4.7 +PyWb 0.4.8 ========== .. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop diff --git a/setup.py b/setup.py index 018af47f..8f788bb2 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ class PyTest(TestCommand): setup( name='pywb', - version='0.4.7', + version='0.4.8', url='https://github.com/ikreymer/pywb', author='Ilya Kreymer', author_email='ikreymer@gmail.com', From daffc7ff5d07a070d0b89cccd32f38e2aa239fdb Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 7 Jul 2014 17:02:44 -0700 Subject: [PATCH 02/26] header rewrite: pass through 'content-range' header --- pywb/rewrite/header_rewriter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pywb/rewrite/header_rewriter.py b/pywb/rewrite/header_rewriter.py index 2b22c000..2dfc824d 100644 --- a/pywb/rewrite/header_rewriter.py +++ b/pywb/rewrite/header_rewriter.py @@ -31,7 +31,7 @@ class HeaderRewriter: 'xml': ['/xml', '+xml', '.xml', '.rss'], } - PROXY_HEADERS = ['content-type', 'content-disposition'] + PROXY_HEADERS = ['content-type', 'content-disposition', 'content-range'] URL_REWRITE_HEADERS = ['location', 'content-location', 'content-base'] From 1317b2b10fa9680863570dd8675b3b7e290ce910 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 10 Jul 2014 21:54:23 -0700 Subject: [PATCH 03/26] route selection via proxy auth! refactor poute request parsing to happen in the actual router class instead of in the route in proxy mode, add support for picking a route via proxy-auth improve test for 'top' rewriting --- pywb/framework/archivalrouter.py | 99 +++++++++++----------- pywb/framework/proxy.py | 80 ++++++++++++++--- pywb/framework/test/test_archivalrouter.py | 23 +++-- pywb/rewrite/test/test_rewrite_live.py | 3 +- sample_archive/text_content/toptest.js | 1 + tests/test_config.yaml | 4 + tests/test_integration.py | 37 ++++++++ 7 files changed, 181 insertions(+), 66 deletions(-) create mode 100644 sample_archive/text_content/toptest.js diff --git a/pywb/framework/archivalrouter.py b/pywb/framework/archivalrouter.py index 7404b35b..1b027488 100644 --- a/pywb/framework/archivalrouter.py +++ b/pywb/framework/archivalrouter.py @@ -29,16 +29,49 @@ class ArchivalRouter(object): self.error_view = kwargs.get('error_view') def __call__(self, env): + request_uri = env['REL_REQUEST_URI'] + for route in self.routes: - result = route(env, self.abs_path) - if result: - return result + matcher, coll = route.is_handling(request_uri) + if matcher: + wbrequest = self.parse_request(route, env, matcher, + coll, request_uri, + use_abs_prefix=self.abs_path) + + return route.handler(wbrequest) # Default Home Page - if env['REL_REQUEST_URI'] in ['/', '/index.html', '/index.htm']: + if request_uri in ['/', '/index.html', '/index.htm']: return self.render_home_page(env) - return self.fallback(env, self.routes) if self.fallback else None + return self.fallback(env, self) if self.fallback else None + + def parse_request(self, route, env, matcher, coll, request_uri, + use_abs_prefix=False): + matched_str = matcher.group(0) + if matched_str: + rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/' + # remove the '/' + rel_prefix part of uri + wb_url_str = request_uri[len(matched_str) + 2:] + else: + rel_prefix = env['SCRIPT_NAME'] + '/' + # the request_uri is the wb_url, since no coll + wb_url_str = request_uri[1:] + + wbrequest = route.request_class(env, + request_uri=request_uri, + wb_url_str=wb_url_str, + rel_prefix=rel_prefix, + coll=coll, + use_abs_prefix=use_abs_prefix, + wburl_class=route.handler.get_wburl_type(), + urlrewriter_class=UrlRewriter) + + # Allow for applying of additional filters + route.apply_filters(wbrequest, matcher) + + return wbrequest + def render_home_page(self, env): # render the homepage! @@ -73,45 +106,15 @@ class Route(object): self.coll_group = coll_group self._custom_init(config) - def __call__(self, env, use_abs_prefix): - wbrequest = self.parse_request(env, use_abs_prefix) - return self.handler(wbrequest) if wbrequest else None - - def parse_request(self, env, use_abs_prefix, request_uri=None): - if not request_uri: - request_uri = env['REL_REQUEST_URI'] - + def is_handling(self, request_uri): matcher = self.regex.match(request_uri[1:]) if not matcher: - return None - - matched_str = matcher.group(0) - if matched_str: - rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/' - # remove the '/' + rel_prefix part of uri - wb_url_str = request_uri[len(matched_str) + 2:] - else: - rel_prefix = env['SCRIPT_NAME'] + '/' - # the request_uri is the wb_url, since no coll - wb_url_str = request_uri[1:] + return None, None coll = matcher.group(self.coll_group) + return matcher, coll - wbrequest = self.request_class(env, - request_uri=request_uri, - wb_url_str=wb_url_str, - rel_prefix=rel_prefix, - coll=coll, - use_abs_prefix=use_abs_prefix, - wburl_class=self.handler.get_wburl_type(), - urlrewriter_class=UrlRewriter) - - # Allow for applying of additional filters - self._apply_filters(wbrequest, matcher) - - return wbrequest - - def _apply_filters(self, wbrequest, matcher): + def apply_filters(self, wbrequest, matcher): for filter in self.filters: last_grp = len(matcher.groups()) filter_str = filter.format(matcher.group(last_grp)) @@ -136,9 +139,11 @@ class ReferRedirect: else: self.match_prefixs = [match_prefixs] - def __call__(self, env, routes): + def __call__(self, env, the_router): referrer = env.get('HTTP_REFERER') + routes = the_router.routes + # ensure there is a referrer if referrer is None: return None @@ -166,17 +171,15 @@ class ReferRedirect: ref_request = None for route in routes: - ref_request = route.parse_request(env, False, request_uri=path) - if ref_request: + matcher, coll = route.is_handling(path) + if matcher: + ref_request = the_router.parse_request(route, env, + matcher, coll, path) ref_route = route break - # must have matched one of the routes - if not ref_request: - return None - - # must have a rewriter - if not ref_request.urlrewriter: + # must have matched one of the routes with a urlrewriter + if not ref_request or not ref_request.urlrewriter: return None rewriter = ref_request.urlrewriter diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index 2ab0c9bc..faf6b72e 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -1,8 +1,11 @@ from wbrequestresponse import WbResponse, WbRequest from archivalrouter import ArchivalRouter + import urlparse +import base64 from pywb.rewrite.url_rewriter import HttpsUrlRewriter +from pywb.utils.statusandheaders import StatusAndHeaders #================================================================= @@ -15,10 +18,7 @@ class ProxyArchivalRouter(ArchivalRouter): """ def __init__(self, routes, **kwargs): super(ProxyArchivalRouter, self).__init__(routes, **kwargs) - request_class = routes[0].request_class - self.proxy = ProxyRouter(routes[0].handler, - request_class=request_class, - **kwargs) + self.proxy = ProxyRouter(routes, **kwargs) def __call__(self, env): response = self.proxy(env) @@ -43,12 +43,14 @@ class ProxyRouter(object): See: http://www.mementoweb.org/guide/rfc/#Pattern1.3 for more details. """ - def __init__(self, handler, **kwargs): - self.handler = handler + def __init__(self, routes, **kwargs): + self.routes = routes self.hostpaths = kwargs.get('hostpaths') self.error_view = kwargs.get('error_view') - self.request_class = kwargs.get('request_class') + + self.auth_msg = kwargs.get('auth_msg', + 'Please enter name of a collection to use for proxy mode') def __call__(self, env): url = env['REL_REQUEST_URI'] @@ -59,16 +61,50 @@ class ProxyRouter(object): if not url.startswith('http://'): return None - wbrequest = self.request_class(env, + proxy_auth = env.get('HTTP_PROXY_AUTHORIZATION') + + route = None + coll = None + matcher = None + + if proxy_auth: + proxy_coll = self.read_basic_auth_coll(proxy_auth) + proxy_coll = '/' + proxy_coll + '/' + + if not proxy_coll: + return self.proxy_auth_coll_response() + + for r in self.routes: + matcher, c = r.is_handling(proxy_coll) + print r.regex.pattern + if matcher: + route = r + coll = c + break + + if not route: + return self.proxy_auth_coll_response() + + print 'COLL ', coll + + else: + route = self.routes[0] + coll = self.routes[0].regex.pattern + + wbrequest = route.request_class(env, request_uri=url, wb_url_str=url, + coll=coll, host_prefix=self.hostpaths[0], - wburl_class=self.handler.get_wburl_type(), + wburl_class=route.handler.get_wburl_type(), urlrewriter_class=HttpsUrlRewriter, use_abs_prefix=False, is_proxy=True) - return self.handler(wbrequest) + if matcher: + route.apply_filters(wbrequest, matcher) + + return route.handler(wbrequest) # Proxy Auto-Config (PAC) script for the proxy def make_pac_response(self, env): @@ -97,3 +133,27 @@ class ProxyRouter(object): content_type = 'application/x-ns-proxy-autoconfig' return WbResponse.text_response(buff, content_type=content_type) + + def proxy_auth_coll_response(self): + proxy_msg = 'Basic realm="{0}"'.format(self.auth_msg) + + headers = [('Content-Type', 'text/plain'), + ('Proxy-Authenticate', proxy_msg)] + + status_headers = StatusAndHeaders('407 Proxy Authentication', headers) + + value = self.auth_msg + + return WbResponse(status_headers, value=[value]) + + @staticmethod + def read_basic_auth_coll(value): + parts = value.split(' ') + if parts[0].lower() != 'basic': + return '' + + if len(parts) != 2: + return '' + + user_pass = base64.b64decode(parts[1]) + return user_pass.split(':')[0] diff --git a/pywb/framework/test/test_archivalrouter.py b/pywb/framework/test/test_archivalrouter.py index b27f5f45..52009353 100644 --- a/pywb/framework/test/test_archivalrouter.py +++ b/pywb/framework/test/test_archivalrouter.py @@ -1,7 +1,7 @@ """ # Test WbRequest parsed via a Route # route with relative path, print resulting wbrequest ->>> print_req(Route('web', WbUrlHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False)) +>>> _test_route_req(Route('web', WbUrlHandler()), {'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}) {'coll': 'web', 'request_uri': '/web/test.example.com', 'wb_prefix': '/web/', @@ -9,21 +9,21 @@ # route with absolute path, running at script /my_pywb, print resultingwbrequest ->>> print_req(Route('web', WbUrlHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)) +>>> _test_route_req(Route('web', WbUrlHandler()), {'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True) {'coll': 'web', 'request_uri': '/web/2013im_/test.example.com', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com')} # route with no collection ->>> print_req(Route('', BaseHandler())({'REL_REQUEST_URI': 'http://example.com', 'SCRIPT_NAME': '/pywb'}, False)) +>>> _test_route_req(Route('', BaseHandler()), {'REL_REQUEST_URI': 'http://example.com', 'SCRIPT_NAME': '/pywb'}) {'coll': '', 'request_uri': 'http://example.com', 'wb_prefix': '/pywb/', 'wb_url': None} # not matching route -- skipped ->>> Route('web', BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False) +>>> _test_route_req(Route('web', BaseHandler()), {'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}) # Referer Redirect Test @@ -84,11 +84,18 @@ False """ -from pywb.framework.archivalrouter import Route, ReferRedirect +from pywb.framework.archivalrouter import Route, ReferRedirect, ArchivalRouter from pywb.framework.basehandlers import BaseHandler, WbUrlHandler import pprint -def print_req(req): +def _test_route_req(route, env, abs_path=False): + matcher, coll = route.is_handling(env['REL_REQUEST_URI']) + if not matcher: + return + + the_router = ArchivalRouter([route], abs_path=abs_path) + req = the_router.parse_request(route, env, matcher, coll, env['REL_REQUEST_URI'], abs_path) + varlist = vars(req) the_dict = dict((k, varlist[k]) for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')) pprint.pprint(the_dict) @@ -102,9 +109,11 @@ def _test_redir(match_host, request_uri, referrer, script_name = '', coll = 'col routes = [Route(coll, WbUrlHandler())] + the_router = ArchivalRouter(routes) + redir = ReferRedirect(match_host) #req = WbRequest.from_uri(request_uri, env) - rep = redir(env, routes) + rep = redir(env, the_router) if not rep: return False diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index 1b2faacc..24f76da1 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -83,7 +83,8 @@ def test_example_domain_specific_3(): assert '/* Bootloader.configurePage' in buff def test_wombat_top(): - status_headers, buff = get_rewritten('https://assets-cdn.github.com/assets/github-0f06d0f46fe7bcfbf31f2380f23aec15ba21b8ec.js', urlrewriter) + #status_headers, buff = get_rewritten('https://assets-cdn.github.com/assets/github-0f06d0f46fe7bcfbf31f2380f23aec15ba21b8ec.js', urlrewriter) + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/toptest.js', urlrewriter) assert 'WB_wombat_top!==window' in buff diff --git a/sample_archive/text_content/toptest.js b/sample_archive/text_content/toptest.js new file mode 100644 index 00000000..53af2de0 --- /dev/null +++ b/sample_archive/text_content/toptest.js @@ -0,0 +1 @@ +!function(){top!==window&&(alert("For security reasons, framing is not allowed."),top.location.replace(document.location))} diff --git a/tests/test_config.yaml b/tests/test_config.yaml index bace37eb..bbb96849 100644 --- a/tests/test_config.yaml +++ b/tests/test_config.yaml @@ -17,6 +17,10 @@ collections: index_paths: './sample_archive/cdx/' filters: ['filename:dupe*'] + pywb-filt-2: + index_paths: './sample_archive/cdx/' + filters: ['!filename:dupe*'] + pywb-nonframe: index_paths: './sample_archive/cdx/' framed_replay: false diff --git a/tests/test_integration.py b/tests/test_integration.py index 7e915acd..9427e2af 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,5 +1,6 @@ from pytest import raises import webtest +import base64 from pywb.webapp.pywb_init import create_wb_router from pywb.framework.wsgi_wrappers import init_app from pywb.cdx.cdxobject import CDXObject @@ -317,6 +318,42 @@ class TestWb: assert 'Sun, Jan 26 2014 20:11:27' in resp.body assert 'wb.js' in resp.body + def test_proxy_replay_auth_filtered(self): + headers = [('Proxy-Authorization', 'Basic ' + base64.b64encode('pywb-filt-2:'))] + resp = self.testapp.get('/x-ignore-this-x', headers = headers, + extra_environ = dict(REQUEST_URI = 'http://www.iana.org/', SCRIPT_NAME = '')) + + self._assert_basic_html(resp) + + assert 'Sun, Jan 26 2014 20:06:24' in resp.body + assert 'wb.js' in resp.body + + def test_proxy_replay_auth(self): + headers = [('Proxy-Authorization', 'Basic ' + base64.b64encode('pywb'))] + resp = self.testapp.get('/x-ignore-this-x', headers = headers, + extra_environ = dict(REQUEST_URI = 'http://www.iana.org/', SCRIPT_NAME = '')) + + self._assert_basic_html(resp) + + assert 'Mon, Jan 27 2014 17:12:38' in resp.body + assert 'wb.js' in resp.body + + def test_proxy_replay_auth_no_coll(self): + headers = [('Proxy-Authorization', 'Basic ' + base64.b64encode('no-such-coll'))] + resp = self.testapp.get('/x-ignore-this-x', headers = headers, + extra_environ = dict(REQUEST_URI = 'http://www.iana.org/', SCRIPT_NAME = ''), + status=407) + + assert resp.status_int == 407 + + def test_proxy_replay_auth_invalid(self): + headers = [('Proxy-Authorization', 'abc' + base64.b64encode('no-such-coll'))] + resp = self.testapp.get('/x-ignore-this-x', headers = headers, + extra_environ = dict(REQUEST_URI = 'http://www.iana.org/', SCRIPT_NAME = ''), + status=407) + + assert resp.status_int == 407 + def test_proxy_pac(self): resp = self.testapp.get('/proxy.pac', extra_environ = dict(SERVER_NAME='pywb-proxy', SERVER_PORT='8080')) assert resp.content_type == 'application/x-ns-proxy-autoconfig' From 1b1a1f811508e757418acf2b809a03d63faeabda Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 14 Jul 2014 19:12:30 -0700 Subject: [PATCH 04/26] proxy: add 'proxy_coll_select' config which will require a proxy-auth to select a collection for proxy mode. Otherwise, defaults to first available collection, though proxy-auth can still be sent to specify different collection --- config.yaml | 4 ++++ pywb/framework/proxy.py | 13 +++++++++---- pywb/webapp/pywb_init.py | 4 +++- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/config.yaml b/config.yaml index 534eb4ff..91051b81 100644 --- a/config.yaml +++ b/config.yaml @@ -91,6 +91,10 @@ static_routes: # Enable simple http proxy mode enable_http_proxy: true +# additional options for routing +routing_options: + proxy_coll_select: false + # enable cdx server api for querying cdx directly (experimental) enable_cdx_api: true diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index faf6b72e..21bb65b5 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -49,9 +49,15 @@ class ProxyRouter(object): self.error_view = kwargs.get('error_view') - self.auth_msg = kwargs.get('auth_msg', + routing_options = kwargs.get('routing_options') + if not routing_options: + routing_options = {} + + self.auth_msg = routing_options.get('auth_msg', 'Please enter name of a collection to use for proxy mode') + self.proxy_coll_select = routing_options.get('proxy_coll_select', False) + def __call__(self, env): url = env['REL_REQUEST_URI'] @@ -76,7 +82,6 @@ class ProxyRouter(object): for r in self.routes: matcher, c = r.is_handling(proxy_coll) - print r.regex.pattern if matcher: route = r coll = c @@ -85,8 +90,8 @@ class ProxyRouter(object): if not route: return self.proxy_auth_coll_response() - print 'COLL ', coll - + elif self.proxy_coll_select: + return self.proxy_auth_coll_response() else: route = self.routes[0] coll = self.routes[0].regex.pattern diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index 3ec39dfc..b3ff1448 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -237,5 +237,7 @@ def create_wb_router(passed_config={}): 'Home Page'), error_view=J2TemplateView.create_template(config.get('error_html'), - 'Error Page') + 'Error Page'), + + routing_options=config.get('routing_options') ) From 7032160cf9327565948315de15b65433458c6c68 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 14 Jul 2014 19:13:19 -0700 Subject: [PATCH 05/26] rewrite: fix rel url resolution to better handle parent rel path. Explicitly resolve path when possible, remove only if at root level --- pywb/rewrite/test/test_regex_rewriters.py | 3 +++ pywb/rewrite/url_rewriter.py | 32 ++++++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py index cbd2cb21..4391edee 100644 --- a/pywb/rewrite/test/test_regex_rewriters.py +++ b/pywb/rewrite/test/test_regex_rewriters.py @@ -86,6 +86,9 @@ r""" >>> _test_css("background: url(file.jpeg)") 'background: url(/web/20131010em_/http://example.com/file.jpeg)' +>>> _test_css("background:#abc url('/static/styles/../images/layout/logo.png')") +"background:#abc url('/web/20131010em_/http://example.com/static/images/layout/logo.png')" + >>> _test_css("background: url('')") "background: url('')" diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 79136ff5..236aba96 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -57,7 +57,7 @@ class UrlRewriter(object): else: # optimize: join if not absolute url, otherwise just use that if not is_abs: - new_url = urlparse.urljoin(wburl.url, url).replace('../', '') + new_url = self.urljoin(wburl.url, url) else: new_url = url @@ -92,6 +92,36 @@ class UrlRewriter(object): def __repr__(self): return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix) + @staticmethod + def urljoin(orig_url, url): + new_url = urlparse.urljoin(orig_url, url) + if '../' not in new_url: + return new_url + + parts = urlparse.urlsplit(new_url) + scheme, netloc, path, query, frag = parts + + path_parts = path.split('/') + i = len(path_parts) - 1 + while i >= 0: + if path_parts[i] == '..': + del path_parts[i] + if i > 0: + del path_parts[i - 1] + i -= 1 + i -= 1 + + if path_parts == ['']: + path = '/' + else: + path = '/'.join(path_parts) + + parts = (scheme, netloc, path, query, frag) + + + new_url = urlparse.urlunsplit(parts) + return new_url + #================================================================= class HttpsUrlRewriter(object): From e858b8faaeecd69405cb32c03a98264d1946f05d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 14 Jul 2014 20:50:45 -0700 Subject: [PATCH 06/26] rewrite: better fix for multiple ../ in urls, additional tests --- pywb/rewrite/test/test_regex_rewriters.py | 3 +++ pywb/rewrite/test/test_url_rewriter.py | 17 +++++++++++++++++ pywb/rewrite/url_rewriter.py | 10 +++++++--- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py index 4391edee..29783cdd 100644 --- a/pywb/rewrite/test/test_regex_rewriters.py +++ b/pywb/rewrite/test/test_regex_rewriters.py @@ -89,6 +89,9 @@ r""" >>> _test_css("background:#abc url('/static/styles/../images/layout/logo.png')") "background:#abc url('/web/20131010em_/http://example.com/static/images/layout/logo.png')" +>>> _test_css("background:#000 url('/static/styles/../../images/layout/logo.png')") +"background:#000 url('/web/20131010em_/http://example.com/images/layout/logo.png')" + >>> _test_css("background: url('')") "background: url('')" diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py index 59669b96..a4173d3a 100644 --- a/pywb/rewrite/test/test_url_rewriter.py +++ b/pywb/rewrite/test/test_url_rewriter.py @@ -1,4 +1,21 @@ """ +# urljoin tests + +>>> UrlRewriter.urljoin('http://example.com/test/', '../file.html') +'http://example.com/file.html' + +>>> UrlRewriter.urljoin('http://example.com/test/', '../path/../../../file.html') +'http://example.com/file.html' + +>>> UrlRewriter.urljoin('http://example.com/test/', '/../file.html') +'http://example.com/file.html' + +>>> UrlRewriter.urljoin('http://example.com/', '/abc/../../file.html') +'http://example.com/file.html' + +>>> UrlRewriter.urljoin('http://example.com/path/more/', 'abc/../../file.html') +'http://example.com/path/file.html' + # UrlRewriter tests >>> do_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') 'https://web.archive.org/web/20131010/http://example.com/path/other.html' diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 236aba96..70d5d2a8 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -102,14 +102,18 @@ class UrlRewriter(object): scheme, netloc, path, query, frag = parts path_parts = path.split('/') - i = len(path_parts) - 1 - while i >= 0: + i = 0 + n = len(path_parts) - 1 + while i < n: if path_parts[i] == '..': del path_parts[i] + n -= 1 if i > 0: del path_parts[i - 1] + n -= 1 i -= 1 - i -= 1 + else: + i += 1 if path_parts == ['']: path = '/' From fa52e0126d44046d3bb8ff19296bf153de665f16 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 15 Jul 2014 12:52:42 -0700 Subject: [PATCH 07/26] cookies: support client side rewriting of document.cooke -> WB_wombat_cookie to rewrite cookie path, if present --- pywb/rewrite/regex_rewriters.py | 1 + pywb/static/wombat.js | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index c2359c1e..3f440eee 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -126,6 +126,7 @@ class JSLinkAndLocationRewriter(JSLinkOnlyRewriter): (r'(? Date: Tue, 15 Jul 2014 12:57:02 -0700 Subject: [PATCH 08/26] cookie: add test for 'document.cookie' rewriting --- pywb/rewrite/test/test_regex_rewriters.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py index 29783cdd..3f3b4638 100644 --- a/pywb/rewrite/test/test_regex_rewriters.py +++ b/pywb/rewrite/test/test_regex_rewriters.py @@ -53,6 +53,10 @@ r""" >>> _test_js('cool_Location = "//example.com/abc.html" //comment') 'cool_Location = "/web/20131010em_/http://example.com/abc.html" //comment' +# document.cookie test +>>> _test_js('document.cookie = "a=b; Path=/"') +'document.WB_wombat_cookie = "a=b; Path=/"' + #================================================================= # XML Rewriting From 96fcaab521398315433bf76c18a327742fe1cdcf Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 19 Jul 2014 14:43:28 -0700 Subject: [PATCH 09/26] live-rewrite-server: add ability to specify http/https proxy for live fetching (for example, for use with a recording proxy) --- pywb/apps/live_rewrite_server.py | 25 +++++++++++++++++++++-- pywb/framework/test/test_wsgi_wrapper.py | 2 +- pywb/framework/wsgi_wrappers.py | 17 +++------------- pywb/rewrite/rewrite_live.py | 26 ++++++++---------------- pywb/webapp/live_rewrite_handler.py | 6 +++--- pywb/webapp/replay_views.py | 4 +++- tests/test_live_rewriter.py | 3 ++- 7 files changed, 44 insertions(+), 39 deletions(-) diff --git a/pywb/apps/live_rewrite_server.py b/pywb/apps/live_rewrite_server.py index 9b29e42b..8d3544f3 100644 --- a/pywb/apps/live_rewrite_server.py +++ b/pywb/apps/live_rewrite_server.py @@ -2,15 +2,36 @@ from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server from pywb.webapp.live_rewrite_handler import create_live_rewriter_app +from argparse import ArgumentParser + + #================================================================= -# init cdx server app +# init rewrite server app #================================================================= -application = init_app(create_live_rewriter_app, load_yaml=False) +def create_app(): + parser = ArgumentParser(description='Live Rewrite Server') + + parser.add_argument('-x', '--proxy', + action='store', + help='Specify host:port to use as HTTP/S proxy') + + result, unknown = parser.parse_known_args() + + config=dict(proxyhostport=result.proxy, framed_replay=True) + + app = init_app(create_live_rewriter_app, load_yaml=False, + config=config) + + return app + + +application = create_app() def main(): # pragma: no cover start_wsgi_server(application, 'Live Rewriter App', default_port=8090) + if __name__ == "__main__": main() diff --git a/pywb/framework/test/test_wsgi_wrapper.py b/pywb/framework/test/test_wsgi_wrapper.py index f3d65135..e46cded5 100644 --- a/pywb/framework/test/test_wsgi_wrapper.py +++ b/pywb/framework/test/test_wsgi_wrapper.py @@ -22,7 +22,7 @@ class TestCustomErrApp: def initer(app_class): - def init(): + def init(config=None): return app_class() return init diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py index 837a7c74..3729a660 100644 --- a/pywb/framework/wsgi_wrappers.py +++ b/pywb/framework/wsgi_wrappers.py @@ -112,7 +112,7 @@ DEFAULT_CONFIG_FILE = 'config.yaml' #================================================================= -def init_app(init_func, load_yaml=True, config_file=None): +def init_app(init_func, load_yaml=True, config_file=None, config={}): logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', level=logging.DEBUG) logging.debug('') @@ -129,9 +129,7 @@ def init_app(init_func, load_yaml=True, config_file=None): config = load_yaml_config(config_file) - wb_router = init_func(config) - else: - wb_router = init_func() + wb_router = init_func(config) except: msg = '*** pywb app init FAILED config from "%s"!\n' logging.exception(msg, init_func.__name__) @@ -146,17 +144,8 @@ def init_app(init_func, load_yaml=True, config_file=None): #================================================================= def start_wsgi_server(the_app, name, default_port=None): # pragma: no cover from wsgiref.simple_server import make_server - from optparse import OptionParser - opt = OptionParser('%prog [OPTIONS]') - opt.add_option('-p', '--port', type='int', default=None) - - options, args = opt.parse_args() - - port = options.port - - if not port: - port = the_app.port + port = the_app.port if not port: if default_port: diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index de137ae3..fbda24f4 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -5,6 +5,7 @@ Fetch a url from live web and apply rewriting rules import requests import datetime import mimetypes +import logging from urlparse import urlsplit @@ -19,24 +20,11 @@ from pywb.rewrite.rewrite_content import RewriteContent #================================================================= class LiveRewriter(object): - PROXY_HEADER_LIST = [('HTTP_USER_AGENT', 'User-Agent'), - ('HTTP_ACCEPT', 'Accept'), - ('HTTP_ACCEPT_LANGUAGE', 'Accept-Language'), - ('HTTP_ACCEPT_CHARSET', 'Accept-Charset'), - ('HTTP_ACCEPT_ENCODING', 'Accept-Encoding'), - ('HTTP_RANGE', 'Range'), - ('HTTP_CACHE_CONTROL', 'Cache-Control'), - ('HTTP_X_REQUESTED_WITH', 'X-Requested-With'), - ('HTTP_X_CSRF_TOKEN', 'X-CSRF-Token'), - ('HTTP_PE_TOKEN', 'PE-Token'), - ('HTTP_COOKIE', 'Cookie'), - ('CONTENT_TYPE', 'Content-Type'), - ('CONTENT_LENGTH', 'Content-Length'), - ('REL_REFERER', 'Referer'), - ] - - def __init__(self, defmod=''): + def __init__(self, defmod='', default_proxy=None): self.rewriter = RewriteContent(defmod=defmod) + self.default_proxy = default_proxy + if self.default_proxy: + logging.debug('Live Rewrite via proxy ' + self.default_proxy) def fetch_local_file(self, uri): fh = open(uri) @@ -89,6 +77,10 @@ class LiveRewriter(object): method = 'GET' data = None + if not proxies and self.default_proxy: + proxies = {'http': self.default_proxy, + 'https': self.default_proxy} + if env is not None: method = env['REQUEST_METHOD'].upper() input_ = env['wsgi.input'] diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py index a69cf8e9..6b1d69e3 100644 --- a/pywb/webapp/live_rewrite_handler.py +++ b/pywb/webapp/live_rewrite_handler.py @@ -9,7 +9,7 @@ from replay_views import RewriteLiveView #================================================================= class RewriteHandler(WbUrlHandler): - def __init__(self, config=dict(framed_replay=True)): + def __init__(self, config): self.rewrite_view = RewriteLiveView(config) def __call__(self, wbrequest): @@ -17,8 +17,8 @@ class RewriteHandler(WbUrlHandler): #================================================================= -def create_live_rewriter_app(): - routes = [Route('rewrite', RewriteHandler()), +def create_live_rewriter_app(config={}): + routes = [Route('rewrite', RewriteHandler(config)), Route('static/default', StaticHandler('pywb/static/')) ] diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index 8cc14b7d..9cc0aa6a 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -88,7 +88,9 @@ class RewriteLiveView(BaseContentView): def __init__(self, config): super(RewriteLiveView, self).__init__(config) - self.rewriter = LiveRewriter(defmod=self._mp_mod) + default_proxy = config.get('proxyhostport') + self.rewriter = LiveRewriter(defmod=self._mp_mod, + default_proxy=default_proxy) def render_content(self, wbrequest, *args): head_insert_func = self.head_insert_view.create_insert_func(wbrequest) diff --git a/tests/test_live_rewriter.py b/tests/test_live_rewriter.py index b2a6dada..ca79c828 100644 --- a/tests/test_live_rewriter.py +++ b/tests/test_live_rewriter.py @@ -4,7 +4,8 @@ import webtest class TestLiveRewriter: def setup(self): - self.app = init_app(create_live_rewriter_app, load_yaml=False) + self.app = init_app(create_live_rewriter_app, load_yaml=False, + config=dict(framed_replay=True)) self.testapp = webtest.TestApp(self.app) def test_live_rewrite_1(self): From b785cd6f08556368a7643e02a0e771b543e7554e Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 20 Jul 2014 15:43:39 -0700 Subject: [PATCH 10/26] memento: use mp_ modifier to support memento with frame or non-frame replay change memento test to use frame replay --- pywb/framework/memento.py | 6 ++++-- tests/test_config_memento.yaml | 3 ++- tests/test_memento.py | 26 +++++++++++++------------- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/pywb/framework/memento.py b/pywb/framework/memento.py index c9981a80..4b3eecc1 100644 --- a/pywb/framework/memento.py +++ b/pywb/framework/memento.py @@ -117,6 +117,7 @@ def make_memento_link(cdx, prefix, datetime=None, rel='memento', end=',\n'): memento = '<{0}>; rel="{1}"; datetime="{2}"' + end string = WbUrl.to_wburl_str(url=cdx['original'], + mod='mp_', timestamp=cdx['timestamp'], type=WbUrl.REPLAY) @@ -140,7 +141,8 @@ def make_timemap(wbrequest, cdx_lines): # timemap link timemap = ('<{0}>; rel="self"; ' + 'type="application/link-format"; from="{1}",\n') - yield timemap.format(prefix + wbrequest.wb_url.to_str(), from_date) + yield timemap.format(prefix + wbrequest.wb_url.to_str(), + from_date) # original link original = '<{0}>; rel="original",\n' @@ -148,7 +150,7 @@ def make_timemap(wbrequest, cdx_lines): # timegate link timegate = '<{0}>; rel="timegate",\n' - yield timegate.format(prefix + url) + yield timegate.format(prefix + 'mp_/' + url) # first memento link yield make_memento_link(first_cdx, prefix, diff --git a/tests/test_config_memento.yaml b/tests/test_config_memento.yaml index c17dabd9..003a3145 100644 --- a/tests/test_config_memento.yaml +++ b/tests/test_config_memento.yaml @@ -17,4 +17,5 @@ enable_http_proxy: true # enable cdx server api for timemap enable_cdx_api: true - +# test memento with framed replay +framed_replay: true diff --git a/tests/test_memento.py b/tests/test_memento.py index 42840e7e..f697f1bc 100644 --- a/tests/test_memento.py +++ b/tests/test_memento.py @@ -34,7 +34,7 @@ class TestWb: """ TimeGate with no Accept-Datetime header """ - resp = self.testapp.get('/pywb/http://www.iana.org/_css/2013.1/screen.css') + resp = self.testapp.get('/pywb/mp_/http://www.iana.org/_css/2013.1/screen.css') assert resp.status_int == 302 @@ -46,7 +46,7 @@ class TestWb: assert MEMENTO_DATETIME not in resp.headers - assert '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location'] + assert '/pywb/20140127171239mp_/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location'] def test_timegate_accept_datetime(self): @@ -54,7 +54,7 @@ class TestWb: TimeGate with Accept-Datetime header """ headers = {ACCEPT_DATETIME: 'Sun, 26 Jan 2014 20:08:04'} - resp = self.testapp.get('/pywb/http://www.iana.org/_css/2013.1/screen.css', headers=headers) + resp = self.testapp.get('/pywb/mp_/http://www.iana.org/_css/2013.1/screen.css', headers=headers) assert resp.status_int == 302 @@ -67,7 +67,7 @@ class TestWb: assert MEMENTO_DATETIME not in resp.headers - assert '/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location'] + assert '/pywb/20140126200804mp_/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location'] def test_non_timegate_intermediate_redir(self): @@ -76,7 +76,7 @@ class TestWb: """ headers = {ACCEPT_DATETIME: 'Sun, 26 Jan 2014 20:08:04'} # not a timegate, partial timestamp /2014/ present - resp = self.testapp.get('/pywb/2014/http://www.iana.org/_css/2013.1/screen.css', headers=headers) + resp = self.testapp.get('/pywb/2014mp_/http://www.iana.org/_css/2013.1/screen.css', headers=headers) assert resp.status_int == 302 @@ -90,14 +90,14 @@ class TestWb: # redirect to latest, not negotiation via Accept-Datetime - assert '/pywb/20140127171239/' in resp.headers['Location'] + assert '/pywb/20140127171239mp_/' in resp.headers['Location'] def test_memento_url(self): """ Memento response, 200 capture """ - resp = self.testapp.get('/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css') + resp = self.testapp.get('/pywb/20140126200804mp_/http://www.iana.org/_css/2013.1/screen.css') assert resp.status_int == 200 @@ -105,7 +105,7 @@ class TestWb: links = self.get_links(resp) assert '; rel="original"' in links - assert '; rel="timegate"' in links + assert '; rel="timegate"' in links assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links assert resp.headers[MEMENTO_DATETIME] == 'Sun, 26 Jan 2014 20:08:04 GMT' @@ -115,7 +115,7 @@ class TestWb: """ Memento (capture) of a 302 response """ - resp = self.testapp.get('/pywb/20140128051539/http://www.iana.org/domains/example') + resp = self.testapp.get('/pywb/20140128051539mp_/http://www.iana.org/domains/example') assert resp.status_int == 302 @@ -123,7 +123,7 @@ class TestWb: links = self.get_links(resp) assert '; rel="original"' in links - assert '; rel="timegate"' in links + assert '; rel="timegate"' in links assert self.make_timemap_link('http://www.iana.org/domains/example') in links assert resp.headers[MEMENTO_DATETIME] == 'Tue, 28 Jan 2014 05:15:39 GMT' @@ -147,12 +147,12 @@ rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT" assert lines[1] == '; rel="original",' - assert lines[2] == '; rel="timegate",' + assert lines[2] == '; rel="timegate",' - assert lines[3] == '; \ + assert lines[3] == '; \ rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT",' - assert lines[4] == '; \ + assert lines[4] == '; \ rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"' def test_timemap_2(self): From 6da27789eb5cec817e34638f8ecb639a64725411 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 20 Jul 2014 16:36:49 -0700 Subject: [PATCH 11/26] live handler: allow live rewrite handler to be specified as one of the collections in pywb by settings index_paths to '$liveweb'. When used, creates a RewriteHandler instead of WBHandler Can also specify 'proxyhostport' to set the live rewrite to go through a proxy fallback: allow fallback to a different handler (usually live rewrite) by specifying 'redir_fallback' with name of handler. Instead of 404, a not found response will internally call the fallback handler to get a response --- pywb/rewrite/rewrite_live.py | 5 ++- pywb/webapp/cdx_api_handler.py | 2 +- pywb/webapp/handlers.py | 31 ++++++++++++++++--- pywb/webapp/live_rewrite_handler.py | 3 ++ pywb/webapp/pywb_init.py | 48 +++++++++++++++++++++-------- 5 files changed, 69 insertions(+), 20 deletions(-) diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index fbda24f4..b81b0144 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -25,6 +25,8 @@ class LiveRewriter(object): self.default_proxy = default_proxy if self.default_proxy: logging.debug('Live Rewrite via proxy ' + self.default_proxy) + else: + logging.debug('Live Rewrite Direct (no proxy)') def fetch_local_file(self, uri): fh = open(uri) @@ -148,7 +150,8 @@ class LiveRewriter(object): 'timestamp': timestamp, 'original': url, 'statuscode': status_headers.get_statuscode(), - 'mimetype': status_headers.get_header('Content-Type') + 'mimetype': status_headers.get_header('Content-Type'), + 'is_live': True, } result = (self.rewriter. diff --git a/pywb/webapp/cdx_api_handler.py b/pywb/webapp/cdx_api_handler.py index e3e16a72..659e6048 100644 --- a/pywb/webapp/cdx_api_handler.py +++ b/pywb/webapp/cdx_api_handler.py @@ -25,7 +25,7 @@ class CDXAPIHandler(BaseHandler): return WbResponse.text_stream(cdx_iter) def __str__(self): - return 'CDX Handler: ' + str(self.index_handler) + return 'CDX Index Handler' @staticmethod def extract_params_from_wsgi_env(env): diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index 2299d2e1..8ebe5ec2 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -14,7 +14,7 @@ from pywb.framework.wbrequestresponse import WbResponse #================================================================= class WBHandler(WbUrlHandler): def __init__(self, index_reader, replay, - search_view=None, config=None): + search_view=None, config=None, handler_dict=None): self.index_reader = index_reader @@ -22,24 +22,45 @@ class WBHandler(WbUrlHandler): self.search_view = search_view + self.fallback_handler = None + + if handler_dict: + fallback = config.get('redir_fallback') + if fallback: + self.fallback_handler = handler_dict.get(fallback) + def __call__(self, wbrequest): if wbrequest.wb_url_str == '/': return self.render_search_page(wbrequest) - with PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t: - response = self.index_reader.load_for_request(wbrequest) + try: + with PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t: + response = self.index_reader.load_for_request(wbrequest) + except NotFoundException as nfe: + return self.handle_not_found(wbrequest, nfe) if isinstance(response, WbResponse): return response - cdx_lines = response[0] - cdx_callback = response[1] + cdx_lines, cdx_callback = response + return self.handle_replay(wbrequest, cdx_lines, cdx_callback) + def handle_replay(self, wbrequest, cdx_lines, cdx_callback): with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: return self.replay(wbrequest, cdx_lines, cdx_callback) + def handle_not_found(self, wbrequest, nfe): + if (not self.fallback_handler or + wbrequest.wb_url.is_query() or + wbrequest.wb_url.is_identity): + raise + + return self.fallback_handler(wbrequest) + #new_url = (self.redir_fallback + wbrequest.wb_url.to_str(timestamp='')) + #return WbResponse.redir_response(new_url) + def render_search_page(self, wbrequest, **kwargs): if self.search_view: return self.search_view.render_response(wbrequest=wbrequest, diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py index 6b1d69e3..d2af7028 100644 --- a/pywb/webapp/live_rewrite_handler.py +++ b/pywb/webapp/live_rewrite_handler.py @@ -15,6 +15,9 @@ class RewriteHandler(WbUrlHandler): def __call__(self, wbrequest): return self.rewrite_view(wbrequest) + def __str__(self): + return 'Live Web Rewrite Handler' + #================================================================= def create_live_rewriter_app(config={}): diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index b3ff1448..ffa2101b 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -13,6 +13,7 @@ from views import J2TemplateView, add_env_globals from views import J2HtmlCapturesView, HeadInsertView from replay_views import ReplayView +from live_rewrite_handler import RewriteHandler from query_handler import QueryHandler from handlers import WBHandler @@ -61,7 +62,7 @@ class DictChain: #================================================================= -def create_wb_handler(query_handler, config): +def create_wb_handler(query_handler, config, handler_dict={}): cookie_maker = config.get('cookie_maker') record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker) @@ -88,29 +89,40 @@ def create_wb_handler(query_handler, config): replayer, search_view=search_view, config=config, + handler_dict=handler_dict, ) return wb_handler #================================================================= -def init_collection(value, config): +def create_live_handler(config): + live_handler = RewriteHandler(config) + return live_handler + + +#================================================================= +def init_route_config(value, config): if isinstance(value, str): - value = {'index_paths': value} + value = dict(index_paths=value) route_config = DictChain(value, config) + return route_config + +#================================================================= +def init_collection(route_config): ds_rules_file = route_config.get('domain_specific_rules', None) html_view = (J2HtmlCapturesView. - create_template(config.get('query_html'), + create_template(route_config.get('query_html'), 'Captures Page')) query_handler = QueryHandler.init_from_config(route_config, ds_rules_file, html_view) - return route_config, query_handler + return query_handler #================================================================= @@ -139,8 +151,8 @@ def create_cdx_server_app(passed_config): routes = [] for name, value in collections.iteritems(): - result = init_collection(value, config) - route_config, query_handler = result + route_config = init_route_config(value, config) + query_handler = init_collection(route_config) cdx_api_suffix = route_config.get('enable_cdx_api', True) @@ -173,23 +185,33 @@ def create_wb_router(passed_config={}): else: request_class = WbRequest - #if config.get('use_lxml_parser', False): - # use_lxml_parser() + # store live and replay handlers + handler_dict = {} for name, value in collections.iteritems(): - if isinstance(value, BaseHandler): + handler_dict[name] = value routes.append(Route(name, value)) continue - result = init_collection(value, config) - route_config, query_handler = result + route_config = init_route_config(value, config) + + if route_config.get('index_paths') == '$liveweb': + live = create_live_handler(route_config) + handler_dict[name] = live + routes.append(Route(name, live)) + continue + + query_handler = init_collection(route_config) wb_handler = create_wb_handler( query_handler=query_handler, - config=route_config + config=route_config, + handler_dict=handler_dict, ) + handler_dict[name] = wb_handler + logging.debug('Adding Collection: ' + name) route_class = route_config.get('route_class', Route) From 3be27630811aa30fd1dde4a17ede8dd6c5395532 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 20 Jul 2014 16:45:14 -0700 Subject: [PATCH 12/26] handlers: change 'redir_fallback' to 'fallback' as no redirect happens, fallback called internally --- pywb/webapp/handlers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index 8ebe5ec2..4a8466d7 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -25,7 +25,7 @@ class WBHandler(WbUrlHandler): self.fallback_handler = None if handler_dict: - fallback = config.get('redir_fallback') + fallback = config.get('fallback') if fallback: self.fallback_handler = handler_dict.get(fallback) From aa0bc86543438b51dd4680ffc349a5553cffaf21 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 20 Jul 2014 16:45:44 -0700 Subject: [PATCH 13/26] cdxindexer: when indexing entire dir, only look at files with ext .warc.gz, .warc, .arc.gz, .arc files and skip the rest. (Files with other ext may be specified explicitly) --- pywb/warc/cdxindexer.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pywb/warc/cdxindexer.py b/pywb/warc/cdxindexer.py index 0bdebee2..dd7f08a7 100644 --- a/pywb/warc/cdxindexer.py +++ b/pywb/warc/cdxindexer.py @@ -77,6 +77,8 @@ class SortedCDXWriter(CDXWriter): return False +ALLOWED_EXT = ('.arc', '.arc.gz', '.warc', '.warc.gz') + #================================================================= def iter_file_or_dir(inputs): for input_ in inputs: @@ -84,12 +86,13 @@ def iter_file_or_dir(inputs): yield input_, os.path.basename(input_) else: for filename in os.listdir(input_): - yield os.path.join(input_, filename), filename + if filename.endswith(ALLOWED_EXT): + yield os.path.join(input_, filename), filename #================================================================= def remove_ext(filename): - for ext in ('.arc', '.arc.gz', '.warc', '.warc.gz'): + for ext in ALLOWED_EXT: if filename.endswith(ext): filename = filename[:-len(ext)] break From fcbc2c29667fc24156f21997e6130f9d13f7d3a1 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 20 Jul 2014 16:46:30 -0700 Subject: [PATCH 14/26] replay ui: improvements to framed replay messages. 'is_live' added to live rewrite to allow for different message for live replay vs archived replay to be used. When using framed replay, default initial message to 'Loading...' default index.html: list non-replay access points in default home page --- pywb/static/wb.js | 51 +++++++++++++++++++++++++++++---------- pywb/ui/frame_insert.html | 27 ++++++++++++++------- pywb/ui/head_insert.html | 1 + pywb/ui/index.html | 11 +++++++++ 4 files changed, 68 insertions(+), 22 deletions(-) diff --git a/pywb/static/wb.js b/pywb/static/wb.js index 84588324..81d40f42 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -19,6 +19,12 @@ This file is part of pywb. _wb_js = (function() { + +var labels = {LOADING_MSG: "Loading...", + REPLAY_MSG: "This is an archived page from ", + LIVE_MSG: "This is a live page just fetched on "}; + + function init_banner() { var PLAIN_BANNER_ID = "_wb_plain_banner"; var FRAME_BANNER_ID = "_wb_frame_top_banner"; @@ -40,19 +46,33 @@ function init_banner() { var banner = document.getElementById(bid); - if (!banner) { - banner = document.createElement("wb_div"); - banner.setAttribute("id", bid); - banner.setAttribute("lang", "en"); - - text = "This is an archived page "; - if (wbinfo && wbinfo.capture_str) { - text += " from " + wbinfo.capture_str + ""; - } - banner.innerHTML = text; - - document.body.insertBefore(banner, document.body.firstChild); + if (banner) { + return; } + + banner = document.createElement("wb_div"); + banner.setAttribute("id", bid); + banner.setAttribute("lang", "en"); + + var text; + + if (wbinfo.is_frame) { + text = labels.LOADING_MSG; + } else if (wbinfo.is_live) { + text = labels.LIVE_MSG; + } else { + text = labels.REPLAY_MSG; + } + + text = "" + text + ""; + + var capture_str = (wbinfo ? wbinfo.capture_str : ""); + + text += "" + capture_str + ""; + + banner.innerHTML = text; + + document.body.insertBefore(banner, document.body.firstChild); } function add_event(name, func, object) { @@ -105,7 +125,10 @@ function notify_top(event) { } if (window.top.update_wb_url) { - window.top.update_wb_url(window.WB_wombat_location.href, wbinfo.timestamp, wbinfo.capture_str); + window.top.update_wb_url(window.WB_wombat_location.href, + wbinfo.timestamp, + wbinfo.capture_str, + wbinfo.is_live); } } @@ -126,4 +149,6 @@ if (wbinfo.is_frame_mp && wbinfo.canon_url && window.location.replace(wbinfo.canon_url); } +return {'labels': labels}; + })(); diff --git a/pywb/ui/frame_insert.html b/pywb/ui/frame_insert.html index d8e7b6d9..19426c40 100644 --- a/pywb/ui/frame_insert.html +++ b/pywb/ui/frame_insert.html @@ -3,7 +3,6 @@ diff --git a/pywb/ui/index.html b/pywb/ui/index.html index 3bbabbe2..3a8ff0c9 100644 --- a/pywb/ui/index.html +++ b/pywb/ui/index.html @@ -9,3 +9,14 @@ The following archive collections are available: {% endif %} {% endfor %} + +Other endpoints in this deployment: + +
    +{% for route in routes %} + {% if not route | is_wb_handler %} +
  • {{ '/' + route.path }} - {{ route | string }}
  • + {% endif %} +{% endfor %} +
+ From ca405ef179b32f11149ae3dba06067d4002736d6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 20 Jul 2014 16:54:41 -0700 Subject: [PATCH 15/26] update version to 0.5.0, update CHANGELIST and README --- CHANGES.rst | 20 ++++++++++++++++++++ README.rst | 2 +- setup.py | 2 +- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 477bf135..dfe8a531 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,3 +1,23 @@ +pywb 0.5.0 changelist +~~~~~~~~~~~~~~~~~~~~~ + +* Fixes to memento timemap/timegate to work with framed replay mode. + +* Support for a fallback handler which will be called from a replay handler instead of a 404 response. The handler, specified via the ``fallback`` option +can be the name of any other replay handler. Typically, it can be used with a live rewrite handler to fetch missing content from live instead of showing a 404. + +* Live Rewrite can now be included as a 'collection type' in a pywb deployment by setting index path to ``$liveweb`` + +* ``live-rewrite-server`` has optional ``--proxy host:port`` param to specify a loading live web data through an HTTP/S proxy, such as for use with a +recording proxy. + +* wombat: add document.cookie -> document.WB_wombat_cookie rewriting to check and rewrite Path= to archival url + +* Better parent relative '../' path rewriting, resolved to correct absolute urls when rewritten. Additional testing for parent relative urls. + +* Improved support for proxy mode, allow different collections to be selected via proxy auth + + pywb 0.4.7 changelist ~~~~~~~~~~~~~~~~~~~~~ diff --git a/README.rst b/README.rst index c7d7cbc7..7a02fd3c 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.4.8 +PyWb 0.5.0 ========== .. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop diff --git a/setup.py b/setup.py index 8f788bb2..305a432c 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ class PyTest(TestCommand): setup( name='pywb', - version='0.4.8', + version='0.5.0', url='https://github.com/ikreymer/pywb', author='Ilya Kreymer', author_email='ikreymer@gmail.com', From 49deb501a6fa75a4d60f84b8787f5a81cac8e4d5 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 20 Jul 2014 18:24:08 -0700 Subject: [PATCH 16/26] fallback: better way of setting fallbacks, check for 'resolve_refs' method on the handler and pass handler_dict if one exists --- pywb/webapp/handlers.py | 14 +++++++------- pywb/webapp/pywb_init.py | 10 +++++++--- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index 4a8466d7..5c227b9f 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -8,13 +8,15 @@ from pywb.utils.loaders import BlockLoader from pywb.framework.basehandlers import BaseHandler, WbUrlHandler from pywb.framework.wbrequestresponse import WbResponse +import logging + #================================================================= # Standard WB Handler #================================================================= class WBHandler(WbUrlHandler): def __init__(self, index_reader, replay, - search_view=None, config=None, handler_dict=None): + search_view=None, config=None): self.index_reader = index_reader @@ -23,11 +25,11 @@ class WBHandler(WbUrlHandler): self.search_view = search_view self.fallback_handler = None + self.fallback_name = config.get('fallback') - if handler_dict: - fallback = config.get('fallback') - if fallback: - self.fallback_handler = handler_dict.get(fallback) + def resolve_refs(self, handler_dict): + if self.fallback_name: + self.fallback_handler = handler_dict.get(self.fallback_name) def __call__(self, wbrequest): if wbrequest.wb_url_str == '/': @@ -58,8 +60,6 @@ class WBHandler(WbUrlHandler): raise return self.fallback_handler(wbrequest) - #new_url = (self.redir_fallback + wbrequest.wb_url.to_str(timestamp='')) - #return WbResponse.redir_response(new_url) def render_search_page(self, wbrequest, **kwargs): if self.search_view: diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index ffa2101b..fb77d211 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -62,7 +62,7 @@ class DictChain: #================================================================= -def create_wb_handler(query_handler, config, handler_dict={}): +def create_wb_handler(query_handler, config): cookie_maker = config.get('cookie_maker') record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker) @@ -89,7 +89,6 @@ def create_wb_handler(query_handler, config, handler_dict={}): replayer, search_view=search_view, config=config, - handler_dict=handler_dict, ) return wb_handler @@ -207,7 +206,6 @@ def create_wb_router(passed_config={}): wb_handler = create_wb_handler( query_handler=query_handler, config=route_config, - handler_dict=handler_dict, ) handler_dict[name] = wb_handler @@ -237,6 +235,12 @@ def create_wb_router(passed_config={}): for static_name, static_path in static_routes.iteritems(): routes.append(Route(static_name, StaticHandler(static_path))) + # resolve any cross handler references + for route in routes: + if hasattr(route.handler, 'resolve_refs'): + route.handler.resolve_refs(handler_dict) + + # Check for new proxy mode! if config.get('enable_http_proxy', False): router = ProxyArchivalRouter From e4297ddabec30f805f516b8fa1e6ee786dd18015 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 20 Jul 2014 18:25:47 -0700 Subject: [PATCH 17/26] tests: add integration tests for $liveweb rewrite handler and replay with fallback --- tests/test_config.yaml | 8 ++++++++ tests/test_integration.py | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/tests/test_config.yaml b/tests/test_config.yaml index bbb96849..c1e562f8 100644 --- a/tests/test_config.yaml +++ b/tests/test_config.yaml @@ -30,6 +30,14 @@ collections: index_paths: './sample_archive/non-surt-cdx/' surt_ordered: false + # live collection + live: $liveweb + + # coll with fallback + pywb-fallback: + index_paths: ./sample_archive/cdx/ + fallback: live + # indicate if cdx files are sorted by SURT keys -- eg: com,example)/ # SURT keys are recommended for future indices, but non-SURT cdxs diff --git a/tests/test_integration.py b/tests/test_integration.py index 9427e2af..4532b0d6 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -259,6 +259,14 @@ class TestWb: resp = self.testapp.get('/pywb/20140603030351mp_/http://example.com?example=3', status = 503) assert resp.status_int == 503 + def test_live_frame(self): + resp = self.testapp.get('/live/mp_/http://example.com/?test=test') + assert resp.status_int == 200 + + def test_live_fallback(self): + resp = self.testapp.get('/pywb-fallback/mp_/http://example.com/?test=test') + assert resp.status_int == 200 + def test_post_1(self): resp = self.testapp.post('/pywb/mp_/httpbin.org/post', {'foo': 'bar', 'test': 'abc'}) From fa813bdd1995c4d5d6e134728b8df78ec5dbeb2b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 20 Jul 2014 18:26:16 -0700 Subject: [PATCH 18/26] pep8 cleanup pass --- pywb/apps/live_rewrite_server.py | 2 +- pywb/framework/archivalrouter.py | 1 - pywb/framework/proxy.py | 3 ++- pywb/rewrite/rewrite_content.py | 2 +- pywb/rewrite/url_rewriter.py | 3 --- pywb/warc/archiveiterator.py | 6 ++++-- pywb/warc/cdxindexer.py | 2 ++ pywb/webapp/replay_views.py | 2 +- 8 files changed, 11 insertions(+), 10 deletions(-) diff --git a/pywb/apps/live_rewrite_server.py b/pywb/apps/live_rewrite_server.py index 8d3544f3..8de26f10 100644 --- a/pywb/apps/live_rewrite_server.py +++ b/pywb/apps/live_rewrite_server.py @@ -18,7 +18,7 @@ def create_app(): result, unknown = parser.parse_known_args() - config=dict(proxyhostport=result.proxy, framed_replay=True) + config = dict(proxyhostport=result.proxy, framed_replay=True) app = init_app(create_live_rewriter_app, load_yaml=False, config=config) diff --git a/pywb/framework/archivalrouter.py b/pywb/framework/archivalrouter.py index 1b027488..749654ba 100644 --- a/pywb/framework/archivalrouter.py +++ b/pywb/framework/archivalrouter.py @@ -72,7 +72,6 @@ class ArchivalRouter(object): return wbrequest - def render_home_page(self, env): # render the homepage! if self.home_view: diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index 21bb65b5..bdf5753f 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -56,7 +56,8 @@ class ProxyRouter(object): self.auth_msg = routing_options.get('auth_msg', 'Please enter name of a collection to use for proxy mode') - self.proxy_coll_select = routing_options.get('proxy_coll_select', False) + self.proxy_coll_select = routing_options.get('proxy_coll_select', + False) def __call__(self, env): url = env['REL_REQUEST_URI'] diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 01de8d2d..ec93593a 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -160,7 +160,7 @@ class RewriteContent: first_buff=first_buff) @staticmethod - def _decode_buff(buff, stream, encoding): # pragma: no coverage + def _decode_buff(buff, stream, encoding): # pragma: no coverage try: buff = buff.decode(encoding) except UnicodeDecodeError, e: diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 70d5d2a8..d5593a22 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -81,8 +81,6 @@ class UrlRewriter(object): if new_url.startswith(self.prefix): new_url = new_url[len(self.prefix):] - #new_wburl = copy.copy(self.wburl) - #new_wburl.url = new_url new_wburl = WbUrl(new_url) return UrlRewriter(new_wburl, self.prefix) @@ -122,7 +120,6 @@ class UrlRewriter(object): parts = (scheme, netloc, path, query, frag) - new_url = urlparse.urlunsplit(parts) return new_url diff --git a/pywb/warc/archiveiterator.py b/pywb/warc/archiveiterator.py index 5d4c521b..6e9488e9 100644 --- a/pywb/warc/archiveiterator.py +++ b/pywb/warc/archiveiterator.py @@ -215,7 +215,8 @@ def create_record_iter(arcv_iter, options): not append_post): continue - elif (not include_all and record.content_type == 'application/warc-fields'): + elif (not include_all and + record.content_type == 'application/warc-fields'): continue entry = parse_warc_record(record) @@ -226,7 +227,8 @@ def create_record_iter(arcv_iter, options): continue if entry.url and not entry.key: - entry.key = canonicalize(entry.url, options.get('surt_ordered', True)) + entry.key = canonicalize(entry.url, + options.get('surt_ordered', True)) compute_digest = False diff --git a/pywb/warc/cdxindexer.py b/pywb/warc/cdxindexer.py index dd7f08a7..585b5711 100644 --- a/pywb/warc/cdxindexer.py +++ b/pywb/warc/cdxindexer.py @@ -77,8 +77,10 @@ class SortedCDXWriter(CDXWriter): return False +#================================================================= ALLOWED_EXT = ('.arc', '.arc.gz', '.warc', '.warc.gz') + #================================================================= def iter_file_or_dir(inputs): for input_ in inputs: diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index 9cc0aa6a..ee88219d 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -72,7 +72,7 @@ class BaseContentView(object): embed_url = wbrequest.wb_url.to_str(mod=self._mp_mod) timestamp = datetime_to_timestamp(datetime.datetime.utcnow()) url = wbrequest.wb_url.url - ctype='text/html' + ctype = 'text/html' return self.frame_insert_view.render_response(embed_url=embed_url, wbrequest=wbrequest, From 221cf701f2dc3f1388ddd72061b534af2c726070 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 20 Jul 2014 19:07:56 -0700 Subject: [PATCH 19/26] Fix spacing in CHANGES.rst --- CHANGES.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index dfe8a531..43f16d5f 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -3,13 +3,13 @@ pywb 0.5.0 changelist * Fixes to memento timemap/timegate to work with framed replay mode. -* Support for a fallback handler which will be called from a replay handler instead of a 404 response. The handler, specified via the ``fallback`` option -can be the name of any other replay handler. Typically, it can be used with a live rewrite handler to fetch missing content from live instead of showing a 404. +* Support for a fallback handler which will be called from a replay handler instead of a 404 response. + + The handler, specified via the ``fallback`` option, can be the name of any other replay handler. Typically, it can be used with a live rewrite handler to fetch missing content from live instead of showing a 404. * Live Rewrite can now be included as a 'collection type' in a pywb deployment by setting index path to ``$liveweb`` -* ``live-rewrite-server`` has optional ``--proxy host:port`` param to specify a loading live web data through an HTTP/S proxy, such as for use with a -recording proxy. +* ``live-rewrite-server`` has optional ``--proxy host:port`` param to specify a loading live web data through an HTTP/S proxy, such as for use with a recording proxy. * wombat: add document.cookie -> document.WB_wombat_cookie rewriting to check and rewrite Path= to archival url From a2973b04e7e13238448fb6aa3da6c4cb982f17bf Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 21 Jul 2014 14:02:31 -0700 Subject: [PATCH 20/26] wbrequest: add options dictionary to store misc request options --- pywb/framework/memento.py | 10 ++++------ pywb/framework/wbrequestresponse.py | 7 +++---- pywb/webapp/handlers.py | 2 -- pywb/webapp/replay_views.py | 11 +++++------ 4 files changed, 12 insertions(+), 18 deletions(-) diff --git a/pywb/framework/memento.py b/pywb/framework/memento.py index 4b3eecc1..d7221adb 100644 --- a/pywb/framework/memento.py +++ b/pywb/framework/memento.py @@ -11,15 +11,13 @@ LINK_FORMAT = 'application/link-format' #================================================================= class MementoReqMixin(object): def _parse_extra(self): - self.is_timegate = False - if not self.wb_url: return if self.wb_url.type != self.wb_url.LATEST_REPLAY: return - self.is_timegate = True + self.options['is_timegate'] = True accept_datetime = self.env.get('HTTP_ACCEPT_DATETIME') if not accept_datetime: @@ -48,7 +46,7 @@ class MementoRespMixin(object): if not wbrequest or not wbrequest.wb_url: return - is_timegate = wbrequest.is_timegate + is_timegate = wbrequest.options.get('is_timegate', False) if is_timegate: self.status_headers.headers.append(('Vary', 'accept-datetime')) @@ -59,7 +57,7 @@ class MementoRespMixin(object): is_memento = False # otherwise, if in proxy mode, then always a memento - elif wbrequest.is_proxy: + elif wbrequest.options['is_proxy']: is_memento = True # otherwise only for replay @@ -80,7 +78,7 @@ class MementoRespMixin(object): link.append(self.make_link(req_url, 'original')) # for now, include timemap only in non-proxy mode - if not wbrequest.is_proxy and (is_memento or is_timegate): + if not wbrequest.options['is_proxy'] and (is_memento or is_timegate): link.append(self.make_timemap_link(wbrequest)) if is_memento and not is_timegate: diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 85ff2eb8..0f1a9f32 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -78,12 +78,11 @@ class WbRequest(object): self.referrer = env.get('HTTP_REFERER') - self.is_ajax = self._is_ajax() + self.options = dict() + self.options['is_ajax'] = self._is_ajax() + self.options['is_proxy'] = is_proxy self.query_filter = [] - - self.is_proxy = is_proxy - self.custom_params = {} # PERF diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index 5c227b9f..ecc477bc 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -8,8 +8,6 @@ from pywb.utils.loaders import BlockLoader from pywb.framework.basehandlers import BaseHandler, WbUrlHandler from pywb.framework.wbrequestresponse import WbResponse -import logging - #================================================================= # Standard WB Handler diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index ee88219d..c113d733 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -66,8 +66,9 @@ class BaseContentView(object): # render top level frame if in frame mode # (not supported in proxy mode) if (self.is_frame_mode and - not wbrequest.is_proxy and - not wbrequest.wb_url.mod): + not wbrequest.wb_url.mod and + not wbrequest.options['is_proxy'] and + not wbrequest.options.get('is_timegate', False)): embed_url = wbrequest.wb_url.to_str(mod=self._mp_mod) timestamp = datetime_to_timestamp(datetime.datetime.utcnow()) @@ -259,12 +260,10 @@ class ReplayView(BaseContentView): return content def _redirect_if_needed(self, wbrequest, cdx): - if wbrequest.is_proxy: + if wbrequest.options['is_proxy']: return None - # todo: generalize this? - redir_needed = (hasattr(wbrequest, 'is_timegate') and - wbrequest.is_timegate) + redir_needed = (wbrequest.options.get('is_timegate', False)) if not redir_needed and self.redir_to_exact: redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp) From 950673908dc0eeb6fc13992378d87926ddc2eac7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 21 Jul 2014 14:41:49 -0700 Subject: [PATCH 21/26] proxy: fix wombat.js to work in proxy mode! rewrite only https -> http --- pywb/static/wombat.js | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 9c6ba1c0..e14c9d7d 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -121,6 +121,15 @@ WB_wombat_init = (function() { return url; } + // proxy mode: If no wb_replay_prefix, only rewrite https:// -> http:// + if (!wb_replay_prefix) { + if (starts_with(url, HTTPS_PREFIX)) { + return HTTP_PREFIX + url.substr(HTTPS_PREFIX.length); + } else { + return url; + } + } + // just in case wombat reference made it into url! url = url.replace("WB_wombat_", ""); @@ -181,6 +190,11 @@ WB_wombat_init = (function() { return ""; } + // proxy mode: no extraction needed + if (!wb_replay_prefix) { + return href; + } + href = href.toString(); var index = href.indexOf("/http", 1); @@ -683,20 +697,22 @@ WB_wombat_init = (function() { //============================================ function wombat_init(replay_prefix, capture_date, orig_scheme, orig_host, timestamp) { wb_replay_prefix = replay_prefix; - - wb_replay_date_prefix = replay_prefix + capture_date + "em_/"; - if (capture_date.length > 0) { - wb_capture_date_part = "/" + capture_date + "/"; - } else { - wb_capture_date_part = ""; + if (wb_replay_prefix) { + wb_replay_date_prefix = replay_prefix + capture_date + "em_/"; + + if (capture_date.length > 0) { + wb_capture_date_part = "/" + capture_date + "/"; + } else { + wb_capture_date_part = ""; + } + + wb_orig_scheme = orig_scheme + '://'; + + wb_orig_host = wb_orig_scheme + orig_host; + + init_bad_prefixes(replay_prefix); } - - wb_orig_scheme = orig_scheme + '://'; - - wb_orig_host = wb_orig_scheme + orig_host; - - init_bad_prefixes(replay_prefix); // Location var wombat_location = new WombatLocation(window.self.location); From 7c573453634d7abcc682ebca5e6de1abfa5ec6e7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 21 Jul 2014 16:42:14 -0700 Subject: [PATCH 22/26] proxy: add 'unaltered_replay' option to proxy_options to replay all content unaltered (no rewriting html, no banner, no wombat) use 'proxy_options' instead of 'routing_options', add additional tests for proxy mode --- config.yaml | 8 +++++--- pywb/framework/proxy.py | 30 ++++++++++++++++++++---------- pywb/static/wb.js | 2 +- pywb/webapp/pywb_init.py | 2 +- tests/test_config.yaml | 6 ++++++ tests/test_config_memento.yaml | 4 ++++ tests/test_integration.py | 10 +++++++++- 7 files changed, 46 insertions(+), 16 deletions(-) diff --git a/config.yaml b/config.yaml index 91051b81..937b4545 100644 --- a/config.yaml +++ b/config.yaml @@ -91,9 +91,11 @@ static_routes: # Enable simple http proxy mode enable_http_proxy: true -# additional options for routing -routing_options: - proxy_coll_select: false +# Additional proxy options (defaults) +#proxy_options: +# use_default_coll: true +# +# unaltered_replay: false # enable cdx server api for querying cdx directly (experimental) enable_cdx_api: true diff --git a/pywb/framework/proxy.py b/pywb/framework/proxy.py index bdf5753f..62bc06b0 100644 --- a/pywb/framework/proxy.py +++ b/pywb/framework/proxy.py @@ -43,21 +43,23 @@ class ProxyRouter(object): See: http://www.mementoweb.org/guide/rfc/#Pattern1.3 for more details. """ + def __init__(self, routes, **kwargs): self.routes = routes self.hostpaths = kwargs.get('hostpaths') self.error_view = kwargs.get('error_view') - routing_options = kwargs.get('routing_options') - if not routing_options: - routing_options = {} + proxy_options = kwargs.get('config', {}) + if proxy_options: + proxy_options = proxy_options.get('proxy_options', {}) - self.auth_msg = routing_options.get('auth_msg', + self.auth_msg = proxy_options.get('auth_msg', 'Please enter name of a collection to use for proxy mode') - self.proxy_coll_select = routing_options.get('proxy_coll_select', - False) + self.use_default_coll = proxy_options.get('use_default_coll', True) + + self.unaltered = proxy_options.get('unaltered_replay', False) def __call__(self, env): url = env['REL_REQUEST_URI'] @@ -76,11 +78,12 @@ class ProxyRouter(object): if proxy_auth: proxy_coll = self.read_basic_auth_coll(proxy_auth) - proxy_coll = '/' + proxy_coll + '/' if not proxy_coll: return self.proxy_auth_coll_response() + proxy_coll = '/' + proxy_coll + '/' + for r in self.routes: matcher, c = r.is_handling(proxy_coll) if matcher: @@ -91,12 +94,16 @@ class ProxyRouter(object): if not route: return self.proxy_auth_coll_response() - elif self.proxy_coll_select: - return self.proxy_auth_coll_response() - else: + # if 'use_default_coll' or only one collection, use that + # for proxy mode + elif self.use_default_coll or len(self.routes) == 1: route = self.routes[0] coll = self.routes[0].regex.pattern + # otherwise, require proxy auth 407 to select collection + else: + return self.proxy_auth_coll_response() + wbrequest = route.request_class(env, request_uri=url, wb_url_str=url, @@ -110,6 +117,9 @@ class ProxyRouter(object): if matcher: route.apply_filters(wbrequest, matcher) + if self.unaltered: + wbrequest.wb_url.mod = 'id_' + return route.handler(wbrequest) # Proxy Auto-Config (PAC) script for the proxy diff --git a/pywb/static/wb.js b/pywb/static/wb.js index 81d40f42..0244cde8 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -22,7 +22,7 @@ _wb_js = (function() { var labels = {LOADING_MSG: "Loading...", REPLAY_MSG: "This is an archived page from ", - LIVE_MSG: "This is a live page just fetched on "}; + LIVE_MSG: "This is a live page loaded on "}; function init_banner() { diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index fb77d211..e17a9485 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -265,5 +265,5 @@ def create_wb_router(passed_config={}): error_view=J2TemplateView.create_template(config.get('error_html'), 'Error Page'), - routing_options=config.get('routing_options') + config=config ) diff --git a/tests/test_config.yaml b/tests/test_config.yaml index c1e562f8..468a3131 100644 --- a/tests/test_config.yaml +++ b/tests/test_config.yaml @@ -106,6 +106,12 @@ static_routes: # Enable simple http proxy mode enable_http_proxy: true +# Additional proxy options (defaults) +proxy_options: + use_default_coll: true + + unaltered_replay: false + # enable cdx server api for querying cdx directly (experimental) #enable_cdx_api: True # or specify suffix diff --git a/tests/test_config_memento.yaml b/tests/test_config_memento.yaml index 003a3145..e8d0eb21 100644 --- a/tests/test_config_memento.yaml +++ b/tests/test_config_memento.yaml @@ -14,6 +14,10 @@ enable_memento: true # Enable simple http proxy mode enable_http_proxy: true +# test unaltered replay for proxy as well +proxy_options: + unaltered_replay: true + # enable cdx server api for timemap enable_cdx_api: true diff --git a/tests/test_integration.py b/tests/test_integration.py index 4532b0d6..94ce45cf 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -354,7 +354,7 @@ class TestWb: assert resp.status_int == 407 - def test_proxy_replay_auth_invalid(self): + def test_proxy_replay_auth_invalid_1(self): headers = [('Proxy-Authorization', 'abc' + base64.b64encode('no-such-coll'))] resp = self.testapp.get('/x-ignore-this-x', headers = headers, extra_environ = dict(REQUEST_URI = 'http://www.iana.org/', SCRIPT_NAME = ''), @@ -362,6 +362,14 @@ class TestWb: assert resp.status_int == 407 + def test_proxy_replay_auth_invalid_2(self): + headers = [('Proxy-Authorization', 'basic')] + resp = self.testapp.get('/x-ignore-this-x', headers = headers, + extra_environ = dict(REQUEST_URI = 'http://www.iana.org/', SCRIPT_NAME = ''), + status=407) + + assert resp.status_int == 407 + def test_proxy_pac(self): resp = self.testapp.get('/proxy.pac', extra_environ = dict(SERVER_NAME='pywb-proxy', SERVER_PORT='8080')) assert resp.content_type == 'application/x-ns-proxy-autoconfig' From 84e83658385dabbc26fc8a920644b37f9749bedb Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 21 Jul 2014 17:10:13 -0700 Subject: [PATCH 23/26] Update README.rst with usage examples --- README.rst | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 7a02fd3c..632aca5b 100644 --- a/README.rst +++ b/README.rst @@ -11,9 +11,25 @@ pywb is a python implementation of web archival replay tools, sometimes also kno pywb allows high-quality replay (browsing) of archived web data stored in standardized `ARC `_ and `WARC `_. -*For an example of deployed service using pywb, please see the https://webrecorder.io project* -pywb Tools +Usage Examples +----------------------------- + +This README contains a basic overview of using pywb. After reading this intro, consider also taking a look at these seperate projects: + +* `pywb-webrecorder `_ demonstrates a way to use pywb and warcprox to record web content while browsing. + +* `pywb-samples `_ provides additional archive samples with difficult-to-replay content. + + +The following deployed applications use pywb: + +* https://perma.cc embeds pywb as part of a larger `open source application `_ to provide web archive replay for law libraries. + +* https://webrecorder.io uses pywb and builds upon pywb-webrecorder to create a hosted web recording and replay system. + + +pywb Tools Overview ----------------------------- In addition to the standard wayback machine (explained further below), pywb tool suite includes a @@ -72,7 +88,7 @@ This process can be done by running the ``cdx-indexer`` script and only needs to Given an archive of warcs at ``myarchive/warcs`` -1. Create a dir for indexs, .eg. ``myarchive/cdx`` +1. Create a dir for indexes, .eg. ``myarchive/cdx`` 2. Run ``cdx-indexer --sort myarchive/cdx myarchive/warcs`` to generate .cdx files for each warc/arc file in ``myarchive/warcs`` From b8a17b7cab3da2b50b8b0945b1c428e2ef58d9ce Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 21 Jul 2014 21:25:10 -0700 Subject: [PATCH 24/26] refactor webapp: RewriteLiveHandler and WBHandler share a common base class, SearchPageWbUrlHandler which renders the search page when there is no wburl move some inits from pywb_init to WBHandler itself --- pywb/webapp/handlers.py | 56 +++++++++++++++++++++-------- pywb/webapp/live_rewrite_handler.py | 8 +++-- pywb/webapp/pywb_init.py | 27 +------------- pywb/webapp/replay_views.py | 2 +- pywb/webapp/views.py | 5 ++- 5 files changed, 52 insertions(+), 46 deletions(-) diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index ecc477bc..6228de3e 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -8,19 +8,55 @@ from pywb.utils.loaders import BlockLoader from pywb.framework.basehandlers import BaseHandler, WbUrlHandler from pywb.framework.wbrequestresponse import WbResponse +from pywb.warc.recordloader import ArcWarcRecordLoader +from pywb.warc.resolvingloader import ResolvingLoader + +from views import J2TemplateView, add_env_globals +from replay_views import ReplayView + + +#================================================================= +class SearchPageWbUrlHandler(WbUrlHandler): + """ + Loads a default search page html template to be shown when + the wb_url is empty + """ + def __init__(self, config): + self.search_view = (J2TemplateView. + create_template(config.get('search_html'), + 'Search Page')) + + def render_search_page(self, wbrequest, **kwargs): + if self.search_view: + return self.search_view.render_response(wbrequest=wbrequest, + prefix=wbrequest.wb_prefix, + **kwargs) + else: + return WbResponse.text_response('No Lookup Url Specified') + #================================================================= # Standard WB Handler #================================================================= -class WBHandler(WbUrlHandler): - def __init__(self, index_reader, replay, - search_view=None, config=None): +class WBHandler(SearchPageWbUrlHandler): + def __init__(self, query_handler, config=None): + super(WBHandler, self).__init__(config) - self.index_reader = index_reader + self.index_reader = query_handler - self.replay = replay + cookie_maker = config.get('cookie_maker') + record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker) - self.search_view = search_view + paths = config.get('archive_paths') + + resolving_loader = ResolvingLoader(paths=paths, + record_loader=record_loader) + + template_globals = config.get('template_globals') + if template_globals: + add_env_globals(template_globals) + + self.replay = ReplayView(resolving_loader, config) self.fallback_handler = None self.fallback_name = config.get('fallback') @@ -59,14 +95,6 @@ class WBHandler(WbUrlHandler): return self.fallback_handler(wbrequest) - def render_search_page(self, wbrequest, **kwargs): - if self.search_view: - return self.search_view.render_response(wbrequest=wbrequest, - prefix=wbrequest.wb_prefix, - **kwargs) - else: - return WbResponse.text_response('No Lookup Url Specified') - def __str__(self): return 'Web Archive Replay Handler' diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py index d2af7028..0a89bd4c 100644 --- a/pywb/webapp/live_rewrite_handler.py +++ b/pywb/webapp/live_rewrite_handler.py @@ -2,17 +2,21 @@ from pywb.framework.basehandlers import WbUrlHandler from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.archivalrouter import ArchivalRouter, Route -from handlers import StaticHandler +from handlers import StaticHandler, SearchPageWbUrlHandler from replay_views import RewriteLiveView #================================================================= -class RewriteHandler(WbUrlHandler): +class RewriteHandler(SearchPageWbUrlHandler): def __init__(self, config): + super(RewriteHandler, self).__init__(config) self.rewrite_view = RewriteLiveView(config) def __call__(self, wbrequest): + if wbrequest.wb_url_str == '/': + return self.render_search_page(wbrequest) + return self.rewrite_view(wbrequest) def __str__(self): diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index e17a9485..2fd02377 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -6,13 +6,9 @@ from pywb.framework.wbrequestresponse import WbRequest from pywb.framework.memento import MementoRequest from pywb.framework.basehandlers import BaseHandler -from pywb.warc.recordloader import ArcWarcRecordLoader -from pywb.warc.resolvingloader import ResolvingLoader - -from views import J2TemplateView, add_env_globals +from views import J2TemplateView from views import J2HtmlCapturesView, HeadInsertView -from replay_views import ReplayView from live_rewrite_handler import RewriteHandler from query_handler import QueryHandler @@ -63,31 +59,10 @@ class DictChain: #================================================================= def create_wb_handler(query_handler, config): - - cookie_maker = config.get('cookie_maker') - record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker) - - paths = config.get('archive_paths') - - resolving_loader = ResolvingLoader(paths=paths, - record_loader=record_loader) - - template_globals = config.get('template_globals') - if template_globals: - add_env_globals(template_globals) - - replayer = ReplayView(resolving_loader, config) - - search_view = (J2TemplateView. - create_template(config.get('search_html'), - 'Search Page')) - wb_handler_class = config.get('wb_handler_class', WBHandler) wb_handler = wb_handler_class( query_handler, - replayer, - search_view=search_view, config=config, ) diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index c113d733..0b8bb528 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -65,7 +65,7 @@ class BaseContentView(object): def __call__(self, wbrequest, *args): # render top level frame if in frame mode # (not supported in proxy mode) - if (self.is_frame_mode and + if (self.is_frame_mode and wbrequest.wb_url and not wbrequest.wb_url.mod and not wbrequest.options['is_proxy'] and not wbrequest.options.get('is_timegate', False)): diff --git a/pywb/webapp/views.py b/pywb/webapp/views.py index 903cc818..c49be8c9 100644 --- a/pywb/webapp/views.py +++ b/pywb/webapp/views.py @@ -2,8 +2,6 @@ from pywb.utils.timeutils import timestamp_to_datetime from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.memento import make_timemap, LINK_FORMAT -from handlers import WBHandler - import urlparse import logging @@ -62,7 +60,8 @@ def is_wb_handler(obj): if not hasattr(obj, 'handler'): return False - return isinstance(obj.handler, WBHandler) + #return isinstance(obj.handler, WBHandler) + return obj.handler.__class__.__name__ == "WBHandler" #================================================================= From 2f50a3eafbeb7aacd93ffa859ed9fb6fcda4d9eb Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 21 Jul 2014 21:43:40 -0700 Subject: [PATCH 25/26] a few more tweaks to CHANGES.rst --- CHANGES.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 43f16d5f..4c7c3abe 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,12 @@ pywb 0.5.0 changelist ~~~~~~~~~~~~~~~~~~~~~ +* LiveRewriteHandler and WBHandler refactoring: LiveRewriteHandler now supports a root search page html template. + +* Proxy mode option: 'unaltered_replay' to proxy archival data with no modifications (no banner, no server or client side rewriting) + +* Fix client side rewriting (wombat.js) for proxy mode: only rewrite https -> http in absolute urls. + * Fixes to memento timemap/timegate to work with framed replay mode. * Support for a fallback handler which will be called from a replay handler instead of a 404 response. @@ -15,6 +21,8 @@ pywb 0.5.0 changelist * Better parent relative '../' path rewriting, resolved to correct absolute urls when rewritten. Additional testing for parent relative urls. +* New 'proxy_options' block, including 'use_default_coll' to allow defaulting to first collection w/o proxy auth. + * Improved support for proxy mode, allow different collections to be selected via proxy auth From 0b8a8f0ae2804a5687d7e5144445bace591df4ea Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 21 Jul 2014 22:43:34 -0700 Subject: [PATCH 26/26] live rewrite: catch errors from live rewrite and raise a new LiveResourceError with a 400 error code, indicating bad request for live resource. Add test for invalid live rewrite requests --- CHANGES.rst | 6 ++++-- pywb/cdx/cdxserver.py | 2 +- pywb/webapp/live_rewrite_handler.py | 16 +++++++++++++++- tests/test_live_rewriter.py | 8 ++++++++ 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 4c7c3abe..21aff406 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,9 +1,11 @@ pywb 0.5.0 changelist ~~~~~~~~~~~~~~~~~~~~~ +* Catch live rewrite errors and display more friendly pywb error message. + * LiveRewriteHandler and WBHandler refactoring: LiveRewriteHandler now supports a root search page html template. -* Proxy mode option: 'unaltered_replay' to proxy archival data with no modifications (no banner, no server or client side rewriting) +* Proxy mode option: 'unaltered_replay' to proxy archival data with no modifications (no banner, no server or client side rewriting). * Fix client side rewriting (wombat.js) for proxy mode: only rewrite https -> http in absolute urls. @@ -13,7 +15,7 @@ pywb 0.5.0 changelist The handler, specified via the ``fallback`` option, can be the name of any other replay handler. Typically, it can be used with a live rewrite handler to fetch missing content from live instead of showing a 404. -* Live Rewrite can now be included as a 'collection type' in a pywb deployment by setting index path to ``$liveweb`` +* Live Rewrite can now be included as a 'collection type' in a pywb deployment by setting index path to ``$liveweb``. * ``live-rewrite-server`` has optional ``--proxy host:port`` param to specify a loading live web data through an HTTP/S proxy, such as for use with a recording proxy. diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 137a2555..b5a5745e 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -58,7 +58,7 @@ class BaseCDXServer(object): return self.load_cdx(**fuzzy_query_params) msg = 'No Captures found for: ' + query.url - raise NotFoundException(msg) + raise NotFoundException(msg, url=query.url) def _calc_search_keys(self, query): return calc_search_range(url=query.url, diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py index 0a89bd4c..e1e2d53e 100644 --- a/pywb/webapp/live_rewrite_handler.py +++ b/pywb/webapp/live_rewrite_handler.py @@ -6,6 +6,14 @@ from handlers import StaticHandler, SearchPageWbUrlHandler from replay_views import RewriteLiveView +from pywb.utils.wbexception import WbException + + +#================================================================= +class LiveResourceException(WbException): + def status(self): + return '400 Bad Live Resource' + #================================================================= class RewriteHandler(SearchPageWbUrlHandler): @@ -17,7 +25,13 @@ class RewriteHandler(SearchPageWbUrlHandler): if wbrequest.wb_url_str == '/': return self.render_search_page(wbrequest) - return self.rewrite_view(wbrequest) + try: + return self.rewrite_view(wbrequest) + + except Exception as exc: + url = wbrequest.wb_url.url + msg = 'Could not load the url from the live web: ' + url + raise LiveResourceException(msg=msg, url=url) def __str__(self): return 'Live Web Rewrite Handler' diff --git a/tests/test_live_rewriter.py b/tests/test_live_rewriter.py index ca79c828..5ce19414 100644 --- a/tests/test_live_rewriter.py +++ b/tests/test_live_rewriter.py @@ -23,4 +23,12 @@ class TestLiveRewriter: assert '