From 44f38f44d540e1bc9404d8b57b5b3ded1376f487 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 8 Feb 2014 20:07:16 -0800 Subject: [PATCH] paths cleanup: - don't store explicit static path, but allow it to be set in the insert - store host_prefix, which is either server name or empty - for archival mode, absolute_paths settings controls if using absolute paths, - for proxy always use absolute_paths - default static path is: /static/default/ - allow extension apps to provide custom /static/X/ path Route overriding: - ability to set Route class - custom init method Archival Relative Redirect: - if starting with timestamp, drop timestamp and assume host-relative path Integration Tests: - test proxy mode by using REQUEST_URI - test archival relative redirect! --- config.yaml | 10 ++++++--- pywb/archivalrouter.py | 31 +++++++++++++++++++++----- pywb/config_utils.py | 16 ++++++++++++-- pywb/handlers.py | 6 ++--- pywb/proxy.py | 4 ++-- pywb/pywb_init.py | 24 ++++++++++++-------- pywb/replay_views.py | 10 ++++----- pywb/ui/head_insert.html | 4 ++-- pywb/wbapp.py | 2 ++ pywb/wbrequestresponse.py | 16 +++++++++----- run-tests.py | 46 ++++++++++++++++++++++++++++++++------- test_config.yaml | 7 ++++-- 12 files changed, 127 insertions(+), 49 deletions(-) diff --git a/config.yaml b/config.yaml index 82acddbb..07a2c303 100644 --- a/config.yaml +++ b/config.yaml @@ -74,18 +74,22 @@ archive_paths: ./sample_archive/warcs/ # to http://localhost:8080/pywb/image.gif # -#hostpaths: ['http://localhost:8080/'] +#hostpaths: ['http://localhost:8080'] + +# Rewrite urls with absolute paths instead of relative +#absoulte_paths: true # List of route names: # : +# default route static/default for pywb defaults static_routes: - static: static/ + static/default: static/ # ==== New / Experimental Settings ==== # Not yet production ready -- used primarily for testing # Enable simple http proxy mode -#enable_http_proxy: false +enable_http_proxy: true # enable cdx server api for querying cdx directly (experimental) #enable_cdx_api: false diff --git a/pywb/archivalrouter.py b/pywb/archivalrouter.py index e805095a..55a85dfb 100644 --- a/pywb/archivalrouter.py +++ b/pywb/archivalrouter.py @@ -73,6 +73,7 @@ class Route: # collection id from regex group (default 0) self.coll_group = coll_group self.filters = filters + self._custom_init() def __call__(self, env, use_abs_prefix): @@ -94,10 +95,10 @@ class Route: wbrequest = WbRequest(env, request_uri = request_uri, - coll = coll, wb_url_str = wb_url_str, wb_prefix = wb_prefix, - use_abs_prefix = use_abs_prefix, + coll = coll, + host_prefix = WbRequest.make_host_prefix(env) if use_abs_prefix else '', wburl_class = self.handler.get_wburl_type()) @@ -111,6 +112,9 @@ class Route: last_grp = len(matcher.groups()) wbrequest.query_filter.append(filter.format(matcher.group(last_grp))) + def _custom_init(self): + pass + def _handle_request(self, wbrequest): return self.handler(wbrequest) @@ -140,6 +144,14 @@ class ReferRedirect: >>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html') 'http://localhost:8080/coll/20131010/http://example.com/other.html' + # With timestamp included + >>> test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html') + 'http://localhost:8080/coll/20131010/http://example.com/other.html' + + # With timestamp included + >>> test_redir('http://localhost:8080/', '/20131010/path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/some/index.html') + 'http://localhost:8080/coll/20131010/http://example.com/path/other.html' + >>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') False @@ -147,6 +159,10 @@ class ReferRedirect: >>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra') 'http://localhost:8080/extra/coll/20131010/http://example.com/other.html' + # With custom SCRIPT_NAME + timestamp + >>> test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra') + 'http://localhost:8080/extra/coll/20131010/http://example.com/other.html' + # With custom SCRIPT_NAME, bad match >>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr') False @@ -185,11 +201,14 @@ class ReferRedirect: rel_request_uri = wbrequest.request_uri[1:] - #ref_wb_url = archiveurl('/' + ref_path[1]) - #ref_wb_url.url = urlparse.urljoin(ref_wb_url.url, wbrequest.request_uri[1:]) - #ref_wb_url.url = ref_wb_url.url.replace('../', '') + timestamp_path = rewriter.wburl.timestamp + '/' + + # check if timestamp is already part of the path + if rel_request_uri.startswith(timestamp_path): + # remove timestamp but leave / to make host relative url + # 2013/path.html -> /path.html + rel_request_uri = rel_request_uri[len(timestamp_path) - 1:] - #final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, ref_path[0] + str(ref_wb_url), '', '')) final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, rewriter.rewrite(rel_request_uri), '', '')) return WbResponse.redir_response(final_url) diff --git a/pywb/config_utils.py b/pywb/config_utils.py index 5a315d51..c3ebcf84 100644 --- a/pywb/config_utils.py +++ b/pywb/config_utils.py @@ -42,10 +42,22 @@ def create_wb_handler(**config): html_view = load_template_file(config.get('query_html'), 'Captures Page', views.J2HtmlCapturesView), search_view = load_template_file(config.get('search_html'), 'Search Page'), - - static_path = config.get('static_path'), ) return wb_handler +#================================================================= +def load_class(name): + result = name.rsplit('.', 1) + + if len(result) == 1: + modname == '' + klass = result[0] + else: + modname = result[0] + klass = result[1] + + mod = __import__(modname, fromlist=[klass]) + return getattr(mod, klass) + diff --git a/pywb/handlers.py b/pywb/handlers.py index 329d99e6..c2c7b949 100644 --- a/pywb/handlers.py +++ b/pywb/handlers.py @@ -22,7 +22,7 @@ class BaseHandler: # Standard WB Handler #================================================================= class WBHandler(BaseHandler): - def __init__(self, cdx_reader, replay, html_view = None, search_view = None, static_path = '/static/'): + def __init__(self, cdx_reader, replay, html_view = None, search_view = None): self.cdx_reader = cdx_reader self.replay = replay @@ -31,8 +31,6 @@ class WBHandler(BaseHandler): self.html_view = html_view self.search_view = search_view - self.static_path = static_path - def __call__(self, wbrequest): @@ -51,7 +49,7 @@ class WBHandler(BaseHandler): return query_view.render_response(wbrequest, cdx_lines) with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: - return self.replay(wbrequest, cdx_lines, self.cdx_reader, self.static_path) + return self.replay(wbrequest, cdx_lines, self.cdx_reader) def render_search_page(self, wbrequest): diff --git a/pywb/proxy.py b/pywb/proxy.py index 51e92f83..107f9d96 100644 --- a/pywb/proxy.py +++ b/pywb/proxy.py @@ -46,10 +46,10 @@ class ProxyRouter: wbrequest = WbRequest(env, request_uri = url, - coll = '', wb_url_str = url, wb_prefix = '', - use_abs_prefix = False, + coll = '', + host_prefix = self.hostpaths[0], wburl_class = self.handler.get_wburl_type(), url_rewriter_class = ProxyHttpsUrlRewriter, is_proxy = True) diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py index bd8c7400..c2ddaffd 100644 --- a/pywb/pywb_init.py +++ b/pywb/pywb_init.py @@ -22,7 +22,7 @@ def pywb_config_manual(config = {}): routes = [] - hostpaths = config.get('hostpaths', ['http://localhost:8080/']) + hostpaths = config.get('hostpaths', ['http://localhost:8080']) # collections based on cdx source collections = config.get('collections', {'pywb': './sample_archive/cdx/'}) @@ -40,23 +40,27 @@ def pywb_config_manual(config = {}): cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, **config) - # cdx query handler - if route_config.get('enable_cdx_api', False): - routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_source))) - wb_handler = config_utils.create_wb_handler( cdx_source = cdx_source, archive_paths = route_config.get('archive_paths', './sample_archive/warcs/'), head_html = route_config.get('head_insert_html', DEFAULT_HEAD_INSERT), query_html = route_config.get('query_html', DEFAULT_QUERY), search_html = route_config.get('search_html', DEFAULT_SEARCH), - - static_path = hostpaths[0] + route_config.get('static_path', 'static/') ) logging.info('Adding Collection: ' + name) - routes.append(archivalrouter.Route(name, wb_handler, filters = route_config.get('filters', []))) + route_class = route_config.get('route_class', None) + if route_class: + route_class = config_utils.load_class(route_class) + else: + route_class = archivalrouter.Route + + routes.append(route_class(name, wb_handler, filters = route_config.get('filters', []))) + + # cdx query handler + if route_config.get('enable_cdx_api', False): + routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_source))) if config.get('debug_echo_env', False): @@ -66,7 +70,7 @@ def pywb_config_manual(config = {}): routes.append(archivalrouter.Route('echo_req', handlers.DebugEchoHandler())) - static_routes = config.get('static_routes', {'static': 'static/'}) + static_routes = config.get('static_routes', {'static/default': 'static/'}) for static_name, static_path in static_routes.iteritems(): routes.append(archivalrouter.Route(static_name, handlers.StaticHandler(static_path))) @@ -85,6 +89,8 @@ def pywb_config_manual(config = {}): # (See archivalrouter.ReferRedirect) hostpaths = hostpaths, + abs_path = config.get('absolute_paths', True), + home_view = config_utils.load_template_file(config.get('home_html', DEFAULT_INDEX), 'Home Page'), error_view = config_utils.load_template_file(config.get('error_html', DEFAULT_ERROR), 'Error Page') ) diff --git a/pywb/replay_views.py b/pywb/replay_views.py index d1c36ead..0f076e02 100644 --- a/pywb/replay_views.py +++ b/pywb/replay_views.py @@ -23,7 +23,7 @@ class ReplayView: self.loader = loader if loader else archiveloader.ArchiveLoader() - def __call__(self, wbrequest, cdx_lines, cdx_reader, static_path): + def __call__(self, wbrequest, cdx_lines, cdx_reader): last_e = None first = True @@ -41,7 +41,7 @@ class ReplayView: (cdx, status_headers, stream) = self.resolve_headers_and_payload(cdx, wbrequest, cdx_reader, failed_files) - return self.make_response(wbrequest, cdx, status_headers, stream, static_path) + return self.make_response(wbrequest, cdx, status_headers, stream) except wbexceptions.CaptureException as ce: @@ -142,7 +142,7 @@ class ReplayView: # done here! just return response # subclasses make override to do additional processing - def make_response(self, wbrequest, cdx, status_headers, stream, static_path): + def make_response(self, wbrequest, cdx, status_headers, stream): return self.create_stream_response(status_headers, stream) @@ -250,7 +250,7 @@ class RewritingReplayView(ReplayView): return None - def make_response(self, wbrequest, cdx, status_headers, stream, static_path): + def make_response(self, wbrequest, cdx, status_headers, stream): # check and reject self-redirect self._reject_self_redirect(wbrequest, cdx, status_headers) @@ -312,7 +312,7 @@ class RewritingReplayView(ReplayView): status_headers = rewritten_headers.status_headers if text_type == 'html': - head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx, static_path = static_path) if self.head_insert_view else None + head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx) if self.head_insert_view else None rewriter = html_rewriter.HTMLRewriter(urlrewriter, outstream = None, head_insert = head_insert_str) elif text_type == 'css': rewriter = regex_rewriters.CSSRewriter(urlrewriter) diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html index f049aacd..b30cd015 100644 --- a/pywb/ui/head_insert.html +++ b/pywb/ui/head_insert.html @@ -3,6 +3,6 @@ wbinfo = {} wbinfo.capture_str = "{{ cdx['timestamp'] | format_ts }}"; - - + + diff --git a/pywb/wbapp.py b/pywb/wbapp.py index 44b85a0f..afd8391f 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -19,6 +19,8 @@ def create_wb_app(wb_router): else: env['REL_REQUEST_URI'] = env['REQUEST_URI'] + print env['REL_REQUEST_URI'] + response = None try: diff --git a/pywb/wbrequestresponse.py b/pywb/wbrequestresponse.py index 2bdc8bbb..a1a82045 100644 --- a/pywb/wbrequestresponse.py +++ b/pywb/wbrequestresponse.py @@ -52,19 +52,21 @@ class WbRequest: wb_url_str = parts[0] coll = '' - return WbRequest(env, request_uri, wb_prefix, wb_url_str, coll, use_abs_prefix) + host_prefix = WbRequest.make_host_prefix(env) if use_abs_prefix else '' + + return WbRequest(env, request_uri, wb_prefix, wb_url_str, coll, host_prefix = host_prefix) @staticmethod - def make_abs_prefix(env, rel_prefix): + def make_host_prefix(env): try: - return env['wsgi.url_scheme'] + '://' + env['HTTP_HOST'] + rel_prefix + return env['wsgi.url_scheme'] + '://' + env['HTTP_HOST'] except KeyError: - return rel_prefix + return '' def __init__(self, env, request_uri, wb_prefix, wb_url_str, coll, - use_abs_prefix = False, + host_prefix = '', wburl_class = WbUrl, url_rewriter_class = UrlRewriter, is_proxy = False): @@ -73,7 +75,9 @@ class WbRequest: self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI') - self.wb_prefix = wb_prefix if not use_abs_prefix else WbRequest.make_abs_prefix(env, wb_prefix) + self.host_prefix = host_prefix + + self.wb_prefix = host_prefix + wb_prefix if not wb_url_str: wb_url_str = '/' diff --git a/run-tests.py b/run-tests.py index 4782af2c..014eb753 100644 --- a/run-tests.py +++ b/run-tests.py @@ -88,21 +88,51 @@ class TestWb: assert 'Mon, Jan 27 2014 17:12:51' in resp.body assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body + def test_redirect_relative_3(self): + # first two requests should result in same redirect + target = 'http://localhost:8080/pywb/2014/http://iana.org/_css/2013.1/screen.css' + + # without timestamp + resp = self.testapp.get('/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:8080/pywb/2014/http://iana.org/')]) + assert resp.status_int == 302 + assert resp.headers['Location'] == target, resp.headers['Location'] + + # with timestamp + resp = self.testapp.get('/2014/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:8080/pywb/2014/http://iana.org/')]) + assert resp.status_int == 302 + assert resp.headers['Location'] == target, resp.headers['Location'] + + + resp = resp.follow() + assert resp.status_int == 302 + assert resp.headers['Location'].endswith('/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css') + + resp = resp.follow() + assert resp.status_int == 200 + assert resp.content_type == 'text/css' + + def test_static_content(self): - resp = self.testapp.get('/test-static/wb.css') + resp = self.testapp.get('/static/test/route/wb.css') assert resp.status_int == 200 assert resp.content_type == 'text/css' assert resp.content_length > 0 - # XX: Doesn't work as webtest does not support proxy mode - # need a way to test - #def test_proxy_replay(self): - #resp = self.testapp.get('http://www.iana.org/domains/idn-tables') - #self._assert_basic_html(resp) + # 'Simulating' proxy by settings REQUEST_URI explicitly to http:// url and no SCRIPT_NAME + # would be nice to be able to test proxy more + def test_proxy_replay(self): + resp = self.testapp.get('/x-ignore-this-x', extra_environ = dict(REQUEST_URI = 'http://www.iana.org/domains/idn-tables', SCRIPT_NAME = '')) + self._assert_basic_html(resp) - #assert 'Sun, Jan 26 2014 20:11:27' in resp.body - #assert 'wb.js' in resp.body + assert 'Sun, Jan 26 2014 20:11:27' in resp.body + assert 'wb.js' in resp.body + + def test_proxy_pac(self): + resp = self.testapp.get('/proxy.pac', extra_environ = dict(SERVER_NAME='pywb-proxy', SERVER_PORT='8080')) + assert resp.content_type == 'application/x-ns-proxy-autoconfig' + assert '"PROXY pywb-proxy:8080"' in resp.body + assert '"localhost"' in resp.body def test_cdx_server_filters(self): resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mimetype:warc/revisit&filter=filename:dupes.warc.gz') diff --git a/test_config.yaml b/test_config.yaml index 45226cc9..7460c825 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -70,12 +70,15 @@ error_html: ui/error.html # to http://localhost:8080/pywb/image.gif # -#hostpaths: ['http://localhost:8080/'] +#hostpaths: ['http://localhost:8080'] + +# Rewrite urls with absolute paths instead of relative +absoulte_paths: true # List of route names: # : static_routes: - test-static: static/ + static/test/route: static/ # ==== New / Experimental Settings ====