1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

paths cleanup:

- don't store explicit static path, but allow it to be set in the insert
- store host_prefix, which is either server name or empty
- for archival mode, absolute_paths settings controls if using absolute paths,
- for proxy always use absolute_paths
- default static path is: /static/default/
- allow extension apps to provide custom /static/X/ path

Route overriding:
- ability to set Route class
- custom init method

Archival Relative Redirect:
- if starting with timestamp, drop timestamp and assume host-relative path

Integration Tests:
- test proxy mode by using REQUEST_URI
- test archival relative redirect!
This commit is contained in:
Ilya Kreymer 2014-02-08 20:07:16 -08:00
parent b11f4fad93
commit 44f38f44d5
12 changed files with 127 additions and 49 deletions

View File

@ -74,18 +74,22 @@ archive_paths: ./sample_archive/warcs/
# to http://localhost:8080/pywb/image.gif # to http://localhost:8080/pywb/image.gif
# #
#hostpaths: ['http://localhost:8080/'] #hostpaths: ['http://localhost:8080']
# Rewrite urls with absolute paths instead of relative
#absoulte_paths: true
# List of route names: # List of route names:
# <route>: <package or file path> # <route>: <package or file path>
# default route static/default for pywb defaults
static_routes: static_routes:
static: static/ static/default: static/
# ==== New / Experimental Settings ==== # ==== New / Experimental Settings ====
# Not yet production ready -- used primarily for testing # Not yet production ready -- used primarily for testing
# Enable simple http proxy mode # Enable simple http proxy mode
#enable_http_proxy: false enable_http_proxy: true
# enable cdx server api for querying cdx directly (experimental) # enable cdx server api for querying cdx directly (experimental)
#enable_cdx_api: false #enable_cdx_api: false

View File

@ -73,6 +73,7 @@ class Route:
# collection id from regex group (default 0) # collection id from regex group (default 0)
self.coll_group = coll_group self.coll_group = coll_group
self.filters = filters self.filters = filters
self._custom_init()
def __call__(self, env, use_abs_prefix): def __call__(self, env, use_abs_prefix):
@ -94,10 +95,10 @@ class Route:
wbrequest = WbRequest(env, wbrequest = WbRequest(env,
request_uri = request_uri, request_uri = request_uri,
coll = coll,
wb_url_str = wb_url_str, wb_url_str = wb_url_str,
wb_prefix = wb_prefix, wb_prefix = wb_prefix,
use_abs_prefix = use_abs_prefix, coll = coll,
host_prefix = WbRequest.make_host_prefix(env) if use_abs_prefix else '',
wburl_class = self.handler.get_wburl_type()) wburl_class = self.handler.get_wburl_type())
@ -111,6 +112,9 @@ class Route:
last_grp = len(matcher.groups()) last_grp = len(matcher.groups())
wbrequest.query_filter.append(filter.format(matcher.group(last_grp))) wbrequest.query_filter.append(filter.format(matcher.group(last_grp)))
def _custom_init(self):
pass
def _handle_request(self, wbrequest): def _handle_request(self, wbrequest):
return self.handler(wbrequest) return self.handler(wbrequest)
@ -140,6 +144,14 @@ class ReferRedirect:
>>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html') >>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
'http://localhost:8080/coll/20131010/http://example.com/other.html' 'http://localhost:8080/coll/20131010/http://example.com/other.html'
# With timestamp included
>>> test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
'http://localhost:8080/coll/20131010/http://example.com/other.html'
# With timestamp included
>>> test_redir('http://localhost:8080/', '/20131010/path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/some/index.html')
'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
>>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') >>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
False False
@ -147,6 +159,10 @@ class ReferRedirect:
>>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra') >>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
'http://localhost:8080/extra/coll/20131010/http://example.com/other.html' 'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
# With custom SCRIPT_NAME + timestamp
>>> test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
# With custom SCRIPT_NAME, bad match # With custom SCRIPT_NAME, bad match
>>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr') >>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
False False
@ -185,11 +201,14 @@ class ReferRedirect:
rel_request_uri = wbrequest.request_uri[1:] rel_request_uri = wbrequest.request_uri[1:]
#ref_wb_url = archiveurl('/' + ref_path[1]) timestamp_path = rewriter.wburl.timestamp + '/'
#ref_wb_url.url = urlparse.urljoin(ref_wb_url.url, wbrequest.request_uri[1:])
#ref_wb_url.url = ref_wb_url.url.replace('../', '') # check if timestamp is already part of the path
if rel_request_uri.startswith(timestamp_path):
# remove timestamp but leave / to make host relative url
# 2013/path.html -> /path.html
rel_request_uri = rel_request_uri[len(timestamp_path) - 1:]
#final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, ref_path[0] + str(ref_wb_url), '', ''))
final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, rewriter.rewrite(rel_request_uri), '', '')) final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, rewriter.rewrite(rel_request_uri), '', ''))
return WbResponse.redir_response(final_url) return WbResponse.redir_response(final_url)

View File

@ -42,10 +42,22 @@ def create_wb_handler(**config):
html_view = load_template_file(config.get('query_html'), 'Captures Page', views.J2HtmlCapturesView), html_view = load_template_file(config.get('query_html'), 'Captures Page', views.J2HtmlCapturesView),
search_view = load_template_file(config.get('search_html'), 'Search Page'), search_view = load_template_file(config.get('search_html'), 'Search Page'),
static_path = config.get('static_path'),
) )
return wb_handler return wb_handler
#=================================================================
def load_class(name):
result = name.rsplit('.', 1)
if len(result) == 1:
modname == ''
klass = result[0]
else:
modname = result[0]
klass = result[1]
mod = __import__(modname, fromlist=[klass])
return getattr(mod, klass)

View File

@ -22,7 +22,7 @@ class BaseHandler:
# Standard WB Handler # Standard WB Handler
#================================================================= #=================================================================
class WBHandler(BaseHandler): class WBHandler(BaseHandler):
def __init__(self, cdx_reader, replay, html_view = None, search_view = None, static_path = '/static/'): def __init__(self, cdx_reader, replay, html_view = None, search_view = None):
self.cdx_reader = cdx_reader self.cdx_reader = cdx_reader
self.replay = replay self.replay = replay
@ -31,8 +31,6 @@ class WBHandler(BaseHandler):
self.html_view = html_view self.html_view = html_view
self.search_view = search_view self.search_view = search_view
self.static_path = static_path
def __call__(self, wbrequest): def __call__(self, wbrequest):
@ -51,7 +49,7 @@ class WBHandler(BaseHandler):
return query_view.render_response(wbrequest, cdx_lines) return query_view.render_response(wbrequest, cdx_lines)
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
return self.replay(wbrequest, cdx_lines, self.cdx_reader, self.static_path) return self.replay(wbrequest, cdx_lines, self.cdx_reader)
def render_search_page(self, wbrequest): def render_search_page(self, wbrequest):

View File

@ -46,10 +46,10 @@ class ProxyRouter:
wbrequest = WbRequest(env, wbrequest = WbRequest(env,
request_uri = url, request_uri = url,
coll = '',
wb_url_str = url, wb_url_str = url,
wb_prefix = '', wb_prefix = '',
use_abs_prefix = False, coll = '',
host_prefix = self.hostpaths[0],
wburl_class = self.handler.get_wburl_type(), wburl_class = self.handler.get_wburl_type(),
url_rewriter_class = ProxyHttpsUrlRewriter, url_rewriter_class = ProxyHttpsUrlRewriter,
is_proxy = True) is_proxy = True)

View File

@ -22,7 +22,7 @@ def pywb_config_manual(config = {}):
routes = [] routes = []
hostpaths = config.get('hostpaths', ['http://localhost:8080/']) hostpaths = config.get('hostpaths', ['http://localhost:8080'])
# collections based on cdx source # collections based on cdx source
collections = config.get('collections', {'pywb': './sample_archive/cdx/'}) collections = config.get('collections', {'pywb': './sample_archive/cdx/'})
@ -40,23 +40,27 @@ def pywb_config_manual(config = {}):
cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, **config) cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, **config)
# cdx query handler
if route_config.get('enable_cdx_api', False):
routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_source)))
wb_handler = config_utils.create_wb_handler( wb_handler = config_utils.create_wb_handler(
cdx_source = cdx_source, cdx_source = cdx_source,
archive_paths = route_config.get('archive_paths', './sample_archive/warcs/'), archive_paths = route_config.get('archive_paths', './sample_archive/warcs/'),
head_html = route_config.get('head_insert_html', DEFAULT_HEAD_INSERT), head_html = route_config.get('head_insert_html', DEFAULT_HEAD_INSERT),
query_html = route_config.get('query_html', DEFAULT_QUERY), query_html = route_config.get('query_html', DEFAULT_QUERY),
search_html = route_config.get('search_html', DEFAULT_SEARCH), search_html = route_config.get('search_html', DEFAULT_SEARCH),
static_path = hostpaths[0] + route_config.get('static_path', 'static/')
) )
logging.info('Adding Collection: ' + name) logging.info('Adding Collection: ' + name)
routes.append(archivalrouter.Route(name, wb_handler, filters = route_config.get('filters', []))) route_class = route_config.get('route_class', None)
if route_class:
route_class = config_utils.load_class(route_class)
else:
route_class = archivalrouter.Route
routes.append(route_class(name, wb_handler, filters = route_config.get('filters', [])))
# cdx query handler
if route_config.get('enable_cdx_api', False):
routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_source)))
if config.get('debug_echo_env', False): if config.get('debug_echo_env', False):
@ -66,7 +70,7 @@ def pywb_config_manual(config = {}):
routes.append(archivalrouter.Route('echo_req', handlers.DebugEchoHandler())) routes.append(archivalrouter.Route('echo_req', handlers.DebugEchoHandler()))
static_routes = config.get('static_routes', {'static': 'static/'}) static_routes = config.get('static_routes', {'static/default': 'static/'})
for static_name, static_path in static_routes.iteritems(): for static_name, static_path in static_routes.iteritems():
routes.append(archivalrouter.Route(static_name, handlers.StaticHandler(static_path))) routes.append(archivalrouter.Route(static_name, handlers.StaticHandler(static_path)))
@ -85,6 +89,8 @@ def pywb_config_manual(config = {}):
# (See archivalrouter.ReferRedirect) # (See archivalrouter.ReferRedirect)
hostpaths = hostpaths, hostpaths = hostpaths,
abs_path = config.get('absolute_paths', True),
home_view = config_utils.load_template_file(config.get('home_html', DEFAULT_INDEX), 'Home Page'), home_view = config_utils.load_template_file(config.get('home_html', DEFAULT_INDEX), 'Home Page'),
error_view = config_utils.load_template_file(config.get('error_html', DEFAULT_ERROR), 'Error Page') error_view = config_utils.load_template_file(config.get('error_html', DEFAULT_ERROR), 'Error Page')
) )

View File

@ -23,7 +23,7 @@ class ReplayView:
self.loader = loader if loader else archiveloader.ArchiveLoader() self.loader = loader if loader else archiveloader.ArchiveLoader()
def __call__(self, wbrequest, cdx_lines, cdx_reader, static_path): def __call__(self, wbrequest, cdx_lines, cdx_reader):
last_e = None last_e = None
first = True first = True
@ -41,7 +41,7 @@ class ReplayView:
(cdx, status_headers, stream) = self.resolve_headers_and_payload(cdx, wbrequest, cdx_reader, failed_files) (cdx, status_headers, stream) = self.resolve_headers_and_payload(cdx, wbrequest, cdx_reader, failed_files)
return self.make_response(wbrequest, cdx, status_headers, stream, static_path) return self.make_response(wbrequest, cdx, status_headers, stream)
except wbexceptions.CaptureException as ce: except wbexceptions.CaptureException as ce:
@ -142,7 +142,7 @@ class ReplayView:
# done here! just return response # done here! just return response
# subclasses make override to do additional processing # subclasses make override to do additional processing
def make_response(self, wbrequest, cdx, status_headers, stream, static_path): def make_response(self, wbrequest, cdx, status_headers, stream):
return self.create_stream_response(status_headers, stream) return self.create_stream_response(status_headers, stream)
@ -250,7 +250,7 @@ class RewritingReplayView(ReplayView):
return None return None
def make_response(self, wbrequest, cdx, status_headers, stream, static_path): def make_response(self, wbrequest, cdx, status_headers, stream):
# check and reject self-redirect # check and reject self-redirect
self._reject_self_redirect(wbrequest, cdx, status_headers) self._reject_self_redirect(wbrequest, cdx, status_headers)
@ -312,7 +312,7 @@ class RewritingReplayView(ReplayView):
status_headers = rewritten_headers.status_headers status_headers = rewritten_headers.status_headers
if text_type == 'html': if text_type == 'html':
head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx, static_path = static_path) if self.head_insert_view else None head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx) if self.head_insert_view else None
rewriter = html_rewriter.HTMLRewriter(urlrewriter, outstream = None, head_insert = head_insert_str) rewriter = html_rewriter.HTMLRewriter(urlrewriter, outstream = None, head_insert = head_insert_str)
elif text_type == 'css': elif text_type == 'css':
rewriter = regex_rewriters.CSSRewriter(urlrewriter) rewriter = regex_rewriters.CSSRewriter(urlrewriter)

View File

@ -3,6 +3,6 @@
wbinfo = {} wbinfo = {}
wbinfo.capture_str = "{{ cdx['timestamp'] | format_ts }}"; wbinfo.capture_str = "{{ cdx['timestamp'] | format_ts }}";
</script> </script>
<script src='{{ static_path }}wb.js'> </script> <script src='{{ wbrequest.host_prefix }}/static/default/wb.js'> </script>
<link rel='stylesheet' href='{{ static_path }}wb.css'/> <link rel='stylesheet' href='{{ wbrequest.host_prefix }}/static/default/wb.css'/>
<!-- End WB Insert --> <!-- End WB Insert -->

View File

@ -19,6 +19,8 @@ def create_wb_app(wb_router):
else: else:
env['REL_REQUEST_URI'] = env['REQUEST_URI'] env['REL_REQUEST_URI'] = env['REQUEST_URI']
print env['REL_REQUEST_URI']
response = None response = None
try: try:

View File

@ -52,19 +52,21 @@ class WbRequest:
wb_url_str = parts[0] wb_url_str = parts[0]
coll = '' coll = ''
return WbRequest(env, request_uri, wb_prefix, wb_url_str, coll, use_abs_prefix) host_prefix = WbRequest.make_host_prefix(env) if use_abs_prefix else ''
return WbRequest(env, request_uri, wb_prefix, wb_url_str, coll, host_prefix = host_prefix)
@staticmethod @staticmethod
def make_abs_prefix(env, rel_prefix): def make_host_prefix(env):
try: try:
return env['wsgi.url_scheme'] + '://' + env['HTTP_HOST'] + rel_prefix return env['wsgi.url_scheme'] + '://' + env['HTTP_HOST']
except KeyError: except KeyError:
return rel_prefix return ''
def __init__(self, env, request_uri, wb_prefix, wb_url_str, coll, def __init__(self, env, request_uri, wb_prefix, wb_url_str, coll,
use_abs_prefix = False, host_prefix = '',
wburl_class = WbUrl, wburl_class = WbUrl,
url_rewriter_class = UrlRewriter, url_rewriter_class = UrlRewriter,
is_proxy = False): is_proxy = False):
@ -73,7 +75,9 @@ class WbRequest:
self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI') self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI')
self.wb_prefix = wb_prefix if not use_abs_prefix else WbRequest.make_abs_prefix(env, wb_prefix) self.host_prefix = host_prefix
self.wb_prefix = host_prefix + wb_prefix
if not wb_url_str: if not wb_url_str:
wb_url_str = '/' wb_url_str = '/'

View File

@ -88,21 +88,51 @@ class TestWb:
assert 'Mon, Jan 27 2014 17:12:51' in resp.body assert 'Mon, Jan 27 2014 17:12:51' in resp.body
assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body
def test_redirect_relative_3(self):
# first two requests should result in same redirect
target = 'http://localhost:8080/pywb/2014/http://iana.org/_css/2013.1/screen.css'
# without timestamp
resp = self.testapp.get('/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:8080/pywb/2014/http://iana.org/')])
assert resp.status_int == 302
assert resp.headers['Location'] == target, resp.headers['Location']
# with timestamp
resp = self.testapp.get('/2014/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:8080/pywb/2014/http://iana.org/')])
assert resp.status_int == 302
assert resp.headers['Location'] == target, resp.headers['Location']
resp = resp.follow()
assert resp.status_int == 302
assert resp.headers['Location'].endswith('/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css')
resp = resp.follow()
assert resp.status_int == 200
assert resp.content_type == 'text/css'
def test_static_content(self): def test_static_content(self):
resp = self.testapp.get('/test-static/wb.css') resp = self.testapp.get('/static/test/route/wb.css')
assert resp.status_int == 200 assert resp.status_int == 200
assert resp.content_type == 'text/css' assert resp.content_type == 'text/css'
assert resp.content_length > 0 assert resp.content_length > 0
# XX: Doesn't work as webtest does not support proxy mode # 'Simulating' proxy by settings REQUEST_URI explicitly to http:// url and no SCRIPT_NAME
# need a way to test # would be nice to be able to test proxy more
#def test_proxy_replay(self): def test_proxy_replay(self):
#resp = self.testapp.get('http://www.iana.org/domains/idn-tables') resp = self.testapp.get('/x-ignore-this-x', extra_environ = dict(REQUEST_URI = 'http://www.iana.org/domains/idn-tables', SCRIPT_NAME = ''))
#self._assert_basic_html(resp) self._assert_basic_html(resp)
#assert 'Sun, Jan 26 2014 20:11:27' in resp.body assert 'Sun, Jan 26 2014 20:11:27' in resp.body
#assert 'wb.js' in resp.body assert 'wb.js' in resp.body
def test_proxy_pac(self):
resp = self.testapp.get('/proxy.pac', extra_environ = dict(SERVER_NAME='pywb-proxy', SERVER_PORT='8080'))
assert resp.content_type == 'application/x-ns-proxy-autoconfig'
assert '"PROXY pywb-proxy:8080"' in resp.body
assert '"localhost"' in resp.body
def test_cdx_server_filters(self): def test_cdx_server_filters(self):
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mimetype:warc/revisit&filter=filename:dupes.warc.gz') resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mimetype:warc/revisit&filter=filename:dupes.warc.gz')

View File

@ -70,12 +70,15 @@ error_html: ui/error.html
# to http://localhost:8080/pywb/image.gif # to http://localhost:8080/pywb/image.gif
# #
#hostpaths: ['http://localhost:8080/'] #hostpaths: ['http://localhost:8080']
# Rewrite urls with absolute paths instead of relative
absoulte_paths: true
# List of route names: # List of route names:
# <route>: <package or file path> # <route>: <package or file path>
static_routes: static_routes:
test-static: static/ static/test/route: static/
# ==== New / Experimental Settings ==== # ==== New / Experimental Settings ====