mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
paths cleanup:
- don't store explicit static path, but allow it to be set in the insert - store host_prefix, which is either server name or empty - for archival mode, absolute_paths settings controls if using absolute paths, - for proxy always use absolute_paths - default static path is: /static/default/ - allow extension apps to provide custom /static/X/ path Route overriding: - ability to set Route class - custom init method Archival Relative Redirect: - if starting with timestamp, drop timestamp and assume host-relative path Integration Tests: - test proxy mode by using REQUEST_URI - test archival relative redirect!
This commit is contained in:
parent
b11f4fad93
commit
44f38f44d5
10
config.yaml
10
config.yaml
@ -74,18 +74,22 @@ archive_paths: ./sample_archive/warcs/
|
||||
# to http://localhost:8080/pywb/image.gif
|
||||
#
|
||||
|
||||
#hostpaths: ['http://localhost:8080/']
|
||||
#hostpaths: ['http://localhost:8080']
|
||||
|
||||
# Rewrite urls with absolute paths instead of relative
|
||||
#absoulte_paths: true
|
||||
|
||||
# List of route names:
|
||||
# <route>: <package or file path>
|
||||
# default route static/default for pywb defaults
|
||||
static_routes:
|
||||
static: static/
|
||||
static/default: static/
|
||||
|
||||
# ==== New / Experimental Settings ====
|
||||
# Not yet production ready -- used primarily for testing
|
||||
|
||||
# Enable simple http proxy mode
|
||||
#enable_http_proxy: false
|
||||
enable_http_proxy: true
|
||||
|
||||
# enable cdx server api for querying cdx directly (experimental)
|
||||
#enable_cdx_api: false
|
||||
|
@ -73,6 +73,7 @@ class Route:
|
||||
# collection id from regex group (default 0)
|
||||
self.coll_group = coll_group
|
||||
self.filters = filters
|
||||
self._custom_init()
|
||||
|
||||
|
||||
def __call__(self, env, use_abs_prefix):
|
||||
@ -94,10 +95,10 @@ class Route:
|
||||
|
||||
wbrequest = WbRequest(env,
|
||||
request_uri = request_uri,
|
||||
coll = coll,
|
||||
wb_url_str = wb_url_str,
|
||||
wb_prefix = wb_prefix,
|
||||
use_abs_prefix = use_abs_prefix,
|
||||
coll = coll,
|
||||
host_prefix = WbRequest.make_host_prefix(env) if use_abs_prefix else '',
|
||||
wburl_class = self.handler.get_wburl_type())
|
||||
|
||||
|
||||
@ -111,6 +112,9 @@ class Route:
|
||||
last_grp = len(matcher.groups())
|
||||
wbrequest.query_filter.append(filter.format(matcher.group(last_grp)))
|
||||
|
||||
def _custom_init(self):
|
||||
pass
|
||||
|
||||
def _handle_request(self, wbrequest):
|
||||
return self.handler(wbrequest)
|
||||
|
||||
@ -140,6 +144,14 @@ class ReferRedirect:
|
||||
>>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
|
||||
'http://localhost:8080/coll/20131010/http://example.com/other.html'
|
||||
|
||||
# With timestamp included
|
||||
>>> test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html')
|
||||
'http://localhost:8080/coll/20131010/http://example.com/other.html'
|
||||
|
||||
# With timestamp included
|
||||
>>> test_redir('http://localhost:8080/', '/20131010/path/other.html', 'http://localhost:8080/coll/20131010/http://example.com/some/index.html')
|
||||
'http://localhost:8080/coll/20131010/http://example.com/path/other.html'
|
||||
|
||||
>>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html')
|
||||
False
|
||||
|
||||
@ -147,6 +159,10 @@ class ReferRedirect:
|
||||
>>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
|
||||
'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
|
||||
|
||||
# With custom SCRIPT_NAME + timestamp
|
||||
>>> test_redir('http://localhost:8080/', '/20131010/other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extra')
|
||||
'http://localhost:8080/extra/coll/20131010/http://example.com/other.html'
|
||||
|
||||
# With custom SCRIPT_NAME, bad match
|
||||
>>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/extra/coll/20131010/http://example.com/path/page.html', '/extr')
|
||||
False
|
||||
@ -185,11 +201,14 @@ class ReferRedirect:
|
||||
|
||||
rel_request_uri = wbrequest.request_uri[1:]
|
||||
|
||||
#ref_wb_url = archiveurl('/' + ref_path[1])
|
||||
#ref_wb_url.url = urlparse.urljoin(ref_wb_url.url, wbrequest.request_uri[1:])
|
||||
#ref_wb_url.url = ref_wb_url.url.replace('../', '')
|
||||
timestamp_path = rewriter.wburl.timestamp + '/'
|
||||
|
||||
# check if timestamp is already part of the path
|
||||
if rel_request_uri.startswith(timestamp_path):
|
||||
# remove timestamp but leave / to make host relative url
|
||||
# 2013/path.html -> /path.html
|
||||
rel_request_uri = rel_request_uri[len(timestamp_path) - 1:]
|
||||
|
||||
#final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, ref_path[0] + str(ref_wb_url), '', ''))
|
||||
final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, rewriter.rewrite(rel_request_uri), '', ''))
|
||||
|
||||
return WbResponse.redir_response(final_url)
|
||||
|
@ -42,10 +42,22 @@ def create_wb_handler(**config):
|
||||
html_view = load_template_file(config.get('query_html'), 'Captures Page', views.J2HtmlCapturesView),
|
||||
|
||||
search_view = load_template_file(config.get('search_html'), 'Search Page'),
|
||||
|
||||
static_path = config.get('static_path'),
|
||||
)
|
||||
|
||||
return wb_handler
|
||||
|
||||
|
||||
#=================================================================
|
||||
def load_class(name):
|
||||
result = name.rsplit('.', 1)
|
||||
|
||||
if len(result) == 1:
|
||||
modname == ''
|
||||
klass = result[0]
|
||||
else:
|
||||
modname = result[0]
|
||||
klass = result[1]
|
||||
|
||||
mod = __import__(modname, fromlist=[klass])
|
||||
return getattr(mod, klass)
|
||||
|
||||
|
@ -22,7 +22,7 @@ class BaseHandler:
|
||||
# Standard WB Handler
|
||||
#=================================================================
|
||||
class WBHandler(BaseHandler):
|
||||
def __init__(self, cdx_reader, replay, html_view = None, search_view = None, static_path = '/static/'):
|
||||
def __init__(self, cdx_reader, replay, html_view = None, search_view = None):
|
||||
self.cdx_reader = cdx_reader
|
||||
self.replay = replay
|
||||
|
||||
@ -31,8 +31,6 @@ class WBHandler(BaseHandler):
|
||||
self.html_view = html_view
|
||||
self.search_view = search_view
|
||||
|
||||
self.static_path = static_path
|
||||
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
|
||||
@ -51,7 +49,7 @@ class WBHandler(BaseHandler):
|
||||
return query_view.render_response(wbrequest, cdx_lines)
|
||||
|
||||
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
|
||||
return self.replay(wbrequest, cdx_lines, self.cdx_reader, self.static_path)
|
||||
return self.replay(wbrequest, cdx_lines, self.cdx_reader)
|
||||
|
||||
|
||||
def render_search_page(self, wbrequest):
|
||||
|
@ -46,10 +46,10 @@ class ProxyRouter:
|
||||
|
||||
wbrequest = WbRequest(env,
|
||||
request_uri = url,
|
||||
coll = '',
|
||||
wb_url_str = url,
|
||||
wb_prefix = '',
|
||||
use_abs_prefix = False,
|
||||
coll = '',
|
||||
host_prefix = self.hostpaths[0],
|
||||
wburl_class = self.handler.get_wburl_type(),
|
||||
url_rewriter_class = ProxyHttpsUrlRewriter,
|
||||
is_proxy = True)
|
||||
|
@ -22,7 +22,7 @@ def pywb_config_manual(config = {}):
|
||||
|
||||
routes = []
|
||||
|
||||
hostpaths = config.get('hostpaths', ['http://localhost:8080/'])
|
||||
hostpaths = config.get('hostpaths', ['http://localhost:8080'])
|
||||
|
||||
# collections based on cdx source
|
||||
collections = config.get('collections', {'pywb': './sample_archive/cdx/'})
|
||||
@ -40,23 +40,27 @@ def pywb_config_manual(config = {}):
|
||||
|
||||
cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, **config)
|
||||
|
||||
# cdx query handler
|
||||
if route_config.get('enable_cdx_api', False):
|
||||
routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_source)))
|
||||
|
||||
wb_handler = config_utils.create_wb_handler(
|
||||
cdx_source = cdx_source,
|
||||
archive_paths = route_config.get('archive_paths', './sample_archive/warcs/'),
|
||||
head_html = route_config.get('head_insert_html', DEFAULT_HEAD_INSERT),
|
||||
query_html = route_config.get('query_html', DEFAULT_QUERY),
|
||||
search_html = route_config.get('search_html', DEFAULT_SEARCH),
|
||||
|
||||
static_path = hostpaths[0] + route_config.get('static_path', 'static/')
|
||||
)
|
||||
|
||||
logging.info('Adding Collection: ' + name)
|
||||
|
||||
routes.append(archivalrouter.Route(name, wb_handler, filters = route_config.get('filters', [])))
|
||||
route_class = route_config.get('route_class', None)
|
||||
if route_class:
|
||||
route_class = config_utils.load_class(route_class)
|
||||
else:
|
||||
route_class = archivalrouter.Route
|
||||
|
||||
routes.append(route_class(name, wb_handler, filters = route_config.get('filters', [])))
|
||||
|
||||
# cdx query handler
|
||||
if route_config.get('enable_cdx_api', False):
|
||||
routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_source)))
|
||||
|
||||
|
||||
if config.get('debug_echo_env', False):
|
||||
@ -66,7 +70,7 @@ def pywb_config_manual(config = {}):
|
||||
routes.append(archivalrouter.Route('echo_req', handlers.DebugEchoHandler()))
|
||||
|
||||
|
||||
static_routes = config.get('static_routes', {'static': 'static/'})
|
||||
static_routes = config.get('static_routes', {'static/default': 'static/'})
|
||||
|
||||
for static_name, static_path in static_routes.iteritems():
|
||||
routes.append(archivalrouter.Route(static_name, handlers.StaticHandler(static_path)))
|
||||
@ -85,6 +89,8 @@ def pywb_config_manual(config = {}):
|
||||
# (See archivalrouter.ReferRedirect)
|
||||
hostpaths = hostpaths,
|
||||
|
||||
abs_path = config.get('absolute_paths', True),
|
||||
|
||||
home_view = config_utils.load_template_file(config.get('home_html', DEFAULT_INDEX), 'Home Page'),
|
||||
error_view = config_utils.load_template_file(config.get('error_html', DEFAULT_ERROR), 'Error Page')
|
||||
)
|
||||
|
@ -23,7 +23,7 @@ class ReplayView:
|
||||
self.loader = loader if loader else archiveloader.ArchiveLoader()
|
||||
|
||||
|
||||
def __call__(self, wbrequest, cdx_lines, cdx_reader, static_path):
|
||||
def __call__(self, wbrequest, cdx_lines, cdx_reader):
|
||||
last_e = None
|
||||
first = True
|
||||
|
||||
@ -41,7 +41,7 @@ class ReplayView:
|
||||
|
||||
(cdx, status_headers, stream) = self.resolve_headers_and_payload(cdx, wbrequest, cdx_reader, failed_files)
|
||||
|
||||
return self.make_response(wbrequest, cdx, status_headers, stream, static_path)
|
||||
return self.make_response(wbrequest, cdx, status_headers, stream)
|
||||
|
||||
|
||||
except wbexceptions.CaptureException as ce:
|
||||
@ -142,7 +142,7 @@ class ReplayView:
|
||||
|
||||
# done here! just return response
|
||||
# subclasses make override to do additional processing
|
||||
def make_response(self, wbrequest, cdx, status_headers, stream, static_path):
|
||||
def make_response(self, wbrequest, cdx, status_headers, stream):
|
||||
return self.create_stream_response(status_headers, stream)
|
||||
|
||||
|
||||
@ -250,7 +250,7 @@ class RewritingReplayView(ReplayView):
|
||||
return None
|
||||
|
||||
|
||||
def make_response(self, wbrequest, cdx, status_headers, stream, static_path):
|
||||
def make_response(self, wbrequest, cdx, status_headers, stream):
|
||||
# check and reject self-redirect
|
||||
self._reject_self_redirect(wbrequest, cdx, status_headers)
|
||||
|
||||
@ -312,7 +312,7 @@ class RewritingReplayView(ReplayView):
|
||||
status_headers = rewritten_headers.status_headers
|
||||
|
||||
if text_type == 'html':
|
||||
head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx, static_path = static_path) if self.head_insert_view else None
|
||||
head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx) if self.head_insert_view else None
|
||||
rewriter = html_rewriter.HTMLRewriter(urlrewriter, outstream = None, head_insert = head_insert_str)
|
||||
elif text_type == 'css':
|
||||
rewriter = regex_rewriters.CSSRewriter(urlrewriter)
|
||||
|
@ -3,6 +3,6 @@
|
||||
wbinfo = {}
|
||||
wbinfo.capture_str = "{{ cdx['timestamp'] | format_ts }}";
|
||||
</script>
|
||||
<script src='{{ static_path }}wb.js'> </script>
|
||||
<link rel='stylesheet' href='{{ static_path }}wb.css'/>
|
||||
<script src='{{ wbrequest.host_prefix }}/static/default/wb.js'> </script>
|
||||
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/static/default/wb.css'/>
|
||||
<!-- End WB Insert -->
|
||||
|
@ -19,6 +19,8 @@ def create_wb_app(wb_router):
|
||||
else:
|
||||
env['REL_REQUEST_URI'] = env['REQUEST_URI']
|
||||
|
||||
print env['REL_REQUEST_URI']
|
||||
|
||||
response = None
|
||||
|
||||
try:
|
||||
|
@ -52,19 +52,21 @@ class WbRequest:
|
||||
wb_url_str = parts[0]
|
||||
coll = ''
|
||||
|
||||
return WbRequest(env, request_uri, wb_prefix, wb_url_str, coll, use_abs_prefix)
|
||||
host_prefix = WbRequest.make_host_prefix(env) if use_abs_prefix else ''
|
||||
|
||||
return WbRequest(env, request_uri, wb_prefix, wb_url_str, coll, host_prefix = host_prefix)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def make_abs_prefix(env, rel_prefix):
|
||||
def make_host_prefix(env):
|
||||
try:
|
||||
return env['wsgi.url_scheme'] + '://' + env['HTTP_HOST'] + rel_prefix
|
||||
return env['wsgi.url_scheme'] + '://' + env['HTTP_HOST']
|
||||
except KeyError:
|
||||
return rel_prefix
|
||||
return ''
|
||||
|
||||
|
||||
def __init__(self, env, request_uri, wb_prefix, wb_url_str, coll,
|
||||
use_abs_prefix = False,
|
||||
host_prefix = '',
|
||||
wburl_class = WbUrl,
|
||||
url_rewriter_class = UrlRewriter,
|
||||
is_proxy = False):
|
||||
@ -73,7 +75,9 @@ class WbRequest:
|
||||
|
||||
self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI')
|
||||
|
||||
self.wb_prefix = wb_prefix if not use_abs_prefix else WbRequest.make_abs_prefix(env, wb_prefix)
|
||||
self.host_prefix = host_prefix
|
||||
|
||||
self.wb_prefix = host_prefix + wb_prefix
|
||||
|
||||
if not wb_url_str:
|
||||
wb_url_str = '/'
|
||||
|
46
run-tests.py
46
run-tests.py
@ -88,21 +88,51 @@ class TestWb:
|
||||
assert 'Mon, Jan 27 2014 17:12:51' in resp.body
|
||||
assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body
|
||||
|
||||
def test_redirect_relative_3(self):
|
||||
# first two requests should result in same redirect
|
||||
target = 'http://localhost:8080/pywb/2014/http://iana.org/_css/2013.1/screen.css'
|
||||
|
||||
# without timestamp
|
||||
resp = self.testapp.get('/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:8080/pywb/2014/http://iana.org/')])
|
||||
assert resp.status_int == 302
|
||||
assert resp.headers['Location'] == target, resp.headers['Location']
|
||||
|
||||
# with timestamp
|
||||
resp = self.testapp.get('/2014/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:8080/pywb/2014/http://iana.org/')])
|
||||
assert resp.status_int == 302
|
||||
assert resp.headers['Location'] == target, resp.headers['Location']
|
||||
|
||||
|
||||
resp = resp.follow()
|
||||
assert resp.status_int == 302
|
||||
assert resp.headers['Location'].endswith('/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css')
|
||||
|
||||
resp = resp.follow()
|
||||
assert resp.status_int == 200
|
||||
assert resp.content_type == 'text/css'
|
||||
|
||||
|
||||
def test_static_content(self):
|
||||
resp = self.testapp.get('/test-static/wb.css')
|
||||
resp = self.testapp.get('/static/test/route/wb.css')
|
||||
assert resp.status_int == 200
|
||||
assert resp.content_type == 'text/css'
|
||||
assert resp.content_length > 0
|
||||
|
||||
|
||||
# XX: Doesn't work as webtest does not support proxy mode
|
||||
# need a way to test
|
||||
#def test_proxy_replay(self):
|
||||
#resp = self.testapp.get('http://www.iana.org/domains/idn-tables')
|
||||
#self._assert_basic_html(resp)
|
||||
# 'Simulating' proxy by settings REQUEST_URI explicitly to http:// url and no SCRIPT_NAME
|
||||
# would be nice to be able to test proxy more
|
||||
def test_proxy_replay(self):
|
||||
resp = self.testapp.get('/x-ignore-this-x', extra_environ = dict(REQUEST_URI = 'http://www.iana.org/domains/idn-tables', SCRIPT_NAME = ''))
|
||||
self._assert_basic_html(resp)
|
||||
|
||||
#assert 'Sun, Jan 26 2014 20:11:27' in resp.body
|
||||
#assert 'wb.js' in resp.body
|
||||
assert 'Sun, Jan 26 2014 20:11:27' in resp.body
|
||||
assert 'wb.js' in resp.body
|
||||
|
||||
def test_proxy_pac(self):
|
||||
resp = self.testapp.get('/proxy.pac', extra_environ = dict(SERVER_NAME='pywb-proxy', SERVER_PORT='8080'))
|
||||
assert resp.content_type == 'application/x-ns-proxy-autoconfig'
|
||||
assert '"PROXY pywb-proxy:8080"' in resp.body
|
||||
assert '"localhost"' in resp.body
|
||||
|
||||
def test_cdx_server_filters(self):
|
||||
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mimetype:warc/revisit&filter=filename:dupes.warc.gz')
|
||||
|
@ -70,12 +70,15 @@ error_html: ui/error.html
|
||||
# to http://localhost:8080/pywb/image.gif
|
||||
#
|
||||
|
||||
#hostpaths: ['http://localhost:8080/']
|
||||
#hostpaths: ['http://localhost:8080']
|
||||
|
||||
# Rewrite urls with absolute paths instead of relative
|
||||
absoulte_paths: true
|
||||
|
||||
# List of route names:
|
||||
# <route>: <package or file path>
|
||||
static_routes:
|
||||
test-static: static/
|
||||
static/test/route: static/
|
||||
|
||||
|
||||
# ==== New / Experimental Settings ====
|
||||
|
Loading…
x
Reference in New Issue
Block a user