mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-21 03:02:09 +01:00
206 lines
7.1 KiB
Python
206 lines
7.1 KiB
Python
import urlparse
|
|
import re
|
|
|
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
|
from pywb.rewrite.wburl import WbUrl
|
|
from wbrequestresponse import WbRequest, WbResponse
|
|
|
|
|
|
#=================================================================
|
|
# ArchivalRouter -- route WB requests in archival mode
|
|
#=================================================================
|
|
class ArchivalRouter(object):
|
|
def __init__(self, routes, **kwargs):
|
|
self.routes = routes
|
|
|
|
# optional port setting may be ignored by wsgi container
|
|
self.port = kwargs.get('port')
|
|
|
|
self.fallback = ReferRedirect()
|
|
|
|
self.abs_path = kwargs.get('abs_path')
|
|
|
|
self.home_view = kwargs.get('home_view')
|
|
self.error_view = kwargs.get('error_view')
|
|
|
|
self.urlrewriter_class = (kwargs.get('config', {}).
|
|
get('urlrewriter_class', UrlRewriter))
|
|
|
|
def __call__(self, env):
|
|
request_uri = env['REL_REQUEST_URI']
|
|
|
|
for route in self.routes:
|
|
matcher, coll = route.is_handling(request_uri)
|
|
if matcher:
|
|
wbrequest = self.parse_request(route, env, matcher,
|
|
coll, request_uri,
|
|
use_abs_prefix=self.abs_path)
|
|
|
|
return route.handler(wbrequest)
|
|
|
|
# Default Home Page
|
|
if request_uri in ['/', '/index.html', '/index.htm']:
|
|
return self.render_home_page(env)
|
|
|
|
return self.fallback(env, self) if self.fallback else None
|
|
|
|
def parse_request(self, route, env, matcher, coll, request_uri,
|
|
use_abs_prefix=False):
|
|
matched_str = matcher.group(0)
|
|
rel_prefix = env.get('SCRIPT_NAME', '') + '/'
|
|
|
|
if matched_str:
|
|
rel_prefix += matched_str + '/'
|
|
# remove the '/' + rel_prefix part of uri
|
|
wb_url_str = request_uri[len(matched_str) + 2:]
|
|
else:
|
|
# the request_uri is the wb_url, since no coll
|
|
wb_url_str = request_uri[1:]
|
|
|
|
wbrequest = route.request_class(env,
|
|
request_uri=request_uri,
|
|
wb_url_str=wb_url_str,
|
|
rel_prefix=rel_prefix,
|
|
coll=coll,
|
|
use_abs_prefix=use_abs_prefix,
|
|
wburl_class=route.handler.get_wburl_type(),
|
|
urlrewriter_class=self.urlrewriter_class,
|
|
cookie_scope=route.cookie_scope,
|
|
rewrite_opts=route.rewrite_opts,
|
|
user_metadata=route.user_metadata)
|
|
|
|
# Allow for applying of additional filters
|
|
route.apply_filters(wbrequest, matcher)
|
|
|
|
return wbrequest
|
|
|
|
def render_home_page(self, env):
|
|
if self.home_view:
|
|
return self.home_view.render_response(env=env, routes=self.routes)
|
|
else:
|
|
return WbResponse.text_response('No Home Page')
|
|
|
|
|
|
#=================================================================
|
|
# Route by matching regex (or fixed prefix)
|
|
# of request uri (excluding first '/')
|
|
#=================================================================
|
|
class Route(object):
|
|
# match upto next / or ? or end
|
|
SLASH_QUERY_LOOKAHEAD = '(?=/|$|\?)'
|
|
|
|
def __init__(self, regex, handler, coll_group=0, config={},
|
|
request_class=WbRequest,
|
|
lookahead=SLASH_QUERY_LOOKAHEAD):
|
|
|
|
self.path = regex
|
|
if regex:
|
|
self.regex = re.compile(regex + lookahead)
|
|
else:
|
|
self.regex = re.compile('')
|
|
|
|
self.handler = handler
|
|
self.request_class = request_class
|
|
|
|
# collection id from regex group (default 0)
|
|
self.coll_group = coll_group
|
|
self.cookie_scope = config.get('cookie_scope')
|
|
self.rewrite_opts = config.get('rewrite_opts', {})
|
|
self.user_metadata = config.get('metadata', {})
|
|
self._custom_init(config)
|
|
|
|
def is_handling(self, request_uri):
|
|
matcher = self.regex.match(request_uri[1:])
|
|
if not matcher:
|
|
return None, None
|
|
|
|
coll = matcher.group(self.coll_group)
|
|
return matcher, coll
|
|
|
|
def apply_filters(self, wbrequest, matcher):
|
|
for filter in self.filters:
|
|
last_grp = len(matcher.groups())
|
|
filter_str = filter.format(matcher.group(last_grp))
|
|
wbrequest.query_filter.append(filter_str)
|
|
|
|
def _custom_init(self, config):
|
|
self.filters = config.get('filters', [])
|
|
|
|
|
|
#=================================================================
|
|
# ReferRedirect -- redirect urls that have 'fallen through'
|
|
# based on the referrer settings
|
|
#=================================================================
|
|
class ReferRedirect:
|
|
def __call__(self, env, the_router):
|
|
referrer = env.get('HTTP_REFERER')
|
|
|
|
routes = the_router.routes
|
|
|
|
# ensure there is a referrer
|
|
if referrer is None:
|
|
return None
|
|
|
|
# get referrer path name
|
|
ref_split = urlparse.urlsplit(referrer)
|
|
|
|
# require that referrer starts with current Host, if any
|
|
curr_host = env.get('HTTP_HOST')
|
|
if curr_host and curr_host != ref_split.netloc:
|
|
return None
|
|
|
|
path = ref_split.path
|
|
|
|
app_path = env.get('SCRIPT_NAME', '')
|
|
|
|
if app_path:
|
|
# must start with current app name, if not root
|
|
if not path.startswith(app_path):
|
|
return None
|
|
|
|
path = path[len(app_path):]
|
|
|
|
ref_route = None
|
|
ref_request = None
|
|
|
|
for route in routes:
|
|
matcher, coll = route.is_handling(path)
|
|
if matcher:
|
|
ref_request = the_router.parse_request(route, env,
|
|
matcher, coll, path)
|
|
ref_route = route
|
|
break
|
|
|
|
# must have matched one of the routes with a urlrewriter
|
|
if not ref_request or not ref_request.urlrewriter:
|
|
return None
|
|
|
|
rewriter = ref_request.urlrewriter
|
|
|
|
rel_request_uri = env['REL_REQUEST_URI']
|
|
|
|
timestamp_path = '/' + rewriter.wburl.timestamp + '/'
|
|
|
|
# check if timestamp is already part of the path
|
|
if rel_request_uri.startswith(timestamp_path):
|
|
# remove timestamp but leave / to make host relative url
|
|
# 2013/path.html -> /path.html
|
|
rel_request_uri = rel_request_uri[len(timestamp_path) - 1:]
|
|
|
|
rewritten_url = rewriter.rewrite(rel_request_uri)
|
|
|
|
# if post, can't redirect as that would lost the post data
|
|
# (can't use 307 because FF will show confirmation warning)
|
|
if ref_request.method == 'POST':
|
|
new_wb_url = WbUrl(rewritten_url[len(rewriter.prefix):])
|
|
ref_request.wb_url.url = new_wb_url.url
|
|
return ref_route.handler(ref_request)
|
|
|
|
final_url = urlparse.urlunsplit((ref_split.scheme,
|
|
ref_split.netloc,
|
|
rewritten_url,
|
|
'',
|
|
''))
|
|
|
|
return WbResponse.redir_response(final_url, status='302 Temp Redirect')
|