1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-25 23:47:47 +01:00
pywb/pywb/archivalrouter.py
Ilya Kreymer 9194e867ea - add referrer self-redirect check and test case
- dispatching: cleanup wbrequestresponse, move tests to a seperate file
- wbrequest: store both rel_prefix and host_prefix, with wb_prefix either full
or rel path as needed, so that full and relative paths are
both available in wbrequest
- create WbUrlHandler to differentiate handlers which
support WbUrl (timestamp[mod]/url) semantic vs other request handlers.
2014-02-23 23:31:54 -08:00

178 lines
5.8 KiB
Python

import urlparse
import re
from wbrequestresponse import WbRequest, WbResponse
from pywb.rewrite.url_rewriter import UrlRewriter
#=================================================================
# ArchivalRouter -- route WB requests in archival mode
#=================================================================
class ArchivalRouter:
def __init__(self, routes, hostpaths=None, abs_path=True, home_view=None, error_view=None):
self.routes = routes
self.fallback = ReferRedirect(hostpaths)
self.abs_path = abs_path
self.home_view = home_view
self.error_view = error_view
def __call__(self, env):
for route in self.routes:
result = route(env, self.abs_path)
if result:
return result
# Home Page
if env['REL_REQUEST_URI'] in ['/', '/index.html', '/index.htm']:
return self.render_home_page()
return self.fallback(env, self.routes) if self.fallback else None
def render_home_page(self):
# render the homepage!
if self.home_view:
return self.home_view.render_response(routes = self.routes)
else:
# default home page template
text = '\n'.join(map(str, self.routes))
return WbResponse.text_response(text)
#=================================================================
# Route by matching regex (or fixed prefix)
# of request uri (excluding first '/')
#=================================================================
class Route:
# match upto next / or ? or end
SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)'
def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD):
self.path = regex
self.regex = re.compile(regex + lookahead)
self.handler = handler
# collection id from regex group (default 0)
self.coll_group = coll_group
self._custom_init(config)
def __call__(self, env, use_abs_prefix):
wbrequest = self.parse_request(env, use_abs_prefix)
return self.handler(wbrequest) if wbrequest else None
def parse_request(self, env, use_abs_prefix, request_uri = None):
if not request_uri:
request_uri = env['REL_REQUEST_URI']
matcher = self.regex.match(request_uri[1:])
if not matcher:
return None
matched_str = matcher.group(0)
if matched_str:
rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri
else:
rel_prefix = env['SCRIPT_NAME'] + '/'
wb_url_str = request_uri[1:] # the request_uri is the wb_url, since no coll
coll = matcher.group(self.coll_group)
wbrequest = WbRequest(env,
request_uri=request_uri,
wb_url_str=wb_url_str,
rel_prefix=rel_prefix,
coll=coll,
use_abs_prefix=use_abs_prefix,
wburl_class = self.handler.get_wburl_type(),
urlrewriter_class=UrlRewriter)
# Allow for applying of additional filters
self._apply_filters(wbrequest, matcher)
return wbrequest
def _apply_filters(self, wbrequest, matcher):
for filter in self.filters:
last_grp = len(matcher.groups())
wbrequest.query_filter.append(filter.format(matcher.group(last_grp)))
def _custom_init(self, config):
self.filters = config.get('filters', [])
def __str__(self):
#return '* ' + self.regex_str + ' => ' + str(self.handler)
return str(self.handler)
#=================================================================
# ReferRedirect -- redirect urls that have 'fallen through' based on the referrer settings
#=================================================================
class ReferRedirect:
def __init__(self, match_prefixs):
if isinstance(match_prefixs, list):
self.match_prefixs = match_prefixs
else:
self.match_prefixs = [match_prefixs]
def __call__(self, env, routes):
referrer = env.get('HTTP_REFERER')
# ensure there is a referrer
if referrer is None:
return None
# get referrer path name
ref_split = urlparse.urlsplit(referrer)
# ensure referrer starts with one of allowed hosts
if not any (referrer.startswith(i) for i in self.match_prefixs):
if ref_split.netloc != env.get('HTTP_HOST'):
return None
path = ref_split.path
app_path = env['SCRIPT_NAME']
if app_path:
# must start with current app name, if not root
if not path.startswith(app_path):
return None
path = path[len(app_path):]
for route in routes:
ref_request = route.parse_request(env, False, request_uri = path)
if ref_request:
break
# must have matched one of the routes
if not ref_request:
return None
# must have a rewriter
if not ref_request.urlrewriter:
return None
rewriter = ref_request.urlrewriter
rel_request_uri = env['REL_REQUEST_URI']
timestamp_path = '/' + rewriter.wburl.timestamp + '/'
# check if timestamp is already part of the path
if rel_request_uri.startswith(timestamp_path):
# remove timestamp but leave / to make host relative url
# 2013/path.html -> /path.html
rel_request_uri = rel_request_uri[len(timestamp_path) - 1:]
final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, rewriter.rewrite(rel_request_uri), '', ''))
return WbResponse.redir_response(final_url)