From 80b2585d22a3362d00d69273e2ca4d3381d442cd Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 19 Jan 2014 21:13:48 -0800 Subject: [PATCH] Should resolve #4 -- supports pywb running as a non-root app * Instead of relying on REQUEST_URI, pywb constructs a REL_REQUEST_URI, from PATH_INFO + QUERY_STRING. SCRIPT_NAME auto-added to prefix * MatchPrefix is now superceded by MatchRegex, which can match a plain string -- collId defaults to the full match * Added optional archivalurl_class to router to allow for customized ArchivalUrl implementations to be specified * run.sh can test on a non-root mountpoint, eg. ./run.sh "/approot" --- pywb/archivalrouter.py | 71 +++++++++++++++++++++++++++------------ pywb/utils.py | 8 ++--- pywb/wbapp.py | 16 ++++----- pywb/wbarchivalurl.py | 2 +- pywb/wbrequestresponse.py | 8 ++--- run.sh | 11 ++++-- 6 files changed, 75 insertions(+), 41 deletions(-) diff --git a/pywb/archivalrouter.py b/pywb/archivalrouter.py index 675a6c21..b9ff2e7d 100644 --- a/pywb/archivalrouter.py +++ b/pywb/archivalrouter.py @@ -3,51 +3,62 @@ import re from wbrequestresponse import WbRequest, WbResponse from url_rewriter import ArchivalUrlRewriter +from wbarchivalurl import ArchivalUrl #================================================================= # ArchivalRequestRouter -- route WB requests in archival mode #================================================================= class ArchivalRequestRouter: - def __init__(self, handlers, hostpaths = None, abs_path = True): + def __init__(self, handlers, hostpaths = None, abs_path = True, archivalurl_class = ArchivalUrl): self.handlers = handlers self.fallback = ReferRedirect(hostpaths) self.abs_path = abs_path + self.archivalurl_class = archivalurl_class def __call__(self, env): for handler in self.handlers: - result = handler(env, self.abs_path) + result = handler(env, self.abs_path, self.archivalurl_class) if result: return result if not self.fallback: return None - return self.fallback(WbRequest.from_uri(None, env), self.abs_path) + return self.fallback(WbRequest.from_uri(None, env)) + #================================================================= -# Route by matching prefix +# Route by matching prefix -- deprecated, as MatchRegex +# also supports the same #================================================================= class MatchPrefix: def __init__(self, prefix, handler): - self.prefix = '/' + prefix + '/' + self.prefix = '/' + prefix + '/' if prefix else '/' self.coll = prefix self.handler = handler - def __call__(self, env, useAbsPrefix): - request_uri = env['REQUEST_URI'] + def __call__(self, env, useAbsPrefix, archivalurl_class): + request_uri = env['REL_REQUEST_URI'] if not request_uri.startswith(self.prefix): return None + if self.coll: + wb_prefix = env['SCRIPT_NAME'] + self.prefix + wb_url = request_uri[len(self.coll) + 1:] + else: + wb_prefix = env['SCRIPT_NAME'] + self.prefix + wb_url = request_uri wbrequest = WbRequest(env, request_uri = request_uri, coll = self.coll, - wb_url = request_uri[len(self.coll) + 1:], - wb_prefix = self.prefix, - use_abs_prefix = useAbsPrefix) + wb_url = wb_url, + wb_prefix = wb_prefix, + use_abs_prefix = useAbsPrefix, + archivalurl_class = archivalurl_class) return self._handleRequest(wbrequest) @@ -59,35 +70,53 @@ class MatchPrefix: #================================================================= # Route by matching regex of request uri (excluding first '/') +# May be a fixed prefix #================================================================= class MatchRegex: - def __init__(self, regex, handler): + def __init__(self, regex, handler, coll_group = 0): self.regex = re.compile(regex) self.handler = handler + # collection id from regex group (default 0) + self.coll_group = coll_group - def __call__(self, env, useAbsPrefix): - request_uri = env['REQUEST_URI'] + def __call__(self, env, useAbsPrefix, archivalurl_class): + request_uri = env['REL_REQUEST_URI'] matcher = self.regex.match(request_uri[1:]) if not matcher: return None rel_prefix = matcher.group(0) + + if rel_prefix: + wb_prefix = env['SCRIPT_NAME'] + '/' + rel_prefix + '/' + wb_url = request_uri[len(rel_prefix) + 1:] # remove the '/' + rel_prefix part of uri + else: + wb_prefix = env['SCRIPT_NAME'] + '/' + wb_url = request_uri # the request_uri is the wb_url, since no coll + + coll = matcher.group(self.coll_group) + wbrequest = WbRequest(env, request_uri = request_uri, - coll = matcher.group(1), - wb_url = request_uri[len(rel_prefix) + 1:], - wb_prefix = '/' + rel_prefix + '/', - use_abs_prefix = useAbsPrefix) + coll = coll, + wb_url = wb_url, + wb_prefix = wb_prefix, + use_abs_prefix = useAbsPrefix, + archivalurl_class = archivalurl_class) + # Allow for setup of additional filters self._addFilters(wbrequest, matcher) - return self.handler(wbrequest) + return self._handleRequest(wbrequest) def _addFilters(self, wbrequest, matcher): pass + def _handleRequest(self, wbrequest): + return self.handler(wbrequest) + #================================================================= # ReferRedirect -- redirect urls that have 'fallen through' based on the referrer settings @@ -121,7 +150,7 @@ class ReferRedirect: self.matchPrefixs = [matchPrefixs] - def __call__(self, wbrequest, abs_path): + def __call__(self, wbrequest): if wbrequest.referrer is None: return None @@ -152,11 +181,11 @@ if __name__ == "__main__": import doctest def test_redir(matchHost, request_uri, referrer): - env = {'REQUEST_URI': request_uri, 'HTTP_REFERER': referrer} + env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer} redir = ReferRedirect(matchHost) req = WbRequest.from_uri(request_uri, env) - rep = redir(req, None) + rep = redir(req) if not rep: return False diff --git a/pywb/utils.py b/pywb/utils.py index 52c74de8..f055736e 100644 --- a/pywb/utils.py +++ b/pywb/utils.py @@ -110,21 +110,21 @@ def iso_date_to_timestamp(string): # adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters # allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3 # explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links -def request_uri(environ, include_query=1): +def rel_request_uri(environ, include_query=1): """ Return the requested path, optionally including the query string # Simple test: - >>> request_uri({'PATH_INFO': '/web/example.com'}) + >>> rel_request_uri({'PATH_INFO': '/web/example.com'}) '/web/example.com' # Test all unecoded special chars and double-quote # (double-quote must be encoded but not single quote) - >>> request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""}) + >>> rel_request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""}) "/web/example.com/0~!+$&'()*+,;=:%22" """ from urllib import quote - url = quote(environ.get('SCRIPT_NAME', '') + environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@') + url = quote(environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@') if include_query and environ.get('QUERY_STRING'): url += '?' + environ['QUERY_STRING'] diff --git a/pywb/wbapp.py b/pywb/wbapp.py index 3e425608..5a2fec75 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -1,4 +1,4 @@ -from utils import request_uri +from utils import rel_request_uri from query import QueryHandler, EchoEnv, EchoRequest from replay import WBHandler import wbexceptions @@ -7,8 +7,6 @@ import indexreader from wbrequestresponse import WbResponse, StatusAndHeaders from archivalrouter import ArchivalRequestRouter, MatchPrefix - - ## =========== headInsert = """ @@ -82,8 +80,11 @@ except: def application(env, start_response): - if not env.get('REQUEST_URI'): - env['REQUEST_URI'] = request_uri(env) + + if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'): + env['REL_REQUEST_URI'] = rel_request_uri(env) + else: + env['REL_REQUEST_URI'] = env['REQUEST_URI'] response = None @@ -91,7 +92,7 @@ def application(env, start_response): response = wbparser(env) if not response: - raise wbexceptions.NotFoundException(env['REQUEST_URI'] + ' was not found') + raise wbexceptions.NotFoundException(env['REL_REQUEST_URI'] + ' was not found') except wbexceptions.InternalRedirect as ir: response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders)) @@ -117,7 +118,4 @@ def handleException(env, exc): return WbResponse.text_response(status + ' Error: ' + str(exc), status = status) -#def handle_not_found(env): -# return WbResponse.text_response('Not Found: ' + env['REQUEST_URI'], status = '404 Not Found') - diff --git a/pywb/wbarchivalurl.py b/pywb/wbarchivalurl.py index 450e5ac2..b3e024d3 100644 --- a/pywb/wbarchivalurl.py +++ b/pywb/wbarchivalurl.py @@ -162,7 +162,7 @@ class ArchivalUrl: return "/" + url def __str__(self): - return ArchivalUrl.to_str(self.type, self.mod, self.timestamp, self.url) + return self.to_str(self.type, self.mod, self.timestamp, self.url) def __repr__(self): return str((self.type, self.timestamp, self.mod, self.url, str(self))) diff --git a/pywb/wbrequestresponse.py b/pywb/wbrequestresponse.py index de71a953..ce64098e 100644 --- a/pywb/wbrequestresponse.py +++ b/pywb/wbrequestresponse.py @@ -31,7 +31,7 @@ class WbRequest: @staticmethod def from_uri(request_uri, env = {}, use_abs_prefix = False): if not request_uri: - request_uri = env.get('REQUEST_URI') + request_uri = env.get('REL_REQUEST_URI') parts = request_uri.split('/', 2) @@ -61,14 +61,14 @@ class WbRequest: return rel_prefix - def __init__(self, env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix = False): + def __init__(self, env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix = False, archivalurl_class = ArchivalUrl): self.env = env - self.request_uri = request_uri if request_uri else env.get('REQUEST_URI') + self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI') self.wb_prefix = wb_prefix if not use_abs_prefix else WbRequest.makeAbsPrefix(env, wb_prefix) - self.wb_url = ArchivalUrl(wb_url) + self.wb_url = archivalurl_class(wb_url) self.coll = coll diff --git a/run.sh b/run.sh index 40d6facf..44fe71b0 100755 --- a/run.sh +++ b/run.sh @@ -2,10 +2,17 @@ mypath=$(cd `dirname $0` && pwd) -app=$1 +app=$2 cd $mypath/pywb if [ -z "$app" ]; then app=wbapp.py fi -uwsgi --static-map /static=$mypath/static --http :8080 --wsgi-file $app +if [ -z "$1" ]; then + # Standard root config + uwsgi --static-map /static=$mypath/static --http-socket :8080 --wsgi-file $app +else + # Test on non-root mount + uwsgi --static-map /static=$mypath/static --http-socket :8080 --mount "$1=$app" --no-default-app --manage-script-name +fi +