1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Should resolve #4 -- supports pywb running as a non-root app

* Instead of relying on REQUEST_URI, pywb constructs a
REL_REQUEST_URI, from PATH_INFO + QUERY_STRING.
SCRIPT_NAME auto-added to prefix
* MatchPrefix is now superceded by MatchRegex, which
can match a plain string -- collId defaults to the full match
* Added optional archivalurl_class to router to allow for customized
ArchivalUrl implementations to be specified
* run.sh can test on a non-root mountpoint, eg. ./run.sh "/approot"
This commit is contained in:
Ilya Kreymer 2014-01-19 21:13:48 -08:00
parent 2e4d78d079
commit 80b2585d22
6 changed files with 75 additions and 41 deletions

View File

@ -3,51 +3,62 @@ import re
from wbrequestresponse import WbRequest, WbResponse
from url_rewriter import ArchivalUrlRewriter
from wbarchivalurl import ArchivalUrl
#=================================================================
# ArchivalRequestRouter -- route WB requests in archival mode
#=================================================================
class ArchivalRequestRouter:
def __init__(self, handlers, hostpaths = None, abs_path = True):
def __init__(self, handlers, hostpaths = None, abs_path = True, archivalurl_class = ArchivalUrl):
self.handlers = handlers
self.fallback = ReferRedirect(hostpaths)
self.abs_path = abs_path
self.archivalurl_class = archivalurl_class
def __call__(self, env):
for handler in self.handlers:
result = handler(env, self.abs_path)
result = handler(env, self.abs_path, self.archivalurl_class)
if result:
return result
if not self.fallback:
return None
return self.fallback(WbRequest.from_uri(None, env), self.abs_path)
return self.fallback(WbRequest.from_uri(None, env))
#=================================================================
# Route by matching prefix
# Route by matching prefix -- deprecated, as MatchRegex
# also supports the same
#=================================================================
class MatchPrefix:
def __init__(self, prefix, handler):
self.prefix = '/' + prefix + '/'
self.prefix = '/' + prefix + '/' if prefix else '/'
self.coll = prefix
self.handler = handler
def __call__(self, env, useAbsPrefix):
request_uri = env['REQUEST_URI']
def __call__(self, env, useAbsPrefix, archivalurl_class):
request_uri = env['REL_REQUEST_URI']
if not request_uri.startswith(self.prefix):
return None
if self.coll:
wb_prefix = env['SCRIPT_NAME'] + self.prefix
wb_url = request_uri[len(self.coll) + 1:]
else:
wb_prefix = env['SCRIPT_NAME'] + self.prefix
wb_url = request_uri
wbrequest = WbRequest(env,
request_uri = request_uri,
coll = self.coll,
wb_url = request_uri[len(self.coll) + 1:],
wb_prefix = self.prefix,
use_abs_prefix = useAbsPrefix)
wb_url = wb_url,
wb_prefix = wb_prefix,
use_abs_prefix = useAbsPrefix,
archivalurl_class = archivalurl_class)
return self._handleRequest(wbrequest)
@ -59,35 +70,53 @@ class MatchPrefix:
#=================================================================
# Route by matching regex of request uri (excluding first '/')
# May be a fixed prefix
#=================================================================
class MatchRegex:
def __init__(self, regex, handler):
def __init__(self, regex, handler, coll_group = 0):
self.regex = re.compile(regex)
self.handler = handler
# collection id from regex group (default 0)
self.coll_group = coll_group
def __call__(self, env, useAbsPrefix):
request_uri = env['REQUEST_URI']
def __call__(self, env, useAbsPrefix, archivalurl_class):
request_uri = env['REL_REQUEST_URI']
matcher = self.regex.match(request_uri[1:])
if not matcher:
return None
rel_prefix = matcher.group(0)
if rel_prefix:
wb_prefix = env['SCRIPT_NAME'] + '/' + rel_prefix + '/'
wb_url = request_uri[len(rel_prefix) + 1:] # remove the '/' + rel_prefix part of uri
else:
wb_prefix = env['SCRIPT_NAME'] + '/'
wb_url = request_uri # the request_uri is the wb_url, since no coll
coll = matcher.group(self.coll_group)
wbrequest = WbRequest(env,
request_uri = request_uri,
coll = matcher.group(1),
wb_url = request_uri[len(rel_prefix) + 1:],
wb_prefix = '/' + rel_prefix + '/',
use_abs_prefix = useAbsPrefix)
coll = coll,
wb_url = wb_url,
wb_prefix = wb_prefix,
use_abs_prefix = useAbsPrefix,
archivalurl_class = archivalurl_class)
# Allow for setup of additional filters
self._addFilters(wbrequest, matcher)
return self.handler(wbrequest)
return self._handleRequest(wbrequest)
def _addFilters(self, wbrequest, matcher):
pass
def _handleRequest(self, wbrequest):
return self.handler(wbrequest)
#=================================================================
# ReferRedirect -- redirect urls that have 'fallen through' based on the referrer settings
@ -121,7 +150,7 @@ class ReferRedirect:
self.matchPrefixs = [matchPrefixs]
def __call__(self, wbrequest, abs_path):
def __call__(self, wbrequest):
if wbrequest.referrer is None:
return None
@ -152,11 +181,11 @@ if __name__ == "__main__":
import doctest
def test_redir(matchHost, request_uri, referrer):
env = {'REQUEST_URI': request_uri, 'HTTP_REFERER': referrer}
env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer}
redir = ReferRedirect(matchHost)
req = WbRequest.from_uri(request_uri, env)
rep = redir(req, None)
rep = redir(req)
if not rep:
return False

View File

@ -110,21 +110,21 @@ def iso_date_to_timestamp(string):
# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters
# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
def request_uri(environ, include_query=1):
def rel_request_uri(environ, include_query=1):
"""
Return the requested path, optionally including the query string
# Simple test:
>>> request_uri({'PATH_INFO': '/web/example.com'})
>>> rel_request_uri({'PATH_INFO': '/web/example.com'})
'/web/example.com'
# Test all unecoded special chars and double-quote
# (double-quote must be encoded but not single quote)
>>> request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""})
>>> rel_request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""})
"/web/example.com/0~!+$&'()*+,;=:%22"
"""
from urllib import quote
url = quote(environ.get('SCRIPT_NAME', '') + environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@')
url = quote(environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@')
if include_query and environ.get('QUERY_STRING'):
url += '?' + environ['QUERY_STRING']

View File

@ -1,4 +1,4 @@
from utils import request_uri
from utils import rel_request_uri
from query import QueryHandler, EchoEnv, EchoRequest
from replay import WBHandler
import wbexceptions
@ -7,8 +7,6 @@ import indexreader
from wbrequestresponse import WbResponse, StatusAndHeaders
from archivalrouter import ArchivalRequestRouter, MatchPrefix
## ===========
headInsert = """
@ -82,8 +80,11 @@ except:
def application(env, start_response):
if not env.get('REQUEST_URI'):
env['REQUEST_URI'] = request_uri(env)
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
env['REL_REQUEST_URI'] = rel_request_uri(env)
else:
env['REL_REQUEST_URI'] = env['REQUEST_URI']
response = None
@ -91,7 +92,7 @@ def application(env, start_response):
response = wbparser(env)
if not response:
raise wbexceptions.NotFoundException(env['REQUEST_URI'] + ' was not found')
raise wbexceptions.NotFoundException(env['REL_REQUEST_URI'] + ' was not found')
except wbexceptions.InternalRedirect as ir:
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
@ -117,7 +118,4 @@ def handleException(env, exc):
return WbResponse.text_response(status + ' Error: ' + str(exc), status = status)
#def handle_not_found(env):
# return WbResponse.text_response('Not Found: ' + env['REQUEST_URI'], status = '404 Not Found')

View File

@ -162,7 +162,7 @@ class ArchivalUrl:
return "/" + url
def __str__(self):
return ArchivalUrl.to_str(self.type, self.mod, self.timestamp, self.url)
return self.to_str(self.type, self.mod, self.timestamp, self.url)
def __repr__(self):
return str((self.type, self.timestamp, self.mod, self.url, str(self)))

View File

@ -31,7 +31,7 @@ class WbRequest:
@staticmethod
def from_uri(request_uri, env = {}, use_abs_prefix = False):
if not request_uri:
request_uri = env.get('REQUEST_URI')
request_uri = env.get('REL_REQUEST_URI')
parts = request_uri.split('/', 2)
@ -61,14 +61,14 @@ class WbRequest:
return rel_prefix
def __init__(self, env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix = False):
def __init__(self, env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix = False, archivalurl_class = ArchivalUrl):
self.env = env
self.request_uri = request_uri if request_uri else env.get('REQUEST_URI')
self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI')
self.wb_prefix = wb_prefix if not use_abs_prefix else WbRequest.makeAbsPrefix(env, wb_prefix)
self.wb_url = ArchivalUrl(wb_url)
self.wb_url = archivalurl_class(wb_url)
self.coll = coll

11
run.sh
View File

@ -2,10 +2,17 @@
mypath=$(cd `dirname $0` && pwd)
app=$1
app=$2
cd $mypath/pywb
if [ -z "$app" ]; then
app=wbapp.py
fi
uwsgi --static-map /static=$mypath/static --http :8080 --wsgi-file $app
if [ -z "$1" ]; then
# Standard root config
uwsgi --static-map /static=$mypath/static --http-socket :8080 --wsgi-file $app
else
# Test on non-root mount
uwsgi --static-map /static=$mypath/static --http-socket :8080 --mount "$1=$app" --no-default-app --manage-script-name
fi