diff --git a/pywb/utils.py b/pywb/utils.py index c74b777f..52c74de8 100644 --- a/pywb/utils.py +++ b/pywb/utils.py @@ -107,13 +107,27 @@ def iso_date_to_timestamp(string): return datetime_to_timestamp(iso_date_to_datetime(string)) -# adapted from wsgiref.request_uri, but doesn't include domain name and allows ':' in url +# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters +# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3 +# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links def request_uri(environ, include_query=1): - """Return the requested path, optionally including the query string""" + """ + Return the requested path, optionally including the query string + + # Simple test: + >>> request_uri({'PATH_INFO': '/web/example.com'}) + '/web/example.com' + + # Test all unecoded special chars and double-quote + # (double-quote must be encoded but not single quote) + >>> request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""}) + "/web/example.com/0~!+$&'()*+,;=:%22" + """ from urllib import quote - url = quote(environ.get('SCRIPT_NAME', '')+environ.get('PATH_INFO',''),safe='/;=,:') + url = quote(environ.get('SCRIPT_NAME', '') + environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@') if include_query and environ.get('QUERY_STRING'): url += '?' + environ['QUERY_STRING'] + return url diff --git a/pywb/wbapp.py b/pywb/wbapp.py index b81467f0..3e425608 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -82,7 +82,9 @@ except: def application(env, start_response): - env['REQUEST_URI'] = request_uri(env) + if not env.get('REQUEST_URI'): + env['REQUEST_URI'] = request_uri(env) + response = None try: