From 2e4d78d0791143b69ac77c4876727cfb73edcbbe Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 19 Jan 2014 16:51:17 -0800 Subject: [PATCH] request_uri: only generate REQUEST_URI manually if not provided by wsgi framework only encode chars that are not allowed in path segment, per http://tools.ietf.org/html/rfc3986#section-3.3 --- pywb/utils.py | 20 +++++++++++++++++--- pywb/wbapp.py | 4 +++- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/pywb/utils.py b/pywb/utils.py index c74b777f..52c74de8 100644 --- a/pywb/utils.py +++ b/pywb/utils.py @@ -107,13 +107,27 @@ def iso_date_to_timestamp(string): return datetime_to_timestamp(iso_date_to_datetime(string)) -# adapted from wsgiref.request_uri, but doesn't include domain name and allows ':' in url +# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters +# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3 +# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links def request_uri(environ, include_query=1): - """Return the requested path, optionally including the query string""" + """ + Return the requested path, optionally including the query string + + # Simple test: + >>> request_uri({'PATH_INFO': '/web/example.com'}) + '/web/example.com' + + # Test all unecoded special chars and double-quote + # (double-quote must be encoded but not single quote) + >>> request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""}) + "/web/example.com/0~!+$&'()*+,;=:%22" + """ from urllib import quote - url = quote(environ.get('SCRIPT_NAME', '')+environ.get('PATH_INFO',''),safe='/;=,:') + url = quote(environ.get('SCRIPT_NAME', '') + environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@') if include_query and environ.get('QUERY_STRING'): url += '?' + environ['QUERY_STRING'] + return url diff --git a/pywb/wbapp.py b/pywb/wbapp.py index b81467f0..3e425608 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -82,7 +82,9 @@ except: def application(env, start_response): - env['REQUEST_URI'] = request_uri(env) + if not env.get('REQUEST_URI'): + env['REQUEST_URI'] = request_uri(env) + response = None try: