diff --git a/pywb/framework/test/test_wbrequestresponse.py b/pywb/framework/test/test_wbrequestresponse.py index 493ca0c2..e066d4d1 100644 --- a/pywb/framework/test/test_wbrequestresponse.py +++ b/pywb/framework/test/test_wbrequestresponse.py @@ -21,9 +21,9 @@ >>> print_req_from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True) {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'} -# No Scheme, so stick to relative +# No Scheme, default to http (shouldn't happen per WSGI standard) >>> print_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True) -{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'} +{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'http://localhost:8080/2010/', 'request_uri': '/2010/example.com'} # Referrer extraction >>> WbUrl(req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://localhost:8080/web/2011/blah.example.com/'}).extract_referrer_wburl_str()).url diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 80156aff..446aa88a 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -23,7 +23,7 @@ class WbRequest(object): if not host: host = env['SERVER_NAME'] + ':' + env['SERVER_PORT'] - return env['wsgi.url_scheme'] + '://' + host + return env.get('wsgi.url_scheme', 'http') + '://' + host except KeyError: return '' @@ -66,7 +66,8 @@ class WbRequest(object): # wb_url present and not root page if wb_url_str != '/' and wburl_class: self.wb_url = wburl_class(wb_url_str) - self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix) + self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix, + host_prefix + rel_prefix) else: # no wb_url, just store blank wb_url self.wb_url = None diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py index cc28a660..59669b96 100644 --- a/pywb/rewrite/test/test_url_rewriter.py +++ b/pywb/rewrite/test/test_url_rewriter.py @@ -24,6 +24,12 @@ >>> do_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') 'localhost:8080/20101226101112/http://some-other-site.com' +>>> do_rewrite('http://localhost:8080/web/2014im_/http://some-other-site.com', 'http://example.com/index.html', '/web/', full_prefix='http://localhost:8080/web/') +'http://localhost:8080/web/2014im_/http://some-other-site.com' + +>>> do_rewrite('/web/http://some-other-site.com', 'http://example.com/index.html', '/web/', full_prefix='http://localhost:8080/web/') +'/web/http://some-other-site.com' + >>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') 'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com' @@ -62,8 +68,8 @@ from pywb.rewrite.url_rewriter import UrlRewriter, HttpsUrlRewriter -def do_rewrite(rel_url, base_url, prefix, mod = None): - rewriter = UrlRewriter(base_url, prefix) +def do_rewrite(rel_url, base_url, prefix, mod=None, full_prefix=None): + rewriter = UrlRewriter(base_url, prefix, full_prefix=full_prefix) return rewriter.rewrite(rel_url, mod) diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index cb35607f..df4f32eb 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -16,9 +16,10 @@ class UrlRewriter(object): PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:'] - def __init__(self, wburl, prefix): + def __init__(self, wburl, prefix, full_prefix=None): self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) self.prefix = prefix + self.full_prefix = full_prefix #if self.prefix.endswith('/'): # self.prefix = self.prefix[:-1] @@ -28,33 +29,43 @@ class UrlRewriter(object): if any(url.startswith(x) for x in self.NO_REWRITE_URI_PREFIX): return url + if (self.prefix and + self.prefix != '/' and + url.startswith(self.prefix)): + return url + + if (self.full_prefix and + self.full_prefix != self.prefix and + url.startswith(self.full_prefix)): + return url + wburl = self.wburl - isAbs = any(url.startswith(x) for x in self.PROTOCOLS) + is_abs = any(url.startswith(x) for x in self.PROTOCOLS) if url.startswith('//'): - isAbs = True + is_abs = True url = 'http:' + url # Optimized rewriter for # -rel urls that don't start with / and # do not contain ../ and no special mod - if not (isAbs or mod or url.startswith('/') or ('../' in url)): - finalUrl = urlparse.urljoin(self.prefix + wburl.original_url, url) + if not (is_abs or mod or url.startswith('/') or ('../' in url)): + final_url = urlparse.urljoin(self.prefix + wburl.original_url, url) else: # optimize: join if not absolute url, otherwise just use that - if not isAbs: - newUrl = urlparse.urljoin(wburl.url, url).replace('../', '') + if not is_abs: + new_url = urlparse.urljoin(wburl.url, url).replace('../', '') else: - newUrl = url + new_url = url if mod is None: mod = wburl.mod - finalUrl = self.prefix + wburl.to_str(mod=mod, url=newUrl) + final_url = self.prefix + wburl.to_str(mod=mod, url=new_url) - return finalUrl + return final_url def get_abs_url(self, url=''): return self.prefix + self.wburl.to_str(url=url) @@ -85,7 +96,7 @@ class HttpsUrlRewriter(object): HTTP = 'http://' HTTPS = 'https://' - def __init__(self, wburl, prefix): + def __init__(self, wburl, prefix, full_prefix=None): pass def rewrite(self, url, mod=None):