mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
url rewriter: add optional 'full prefix', check and don't rewrite urls
if starting with prefix or full prefix wbrequest: if no scheme present (shouldn't happen with wsgi) default to http
This commit is contained in:
parent
cd017669ae
commit
53f0cb540f
@ -21,9 +21,9 @@
|
||||
>>> print_req_from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
|
||||
|
||||
# No Scheme, so stick to relative
|
||||
# No Scheme, default to http (shouldn't happen per WSGI standard)
|
||||
>>> print_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
|
||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'http://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
|
||||
|
||||
# Referrer extraction
|
||||
>>> WbUrl(req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://localhost:8080/web/2011/blah.example.com/'}).extract_referrer_wburl_str()).url
|
||||
|
@ -23,7 +23,7 @@ class WbRequest(object):
|
||||
if not host:
|
||||
host = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
|
||||
|
||||
return env['wsgi.url_scheme'] + '://' + host
|
||||
return env.get('wsgi.url_scheme', 'http') + '://' + host
|
||||
except KeyError:
|
||||
return ''
|
||||
|
||||
@ -66,7 +66,8 @@ class WbRequest(object):
|
||||
# wb_url present and not root page
|
||||
if wb_url_str != '/' and wburl_class:
|
||||
self.wb_url = wburl_class(wb_url_str)
|
||||
self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix)
|
||||
self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix,
|
||||
host_prefix + rel_prefix)
|
||||
else:
|
||||
# no wb_url, just store blank wb_url
|
||||
self.wb_url = None
|
||||
|
@ -24,6 +24,12 @@
|
||||
>>> do_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
|
||||
'localhost:8080/20101226101112/http://some-other-site.com'
|
||||
|
||||
>>> do_rewrite('http://localhost:8080/web/2014im_/http://some-other-site.com', 'http://example.com/index.html', '/web/', full_prefix='http://localhost:8080/web/')
|
||||
'http://localhost:8080/web/2014im_/http://some-other-site.com'
|
||||
|
||||
>>> do_rewrite('/web/http://some-other-site.com', 'http://example.com/index.html', '/web/', full_prefix='http://localhost:8080/web/')
|
||||
'/web/http://some-other-site.com'
|
||||
|
||||
>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
|
||||
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
|
||||
|
||||
@ -62,8 +68,8 @@
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter, HttpsUrlRewriter
|
||||
|
||||
|
||||
def do_rewrite(rel_url, base_url, prefix, mod = None):
|
||||
rewriter = UrlRewriter(base_url, prefix)
|
||||
def do_rewrite(rel_url, base_url, prefix, mod=None, full_prefix=None):
|
||||
rewriter = UrlRewriter(base_url, prefix, full_prefix=full_prefix)
|
||||
return rewriter.rewrite(rel_url, mod)
|
||||
|
||||
|
||||
|
@ -16,9 +16,10 @@ class UrlRewriter(object):
|
||||
|
||||
PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']
|
||||
|
||||
def __init__(self, wburl, prefix):
|
||||
def __init__(self, wburl, prefix, full_prefix=None):
|
||||
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
||||
self.prefix = prefix
|
||||
self.full_prefix = full_prefix
|
||||
|
||||
#if self.prefix.endswith('/'):
|
||||
# self.prefix = self.prefix[:-1]
|
||||
@ -28,33 +29,43 @@ class UrlRewriter(object):
|
||||
if any(url.startswith(x) for x in self.NO_REWRITE_URI_PREFIX):
|
||||
return url
|
||||
|
||||
if (self.prefix and
|
||||
self.prefix != '/' and
|
||||
url.startswith(self.prefix)):
|
||||
return url
|
||||
|
||||
if (self.full_prefix and
|
||||
self.full_prefix != self.prefix and
|
||||
url.startswith(self.full_prefix)):
|
||||
return url
|
||||
|
||||
wburl = self.wburl
|
||||
|
||||
isAbs = any(url.startswith(x) for x in self.PROTOCOLS)
|
||||
is_abs = any(url.startswith(x) for x in self.PROTOCOLS)
|
||||
|
||||
if url.startswith('//'):
|
||||
isAbs = True
|
||||
is_abs = True
|
||||
url = 'http:' + url
|
||||
|
||||
# Optimized rewriter for
|
||||
# -rel urls that don't start with / and
|
||||
# do not contain ../ and no special mod
|
||||
if not (isAbs or mod or url.startswith('/') or ('../' in url)):
|
||||
finalUrl = urlparse.urljoin(self.prefix + wburl.original_url, url)
|
||||
if not (is_abs or mod or url.startswith('/') or ('../' in url)):
|
||||
final_url = urlparse.urljoin(self.prefix + wburl.original_url, url)
|
||||
|
||||
else:
|
||||
# optimize: join if not absolute url, otherwise just use that
|
||||
if not isAbs:
|
||||
newUrl = urlparse.urljoin(wburl.url, url).replace('../', '')
|
||||
if not is_abs:
|
||||
new_url = urlparse.urljoin(wburl.url, url).replace('../', '')
|
||||
else:
|
||||
newUrl = url
|
||||
new_url = url
|
||||
|
||||
if mod is None:
|
||||
mod = wburl.mod
|
||||
|
||||
finalUrl = self.prefix + wburl.to_str(mod=mod, url=newUrl)
|
||||
final_url = self.prefix + wburl.to_str(mod=mod, url=new_url)
|
||||
|
||||
return finalUrl
|
||||
return final_url
|
||||
|
||||
def get_abs_url(self, url=''):
|
||||
return self.prefix + self.wburl.to_str(url=url)
|
||||
@ -85,7 +96,7 @@ class HttpsUrlRewriter(object):
|
||||
HTTP = 'http://'
|
||||
HTTPS = 'https://'
|
||||
|
||||
def __init__(self, wburl, prefix):
|
||||
def __init__(self, wburl, prefix, full_prefix=None):
|
||||
pass
|
||||
|
||||
def rewrite(self, url, mod=None):
|
||||
|
Loading…
x
Reference in New Issue
Block a user