mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
url rewriter: add optional 'full prefix', check and don't rewrite urls
if starting with prefix or full prefix wbrequest: if no scheme present (shouldn't happen with wsgi) default to http
This commit is contained in:
parent
cd017669ae
commit
53f0cb540f
@ -21,9 +21,9 @@
|
|||||||
>>> print_req_from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
|
>>> print_req_from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
|
||||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
|
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
|
||||||
|
|
||||||
# No Scheme, so stick to relative
|
# No Scheme, default to http (shouldn't happen per WSGI standard)
|
||||||
>>> print_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
|
>>> print_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
|
||||||
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
|
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'http://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
|
||||||
|
|
||||||
# Referrer extraction
|
# Referrer extraction
|
||||||
>>> WbUrl(req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://localhost:8080/web/2011/blah.example.com/'}).extract_referrer_wburl_str()).url
|
>>> WbUrl(req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://localhost:8080/web/2011/blah.example.com/'}).extract_referrer_wburl_str()).url
|
||||||
|
@ -23,7 +23,7 @@ class WbRequest(object):
|
|||||||
if not host:
|
if not host:
|
||||||
host = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
|
host = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
|
||||||
|
|
||||||
return env['wsgi.url_scheme'] + '://' + host
|
return env.get('wsgi.url_scheme', 'http') + '://' + host
|
||||||
except KeyError:
|
except KeyError:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
@ -66,7 +66,8 @@ class WbRequest(object):
|
|||||||
# wb_url present and not root page
|
# wb_url present and not root page
|
||||||
if wb_url_str != '/' and wburl_class:
|
if wb_url_str != '/' and wburl_class:
|
||||||
self.wb_url = wburl_class(wb_url_str)
|
self.wb_url = wburl_class(wb_url_str)
|
||||||
self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix)
|
self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix,
|
||||||
|
host_prefix + rel_prefix)
|
||||||
else:
|
else:
|
||||||
# no wb_url, just store blank wb_url
|
# no wb_url, just store blank wb_url
|
||||||
self.wb_url = None
|
self.wb_url = None
|
||||||
|
@ -24,6 +24,12 @@
|
|||||||
>>> do_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
|
>>> do_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
|
||||||
'localhost:8080/20101226101112/http://some-other-site.com'
|
'localhost:8080/20101226101112/http://some-other-site.com'
|
||||||
|
|
||||||
|
>>> do_rewrite('http://localhost:8080/web/2014im_/http://some-other-site.com', 'http://example.com/index.html', '/web/', full_prefix='http://localhost:8080/web/')
|
||||||
|
'http://localhost:8080/web/2014im_/http://some-other-site.com'
|
||||||
|
|
||||||
|
>>> do_rewrite('/web/http://some-other-site.com', 'http://example.com/index.html', '/web/', full_prefix='http://localhost:8080/web/')
|
||||||
|
'/web/http://some-other-site.com'
|
||||||
|
|
||||||
>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
|
>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
|
||||||
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
|
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
|
||||||
|
|
||||||
@ -62,8 +68,8 @@
|
|||||||
from pywb.rewrite.url_rewriter import UrlRewriter, HttpsUrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter, HttpsUrlRewriter
|
||||||
|
|
||||||
|
|
||||||
def do_rewrite(rel_url, base_url, prefix, mod = None):
|
def do_rewrite(rel_url, base_url, prefix, mod=None, full_prefix=None):
|
||||||
rewriter = UrlRewriter(base_url, prefix)
|
rewriter = UrlRewriter(base_url, prefix, full_prefix=full_prefix)
|
||||||
return rewriter.rewrite(rel_url, mod)
|
return rewriter.rewrite(rel_url, mod)
|
||||||
|
|
||||||
|
|
||||||
|
@ -16,9 +16,10 @@ class UrlRewriter(object):
|
|||||||
|
|
||||||
PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']
|
PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']
|
||||||
|
|
||||||
def __init__(self, wburl, prefix):
|
def __init__(self, wburl, prefix, full_prefix=None):
|
||||||
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
|
self.full_prefix = full_prefix
|
||||||
|
|
||||||
#if self.prefix.endswith('/'):
|
#if self.prefix.endswith('/'):
|
||||||
# self.prefix = self.prefix[:-1]
|
# self.prefix = self.prefix[:-1]
|
||||||
@ -28,33 +29,43 @@ class UrlRewriter(object):
|
|||||||
if any(url.startswith(x) for x in self.NO_REWRITE_URI_PREFIX):
|
if any(url.startswith(x) for x in self.NO_REWRITE_URI_PREFIX):
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
if (self.prefix and
|
||||||
|
self.prefix != '/' and
|
||||||
|
url.startswith(self.prefix)):
|
||||||
|
return url
|
||||||
|
|
||||||
|
if (self.full_prefix and
|
||||||
|
self.full_prefix != self.prefix and
|
||||||
|
url.startswith(self.full_prefix)):
|
||||||
|
return url
|
||||||
|
|
||||||
wburl = self.wburl
|
wburl = self.wburl
|
||||||
|
|
||||||
isAbs = any(url.startswith(x) for x in self.PROTOCOLS)
|
is_abs = any(url.startswith(x) for x in self.PROTOCOLS)
|
||||||
|
|
||||||
if url.startswith('//'):
|
if url.startswith('//'):
|
||||||
isAbs = True
|
is_abs = True
|
||||||
url = 'http:' + url
|
url = 'http:' + url
|
||||||
|
|
||||||
# Optimized rewriter for
|
# Optimized rewriter for
|
||||||
# -rel urls that don't start with / and
|
# -rel urls that don't start with / and
|
||||||
# do not contain ../ and no special mod
|
# do not contain ../ and no special mod
|
||||||
if not (isAbs or mod or url.startswith('/') or ('../' in url)):
|
if not (is_abs or mod or url.startswith('/') or ('../' in url)):
|
||||||
finalUrl = urlparse.urljoin(self.prefix + wburl.original_url, url)
|
final_url = urlparse.urljoin(self.prefix + wburl.original_url, url)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# optimize: join if not absolute url, otherwise just use that
|
# optimize: join if not absolute url, otherwise just use that
|
||||||
if not isAbs:
|
if not is_abs:
|
||||||
newUrl = urlparse.urljoin(wburl.url, url).replace('../', '')
|
new_url = urlparse.urljoin(wburl.url, url).replace('../', '')
|
||||||
else:
|
else:
|
||||||
newUrl = url
|
new_url = url
|
||||||
|
|
||||||
if mod is None:
|
if mod is None:
|
||||||
mod = wburl.mod
|
mod = wburl.mod
|
||||||
|
|
||||||
finalUrl = self.prefix + wburl.to_str(mod=mod, url=newUrl)
|
final_url = self.prefix + wburl.to_str(mod=mod, url=new_url)
|
||||||
|
|
||||||
return finalUrl
|
return final_url
|
||||||
|
|
||||||
def get_abs_url(self, url=''):
|
def get_abs_url(self, url=''):
|
||||||
return self.prefix + self.wburl.to_str(url=url)
|
return self.prefix + self.wburl.to_str(url=url)
|
||||||
@ -85,7 +96,7 @@ class HttpsUrlRewriter(object):
|
|||||||
HTTP = 'http://'
|
HTTP = 'http://'
|
||||||
HTTPS = 'https://'
|
HTTPS = 'https://'
|
||||||
|
|
||||||
def __init__(self, wburl, prefix):
|
def __init__(self, wburl, prefix, full_prefix=None):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def rewrite(self, url, mod=None):
|
def rewrite(self, url, mod=None):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user