1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

url rewriter: add optional 'full prefix', check and don't rewrite urls

if starting with prefix or full prefix
wbrequest: if no scheme present (shouldn't happen with wsgi) default to http
This commit is contained in:
Ilya Kreymer 2014-04-24 10:44:08 -07:00
parent cd017669ae
commit 53f0cb540f
4 changed files with 35 additions and 17 deletions

View File

@ -21,9 +21,9 @@
>>> print_req_from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
# No Scheme, so stick to relative
# No Scheme, default to http (shouldn't happen per WSGI standard)
>>> print_req_from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'http://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
# Referrer extraction
>>> WbUrl(req_from_uri('/web/2010/example.com', {'wsgi.url_scheme': 'http', 'HTTP_HOST': 'localhost:8080', 'HTTP_REFERER': 'http://localhost:8080/web/2011/blah.example.com/'}).extract_referrer_wburl_str()).url

View File

@ -23,7 +23,7 @@ class WbRequest(object):
if not host:
host = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
return env['wsgi.url_scheme'] + '://' + host
return env.get('wsgi.url_scheme', 'http') + '://' + host
except KeyError:
return ''
@ -66,7 +66,8 @@ class WbRequest(object):
# wb_url present and not root page
if wb_url_str != '/' and wburl_class:
self.wb_url = wburl_class(wb_url_str)
self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix)
self.urlrewriter = urlrewriter_class(self.wb_url, self.wb_prefix,
host_prefix + rel_prefix)
else:
# no wb_url, just store blank wb_url
self.wb_url = None

View File

@ -24,6 +24,12 @@
>>> do_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/20101226101112/http://some-other-site.com'
>>> do_rewrite('http://localhost:8080/web/2014im_/http://some-other-site.com', 'http://example.com/index.html', '/web/', full_prefix='http://localhost:8080/web/')
'http://localhost:8080/web/2014im_/http://some-other-site.com'
>>> do_rewrite('/web/http://some-other-site.com', 'http://example.com/index.html', '/web/', full_prefix='http://localhost:8080/web/')
'/web/http://some-other-site.com'
>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
@ -62,8 +68,8 @@
from pywb.rewrite.url_rewriter import UrlRewriter, HttpsUrlRewriter
def do_rewrite(rel_url, base_url, prefix, mod = None):
rewriter = UrlRewriter(base_url, prefix)
def do_rewrite(rel_url, base_url, prefix, mod=None, full_prefix=None):
rewriter = UrlRewriter(base_url, prefix, full_prefix=full_prefix)
return rewriter.rewrite(rel_url, mod)

View File

@ -16,9 +16,10 @@ class UrlRewriter(object):
PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']
def __init__(self, wburl, prefix):
def __init__(self, wburl, prefix, full_prefix=None):
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
self.prefix = prefix
self.full_prefix = full_prefix
#if self.prefix.endswith('/'):
# self.prefix = self.prefix[:-1]
@ -28,33 +29,43 @@ class UrlRewriter(object):
if any(url.startswith(x) for x in self.NO_REWRITE_URI_PREFIX):
return url
if (self.prefix and
self.prefix != '/' and
url.startswith(self.prefix)):
return url
if (self.full_prefix and
self.full_prefix != self.prefix and
url.startswith(self.full_prefix)):
return url
wburl = self.wburl
isAbs = any(url.startswith(x) for x in self.PROTOCOLS)
is_abs = any(url.startswith(x) for x in self.PROTOCOLS)
if url.startswith('//'):
isAbs = True
is_abs = True
url = 'http:' + url
# Optimized rewriter for
# -rel urls that don't start with / and
# do not contain ../ and no special mod
if not (isAbs or mod or url.startswith('/') or ('../' in url)):
finalUrl = urlparse.urljoin(self.prefix + wburl.original_url, url)
if not (is_abs or mod or url.startswith('/') or ('../' in url)):
final_url = urlparse.urljoin(self.prefix + wburl.original_url, url)
else:
# optimize: join if not absolute url, otherwise just use that
if not isAbs:
newUrl = urlparse.urljoin(wburl.url, url).replace('../', '')
if not is_abs:
new_url = urlparse.urljoin(wburl.url, url).replace('../', '')
else:
newUrl = url
new_url = url
if mod is None:
mod = wburl.mod
finalUrl = self.prefix + wburl.to_str(mod=mod, url=newUrl)
final_url = self.prefix + wburl.to_str(mod=mod, url=new_url)
return finalUrl
return final_url
def get_abs_url(self, url=''):
return self.prefix + self.wburl.to_str(url=url)
@ -85,7 +96,7 @@ class HttpsUrlRewriter(object):
HTTP = 'http://'
HTTPS = 'https://'
def __init__(self, wburl, prefix):
def __init__(self, wburl, prefix, full_prefix=None):
pass
def rewrite(self, url, mod=None):