1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-28 00:25:21 +01:00
pywb/pywb/rewrite/url_rewriter.py
Ilya Kreymer 9194e867ea - add referrer self-redirect check and test case
- dispatching: cleanup wbrequestresponse, move tests to a seperate file
- wbrequest: store both rel_prefix and host_prefix, with wb_prefix either full
or rel path as needed, so that full and relative paths are
both available in wbrequest
- create WbUrlHandler to differentiate handlers which
support WbUrl (timestamp[mod]/url) semantic vs other request handlers.
2014-02-23 23:31:54 -08:00

131 lines
4.6 KiB
Python

import copy
import urlparse
from wburl import WbUrl
class UrlRewriter:
"""
>>> do_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
'https://web.archive.org/web/20131010/http://example.com/path/other.html'
>>> do_rewrite('file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
'https://web.archive.org/web/20131010js_/http://example.com/path/file.js'
>>> do_rewrite('/other.html', '20130907*/http://example.com/path/page.html', '/coll/')
'/coll/20130907*/http://example.com/other.html'
>>> do_rewrite('./other.html', '20130907*/http://example.com/path/page.html', '/coll/')
'/coll/20130907*/http://example.com/path/other.html'
>>> do_rewrite('../other.html', '20131112im_/http://example.com/path/page.html', '/coll/')
'/coll/20131112im_/http://example.com/other.html'
>>> do_rewrite('../../other.html', '*/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/*/http://example.com/other.html'
>>> do_rewrite('path/../../other.html', '*/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/*/http://example.com/other.html'
>>> do_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/20101226101112/http://some-other-site.com'
>>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '/')
'/2020/http://example.com/other.html'
>>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '')
'2020/http://example.com/other.html'
>>> do_rewrite('', '20131010010203/http://example.com/file.html', '/web/')
'/web/20131010010203/http://example.com/file.html'
>>> do_rewrite('#anchor', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
'#anchor'
>>> do_rewrite('mailto:example@example.com', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
'mailto:example@example.com'
>>> UrlRewriter('19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url()
'/abc/19960708im_/'
>>> UrlRewriter('2013id_/example.com/file/path/blah.html', '/123/').get_timestamp_url('20131024')
'/123/20131024id_/http://example.com/file/path/blah.html'
>>> UrlRewriter.strip_protocol('https://example.com') == UrlRewriter.strip_protocol('http://example.com')
True
"""
NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:', 'mailto:', 'about:']
PROTOCOLS = ['http://', 'https://', '//', 'ftp://', 'mms://', 'rtsp://', 'wais://']
def __init__(self, wburl, prefix):
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
self.prefix = prefix
#if self.prefix.endswith('/'):
# self.prefix = self.prefix[:-1]
def rewrite(self, url, mod = None):
# if special protocol, no rewriting at all
if any (url.startswith(x) for x in self.NO_REWRITE_URI_PREFIX):
return url
wburl = self.wburl
isAbs = any(url.startswith(x) for x in self.PROTOCOLS)
# Optimized rewriter for
# -rel urls that don't start with / and don't contain ../ and no special mod
if not (isAbs or mod or url.startswith('/') or ('../' in url)):
finalUrl = urlparse.urljoin(self.prefix + wburl.original_url, url)
else:
# optimize: join if not absolute url, otherwise just use that
if not isAbs:
newUrl = urlparse.urljoin(wburl.url, url).replace('../', '')
else:
newUrl = url
if mod is None:
mod = wburl.mod
finalUrl = self.prefix + wburl.to_str(mod=mod, url=newUrl)
return finalUrl
def get_abs_url(self, url = ''):
return self.prefix + self.wburl.to_str(url=url)
def get_timestamp_url(self, timestamp, url = None):
if url is None:
url = self.wburl.url
return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url)
def set_base_url(self, newUrl):
self.wburl.url = newUrl
def __repr__(self):
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
@staticmethod
def strip_protocol(url):
for protocol in UrlRewriter.PROTOCOLS:
if url.startswith(protocol):
return url[len(protocol):]
return url
def do_rewrite(rel_url, base_url, prefix, mod = None):
rewriter = UrlRewriter(base_url, prefix)
return rewriter.rewrite(rel_url, mod)
if __name__ == "__main__":
import doctest
doctest.testmod()