mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-31 03:04:12 +02:00
87 lines
3.1 KiB
Python
87 lines
3.1 KiB
Python
import copy
|
|
import urlparse
|
|
|
|
from wbarchivalurl import ArchivalUrl
|
|
|
|
class ArchivalUrlRewriter:
|
|
"""
|
|
>>> test_rewrite('other.html', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
|
|
'https://web.archive.org/web/20131010/http://example.com/path/other.html'
|
|
|
|
>>> test_rewrite('file.js', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
|
|
'https://web.archive.org/web/20131010js_/http://example.com/path/file.js'
|
|
|
|
>>> test_rewrite('./other.html', '/20130907*/http://example.com/path/page.html', '/coll/')
|
|
'/coll/20130907*/http://example.com/path/other.html'
|
|
|
|
>>> test_rewrite('../other.html', '/20131112im_/http://example.com/path/page.html', '/coll/')
|
|
'/coll/20131112im_/http://example.com/other.html'
|
|
|
|
>>> test_rewrite('../../other.html', '/*/http://example.com/index.html', 'localhost:8080/')
|
|
'localhost:8080/*/http://example.com/other.html'
|
|
|
|
>>> test_rewrite('path/../../other.html', '/*/http://example.com/index.html', 'localhost:8080/')
|
|
'localhost:8080/*/http://example.com/other.html'
|
|
|
|
>>> test_rewrite('http://some-other-site.com', '/20101226101112/http://example.com/index.html', 'localhost:8080/')
|
|
'localhost:8080/20101226101112/http://some-other-site.com'
|
|
|
|
>>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '/')
|
|
'/2020/http://example.com/other.html'
|
|
|
|
>>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '')
|
|
'/2020/http://example.com/other.html'
|
|
"""
|
|
|
|
NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:']
|
|
|
|
PROTOCOLS = ['http://', 'https://', '//', 'mms://', 'rtsp://', 'wais://']
|
|
|
|
def __init__(self, wburl_str, prefix):
|
|
self.wburl = ArchivalUrl(wburl_str)
|
|
self.prefix = prefix
|
|
|
|
if self.prefix.endswith('/'):
|
|
self.prefix = self.prefix[:-1]
|
|
|
|
def rewrite(self, url, mod = None):
|
|
# if special protocol, no rewriting at all
|
|
if any (url.startswith(x) for x in ArchivalUrlRewriter.NO_REWRITE_URI_PREFIX):
|
|
return url
|
|
|
|
wburl = self.wburl
|
|
|
|
isAbs = any (url.startswith(x) for x in ArchivalUrlRewriter.PROTOCOLS)
|
|
|
|
# Optimized rewriter for
|
|
# -rel urls that don't start with / and don't contain ../ and no special mod
|
|
if not (isAbs or mod or url.startswith('/') or ('../' in url)):
|
|
finalUrl = urlparse.urljoin(self.prefix + wburl.original_url, url)
|
|
|
|
else:
|
|
# optimize: join if not absolute url, otherwise just use that
|
|
if not isAbs:
|
|
newUrl = urlparse.urljoin(wburl.url, url).replace('../', '')
|
|
else:
|
|
newUrl = url
|
|
|
|
if mod is None:
|
|
mod = wburl.mod
|
|
|
|
finalUrl = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl)
|
|
|
|
return finalUrl
|
|
|
|
|
|
def setBaseUrl(self, newUrl):
|
|
self.wburl.url = newUrl
|
|
|
|
if __name__ == "__main__":
|
|
import doctest
|
|
|
|
def test_rewrite(rel_url, base_url, prefix, mod = None):
|
|
rewriter = ArchivalUrlRewriter(base_url, prefix)
|
|
return rewriter.rewrite(rel_url, mod)
|
|
|
|
doctest.testmod()
|