1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-20 10:49:11 +01:00
pywb/pywb/rewrite/url_rewriter.py
Ilya Kreymer 498a864441 rewriting: support setting cookie_scope at collection level
js rewriting: add custom url rewrite option to per-url rewrite rules
2014-10-06 10:14:45 -07:00

162 lines
4.6 KiB
Python

import copy
import urlparse
from wburl import WbUrl
from cookie_rewriter import get_cookie_rewriter
#=================================================================
class UrlRewriter(object):
"""
Main pywb UrlRewriter which rewrites absolute and relative urls
to be relative to the current page, as specified via a WbUrl
instance and an optional full path prefix
"""
NO_REWRITE_URI_PREFIX = ['#', 'javascript:', 'data:',
'mailto:', 'about:', 'file:']
PROTOCOLS = ['http:', 'https:', 'ftp:', 'mms:', 'rtsp:', 'wais:']
def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None, root_path=None,
cookie_scope=None):
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
self.prefix = prefix
self.full_prefix = full_prefix
self.rel_prefix = rel_prefix if rel_prefix else prefix
self.root_path = root_path if root_path else '/'
self.cookie_scope = cookie_scope
def rewrite(self, url, mod=None):
# if special protocol, no rewriting at all
if any(url.startswith(x) for x in self.NO_REWRITE_URI_PREFIX):
return url
if (self.prefix and
self.prefix != '/' and
url.startswith(self.prefix)):
return url
if (self.full_prefix and
self.full_prefix != self.prefix and
url.startswith(self.full_prefix)):
return url
wburl = self.wburl
is_abs = any(url.startswith(x) for x in self.PROTOCOLS)
if url.startswith('//'):
is_abs = True
url = 'http:' + url
# Optimized rewriter for
# -rel urls that don't start with / and
# do not contain ../ and no special mod
if not (is_abs or mod or url.startswith('/') or ('../' in url)):
final_url = urlparse.urljoin(self.prefix + wburl.original_url, url)
else:
# optimize: join if not absolute url, otherwise just use that
if not is_abs:
new_url = self.urljoin(wburl.url, url)
else:
new_url = url
if mod is None:
mod = wburl.mod
final_url = self.prefix + wburl.to_str(mod=mod, url=new_url)
return final_url
def get_abs_url(self, url=''):
return self.prefix + self.wburl.to_str(url=url)
def get_timestamp_url(self, timestamp, url=None):
if url is None:
url = self.wburl.url
return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url)
def rebase_rewriter(self, new_url):
if new_url.startswith(self.prefix):
new_url = new_url[len(self.prefix):]
new_wburl = WbUrl(new_url)
return UrlRewriter(new_wburl, self.prefix)
def get_cookie_rewriter(self, scope=None):
# collection scope overrides rule scope?
if self.cookie_scope:
scope = self.cookie_scope
cls = get_cookie_rewriter(scope)
return cls(self)
def __repr__(self):
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
@staticmethod
def urljoin(orig_url, url):
new_url = urlparse.urljoin(orig_url, url)
if '../' not in new_url:
return new_url
parts = urlparse.urlsplit(new_url)
scheme, netloc, path, query, frag = parts
path_parts = path.split('/')
i = 0
n = len(path_parts) - 1
while i < n:
if path_parts[i] == '..':
del path_parts[i]
n -= 1
if i > 0:
del path_parts[i - 1]
n -= 1
i -= 1
else:
i += 1
if path_parts == ['']:
path = '/'
else:
path = '/'.join(path_parts)
parts = (scheme, netloc, path, query, frag)
new_url = urlparse.urlunsplit(parts)
return new_url
#=================================================================
class HttpsUrlRewriter(UrlRewriter):
"""
A url rewriter which urls that start with https:// to http://
Other urls/input is unchanged.
"""
HTTP = 'http://'
HTTPS = 'https://'
def rewrite(self, url, mod=None):
if url.startswith(self.HTTPS):
result = self.HTTP + url[len(self.HTTPS):]
return result
else:
return url
def get_timestamp_url(self, timestamp, url=''):
return url
def get_abs_url(self, url=''):
return url
def rebase_rewriter(self, new_url):
return self
def get_cookie_rewriter(self, scope=None):
return None