pywb/pywb/wburlrewriter.py

import copy
import urlparse

from wbarchivalurl import ArchivalUrl

class ArchivalUrlRewriter:
    """
    >>> test_rewrite('other.html', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
    'https://web.archive.org/web/20131010/http://example.com/path/other.html'

    >>> test_rewrite('file.js', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
    'https://web.archive.org/web/20131010js_/http://example.com/path/file.js'

    >>> test_rewrite('./other.html', '/20130907*/http://example.com/path/page.html', '/coll/')
    '/coll/20130907*/http://example.com/path/other.html'

    >>> test_rewrite('../other.html', '/20131112im_/http://example.com/path/page.html', '/coll/')
    '/coll/20131112im_/http://example.com/other.html'

    >>> test_rewrite('../../other.html', '/*/http://example.com/index.html', 'localhost:8080/')
    'localhost:8080/*/http://example.com/other.html'

    >>> test_rewrite('path/../../other.html', '/*/http://example.com/index.html', 'localhost:8080/')
    'localhost:8080/*/http://example.com/other.html'

    >>> test_rewrite('http://some-other-site.com', '/20101226101112/http://example.com/index.html', 'localhost:8080/')
    'localhost:8080/20101226101112/http://some-other-site.com'

    >>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '/')
    '/2020/http://example.com/other.html'

    >>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '')
    '/2020/http://example.com/other.html'
      """

    NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:']

    PROTOCOLS = ['http://', 'https://', '//', 'mms://', 'rtsp://', 'wais://']

    def __init__(self, wburl_str, prefix):
        self.wburl = ArchivalUrl(wburl_str)
        self.prefix = prefix

        if self.prefix.endswith('/'):
            self.prefix = self.prefix[:-1]

    def rewrite(self, url, mod = None):
        # if special protocol, no rewriting at all
        if any (url.startswith(x) for x in ArchivalUrlRewriter.NO_REWRITE_URI_PREFIX):
            return url

        wburl = self.wburl

        isAbs = any (url.startswith(x) for x in ArchivalUrlRewriter.PROTOCOLS)

        # Optimized rewriter for
        # -rel urls that don't start with / and  don't contain ../ and no special mod
        if not (isAbs or mod or url.startswith('/') or ('../' in url)):
            finalUrl = urlparse.urljoin(self.prefix + wburl.original_url, url)

        else:
            # optimize: join if not absolute url, otherwise just use that
            if not isAbs:
                newUrl = urlparse.urljoin(wburl.url, url).replace('../', '')
            else:
                newUrl = url

            if mod is None:
                mod = wburl.mod

            finalUrl = self.prefix + ArchivalUrl.to_str(wburl.type, mod, wburl.timestamp, newUrl)

        return finalUrl


    def setBaseUrl(self, newUrl):
        self.wburl.url = newUrl

if __name__ == "__main__":
    import doctest

    def test_rewrite(rel_url, base_url, prefix, mod = None):
        rewriter = ArchivalUrlRewriter(base_url, prefix)
        return rewriter.rewrite(rel_url, mod)

    doctest.testmod()