pywb/pywb/utils/canonicalize.py

""" Standard url-canonicalzation, surt and non-surt
"""

import surt
import urlparse

from wbexception import BadRequestException


#=================================================================
class UrlCanonicalizer(object):
    def __init__(self, surt_ordered=True):
        self.surt_ordered = surt_ordered

    def __call__(self, url):
        return canonicalize(url, self.surt_ordered)


#=================================================================
class UrlCanonicalizeException(BadRequestException):
    pass


#=================================================================
def canonicalize(url, surt_ordered=True):
    """
    Canonicalize url and convert to surt
    If not in surt ordered mode, convert back to url form
    as surt conversion is currently part of canonicalization

    >>> canonicalize('http://example.com/path/file.html', surt_ordered=True)
    'com,example)/path/file.html'

    >>> canonicalize('http://example.com/path/file.html', surt_ordered=False)
    'example.com/path/file.html'

    >>> canonicalize('urn:some:id')
    'urn:some:id'
    """
    try:
        key = surt.surt(url)
    except Exception as e:
        # urn is already canonical, so just use as-is
        if url.startswith('urn:'):
            return url

        raise UrlCanonicalizeException('Invalid Url: ' + url)

    # if not surt, unsurt the surt to get canonicalized non-surt url
    if not surt_ordered:
        key = unsurt(key)

    return key


#=================================================================
def unsurt(surt):
    """
    # Simple surt
    >>> unsurt('com,example)/')
    'example.com/'

    # Broken surt
    >>> unsurt('com,example)')
    'com,example)'

    # Long surt
    >>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\
index.html?a=b?c=)/')
    'subdomain.another.subsub.sub.domain.suffix/path/file/index.html?a=b?c=)/'
    """

    try:
        index = surt.index(')/')
        parts = surt[0:index].split(',')
        parts.reverse()
        host = '.'.join(parts)
        host += surt[index + 1:]
        return host

    except ValueError:
        # May not be a valid surt
        return surt


#=================================================================
def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
    """
    Canonicalize a url (either with custom canonicalizer or
    standard canonicalizer with or without surt)

    Then, compute a start and end search url search range
    for a given match type.

    Support match types:
    * exact
    * prefix
    * host
    * domain (only available when for surt ordering)

    Examples below:

    # surt ranges
    >>> calc_search_range('http://example.com/path/file.html', 'exact')
    ('com,example)/path/file.html', 'com,example)/path/file.html!')

    >>> calc_search_range('http://example.com/path/file.html', 'prefix')
    ('com,example)/path/file.html', 'com,example)/path/file.htmm')

    >>> calc_search_range('http://example.com/path/file.html', 'host')
    ('com,example)/', 'com,example*')

    >>> calc_search_range('http://example.com/path/file.html', 'domain')
    ('com,example)/', 'com,example-')

    special case for tld domain range
    >>> calc_search_range('com', 'domain')
    ('com,', 'com-')

    # non-surt ranges
    >>> calc_search_range('http://example.com/path/file.html', 'exact', False)
    ('example.com/path/file.html', 'example.com/path/file.html!')

    >>> calc_search_range('http://example.com/path/file.html', 'prefix', False)
    ('example.com/path/file.html', 'example.com/path/file.htmm')

    >>> calc_search_range('http://example.com/path/file.html', 'host', False)
    ('example.com/', 'example.com0')

    # errors: domain range not supported
    >>> calc_search_range('http://example.com/path/file.html', 'domain', False)
    Traceback (most recent call last):
    UrlCanonicalizeException: matchType=domain unsupported for non-surt

    >>> calc_search_range('http://example.com/path/file.html', 'blah', False)
    Traceback (most recent call last):
    UrlCanonicalizeException: Invalid match_type: blah

    """
    def inc_last_char(x):
        return x[0:-1] + chr(ord(x[-1]) + 1)

    if not url_canon:
        # make new canon
        url_canon = UrlCanonicalizer(surt_ordered)
    else:
        # ensure surt order matches url_canon
        surt_ordered = url_canon.surt_ordered

    start_key = url_canon(url)

    if match_type == 'exact':
        end_key = start_key + '!'

    elif match_type == 'prefix':
        # add trailing slash if url has it
        if url.endswith('/') and not start_key.endswith('/'):
            start_key += '/'

        end_key = inc_last_char(start_key)

    elif match_type == 'host':
        if surt_ordered:
            host = start_key.split(')/')[0]

            start_key = host + ')/'
            end_key = host + '*'
        else:
            host = urlparse.urlsplit(url).netloc

            start_key = host + '/'
            end_key = host + '0'

    elif match_type == 'domain':
        if not surt_ordered:
            msg = 'matchType=domain unsupported for non-surt'
            raise UrlCanonicalizeException(msg)

        host = start_key.split(')/')[0]

        # if tld, use com, as start_key
        # otherwise, stick with com,example)/
        if ',' not in host:
            start_key = host + ','
        else:
            start_key = host + ')/'

        end_key = host + '-'
    else:
        raise UrlCanonicalizeException('Invalid match_type: ' + match_type)

    return (start_key, end_key)


if __name__ == "__main__":
    import doctest
    doctest.testmod()