1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00
pywb/pywb/utils/canonicalize.py
Ilya Kreymer 30ab27bb1c indexing: support indexing (and even replay of) records where target-uri is a 'urn:' identifier (#91)
for canonicalzation, treat urns as is, already canonical
for wburl, don't add http:// prefix if urn: prefix is present
add example-wpull warc for testing
2015-03-30 17:23:50 -07:00

198 lines
5.5 KiB
Python

""" Standard url-canonicalzation, surt and non-surt
"""
import surt
import urlparse
from wbexception import BadRequestException
#=================================================================
class UrlCanonicalizer(object):
def __init__(self, surt_ordered=True):
self.surt_ordered = surt_ordered
def __call__(self, url):
return canonicalize(url, self.surt_ordered)
#=================================================================
class UrlCanonicalizeException(BadRequestException):
pass
#=================================================================
def canonicalize(url, surt_ordered=True):
"""
Canonicalize url and convert to surt
If not in surt ordered mode, convert back to url form
as surt conversion is currently part of canonicalization
>>> canonicalize('http://example.com/path/file.html', surt_ordered=True)
'com,example)/path/file.html'
>>> canonicalize('http://example.com/path/file.html', surt_ordered=False)
'example.com/path/file.html'
>>> canonicalize('urn:some:id')
'urn:some:id'
"""
try:
key = surt.surt(url)
except Exception as e:
# urn is already canonical, so just use as-is
if url.startswith('urn:'):
return url
raise UrlCanonicalizeException('Invalid Url: ' + url)
# if not surt, unsurt the surt to get canonicalized non-surt url
if not surt_ordered:
key = unsurt(key)
return key
#=================================================================
def unsurt(surt):
"""
# Simple surt
>>> unsurt('com,example)/')
'example.com/'
# Broken surt
>>> unsurt('com,example)')
'com,example)'
# Long surt
>>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\
index.html?a=b?c=)/')
'subdomain.another.subsub.sub.domain.suffix/path/file/index.html?a=b?c=)/'
"""
try:
index = surt.index(')/')
parts = surt[0:index].split(',')
parts.reverse()
host = '.'.join(parts)
host += surt[index + 1:]
return host
except ValueError:
# May not be a valid surt
return surt
#=================================================================
def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
"""
Canonicalize a url (either with custom canonicalizer or
standard canonicalizer with or without surt)
Then, compute a start and end search url search range
for a given match type.
Support match types:
* exact
* prefix
* host
* domain (only available when for surt ordering)
Examples below:
# surt ranges
>>> calc_search_range('http://example.com/path/file.html', 'exact')
('com,example)/path/file.html', 'com,example)/path/file.html!')
>>> calc_search_range('http://example.com/path/file.html', 'prefix')
('com,example)/path/file.html', 'com,example)/path/file.htmm')
>>> calc_search_range('http://example.com/path/file.html', 'host')
('com,example)/', 'com,example*')
>>> calc_search_range('http://example.com/path/file.html', 'domain')
('com,example)/', 'com,example-')
special case for tld domain range
>>> calc_search_range('com', 'domain')
('com,', 'com-')
# non-surt ranges
>>> calc_search_range('http://example.com/path/file.html', 'exact', False)
('example.com/path/file.html', 'example.com/path/file.html!')
>>> calc_search_range('http://example.com/path/file.html', 'prefix', False)
('example.com/path/file.html', 'example.com/path/file.htmm')
>>> calc_search_range('http://example.com/path/file.html', 'host', False)
('example.com/', 'example.com0')
# errors: domain range not supported
>>> calc_search_range('http://example.com/path/file.html', 'domain', False)
Traceback (most recent call last):
UrlCanonicalizeException: matchType=domain unsupported for non-surt
>>> calc_search_range('http://example.com/path/file.html', 'blah', False)
Traceback (most recent call last):
UrlCanonicalizeException: Invalid match_type: blah
"""
def inc_last_char(x):
return x[0:-1] + chr(ord(x[-1]) + 1)
if not url_canon:
# make new canon
url_canon = UrlCanonicalizer(surt_ordered)
else:
# ensure surt order matches url_canon
surt_ordered = url_canon.surt_ordered
start_key = url_canon(url)
if match_type == 'exact':
end_key = start_key + '!'
elif match_type == 'prefix':
# add trailing slash if url has it
if url.endswith('/') and not start_key.endswith('/'):
start_key += '/'
end_key = inc_last_char(start_key)
elif match_type == 'host':
if surt_ordered:
host = start_key.split(')/')[0]
start_key = host + ')/'
end_key = host + '*'
else:
host = urlparse.urlsplit(url).netloc
start_key = host + '/'
end_key = host + '0'
elif match_type == 'domain':
if not surt_ordered:
msg = 'matchType=domain unsupported for non-surt'
raise UrlCanonicalizeException(msg)
host = start_key.split(')/')[0]
# if tld, use com, as start_key
# otherwise, stick with com,example)/
if ',' not in host:
start_key = host + ','
else:
start_key = host + ')/'
end_key = host + '-'
else:
raise UrlCanonicalizeException('Invalid match_type: ' + match_type)
return (start_key, end_key)
if __name__ == "__main__":
import doctest
doctest.testmod()