mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
for canonicalzation, treat urns as is, already canonical for wburl, don't add http:// prefix if urn: prefix is present add example-wpull warc for testing
198 lines
5.5 KiB
Python
198 lines
5.5 KiB
Python
""" Standard url-canonicalzation, surt and non-surt
|
|
"""
|
|
|
|
import surt
|
|
import urlparse
|
|
|
|
from wbexception import BadRequestException
|
|
|
|
|
|
#=================================================================
|
|
class UrlCanonicalizer(object):
|
|
def __init__(self, surt_ordered=True):
|
|
self.surt_ordered = surt_ordered
|
|
|
|
def __call__(self, url):
|
|
return canonicalize(url, self.surt_ordered)
|
|
|
|
|
|
#=================================================================
|
|
class UrlCanonicalizeException(BadRequestException):
|
|
pass
|
|
|
|
|
|
#=================================================================
|
|
def canonicalize(url, surt_ordered=True):
|
|
"""
|
|
Canonicalize url and convert to surt
|
|
If not in surt ordered mode, convert back to url form
|
|
as surt conversion is currently part of canonicalization
|
|
|
|
>>> canonicalize('http://example.com/path/file.html', surt_ordered=True)
|
|
'com,example)/path/file.html'
|
|
|
|
>>> canonicalize('http://example.com/path/file.html', surt_ordered=False)
|
|
'example.com/path/file.html'
|
|
|
|
>>> canonicalize('urn:some:id')
|
|
'urn:some:id'
|
|
"""
|
|
try:
|
|
key = surt.surt(url)
|
|
except Exception as e:
|
|
# urn is already canonical, so just use as-is
|
|
if url.startswith('urn:'):
|
|
return url
|
|
|
|
raise UrlCanonicalizeException('Invalid Url: ' + url)
|
|
|
|
# if not surt, unsurt the surt to get canonicalized non-surt url
|
|
if not surt_ordered:
|
|
key = unsurt(key)
|
|
|
|
return key
|
|
|
|
|
|
#=================================================================
|
|
def unsurt(surt):
|
|
"""
|
|
# Simple surt
|
|
>>> unsurt('com,example)/')
|
|
'example.com/'
|
|
|
|
# Broken surt
|
|
>>> unsurt('com,example)')
|
|
'com,example)'
|
|
|
|
# Long surt
|
|
>>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\
|
|
index.html?a=b?c=)/')
|
|
'subdomain.another.subsub.sub.domain.suffix/path/file/index.html?a=b?c=)/'
|
|
"""
|
|
|
|
try:
|
|
index = surt.index(')/')
|
|
parts = surt[0:index].split(',')
|
|
parts.reverse()
|
|
host = '.'.join(parts)
|
|
host += surt[index + 1:]
|
|
return host
|
|
|
|
except ValueError:
|
|
# May not be a valid surt
|
|
return surt
|
|
|
|
|
|
#=================================================================
|
|
def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
|
"""
|
|
Canonicalize a url (either with custom canonicalizer or
|
|
standard canonicalizer with or without surt)
|
|
|
|
Then, compute a start and end search url search range
|
|
for a given match type.
|
|
|
|
Support match types:
|
|
* exact
|
|
* prefix
|
|
* host
|
|
* domain (only available when for surt ordering)
|
|
|
|
Examples below:
|
|
|
|
# surt ranges
|
|
>>> calc_search_range('http://example.com/path/file.html', 'exact')
|
|
('com,example)/path/file.html', 'com,example)/path/file.html!')
|
|
|
|
>>> calc_search_range('http://example.com/path/file.html', 'prefix')
|
|
('com,example)/path/file.html', 'com,example)/path/file.htmm')
|
|
|
|
>>> calc_search_range('http://example.com/path/file.html', 'host')
|
|
('com,example)/', 'com,example*')
|
|
|
|
>>> calc_search_range('http://example.com/path/file.html', 'domain')
|
|
('com,example)/', 'com,example-')
|
|
|
|
special case for tld domain range
|
|
>>> calc_search_range('com', 'domain')
|
|
('com,', 'com-')
|
|
|
|
# non-surt ranges
|
|
>>> calc_search_range('http://example.com/path/file.html', 'exact', False)
|
|
('example.com/path/file.html', 'example.com/path/file.html!')
|
|
|
|
>>> calc_search_range('http://example.com/path/file.html', 'prefix', False)
|
|
('example.com/path/file.html', 'example.com/path/file.htmm')
|
|
|
|
>>> calc_search_range('http://example.com/path/file.html', 'host', False)
|
|
('example.com/', 'example.com0')
|
|
|
|
# errors: domain range not supported
|
|
>>> calc_search_range('http://example.com/path/file.html', 'domain', False)
|
|
Traceback (most recent call last):
|
|
UrlCanonicalizeException: matchType=domain unsupported for non-surt
|
|
|
|
>>> calc_search_range('http://example.com/path/file.html', 'blah', False)
|
|
Traceback (most recent call last):
|
|
UrlCanonicalizeException: Invalid match_type: blah
|
|
|
|
"""
|
|
def inc_last_char(x):
|
|
return x[0:-1] + chr(ord(x[-1]) + 1)
|
|
|
|
if not url_canon:
|
|
# make new canon
|
|
url_canon = UrlCanonicalizer(surt_ordered)
|
|
else:
|
|
# ensure surt order matches url_canon
|
|
surt_ordered = url_canon.surt_ordered
|
|
|
|
start_key = url_canon(url)
|
|
|
|
if match_type == 'exact':
|
|
end_key = start_key + '!'
|
|
|
|
elif match_type == 'prefix':
|
|
# add trailing slash if url has it
|
|
if url.endswith('/') and not start_key.endswith('/'):
|
|
start_key += '/'
|
|
|
|
end_key = inc_last_char(start_key)
|
|
|
|
elif match_type == 'host':
|
|
if surt_ordered:
|
|
host = start_key.split(')/')[0]
|
|
|
|
start_key = host + ')/'
|
|
end_key = host + '*'
|
|
else:
|
|
host = urlparse.urlsplit(url).netloc
|
|
|
|
start_key = host + '/'
|
|
end_key = host + '0'
|
|
|
|
elif match_type == 'domain':
|
|
if not surt_ordered:
|
|
msg = 'matchType=domain unsupported for non-surt'
|
|
raise UrlCanonicalizeException(msg)
|
|
|
|
host = start_key.split(')/')[0]
|
|
|
|
# if tld, use com, as start_key
|
|
# otherwise, stick with com,example)/
|
|
if ',' not in host:
|
|
start_key = host + ','
|
|
else:
|
|
start_key = host + ')/'
|
|
|
|
end_key = host + '-'
|
|
else:
|
|
raise UrlCanonicalizeException('Invalid match_type: ' + match_type)
|
|
|
|
return (start_key, end_key)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import doctest
|
|
doctest.testmod()
|