1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-28 00:25:21 +01:00
pywb/pywb/utils/canonicalize.py
Ilya Kreymer bcbc00a89b
Fuzzy Rewrite Improvements ()
rules system:
- 'mixin' class for adding custom rewrite mixin, initialized with optional 'mixin_params'
- 'force_type' to always force rewriting text type for rule match (eg. if application/octet-stream)
- fuzzy rewrite: 'find_all' mode for matching via regex.findall() instead of search()
- load_function moved to generic load_py_name
- new rules for fb!
- JSReplaceFuzzy mixin to replace content based on query (or POST) regex match
- tests: tests JSReplaceFuzzy rewriting

query:
- append '?' for fuzzy matching if filters are set
- cdx['is_fuzzy'] set to '1' instead of True

client-side: rewrite
- add window.Request object rewrite
- improved rewrite of wb server + path, avoid double-slash
- fetch() rewrite proxy_to_obj()
- proxy_to_obj() null check
- WombatLocation prop change, skip if prop is the same
2017-10-31 20:35:29 -07:00

212 lines
6.1 KiB
Python

""" Standard url-canonicalzation, surt and non-surt
"""
import surt
import six.moves.urllib.parse as urlparse
from pywb.utils.wbexception import BadRequestException
#=================================================================
class UrlCanonicalizer(object):
def __init__(self, surt_ordered=True):
self.surt_ordered = surt_ordered
def __call__(self, url):
return canonicalize(url, self.surt_ordered)
#=================================================================
class UrlCanonicalizeException(BadRequestException):
pass
#=================================================================
def canonicalize(url, surt_ordered=True):
"""
Canonicalize url and convert to surt
If not in surt ordered mode, convert back to url form
as surt conversion is currently part of canonicalization
>>> canonicalize('http://example.com/path/file.html', surt_ordered=True)
'com,example)/path/file.html'
>>> canonicalize('http://example.com/path/file.html', surt_ordered=False)
'example.com/path/file.html'
>>> canonicalize('urn:some:id')
'urn:some:id'
"""
try:
key = surt.surt(url)
except Exception as e: #pragma: no cover
# doesn't happen with surt from 0.3b
# urn is already canonical, so just use as-is
if url.startswith('urn:'):
return url
raise UrlCanonicalizeException('Invalid Url: ' + url)
# if not surt, unsurt the surt to get canonicalized non-surt url
if not surt_ordered:
key = unsurt(key)
return key
#=================================================================
def unsurt(surt):
"""
# Simple surt
>>> unsurt('com,example)/')
'example.com/'
# Broken surt
>>> unsurt('com,example)')
'com,example)'
# Long surt
>>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/\
index.html?a=b?c=)/')
'subdomain.another.subsub.sub.domain.suffix/path/file/index.html?a=b?c=)/'
"""
try:
index = surt.index(')/')
parts = surt[0:index].split(',')
parts.reverse()
host = '.'.join(parts)
host += surt[index + 1:]
return host
except ValueError:
# May not be a valid surt
return surt
#=================================================================
def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
"""
Canonicalize a url (either with custom canonicalizer or
standard canonicalizer with or without surt)
Then, compute a start and end search url search range
for a given match type.
Support match types:
* exact
* prefix
* host
* domain (only available when for surt ordering)
Examples below:
# surt ranges
>>> calc_search_range('http://example.com/path/file.html', 'exact')
('com,example)/path/file.html', 'com,example)/path/file.html!')
>>> calc_search_range('http://example.com/path/file.html', 'prefix')
('com,example)/path/file.html', 'com,example)/path/file.htmm')
# slash and ?
>>> calc_search_range('http://example.com/path/', 'prefix')
('com,example)/path/', 'com,example)/path0')
>>> calc_search_range('http://example.com/path?', 'prefix')
('com,example)/path?', 'com,example)/path@')
>>> calc_search_range('http://example.com/path/?', 'prefix')
('com,example)/path?', 'com,example)/path@')
>>> calc_search_range('http://example.com/path/file.html', 'host')
('com,example)/', 'com,example*')
>>> calc_search_range('http://example.com/path/file.html', 'domain')
('com,example)/', 'com,example-')
special case for tld domain range
>>> calc_search_range('com', 'domain')
('com,', 'com-')
# non-surt ranges
>>> calc_search_range('http://example.com/path/file.html', 'exact', False)
('example.com/path/file.html', 'example.com/path/file.html!')
>>> calc_search_range('http://example.com/path/file.html', 'prefix', False)
('example.com/path/file.html', 'example.com/path/file.htmm')
>>> calc_search_range('http://example.com/path/file.html', 'host', False)
('example.com/', 'example.com0')
# errors: domain range not supported
>>> calc_search_range('http://example.com/path/file.html', 'domain', False) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
UrlCanonicalizeException: matchType=domain unsupported for non-surt
>>> calc_search_range('http://example.com/path/file.html', 'blah', False) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
UrlCanonicalizeException: Invalid match_type: blah
"""
def inc_last_char(x):
return x[0:-1] + chr(ord(x[-1]) + 1)
if not url_canon:
# make new canon
url_canon = UrlCanonicalizer(surt_ordered)
else:
# ensure surt order matches url_canon
surt_ordered = url_canon.surt_ordered
start_key = url_canon(url)
if match_type == 'exact':
end_key = start_key + '!'
elif match_type == 'prefix':
# add trailing slash if url has it
if url.endswith('/') and not start_key.endswith('/'):
start_key += '/'
if url.endswith('?') and not start_key.endswith('?'):
start_key += '?'
end_key = inc_last_char(start_key)
elif match_type == 'host':
if surt_ordered:
host = start_key.split(')/')[0]
start_key = host + ')/'
end_key = host + '*'
else:
host = urlparse.urlsplit(url).netloc
start_key = host + '/'
end_key = host + '0'
elif match_type == 'domain':
if not surt_ordered:
msg = 'matchType=domain unsupported for non-surt'
raise UrlCanonicalizeException(msg)
host = start_key.split(')/')[0]
# if tld, use com, as start_key
# otherwise, stick with com,example)/
if ',' not in host:
start_key = host + ','
else:
start_key = host + ')/'
end_key = host + '-'
else:
raise UrlCanonicalizeException('Invalid match_type: ' + match_type)
return (start_key, end_key)
if __name__ == "__main__":
import doctest
doctest.testmod()