diff --git a/pywb/indexreader.py b/pywb/indexreader.py index ea692a05..c2049fff 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -64,8 +64,10 @@ class LocalCDXServer(IndexReader): """ - def __init__(self, sources): + def __init__(self, sources, surt_ordered = True): self.sources = [] + self.surt_ordered = surt_ordered + logging.info('CDX Surt-Ordered? ' + str(surt_ordered)) for src in sources: if os.path.isdir(src): @@ -80,8 +82,13 @@ class LocalCDXServer(IndexReader): def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues): - # convert to surt + # canonicalize to surt (canonicalization is part of surt conversion) key = surt.surt(url) + + # if not surt, unsurt the surt to get canonicalized non-surt url + if not self.surt_ordered: + key = utils.unsurt(key) + match_func = binsearch.iter_exact params.update(**kwvalues) diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py index 4d1e5c48..1690c64c 100644 --- a/pywb/pywb_init.py +++ b/pywb/pywb_init.py @@ -88,11 +88,14 @@ def pywb_config(config_file = None): -def yaml_parse_index_loader(index_config): +def yaml_parse_index_loader(config): + index_config = config['index_paths'] + surt_ordered = config.get('surt_ordered', True) + # support mixed cdx streams and remote servers? # for now, list implies local sources if isinstance(index_config, list): - return indexreader.LocalCDXServer(index_config) + return indexreader.LocalCDXServer(index_config, surt_ordered) if isinstance(index_config, str): uri = index_config @@ -139,7 +142,7 @@ def yaml_parse_route(config): archive_loader = archiveloader.ArchiveLoader() - index_loader = yaml_parse_index_loader(config['index_paths']) + index_loader = yaml_parse_index_loader(config) archive_resolvers = map(replay_resolvers.make_best_resolver, config['archive_paths']) diff --git a/pywb/replay_resolvers.py b/pywb/replay_resolvers.py index 98bcc89d..ff7074a0 100644 --- a/pywb/replay_resolvers.py +++ b/pywb/replay_resolvers.py @@ -8,12 +8,13 @@ import logging #====================================== # PrefixResolver - convert cdx file entry to url with prefix if url contains specified string #====================================== -def PrefixResolver(prefix, contains = ''): - def makeUrl(url): - return [prefix + url] if (contains in url) else [] +class PrefixResolver: + def __init__(self, prefix, contains = ''): + self.prefix = prefix + self.contains = contains - #print "prefix: " + prefix + " contains: " + contains - return makeUrl + def __call__(self, filename): + return [self.prefix + filename] if (self.contains in url) else [] #====================================== class RedisResolver: @@ -50,6 +51,23 @@ class PathIndexResolver: #TODO: more options (remote files, contains param, etc..) # find best resolver given the path def make_best_resolver(path): + """ + # http path + >>> class_name(make_best_resolver('http://myhost.example.com/warcs/')) + 'PrefixResolver' + + # redis path + >>> class_name(make_best_resolver('redis://myhost.example.com:1234/1')) + 'RedisResolver' + + # a file + >>> class_name(make_best_resolver('file://' + os.path.realpath(__file__))) + 'PathIndexResolver' + + # a dir + >>> class_name(make_best_resolver('file://' + os.path.dirname(os.path.realpath(__file__)))) + 'PrefixResolver' + """ url_parts = urlparse.urlsplit(path) if url_parts.scheme == 'redis': @@ -68,4 +86,13 @@ def make_best_resolver(path): logging.info('Adding Archive Path Source: ' + path) return PrefixResolver(path) +import utils +#================================================================= +if __name__ == "__main__" or utils.enable_doctests(): + + def class_name(obj): + return obj.__class__.__name__ + + import doctest + doctest.testmod() diff --git a/pywb/utils.py b/pywb/utils.py index 496b5f6a..3a8cba4e 100644 --- a/pywb/utils.py +++ b/pywb/utils.py @@ -168,6 +168,7 @@ def timestamp_to_sec(string): return calendar.timegm(timestamp_to_datetime(string)) +#================================================================= # adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters # allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3 # explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links @@ -193,7 +194,36 @@ def rel_request_uri(environ, include_query=1): -#============================================ +#================================================================= +def unsurt(surt): + """ + # Simple surt + >>> unsurt('com,example)/') + 'example.com)/' + + # Broken surt + >>> unsurt('com,example)') + 'com,example)' + + # Long surt + >>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/index.html?a=b?c=)/') + 'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/' + """ + + try: + index = surt.index(')/') + parts = surt[0:index].split(',') + parts.reverse() + host = '.'.join(parts) + host += surt[index:] + return host + + except ValueError: + # May not be a valid surt + return surt + + +#================================================================= # Support for bulk doctest testing via nose # nosetests --with-doctest @@ -207,7 +237,7 @@ def test_data_dir(): import os return os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/' -#============================================ +#================================================================= if __name__ == "__main__" or enable_doctests(): import doctest