support non-surt ordered cdx

add unsurt() util func and surt_ordered init param to LocalCDXServer test make_best_resolver()
2025-03-15 00:03:28 +01:00 · 2014-01-29 00:58:37 -08:00 · 2014-01-29 00:58:37 -08:00 · 7a20d26d5f
commit 7a20d26d5f
parent 9a3449dfd5
4 changed files with 79 additions and 12 deletions
--- a/pywb/indexreader.py
+++ b/pywb/indexreader.py
@ -64,8 +64,10 @@ class LocalCDXServer(IndexReader):

    """

-    def __init__(self, sources):
+    def __init__(self, sources, surt_ordered = True):
        self.sources = []
+        self.surt_ordered = surt_ordered
+        logging.info('CDX Surt-Ordered? ' + str(surt_ordered))

        for src in sources:
            if os.path.isdir(src):
@ -80,8 +82,13 @@ class LocalCDXServer(IndexReader):


    def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
-        # convert to surt
+        # canonicalize to surt (canonicalization is part of surt conversion)
        key = surt.surt(url)
+
+        # if not surt, unsurt the surt to get canonicalized non-surt url
+        if not self.surt_ordered:
+            key = utils.unsurt(key)
+
        match_func = binsearch.iter_exact

        params.update(**kwvalues)
--- a/pywb/pywb_init.py
+++ b/pywb/pywb_init.py
@ -88,11 +88,14 @@ def pywb_config(config_file = None):



-def yaml_parse_index_loader(index_config):
+def yaml_parse_index_loader(config):
+    index_config = config['index_paths']
+    surt_ordered = config.get('surt_ordered', True)
+
    # support mixed cdx streams and remote servers?
    # for now, list implies local sources
    if isinstance(index_config, list):
-        return indexreader.LocalCDXServer(index_config)
+        return indexreader.LocalCDXServer(index_config, surt_ordered)

    if isinstance(index_config, str):
        uri = index_config
@ -139,7 +142,7 @@ def yaml_parse_route(config):

    archive_loader = archiveloader.ArchiveLoader()

-    index_loader = yaml_parse_index_loader(config['index_paths'])
+    index_loader = yaml_parse_index_loader(config)

    archive_resolvers = map(replay_resolvers.make_best_resolver, config['archive_paths'])

--- a/pywb/replay_resolvers.py
+++ b/pywb/replay_resolvers.py
@ -8,12 +8,13 @@ import logging
 #======================================
 # PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
 #======================================
-def PrefixResolver(prefix, contains = ''):
-    def makeUrl(url):
-        return [prefix + url] if (contains in url) else []
+class PrefixResolver:
+    def __init__(self, prefix, contains = ''):
+        self.prefix = prefix
+        self.contains = contains

-    #print "prefix: " + prefix + " contains: " + contains
-    return makeUrl
+    def __call__(self, filename):
+        return [self.prefix + filename] if (self.contains in url) else []

 #======================================
 class RedisResolver:
@ -50,6 +51,23 @@ class PathIndexResolver:
 #TODO: more options (remote files, contains param, etc..)
 # find best resolver given the path
 def make_best_resolver(path):
+    """
+    # http path
+    >>> class_name(make_best_resolver('http://myhost.example.com/warcs/'))
+    'PrefixResolver'
+
+    # redis path
+    >>> class_name(make_best_resolver('redis://myhost.example.com:1234/1'))
+    'RedisResolver'
+
+    # a file
+    >>> class_name(make_best_resolver('file://' + os.path.realpath(__file__)))
+    'PathIndexResolver'
+
+    # a dir
+    >>> class_name(make_best_resolver('file://' + os.path.dirname(os.path.realpath(__file__))))
+    'PrefixResolver'
+    """
    url_parts = urlparse.urlsplit(path)

    if url_parts.scheme == 'redis':
@ -68,4 +86,13 @@ def make_best_resolver(path):
        logging.info('Adding Archive Path Source: ' + path)
        return PrefixResolver(path)

+import utils
+#=================================================================
+if __name__ == "__main__" or utils.enable_doctests():
+
+    def class_name(obj):
+        return obj.__class__.__name__
+
+    import doctest
+    doctest.testmod()

--- a/pywb/utils.py
+++ b/pywb/utils.py
@ -168,6 +168,7 @@ def timestamp_to_sec(string):

    return calendar.timegm(timestamp_to_datetime(string))

+#=================================================================
 # adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters
 # allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
 # explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
@ -193,7 +194,36 @@ def rel_request_uri(environ, include_query=1):



-#============================================
+#=================================================================
+def unsurt(surt):
+    """
+    # Simple surt
+    >>> unsurt('com,example)/')
+    'example.com)/'
+
+    # Broken surt
+    >>> unsurt('com,example)')
+    'com,example)'
+
+    # Long surt
+    >>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/index.html?a=b?c=)/')
+    'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/'
+    """
+
+    try:
+        index = surt.index(')/')
+        parts = surt[0:index].split(',')
+        parts.reverse()
+        host = '.'.join(parts)
+        host += surt[index:]
+        return host
+
+    except ValueError:
+        # May not be a valid surt
+        return surt
+
+
+#=================================================================
 # Support for bulk doctest testing via nose
 # nosetests --with-doctest

@ -207,7 +237,7 @@ def test_data_dir():
    import os
    return os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/'

-#============================================
+#=================================================================

 if __name__ == "__main__" or enable_doctests():
    import doctest