mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
support non-surt ordered cdx
add unsurt() util func and surt_ordered init param to LocalCDXServer test make_best_resolver()
This commit is contained in:
parent
9a3449dfd5
commit
7a20d26d5f
@ -64,8 +64,10 @@ class LocalCDXServer(IndexReader):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, sources):
|
def __init__(self, sources, surt_ordered = True):
|
||||||
self.sources = []
|
self.sources = []
|
||||||
|
self.surt_ordered = surt_ordered
|
||||||
|
logging.info('CDX Surt-Ordered? ' + str(surt_ordered))
|
||||||
|
|
||||||
for src in sources:
|
for src in sources:
|
||||||
if os.path.isdir(src):
|
if os.path.isdir(src):
|
||||||
@ -80,8 +82,13 @@ class LocalCDXServer(IndexReader):
|
|||||||
|
|
||||||
|
|
||||||
def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
|
def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
|
||||||
# convert to surt
|
# canonicalize to surt (canonicalization is part of surt conversion)
|
||||||
key = surt.surt(url)
|
key = surt.surt(url)
|
||||||
|
|
||||||
|
# if not surt, unsurt the surt to get canonicalized non-surt url
|
||||||
|
if not self.surt_ordered:
|
||||||
|
key = utils.unsurt(key)
|
||||||
|
|
||||||
match_func = binsearch.iter_exact
|
match_func = binsearch.iter_exact
|
||||||
|
|
||||||
params.update(**kwvalues)
|
params.update(**kwvalues)
|
||||||
|
@ -88,11 +88,14 @@ def pywb_config(config_file = None):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def yaml_parse_index_loader(index_config):
|
def yaml_parse_index_loader(config):
|
||||||
|
index_config = config['index_paths']
|
||||||
|
surt_ordered = config.get('surt_ordered', True)
|
||||||
|
|
||||||
# support mixed cdx streams and remote servers?
|
# support mixed cdx streams and remote servers?
|
||||||
# for now, list implies local sources
|
# for now, list implies local sources
|
||||||
if isinstance(index_config, list):
|
if isinstance(index_config, list):
|
||||||
return indexreader.LocalCDXServer(index_config)
|
return indexreader.LocalCDXServer(index_config, surt_ordered)
|
||||||
|
|
||||||
if isinstance(index_config, str):
|
if isinstance(index_config, str):
|
||||||
uri = index_config
|
uri = index_config
|
||||||
@ -139,7 +142,7 @@ def yaml_parse_route(config):
|
|||||||
|
|
||||||
archive_loader = archiveloader.ArchiveLoader()
|
archive_loader = archiveloader.ArchiveLoader()
|
||||||
|
|
||||||
index_loader = yaml_parse_index_loader(config['index_paths'])
|
index_loader = yaml_parse_index_loader(config)
|
||||||
|
|
||||||
archive_resolvers = map(replay_resolvers.make_best_resolver, config['archive_paths'])
|
archive_resolvers = map(replay_resolvers.make_best_resolver, config['archive_paths'])
|
||||||
|
|
||||||
|
@ -8,12 +8,13 @@ import logging
|
|||||||
#======================================
|
#======================================
|
||||||
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
|
# PrefixResolver - convert cdx file entry to url with prefix if url contains specified string
|
||||||
#======================================
|
#======================================
|
||||||
def PrefixResolver(prefix, contains = ''):
|
class PrefixResolver:
|
||||||
def makeUrl(url):
|
def __init__(self, prefix, contains = ''):
|
||||||
return [prefix + url] if (contains in url) else []
|
self.prefix = prefix
|
||||||
|
self.contains = contains
|
||||||
|
|
||||||
#print "prefix: " + prefix + " contains: " + contains
|
def __call__(self, filename):
|
||||||
return makeUrl
|
return [self.prefix + filename] if (self.contains in url) else []
|
||||||
|
|
||||||
#======================================
|
#======================================
|
||||||
class RedisResolver:
|
class RedisResolver:
|
||||||
@ -50,6 +51,23 @@ class PathIndexResolver:
|
|||||||
#TODO: more options (remote files, contains param, etc..)
|
#TODO: more options (remote files, contains param, etc..)
|
||||||
# find best resolver given the path
|
# find best resolver given the path
|
||||||
def make_best_resolver(path):
|
def make_best_resolver(path):
|
||||||
|
"""
|
||||||
|
# http path
|
||||||
|
>>> class_name(make_best_resolver('http://myhost.example.com/warcs/'))
|
||||||
|
'PrefixResolver'
|
||||||
|
|
||||||
|
# redis path
|
||||||
|
>>> class_name(make_best_resolver('redis://myhost.example.com:1234/1'))
|
||||||
|
'RedisResolver'
|
||||||
|
|
||||||
|
# a file
|
||||||
|
>>> class_name(make_best_resolver('file://' + os.path.realpath(__file__)))
|
||||||
|
'PathIndexResolver'
|
||||||
|
|
||||||
|
# a dir
|
||||||
|
>>> class_name(make_best_resolver('file://' + os.path.dirname(os.path.realpath(__file__))))
|
||||||
|
'PrefixResolver'
|
||||||
|
"""
|
||||||
url_parts = urlparse.urlsplit(path)
|
url_parts = urlparse.urlsplit(path)
|
||||||
|
|
||||||
if url_parts.scheme == 'redis':
|
if url_parts.scheme == 'redis':
|
||||||
@ -68,4 +86,13 @@ def make_best_resolver(path):
|
|||||||
logging.info('Adding Archive Path Source: ' + path)
|
logging.info('Adding Archive Path Source: ' + path)
|
||||||
return PrefixResolver(path)
|
return PrefixResolver(path)
|
||||||
|
|
||||||
|
import utils
|
||||||
|
#=================================================================
|
||||||
|
if __name__ == "__main__" or utils.enable_doctests():
|
||||||
|
|
||||||
|
def class_name(obj):
|
||||||
|
return obj.__class__.__name__
|
||||||
|
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
||||||
|
|
||||||
|
@ -168,6 +168,7 @@ def timestamp_to_sec(string):
|
|||||||
|
|
||||||
return calendar.timegm(timestamp_to_datetime(string))
|
return calendar.timegm(timestamp_to_datetime(string))
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters
|
# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters
|
||||||
# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
|
# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
|
||||||
# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
|
# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
|
||||||
@ -193,7 +194,36 @@ def rel_request_uri(environ, include_query=1):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
#============================================
|
#=================================================================
|
||||||
|
def unsurt(surt):
|
||||||
|
"""
|
||||||
|
# Simple surt
|
||||||
|
>>> unsurt('com,example)/')
|
||||||
|
'example.com)/'
|
||||||
|
|
||||||
|
# Broken surt
|
||||||
|
>>> unsurt('com,example)')
|
||||||
|
'com,example)'
|
||||||
|
|
||||||
|
# Long surt
|
||||||
|
>>> unsurt('suffix,domain,sub,subsub,another,subdomain)/path/file/index.html?a=b?c=)/')
|
||||||
|
'subdomain.another.subsub.sub.domain.suffix)/path/file/index.html?a=b?c=)/'
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
index = surt.index(')/')
|
||||||
|
parts = surt[0:index].split(',')
|
||||||
|
parts.reverse()
|
||||||
|
host = '.'.join(parts)
|
||||||
|
host += surt[index:]
|
||||||
|
return host
|
||||||
|
|
||||||
|
except ValueError:
|
||||||
|
# May not be a valid surt
|
||||||
|
return surt
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
# Support for bulk doctest testing via nose
|
# Support for bulk doctest testing via nose
|
||||||
# nosetests --with-doctest
|
# nosetests --with-doctest
|
||||||
|
|
||||||
@ -207,7 +237,7 @@ def test_data_dir():
|
|||||||
import os
|
import os
|
||||||
return os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/'
|
return os.path.dirname(os.path.realpath(__file__)) + '/../sample_archive/'
|
||||||
|
|
||||||
#============================================
|
#=================================================================
|
||||||
|
|
||||||
if __name__ == "__main__" or enable_doctests():
|
if __name__ == "__main__" or enable_doctests():
|
||||||
import doctest
|
import doctest
|
||||||
|
Loading…
x
Reference in New Issue
Block a user