From 8d3d326c9e3ce58ceeac32e9832e47648c75d7a6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 2 Apr 2014 11:41:20 -0700 Subject: [PATCH] tests: add pathresolver tests for RedisResolver and PathIndexResolver --- pywb/warc/pathresolvers.py | 54 ++-------------- pywb/warc/recordloader.py | 3 +- pywb/warc/test/test_pathresolvers.py | 77 +++++++++++++++++++++++ sample_archive/text_content/pathindex.txt | 2 + 4 files changed, 87 insertions(+), 49 deletions(-) create mode 100644 pywb/warc/test/test_pathresolvers.py create mode 100644 sample_archive/text_content/pathindex.txt diff --git a/pywb/warc/pathresolvers.py b/pywb/warc/pathresolvers.py index 7b275c0c..5419eeb9 100644 --- a/pywb/warc/pathresolvers.py +++ b/pywb/warc/pathresolvers.py @@ -46,12 +46,8 @@ class RedisResolver: self.redis = redis.StrictRedis.from_url(redis_url) def __call__(self, filename): - try: - redis_val = self.redis.hget(self.key_prefix + filename, 'path') - return [redis_val] if redis_val else None - except Exception as e: - print e - return None + redis_val = self.redis.hget(self.key_prefix + filename, 'path') + return [redis_val] if redis_val else [] def __repr__(self): return "RedisResolver('{0}')".format(self.redis_url) @@ -68,13 +64,13 @@ class PathIndexResolver: def gen_list(result): for pathline in result: - path = pathline.split('\t') - if len(path) == 2: - yield path[1] + paths = pathline.split('\t')[1:] + for path in paths: + yield path return gen_list(result) - def __repr__(self): + def __repr__(self): # pragma: no cover return "PathIndexResolver('{0}')".format(self.pathindex_file) @@ -82,32 +78,6 @@ class PathIndexResolver: #TODO: more options (remote files, contains param, etc..) # find best resolver given the path def make_best_resolver(param): - """ - # http path - >>> make_best_resolver('http://myhost.example.com/warcs/') - PrefixResolver('http://myhost.example.com/warcs/') - - # http path w/ contains param - >>> make_best_resolver(['http://myhost.example.com/warcs/', '/']) - PrefixResolver('http://myhost.example.com/warcs/', contains = '/') - - # redis path - >>> make_best_resolver('redis://myhost.example.com:1234/1') - RedisResolver('redis://myhost.example.com:1234/1') - - # a file - >>> r = make_best_resolver('file://' + os.path.realpath(__file__)) - >>> r.__class__.__name__ - 'PathIndexResolver' - - # a dir - >>> path = os.path.realpath(__file__) - >>> r = make_best_resolver('file://' + os.path.dirname(path)) - >>> r.__class__.__name__ - 'PrefixResolver' - - """ - if isinstance(param, list): path = param[0] arg = param[1] @@ -136,19 +106,7 @@ def make_best_resolver(param): #================================================================= def make_best_resolvers(paths): - """ - >>> r = make_best_resolvers(['http://example.com/warcs/',\ - 'redis://example.com:1234/1']) - >>> map(lambda x: x.__class__.__name__, r) - ['PrefixResolver', 'RedisResolver'] - """ if hasattr(paths, '__iter__'): return map(make_best_resolver, paths) else: return [make_best_resolver(paths)] - - -#================================================================= -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 9189b2c7..11524fed 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -31,7 +31,8 @@ class ArchiveLoadFailed(WbException): #================================================================= class ArcWarcRecordLoader: - # Standard ARC headers + # Standard ARC v1.0 headers + # TODO: support ARV v2.0 also? ARC_HEADERS = ["uri", "ip-address", "archive-date", "content-type", "length"] diff --git a/pywb/warc/test/test_pathresolvers.py b/pywb/warc/test/test_pathresolvers.py new file mode 100644 index 00000000..923c0ce1 --- /dev/null +++ b/pywb/warc/test/test_pathresolvers.py @@ -0,0 +1,77 @@ +""" +# PathIndexResolver tests +>>> list(PathIndexResolver(get_test_dir() + 'text_content/pathindex.txt')('example.warc.gz')) +['invalid_path', 'sample_archive/warcs/example.warc.gz'] + +>>> list(PathIndexResolver(get_test_dir() + 'text_content/pathindex.txt')('iana.warc.gz')) +['sample_archive/warcs/iana.warc.gz'] + +>>> list(PathIndexResolver(get_test_dir() + 'text_content/pathindex.txt')('not-found.gz')) +[] + +# RedisResolver tests +# not set, no match +>>> redis_resolver('example.warc.gz') +[] + +>>> hset_path('example.warc.gz', 'some_path/example.warc.gz') +>>> redis_resolver('example.warc.gz') +['some_path/example.warc.gz'] + + +make_best_resolver tests +# http path +>>> make_best_resolver('http://myhost.example.com/warcs/') +PrefixResolver('http://myhost.example.com/warcs/') + +# http path w/ contains param +>>> make_best_resolver(['http://myhost.example.com/warcs/', '/']) +PrefixResolver('http://myhost.example.com/warcs/', contains = '/') + +# redis path +>>> make_best_resolver('redis://myhost.example.com:1234/1') +RedisResolver('redis://myhost.example.com:1234/1') + +# a file +>>> r = make_best_resolver('file://' + os.path.realpath(__file__)) +>>> r.__class__.__name__ +'PathIndexResolver' + +# a dir +>>> path = os.path.realpath(__file__) +>>> r = make_best_resolver('file://' + os.path.dirname(path)) +>>> r.__class__.__name__ +'PrefixResolver' + + +# make_best_resolvers +>>> r = make_best_resolvers(['http://example.com/warcs/',\ + 'redis://example.com:1234/1']) +>>> map(lambda x: x.__class__.__name__, r) +['PrefixResolver', 'RedisResolver'] +""" + +from pywb import get_test_dir +from pywb.warc.pathresolvers import PrefixResolver, PathIndexResolver, RedisResolver +from pywb.warc.pathresolvers import make_best_resolver, make_best_resolvers +import os + + +from fakeredis import FakeStrictRedis +from mock import patch + +@patch('redis.StrictRedis', FakeStrictRedis) +def init_redis_resolver(): + return RedisResolver('redis://127.0.0.1:6379/0') + + +def hset_path(filename, path): + redis_resolver.redis.hset(redis_resolver.key_prefix + filename, 'path', path) + + +redis_resolver = init_redis_resolver() + +#================================================================= +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/sample_archive/text_content/pathindex.txt b/sample_archive/text_content/pathindex.txt new file mode 100644 index 00000000..6b33a3bb --- /dev/null +++ b/sample_archive/text_content/pathindex.txt @@ -0,0 +1,2 @@ +example.warc.gz invalid_path sample_archive/warcs/example.warc.gz +iana.warc.gz sample_archive/warcs/iana.warc.gz