diff --git a/pywb/warcserver/index/aggregator.py b/pywb/warcserver/index/aggregator.py index c323d943..77506ab8 100644 --- a/pywb/warcserver/index/aggregator.py +++ b/pywb/warcserver/index/aggregator.py @@ -245,9 +245,10 @@ class GeventTimeoutAggregator(TimeoutMixin, GeventMixin, BaseSourceListAggregato #============================================================================= class BaseDirectoryIndexSource(BaseAggregator): - def __init__(self, base_prefix, base_dir=''): + def __init__(self, base_prefix, base_dir='', name=''): self.base_prefix = base_prefix self.base_dir = base_dir + self.name = name def _iter_sources(self, params): the_dir = res_template(self.base_dir, params) @@ -276,6 +277,9 @@ class BaseDirectoryIndexSource(BaseAggregator): else: full_name = os.path.join(rel_path, name) + if self.name: + full_name = self.name + ':' + full_name + yield full_name, FileIndexSource(filename) def __repr__(self): diff --git a/pywb/warcserver/index/test/test_dir_agg.py b/pywb/warcserver/index/test/test_dir_agg.py index b5e615fe..139a28f4 100644 --- a/pywb/warcserver/index/test/test_dir_agg.py +++ b/pywb/warcserver/index/test/test_dir_agg.py @@ -36,8 +36,9 @@ class TestDirAgg(TempDirTests, BaseTestClass): os.makedirs(coll_B) os.makedirs(coll_C) - dir_prefix = to_path(cls.root_dir) - dir_path ='colls/{coll}/indexes' + dir_prefix = os.path.join(cls.root_dir, 'colls') + dir_path = '{coll}/indexes' + dir_name = 'colls' shutil.copy(to_path(TEST_CDX_PATH + 'example2.cdxj'), coll_A) shutil.copy(to_path(TEST_CDX_PATH + 'iana.cdxj'), coll_B) @@ -46,8 +47,8 @@ class TestDirAgg(TempDirTests, BaseTestClass): with open(to_path(cls.root_dir) + '/somefile', 'w') as fh: fh.write('foo') - cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path) - cls.cache_dir_loader = CacheDirectoryIndexSource(dir_prefix, dir_path) + cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path, dir_name) + cls.cache_dir_loader = CacheDirectoryIndexSource(dir_prefix, dir_path, dir_name) def test_agg_no_coll_set(self): res, errs = self.dir_loader(dict(url='example.com/')) @@ -57,7 +58,7 @@ class TestDirAgg(TempDirTests, BaseTestClass): def test_agg_collA_found(self): res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'}) - exp = [{'source': to_path('colls/A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}] + exp = [{'source': to_path('colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}] assert(to_json_list(res) == exp) assert(errs == {}) @@ -73,7 +74,7 @@ class TestDirAgg(TempDirTests, BaseTestClass): def test_agg_collB_found(self): res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': 'B'}) - exp = [{'source': to_path('colls/B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] + exp = [{'source': to_path('colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] assert(to_json_list(res) == exp) assert(errs == {}) @@ -83,7 +84,7 @@ class TestDirAgg(TempDirTests, BaseTestClass): agg_source = SimpleAggregator({'dir': self.dir_loader}) res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'}) - exp = [{'source': to_path('dir:colls/B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] + exp = [{'source': to_path('dir:colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] assert(to_json_list(res) == exp) assert(errs == {}) @@ -93,9 +94,9 @@ class TestDirAgg(TempDirTests, BaseTestClass): res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': '*'}) exp = [ - {'source': to_path('colls/B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}, - {'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, - {'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, + {'source': to_path('colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}, + {'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, + {'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, ] assert(to_json_list(res) == exp) @@ -106,9 +107,9 @@ class TestDirAgg(TempDirTests, BaseTestClass): res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': '*'}) exp = [ - {'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, - {'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, - {'source': to_path('colls/A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'} + {'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, + {'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, + {'source': to_path('colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'} ] assert(to_json_list(res) == exp) @@ -126,9 +127,9 @@ class TestDirAgg(TempDirTests, BaseTestClass): {'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'}, {'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'}, {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'}, - {'source': to_path('local:colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, - {'source': to_path('local:colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, - {'source': to_path('local:colls/A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'} + {'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, + {'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, + {'source': to_path('local:colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'} ] assert(to_json_list(res) == exp) @@ -156,18 +157,17 @@ class TestDirAgg(TempDirTests, BaseTestClass): def test_agg_dir_sources_1(self): res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) - exp = {'sources': {to_path('colls/A/indexes/example2.cdxj'): 'file', - to_path('colls/B/indexes/iana.cdxj'): 'file', - to_path('colls/C/indexes/dupes.cdxj'): 'file'} + exp = {'sources': {to_path('colls:A/indexes/example2.cdxj'): 'file', + to_path('colls:B/indexes/iana.cdxj'): 'file', + to_path('colls:C/indexes/dupes.cdxj'): 'file'} } assert(res == exp) - def test_agg_dir_sources_2(self): res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'}) - exp = {'sources': {to_path('colls/A/indexes/example2.cdxj'): 'file', - to_path('colls/C/indexes/dupes.cdxj'): 'file'} + exp = {'sources': {to_path('colls:A/indexes/example2.cdxj'): 'file', + to_path('colls:C/indexes/dupes.cdxj'): 'file'} } assert(res == exp) @@ -193,9 +193,9 @@ class TestDirAgg(TempDirTests, BaseTestClass): def test_cache_dir_sources_1(self): - exp = {'sources': {to_path('colls/A/indexes/example2.cdxj'): 'file', - to_path('colls/B/indexes/iana.cdxj'): 'file', - to_path('colls/C/indexes/dupes.cdxj'): 'file'} + exp = {'sources': {to_path('colls:A/indexes/example2.cdxj'): 'file', + to_path('colls:B/indexes/iana.cdxj'): 'file', + to_path('colls:C/indexes/dupes.cdxj'): 'file'} } res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) @@ -215,5 +215,5 @@ class TestDirAgg(TempDirTests, BaseTestClass): res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) # New File Included - exp['sources'][to_path('colls/C/indexes/empty.cdxj')] = 'file' + exp['sources'][to_path('colls:C/indexes/empty.cdxj')] = 'file' assert(res == exp) diff --git a/pywb/warcserver/warcserver.py b/pywb/warcserver/warcserver.py index 1cce7193..96fb63e7 100644 --- a/pywb/warcserver/warcserver.py +++ b/pywb/warcserver/warcserver.py @@ -86,7 +86,9 @@ class WarcServer(BaseWarcServer): #indexes_templ = os.path.join('{coll}', 'indexes') + os.path.sep self.indexes_templ = self.AUTO_DIR_INDEX_PATH.replace('/', os.path.sep) - dir_source = CacheDirectoryIndexSource(self.root_dir, self.indexes_templ) + dir_source = CacheDirectoryIndexSource(base_prefix=self.root_dir, + base_dir=self.indexes_templ, + name=self.root_dir) self.archive_templ = self.AUTO_DIR_ARCHIVE_PATH.replace('/', os.path.sep) self.archive_templ = os.path.join(self.root_dir, self.archive_templ)