mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
warcserver: DirectoryAggregator:
- support naming directory aggregator such that source is reflected as '<name>:<path/to/index>' if optional name is present - for default WarcServer use colls dir as name, defaulting to 'collections:<coll/indexes/index.cdxj>' for 'source' entries - tests: update tests to use name with directory aggregator for more consistent source names
This commit is contained in:
parent
01597c1060
commit
5791980132
@ -245,9 +245,10 @@ class GeventTimeoutAggregator(TimeoutMixin, GeventMixin, BaseSourceListAggregato
|
||||
|
||||
#=============================================================================
|
||||
class BaseDirectoryIndexSource(BaseAggregator):
|
||||
def __init__(self, base_prefix, base_dir=''):
|
||||
def __init__(self, base_prefix, base_dir='', name=''):
|
||||
self.base_prefix = base_prefix
|
||||
self.base_dir = base_dir
|
||||
self.name = name
|
||||
|
||||
def _iter_sources(self, params):
|
||||
the_dir = res_template(self.base_dir, params)
|
||||
@ -276,6 +277,9 @@ class BaseDirectoryIndexSource(BaseAggregator):
|
||||
else:
|
||||
full_name = os.path.join(rel_path, name)
|
||||
|
||||
if self.name:
|
||||
full_name = self.name + ':' + full_name
|
||||
|
||||
yield full_name, FileIndexSource(filename)
|
||||
|
||||
def __repr__(self):
|
||||
|
@ -36,8 +36,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
os.makedirs(coll_B)
|
||||
os.makedirs(coll_C)
|
||||
|
||||
dir_prefix = to_path(cls.root_dir)
|
||||
dir_path ='colls/{coll}/indexes'
|
||||
dir_prefix = os.path.join(cls.root_dir, 'colls')
|
||||
dir_path = '{coll}/indexes'
|
||||
dir_name = 'colls'
|
||||
|
||||
shutil.copy(to_path(TEST_CDX_PATH + 'example2.cdxj'), coll_A)
|
||||
shutil.copy(to_path(TEST_CDX_PATH + 'iana.cdxj'), coll_B)
|
||||
@ -46,8 +47,8 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
with open(to_path(cls.root_dir) + '/somefile', 'w') as fh:
|
||||
fh.write('foo')
|
||||
|
||||
cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path)
|
||||
cls.cache_dir_loader = CacheDirectoryIndexSource(dir_prefix, dir_path)
|
||||
cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path, dir_name)
|
||||
cls.cache_dir_loader = CacheDirectoryIndexSource(dir_prefix, dir_path, dir_name)
|
||||
|
||||
def test_agg_no_coll_set(self):
|
||||
res, errs = self.dir_loader(dict(url='example.com/'))
|
||||
@ -57,7 +58,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
def test_agg_collA_found(self):
|
||||
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'})
|
||||
|
||||
exp = [{'source': to_path('colls/A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}]
|
||||
exp = [{'source': to_path('colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
@ -73,7 +74,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
def test_agg_collB_found(self):
|
||||
res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': 'B'})
|
||||
|
||||
exp = [{'source': to_path('colls/B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
|
||||
exp = [{'source': to_path('colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
@ -83,7 +84,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
agg_source = SimpleAggregator({'dir': self.dir_loader})
|
||||
res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'})
|
||||
|
||||
exp = [{'source': to_path('dir:colls/B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
|
||||
exp = [{'source': to_path('dir:colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
@ -93,9 +94,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': '*'})
|
||||
|
||||
exp = [
|
||||
{'source': to_path('colls/B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
|
||||
{'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
|
||||
{'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
|
||||
{'source': to_path('colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
|
||||
{'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
|
||||
{'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
|
||||
]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
@ -106,9 +107,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': '*'})
|
||||
|
||||
exp = [
|
||||
{'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
||||
{'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
||||
{'source': to_path('colls/A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
|
||||
{'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
||||
{'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
||||
{'source': to_path('colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
|
||||
]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
@ -126,9 +127,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
{'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
|
||||
{'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
|
||||
{'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
|
||||
{'source': to_path('local:colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
||||
{'source': to_path('local:colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
||||
{'source': to_path('local:colls/A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
|
||||
{'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
||||
{'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
||||
{'source': to_path('local:colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
|
||||
]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
@ -156,18 +157,17 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
|
||||
def test_agg_dir_sources_1(self):
|
||||
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
||||
exp = {'sources': {to_path('colls/A/indexes/example2.cdxj'): 'file',
|
||||
to_path('colls/B/indexes/iana.cdxj'): 'file',
|
||||
to_path('colls/C/indexes/dupes.cdxj'): 'file'}
|
||||
exp = {'sources': {to_path('colls:A/indexes/example2.cdxj'): 'file',
|
||||
to_path('colls:B/indexes/iana.cdxj'): 'file',
|
||||
to_path('colls:C/indexes/dupes.cdxj'): 'file'}
|
||||
}
|
||||
|
||||
assert(res == exp)
|
||||
|
||||
|
||||
def test_agg_dir_sources_2(self):
|
||||
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
|
||||
exp = {'sources': {to_path('colls/A/indexes/example2.cdxj'): 'file',
|
||||
to_path('colls/C/indexes/dupes.cdxj'): 'file'}
|
||||
exp = {'sources': {to_path('colls:A/indexes/example2.cdxj'): 'file',
|
||||
to_path('colls:C/indexes/dupes.cdxj'): 'file'}
|
||||
}
|
||||
|
||||
assert(res == exp)
|
||||
@ -193,9 +193,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
|
||||
|
||||
def test_cache_dir_sources_1(self):
|
||||
exp = {'sources': {to_path('colls/A/indexes/example2.cdxj'): 'file',
|
||||
to_path('colls/B/indexes/iana.cdxj'): 'file',
|
||||
to_path('colls/C/indexes/dupes.cdxj'): 'file'}
|
||||
exp = {'sources': {to_path('colls:A/indexes/example2.cdxj'): 'file',
|
||||
to_path('colls:B/indexes/iana.cdxj'): 'file',
|
||||
to_path('colls:C/indexes/dupes.cdxj'): 'file'}
|
||||
}
|
||||
|
||||
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
||||
@ -215,5 +215,5 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
||||
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
||||
|
||||
# New File Included
|
||||
exp['sources'][to_path('colls/C/indexes/empty.cdxj')] = 'file'
|
||||
exp['sources'][to_path('colls:C/indexes/empty.cdxj')] = 'file'
|
||||
assert(res == exp)
|
||||
|
@ -86,7 +86,9 @@ class WarcServer(BaseWarcServer):
|
||||
|
||||
#indexes_templ = os.path.join('{coll}', 'indexes') + os.path.sep
|
||||
self.indexes_templ = self.AUTO_DIR_INDEX_PATH.replace('/', os.path.sep)
|
||||
dir_source = CacheDirectoryIndexSource(self.root_dir, self.indexes_templ)
|
||||
dir_source = CacheDirectoryIndexSource(base_prefix=self.root_dir,
|
||||
base_dir=self.indexes_templ,
|
||||
name=self.root_dir)
|
||||
|
||||
self.archive_templ = self.AUTO_DIR_ARCHIVE_PATH.replace('/', os.path.sep)
|
||||
self.archive_templ = os.path.join(self.root_dir, self.archive_templ)
|
||||
|
Loading…
x
Reference in New Issue
Block a user