1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

warcserver: DirectoryAggregator:

- support naming directory aggregator such that source is reflected as '<name>:<path/to/index>' if optional name is present
- for default WarcServer use colls dir as name, defaulting to 'collections:<coll/indexes/index.cdxj>' for 'source' entries
- tests: update tests to use name with directory aggregator for more consistent source names
This commit is contained in:
Ilya Kreymer 2017-09-28 01:52:07 -07:00
parent 01597c1060
commit 5791980132
3 changed files with 34 additions and 28 deletions

View File

@ -245,9 +245,10 @@ class GeventTimeoutAggregator(TimeoutMixin, GeventMixin, BaseSourceListAggregato
#=============================================================================
class BaseDirectoryIndexSource(BaseAggregator):
def __init__(self, base_prefix, base_dir=''):
def __init__(self, base_prefix, base_dir='', name=''):
self.base_prefix = base_prefix
self.base_dir = base_dir
self.name = name
def _iter_sources(self, params):
the_dir = res_template(self.base_dir, params)
@ -276,6 +277,9 @@ class BaseDirectoryIndexSource(BaseAggregator):
else:
full_name = os.path.join(rel_path, name)
if self.name:
full_name = self.name + ':' + full_name
yield full_name, FileIndexSource(filename)
def __repr__(self):

View File

@ -36,8 +36,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
os.makedirs(coll_B)
os.makedirs(coll_C)
dir_prefix = to_path(cls.root_dir)
dir_path ='colls/{coll}/indexes'
dir_prefix = os.path.join(cls.root_dir, 'colls')
dir_path = '{coll}/indexes'
dir_name = 'colls'
shutil.copy(to_path(TEST_CDX_PATH + 'example2.cdxj'), coll_A)
shutil.copy(to_path(TEST_CDX_PATH + 'iana.cdxj'), coll_B)
@ -46,8 +47,8 @@ class TestDirAgg(TempDirTests, BaseTestClass):
with open(to_path(cls.root_dir) + '/somefile', 'w') as fh:
fh.write('foo')
cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path)
cls.cache_dir_loader = CacheDirectoryIndexSource(dir_prefix, dir_path)
cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path, dir_name)
cls.cache_dir_loader = CacheDirectoryIndexSource(dir_prefix, dir_path, dir_name)
def test_agg_no_coll_set(self):
res, errs = self.dir_loader(dict(url='example.com/'))
@ -57,7 +58,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
def test_agg_collA_found(self):
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'})
exp = [{'source': to_path('colls/A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}]
exp = [{'source': to_path('colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}]
assert(to_json_list(res) == exp)
assert(errs == {})
@ -73,7 +74,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
def test_agg_collB_found(self):
res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': 'B'})
exp = [{'source': to_path('colls/B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
exp = [{'source': to_path('colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
assert(to_json_list(res) == exp)
assert(errs == {})
@ -83,7 +84,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
agg_source = SimpleAggregator({'dir': self.dir_loader})
res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'})
exp = [{'source': to_path('dir:colls/B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
exp = [{'source': to_path('dir:colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
assert(to_json_list(res) == exp)
assert(errs == {})
@ -93,9 +94,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': '*'})
exp = [
{'source': to_path('colls/B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
{'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
{'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
{'source': to_path('colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
{'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
{'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
]
assert(to_json_list(res) == exp)
@ -106,9 +107,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': '*'})
exp = [
{'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
{'source': to_path('colls/A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
{'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
{'source': to_path('colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
]
assert(to_json_list(res) == exp)
@ -126,9 +127,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
{'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
{'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
{'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
{'source': to_path('local:colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': to_path('local:colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
{'source': to_path('local:colls/A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
{'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
{'source': to_path('local:colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
]
assert(to_json_list(res) == exp)
@ -156,18 +157,17 @@ class TestDirAgg(TempDirTests, BaseTestClass):
def test_agg_dir_sources_1(self):
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
exp = {'sources': {to_path('colls/A/indexes/example2.cdxj'): 'file',
to_path('colls/B/indexes/iana.cdxj'): 'file',
to_path('colls/C/indexes/dupes.cdxj'): 'file'}
exp = {'sources': {to_path('colls:A/indexes/example2.cdxj'): 'file',
to_path('colls:B/indexes/iana.cdxj'): 'file',
to_path('colls:C/indexes/dupes.cdxj'): 'file'}
}
assert(res == exp)
def test_agg_dir_sources_2(self):
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
exp = {'sources': {to_path('colls/A/indexes/example2.cdxj'): 'file',
to_path('colls/C/indexes/dupes.cdxj'): 'file'}
exp = {'sources': {to_path('colls:A/indexes/example2.cdxj'): 'file',
to_path('colls:C/indexes/dupes.cdxj'): 'file'}
}
assert(res == exp)
@ -193,9 +193,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
def test_cache_dir_sources_1(self):
exp = {'sources': {to_path('colls/A/indexes/example2.cdxj'): 'file',
to_path('colls/B/indexes/iana.cdxj'): 'file',
to_path('colls/C/indexes/dupes.cdxj'): 'file'}
exp = {'sources': {to_path('colls:A/indexes/example2.cdxj'): 'file',
to_path('colls:B/indexes/iana.cdxj'): 'file',
to_path('colls:C/indexes/dupes.cdxj'): 'file'}
}
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
@ -215,5 +215,5 @@ class TestDirAgg(TempDirTests, BaseTestClass):
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
# New File Included
exp['sources'][to_path('colls/C/indexes/empty.cdxj')] = 'file'
exp['sources'][to_path('colls:C/indexes/empty.cdxj')] = 'file'
assert(res == exp)

View File

@ -86,7 +86,9 @@ class WarcServer(BaseWarcServer):
#indexes_templ = os.path.join('{coll}', 'indexes') + os.path.sep
self.indexes_templ = self.AUTO_DIR_INDEX_PATH.replace('/', os.path.sep)
dir_source = CacheDirectoryIndexSource(self.root_dir, self.indexes_templ)
dir_source = CacheDirectoryIndexSource(base_prefix=self.root_dir,
base_dir=self.indexes_templ,
name=self.root_dir)
self.archive_templ = self.AUTO_DIR_ARCHIVE_PATH.replace('/', os.path.sep)
self.archive_templ = os.path.join(self.root_dir, self.archive_templ)