mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-16 00:24:48 +01:00
warcserver: DirectoryAggregator:
- support naming directory aggregator such that source is reflected as '<name>:<path/to/index>' if optional name is present - for default WarcServer use colls dir as name, defaulting to 'collections:<coll/indexes/index.cdxj>' for 'source' entries - tests: update tests to use name with directory aggregator for more consistent source names
This commit is contained in:
parent
01597c1060
commit
5791980132
pywb/warcserver
@ -245,9 +245,10 @@ class GeventTimeoutAggregator(TimeoutMixin, GeventMixin, BaseSourceListAggregato
|
|||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class BaseDirectoryIndexSource(BaseAggregator):
|
class BaseDirectoryIndexSource(BaseAggregator):
|
||||||
def __init__(self, base_prefix, base_dir=''):
|
def __init__(self, base_prefix, base_dir='', name=''):
|
||||||
self.base_prefix = base_prefix
|
self.base_prefix = base_prefix
|
||||||
self.base_dir = base_dir
|
self.base_dir = base_dir
|
||||||
|
self.name = name
|
||||||
|
|
||||||
def _iter_sources(self, params):
|
def _iter_sources(self, params):
|
||||||
the_dir = res_template(self.base_dir, params)
|
the_dir = res_template(self.base_dir, params)
|
||||||
@ -276,6 +277,9 @@ class BaseDirectoryIndexSource(BaseAggregator):
|
|||||||
else:
|
else:
|
||||||
full_name = os.path.join(rel_path, name)
|
full_name = os.path.join(rel_path, name)
|
||||||
|
|
||||||
|
if self.name:
|
||||||
|
full_name = self.name + ':' + full_name
|
||||||
|
|
||||||
yield full_name, FileIndexSource(filename)
|
yield full_name, FileIndexSource(filename)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
@ -36,8 +36,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
|||||||
os.makedirs(coll_B)
|
os.makedirs(coll_B)
|
||||||
os.makedirs(coll_C)
|
os.makedirs(coll_C)
|
||||||
|
|
||||||
dir_prefix = to_path(cls.root_dir)
|
dir_prefix = os.path.join(cls.root_dir, 'colls')
|
||||||
dir_path ='colls/{coll}/indexes'
|
dir_path = '{coll}/indexes'
|
||||||
|
dir_name = 'colls'
|
||||||
|
|
||||||
shutil.copy(to_path(TEST_CDX_PATH + 'example2.cdxj'), coll_A)
|
shutil.copy(to_path(TEST_CDX_PATH + 'example2.cdxj'), coll_A)
|
||||||
shutil.copy(to_path(TEST_CDX_PATH + 'iana.cdxj'), coll_B)
|
shutil.copy(to_path(TEST_CDX_PATH + 'iana.cdxj'), coll_B)
|
||||||
@ -46,8 +47,8 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
|||||||
with open(to_path(cls.root_dir) + '/somefile', 'w') as fh:
|
with open(to_path(cls.root_dir) + '/somefile', 'w') as fh:
|
||||||
fh.write('foo')
|
fh.write('foo')
|
||||||
|
|
||||||
cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path)
|
cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path, dir_name)
|
||||||
cls.cache_dir_loader = CacheDirectoryIndexSource(dir_prefix, dir_path)
|
cls.cache_dir_loader = CacheDirectoryIndexSource(dir_prefix, dir_path, dir_name)
|
||||||
|
|
||||||
def test_agg_no_coll_set(self):
|
def test_agg_no_coll_set(self):
|
||||||
res, errs = self.dir_loader(dict(url='example.com/'))
|
res, errs = self.dir_loader(dict(url='example.com/'))
|
||||||
@ -57,7 +58,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
|||||||
def test_agg_collA_found(self):
|
def test_agg_collA_found(self):
|
||||||
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'})
|
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'})
|
||||||
|
|
||||||
exp = [{'source': to_path('colls/A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}]
|
exp = [{'source': to_path('colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}]
|
||||||
|
|
||||||
assert(to_json_list(res) == exp)
|
assert(to_json_list(res) == exp)
|
||||||
assert(errs == {})
|
assert(errs == {})
|
||||||
@ -73,7 +74,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
|||||||
def test_agg_collB_found(self):
|
def test_agg_collB_found(self):
|
||||||
res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': 'B'})
|
res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': 'B'})
|
||||||
|
|
||||||
exp = [{'source': to_path('colls/B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
|
exp = [{'source': to_path('colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
|
||||||
|
|
||||||
assert(to_json_list(res) == exp)
|
assert(to_json_list(res) == exp)
|
||||||
assert(errs == {})
|
assert(errs == {})
|
||||||
@ -83,7 +84,7 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
|||||||
agg_source = SimpleAggregator({'dir': self.dir_loader})
|
agg_source = SimpleAggregator({'dir': self.dir_loader})
|
||||||
res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'})
|
res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'})
|
||||||
|
|
||||||
exp = [{'source': to_path('dir:colls/B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
|
exp = [{'source': to_path('dir:colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
|
||||||
|
|
||||||
assert(to_json_list(res) == exp)
|
assert(to_json_list(res) == exp)
|
||||||
assert(errs == {})
|
assert(errs == {})
|
||||||
@ -93,9 +94,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
|||||||
res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': '*'})
|
res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': '*'})
|
||||||
|
|
||||||
exp = [
|
exp = [
|
||||||
{'source': to_path('colls/B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
|
{'source': to_path('colls:B/indexes/iana.cdxj'), 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
|
||||||
{'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
|
{'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
|
||||||
{'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
|
{'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
|
||||||
]
|
]
|
||||||
|
|
||||||
assert(to_json_list(res) == exp)
|
assert(to_json_list(res) == exp)
|
||||||
@ -106,9 +107,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
|||||||
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': '*'})
|
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': '*'})
|
||||||
|
|
||||||
exp = [
|
exp = [
|
||||||
{'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
{'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
||||||
{'source': to_path('colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
{'source': to_path('colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
||||||
{'source': to_path('colls/A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
|
{'source': to_path('colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
|
||||||
]
|
]
|
||||||
|
|
||||||
assert(to_json_list(res) == exp)
|
assert(to_json_list(res) == exp)
|
||||||
@ -126,9 +127,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
|||||||
{'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
|
{'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
|
||||||
{'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
|
{'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
|
||||||
{'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
|
{'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
|
||||||
{'source': to_path('local:colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
{'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
||||||
{'source': to_path('local:colls/C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
{'source': to_path('local:colls:C/indexes/dupes.cdxj'), 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
||||||
{'source': to_path('local:colls/A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
|
{'source': to_path('local:colls:A/indexes/example2.cdxj'), 'timestamp': '20160225042329', 'filename': 'example2.warc.gz'}
|
||||||
]
|
]
|
||||||
|
|
||||||
assert(to_json_list(res) == exp)
|
assert(to_json_list(res) == exp)
|
||||||
@ -156,18 +157,17 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
|||||||
|
|
||||||
def test_agg_dir_sources_1(self):
|
def test_agg_dir_sources_1(self):
|
||||||
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
||||||
exp = {'sources': {to_path('colls/A/indexes/example2.cdxj'): 'file',
|
exp = {'sources': {to_path('colls:A/indexes/example2.cdxj'): 'file',
|
||||||
to_path('colls/B/indexes/iana.cdxj'): 'file',
|
to_path('colls:B/indexes/iana.cdxj'): 'file',
|
||||||
to_path('colls/C/indexes/dupes.cdxj'): 'file'}
|
to_path('colls:C/indexes/dupes.cdxj'): 'file'}
|
||||||
}
|
}
|
||||||
|
|
||||||
assert(res == exp)
|
assert(res == exp)
|
||||||
|
|
||||||
|
|
||||||
def test_agg_dir_sources_2(self):
|
def test_agg_dir_sources_2(self):
|
||||||
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
|
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
|
||||||
exp = {'sources': {to_path('colls/A/indexes/example2.cdxj'): 'file',
|
exp = {'sources': {to_path('colls:A/indexes/example2.cdxj'): 'file',
|
||||||
to_path('colls/C/indexes/dupes.cdxj'): 'file'}
|
to_path('colls:C/indexes/dupes.cdxj'): 'file'}
|
||||||
}
|
}
|
||||||
|
|
||||||
assert(res == exp)
|
assert(res == exp)
|
||||||
@ -193,9 +193,9 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
|||||||
|
|
||||||
|
|
||||||
def test_cache_dir_sources_1(self):
|
def test_cache_dir_sources_1(self):
|
||||||
exp = {'sources': {to_path('colls/A/indexes/example2.cdxj'): 'file',
|
exp = {'sources': {to_path('colls:A/indexes/example2.cdxj'): 'file',
|
||||||
to_path('colls/B/indexes/iana.cdxj'): 'file',
|
to_path('colls:B/indexes/iana.cdxj'): 'file',
|
||||||
to_path('colls/C/indexes/dupes.cdxj'): 'file'}
|
to_path('colls:C/indexes/dupes.cdxj'): 'file'}
|
||||||
}
|
}
|
||||||
|
|
||||||
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
||||||
@ -215,5 +215,5 @@ class TestDirAgg(TempDirTests, BaseTestClass):
|
|||||||
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
||||||
|
|
||||||
# New File Included
|
# New File Included
|
||||||
exp['sources'][to_path('colls/C/indexes/empty.cdxj')] = 'file'
|
exp['sources'][to_path('colls:C/indexes/empty.cdxj')] = 'file'
|
||||||
assert(res == exp)
|
assert(res == exp)
|
||||||
|
@ -86,7 +86,9 @@ class WarcServer(BaseWarcServer):
|
|||||||
|
|
||||||
#indexes_templ = os.path.join('{coll}', 'indexes') + os.path.sep
|
#indexes_templ = os.path.join('{coll}', 'indexes') + os.path.sep
|
||||||
self.indexes_templ = self.AUTO_DIR_INDEX_PATH.replace('/', os.path.sep)
|
self.indexes_templ = self.AUTO_DIR_INDEX_PATH.replace('/', os.path.sep)
|
||||||
dir_source = CacheDirectoryIndexSource(self.root_dir, self.indexes_templ)
|
dir_source = CacheDirectoryIndexSource(base_prefix=self.root_dir,
|
||||||
|
base_dir=self.indexes_templ,
|
||||||
|
name=self.root_dir)
|
||||||
|
|
||||||
self.archive_templ = self.AUTO_DIR_ARCHIVE_PATH.replace('/', os.path.sep)
|
self.archive_templ = self.AUTO_DIR_ARCHIVE_PATH.replace('/', os.path.sep)
|
||||||
self.archive_templ = os.path.join(self.root_dir, self.archive_templ)
|
self.archive_templ = os.path.join(self.root_dir, self.archive_templ)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user