mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 08:04:49 +01:00
dir agg: include filename in dir source name
This commit is contained in:
parent
31fb2f926f
commit
7b847311d5
@ -228,8 +228,11 @@ class BaseDirectoryIndexSource(BaseAggregator):
|
||||
print('Adding ' + filename)
|
||||
rel_path = os.path.relpath(the_dir, self.base_prefix)
|
||||
if rel_path == '.':
|
||||
rel_path = ''
|
||||
yield rel_path, FileIndexSource(filename)
|
||||
full_name = name
|
||||
else:
|
||||
full_name = rel_path + '/' + name
|
||||
|
||||
yield full_name, FileIndexSource(filename)
|
||||
|
||||
def __str__(self):
|
||||
return 'file_dir'
|
||||
|
@ -45,16 +45,16 @@ def setup_module():
|
||||
global dir_loader
|
||||
dir_loader = DirectoryIndexSource(dir_prefix, dir_path)
|
||||
|
||||
global orig_cwd
|
||||
orig_cwd = os.getcwd()
|
||||
os.chdir(root_dir)
|
||||
#global orig_cwd
|
||||
#orig_cwd = os.getcwd()
|
||||
#os.chdir(root_dir)
|
||||
|
||||
# use actually set dir
|
||||
root_dir = os.getcwd()
|
||||
#root_dir = os.getcwd()
|
||||
|
||||
def teardown_module():
|
||||
global orig_cwd
|
||||
os.chdir(orig_cwd)
|
||||
#global orig_cwd
|
||||
#os.chdir(orig_cwd)
|
||||
|
||||
global root_dir
|
||||
shutil.rmtree(root_dir)
|
||||
@ -72,7 +72,7 @@ def test_agg_no_coll_set():
|
||||
def test_agg_collA_found():
|
||||
res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'A'})
|
||||
|
||||
exp = [{'source': 'colls/A/indexes', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}]
|
||||
exp = [{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
@ -88,7 +88,7 @@ def test_agg_collB():
|
||||
def test_agg_collB_found():
|
||||
res, errs = dir_loader({'url': 'iana.org/', 'param.coll': 'B'})
|
||||
|
||||
exp = [{'source': 'colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
|
||||
exp = [{'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
@ -98,7 +98,7 @@ def test_extra_agg_collB():
|
||||
agg_source = SimpleAggregator({'dir': dir_loader})
|
||||
res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'})
|
||||
|
||||
exp = [{'source': 'dir:colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
|
||||
exp = [{'source': 'dir:colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
assert(errs == {})
|
||||
@ -108,9 +108,9 @@ def test_agg_all_found_1():
|
||||
res, errs = dir_loader({'url': 'iana.org/', 'param.coll': '*'})
|
||||
|
||||
exp = [
|
||||
{'source': 'colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
|
||||
{'source': 'colls/C/indexes', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'colls/C/indexes', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
|
||||
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
|
||||
]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
@ -121,9 +121,9 @@ def test_agg_all_found_2():
|
||||
res, errs = dir_loader({'url': 'example.com/', 'param.coll': '*'})
|
||||
|
||||
exp = [
|
||||
{'source': 'colls/C/indexes', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'colls/C/indexes', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'colls/A/indexes', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
|
||||
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
|
||||
]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
@ -145,9 +145,9 @@ def test_agg_dir_and_memento():
|
||||
{'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
|
||||
{'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
|
||||
{'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
|
||||
{'source': 'local:colls/C/indexes', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'local:colls/C/indexes', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'local:colls/A/indexes', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
|
||||
{'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
||||
{'source': 'local:colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
|
||||
]
|
||||
|
||||
assert(to_json_list(res) == exp)
|
||||
@ -175,9 +175,9 @@ def test_agg_no_dir_2():
|
||||
|
||||
def test_agg_dir_sources_1():
|
||||
res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
||||
exp = {'sources': {'colls/A/indexes': 'file',
|
||||
'colls/B/indexes': 'file',
|
||||
'colls/C/indexes': 'file'}
|
||||
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
|
||||
'colls/B/indexes/iana.cdxj': 'file',
|
||||
'colls/C/indexes/dupes.cdxj': 'file'}
|
||||
}
|
||||
|
||||
assert(res == exp)
|
||||
@ -185,15 +185,24 @@ def test_agg_dir_sources_1():
|
||||
|
||||
def test_agg_dir_sources_2():
|
||||
res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
|
||||
exp = {'sources': {'colls/A/indexes': 'file',
|
||||
'colls/C/indexes': 'file'}
|
||||
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
|
||||
'colls/C/indexes/dupes.cdxj': 'file'}
|
||||
}
|
||||
|
||||
assert(res == exp)
|
||||
|
||||
|
||||
def test_agg_dir_sources_single_dir():
|
||||
loader = DirectoryIndexSource('testdata/', '')
|
||||
loader = DirectoryIndexSource(os.path.join(root_dir, 'colls', 'A', 'indexes'), '')
|
||||
res = loader.get_source_list({'url': 'example.com/'})
|
||||
|
||||
exp = {'sources': {'example.cdxj': 'file'}}
|
||||
|
||||
assert(res == exp)
|
||||
|
||||
|
||||
def test_agg_dir_sources_not_found_dir():
|
||||
loader = DirectoryIndexSource(os.path.join(root_dir, 'colls', 'Z', 'indexes'), '')
|
||||
res = loader.get_source_list({'url': 'example.com/'})
|
||||
|
||||
exp = {'sources': {}}
|
||||
@ -201,3 +210,4 @@ def test_agg_dir_sources_single_dir():
|
||||
assert(res == exp)
|
||||
|
||||
|
||||
|
||||
|
@ -203,7 +203,7 @@ class TestResAgg(object):
|
||||
def test_agg_select_local(self):
|
||||
resp = self.testapp.get('/many/resource?url=http://iana.org/&closest=20140126200624')
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'local'
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'local:iana.cdxj'
|
||||
|
||||
self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z')
|
||||
|
||||
@ -222,7 +222,7 @@ Host: iana.org
|
||||
|
||||
resp = self.testapp.post('/many/resource/postreq?url=http://iana.org/&closest=20140126200624', req_data)
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'local'
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'local:iana.cdxj'
|
||||
|
||||
self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z')
|
||||
|
||||
@ -336,7 +336,7 @@ foo=bar&test=abc"""
|
||||
def test_agg_local_revisit(self):
|
||||
resp = self.testapp.get('/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local')
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'local'
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'local:dupes.cdxj'
|
||||
|
||||
buff = BytesIO(resp.body)
|
||||
status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff)
|
||||
|
Loading…
x
Reference in New Issue
Block a user