diff --git a/webagg/aggregator.py b/webagg/aggregator.py index 8a810a63..0f148492 100644 --- a/webagg/aggregator.py +++ b/webagg/aggregator.py @@ -228,8 +228,11 @@ class BaseDirectoryIndexSource(BaseAggregator): print('Adding ' + filename) rel_path = os.path.relpath(the_dir, self.base_prefix) if rel_path == '.': - rel_path = '' - yield rel_path, FileIndexSource(filename) + full_name = name + else: + full_name = rel_path + '/' + name + + yield full_name, FileIndexSource(filename) def __str__(self): return 'file_dir' diff --git a/webagg/test/test_dir_agg.py b/webagg/test/test_dir_agg.py index 9d2db560..14d011aa 100644 --- a/webagg/test/test_dir_agg.py +++ b/webagg/test/test_dir_agg.py @@ -45,16 +45,16 @@ def setup_module(): global dir_loader dir_loader = DirectoryIndexSource(dir_prefix, dir_path) - global orig_cwd - orig_cwd = os.getcwd() - os.chdir(root_dir) + #global orig_cwd + #orig_cwd = os.getcwd() + #os.chdir(root_dir) # use actually set dir - root_dir = os.getcwd() + #root_dir = os.getcwd() def teardown_module(): - global orig_cwd - os.chdir(orig_cwd) + #global orig_cwd + #os.chdir(orig_cwd) global root_dir shutil.rmtree(root_dir) @@ -72,7 +72,7 @@ def test_agg_no_coll_set(): def test_agg_collA_found(): res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'A'}) - exp = [{'source': 'colls/A/indexes', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}] + exp = [{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}] assert(to_json_list(res) == exp) assert(errs == {}) @@ -88,7 +88,7 @@ def test_agg_collB(): def test_agg_collB_found(): res, errs = dir_loader({'url': 'iana.org/', 'param.coll': 'B'}) - exp = [{'source': 'colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] + exp = [{'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] assert(to_json_list(res) == exp) assert(errs == {}) @@ -98,7 +98,7 @@ def test_extra_agg_collB(): agg_source = SimpleAggregator({'dir': dir_loader}) res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'}) - exp = [{'source': 'dir:colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] + exp = [{'source': 'dir:colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] assert(to_json_list(res) == exp) assert(errs == {}) @@ -108,9 +108,9 @@ def test_agg_all_found_1(): res, errs = dir_loader({'url': 'iana.org/', 'param.coll': '*'}) exp = [ - {'source': 'colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}, - {'source': 'colls/C/indexes', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, - {'source': 'colls/C/indexes', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, + {'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}, + {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, + {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, ] assert(to_json_list(res) == exp) @@ -121,9 +121,9 @@ def test_agg_all_found_2(): res, errs = dir_loader({'url': 'example.com/', 'param.coll': '*'}) exp = [ - {'source': 'colls/C/indexes', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, - {'source': 'colls/C/indexes', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, - {'source': 'colls/A/indexes', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'} + {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, + {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, + {'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'} ] assert(to_json_list(res) == exp) @@ -145,9 +145,9 @@ def test_agg_dir_and_memento(): {'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'}, {'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'}, {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'}, - {'source': 'local:colls/C/indexes', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, - {'source': 'local:colls/C/indexes', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, - {'source': 'local:colls/A/indexes', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'} + {'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, + {'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, + {'source': 'local:colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'} ] assert(to_json_list(res) == exp) @@ -175,9 +175,9 @@ def test_agg_no_dir_2(): def test_agg_dir_sources_1(): res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) - exp = {'sources': {'colls/A/indexes': 'file', - 'colls/B/indexes': 'file', - 'colls/C/indexes': 'file'} + exp = {'sources': {'colls/A/indexes/example.cdxj': 'file', + 'colls/B/indexes/iana.cdxj': 'file', + 'colls/C/indexes/dupes.cdxj': 'file'} } assert(res == exp) @@ -185,15 +185,24 @@ def test_agg_dir_sources_1(): def test_agg_dir_sources_2(): res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'}) - exp = {'sources': {'colls/A/indexes': 'file', - 'colls/C/indexes': 'file'} + exp = {'sources': {'colls/A/indexes/example.cdxj': 'file', + 'colls/C/indexes/dupes.cdxj': 'file'} } assert(res == exp) def test_agg_dir_sources_single_dir(): - loader = DirectoryIndexSource('testdata/', '') + loader = DirectoryIndexSource(os.path.join(root_dir, 'colls', 'A', 'indexes'), '') + res = loader.get_source_list({'url': 'example.com/'}) + + exp = {'sources': {'example.cdxj': 'file'}} + + assert(res == exp) + + +def test_agg_dir_sources_not_found_dir(): + loader = DirectoryIndexSource(os.path.join(root_dir, 'colls', 'Z', 'indexes'), '') res = loader.get_source_list({'url': 'example.com/'}) exp = {'sources': {}} @@ -201,3 +210,4 @@ def test_agg_dir_sources_single_dir(): assert(res == exp) + diff --git a/webagg/test/test_handlers.py b/webagg/test/test_handlers.py index c5577c5a..138584d6 100644 --- a/webagg/test/test_handlers.py +++ b/webagg/test/test_handlers.py @@ -203,7 +203,7 @@ class TestResAgg(object): def test_agg_select_local(self): resp = self.testapp.get('/many/resource?url=http://iana.org/&closest=20140126200624') - assert resp.headers['WebAgg-Source-Coll'] == 'local' + assert resp.headers['WebAgg-Source-Coll'] == 'local:iana.cdxj' self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z') @@ -222,7 +222,7 @@ Host: iana.org resp = self.testapp.post('/many/resource/postreq?url=http://iana.org/&closest=20140126200624', req_data) - assert resp.headers['WebAgg-Source-Coll'] == 'local' + assert resp.headers['WebAgg-Source-Coll'] == 'local:iana.cdxj' self._check_uri_date(resp, 'http://www.iana.org/', '2014-01-26T20:06:24Z') @@ -336,7 +336,7 @@ foo=bar&test=abc""" def test_agg_local_revisit(self): resp = self.testapp.get('/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local') - assert resp.headers['WebAgg-Source-Coll'] == 'local' + assert resp.headers['WebAgg-Source-Coll'] == 'local:dupes.cdxj' buff = BytesIO(resp.body) status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff)