diff --git a/webagg/test/test_dir_agg.py b/webagg/test/test_dir_agg.py index 14d011aa..165b6346 100644 --- a/webagg/test/test_dir_agg.py +++ b/webagg/test/test_dir_agg.py @@ -3,7 +3,7 @@ import os import shutil import json -from .testutils import to_path +from .testutils import to_path, to_json_list, TempDirTests from mock import patch @@ -12,202 +12,179 @@ from webagg.indexsource import MementoIndexSource #============================================================================= -root_dir = None -orig_cwd = None -dir_loader = None - linkheader = """\ ; rel="original", ; rel="timemap"; type="application/link-format", ; rel="first memento"; datetime="Sun, 20 Jan 2002 14:25:10 GMT", ; rel="prev memento"; datetime="Sat, 01 May 2010 12:34:14 GMT", ; rel="memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", ; rel="next memento"; datetime="Wed, 19 May 2010 20:24:18 GMT", ; rel="last memento"; datetime="Mon, 07 Mar 2016 20:06:19 GMT"\ """ -def setup_module(): - global root_dir - root_dir = tempfile.mkdtemp() - - coll_A = to_path(root_dir + '/colls/A/indexes') - coll_B = to_path(root_dir + '/colls/B/indexes') - coll_C = to_path(root_dir + '/colls/C/indexes') - - os.makedirs(coll_A) - os.makedirs(coll_B) - os.makedirs(coll_C) - - dir_prefix = to_path(root_dir) - dir_path ='colls/{coll}/indexes' - - shutil.copy(to_path('testdata/example.cdxj'), coll_A) - shutil.copy(to_path('testdata/iana.cdxj'), coll_B) - shutil.copy(to_path('testdata/dupes.cdxj'), coll_C) - - with open(to_path(root_dir) + 'somefile', 'w') as fh: - fh.write('foo') - - global dir_loader - dir_loader = DirectoryIndexSource(dir_prefix, dir_path) - - #global orig_cwd - #orig_cwd = os.getcwd() - #os.chdir(root_dir) - - # use actually set dir - #root_dir = os.getcwd() - -def teardown_module(): - #global orig_cwd - #os.chdir(orig_cwd) - - global root_dir - shutil.rmtree(root_dir) - - -def to_json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']): - return list([json.loads(cdx.to_json(fields)) for cdx in cdxlist]) - - -def test_agg_no_coll_set(): - res, errs = dir_loader(dict(url='example.com/')) - assert(to_json_list(res) == []) - assert(errs == {}) - -def test_agg_collA_found(): - res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'A'}) - - exp = [{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}] - - assert(to_json_list(res) == exp) - assert(errs == {}) - -def test_agg_collB(): - res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'B'}) - - exp = [] - - assert(to_json_list(res) == exp) - assert(errs == {}) - -def test_agg_collB_found(): - res, errs = dir_loader({'url': 'iana.org/', 'param.coll': 'B'}) - - exp = [{'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] - - assert(to_json_list(res) == exp) - assert(errs == {}) - - -def test_extra_agg_collB(): - agg_source = SimpleAggregator({'dir': dir_loader}) - res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'}) - - exp = [{'source': 'dir:colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] - - assert(to_json_list(res) == exp) - assert(errs == {}) - - -def test_agg_all_found_1(): - res, errs = dir_loader({'url': 'iana.org/', 'param.coll': '*'}) - - exp = [ - {'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}, - {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, - {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, - ] - - assert(to_json_list(res) == exp) - assert(errs == {}) - - -def test_agg_all_found_2(): - res, errs = dir_loader({'url': 'example.com/', 'param.coll': '*'}) - - exp = [ - {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, - {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, - {'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'} - ] - - assert(to_json_list(res) == exp) - assert(errs == {}) - def mock_link_header(*args, **kwargs): return linkheader -@patch('webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header) -def test_agg_dir_and_memento(): - sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), - 'local': dir_loader} - agg_source = SimpleAggregator(sources) - res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6}) +class TestDirAgg(TempDirTests): + @classmethod + def setup_class(cls): + super(TestDirAgg, cls).setup_class() + coll_A = to_path(cls.root_dir + '/colls/A/indexes') + coll_B = to_path(cls.root_dir + '/colls/B/indexes') + coll_C = to_path(cls.root_dir + '/colls/C/indexes') - exp = [ - {'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'}, - {'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'}, - {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'}, - {'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, - {'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, - {'source': 'local:colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'} - ] + os.makedirs(coll_A) + os.makedirs(coll_B) + os.makedirs(coll_C) - assert(to_json_list(res) == exp) - assert(errs == {}) + dir_prefix = to_path(cls.root_dir) + dir_path ='colls/{coll}/indexes' + + shutil.copy(to_path('testdata/example.cdxj'), coll_A) + shutil.copy(to_path('testdata/iana.cdxj'), coll_B) + shutil.copy(to_path('testdata/dupes.cdxj'), coll_C) + + with open(to_path(cls.root_dir) + 'somefile', 'w') as fh: + fh.write('foo') + + cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path) + + def test_agg_no_coll_set(self): + res, errs = self.dir_loader(dict(url='example.com/')) + assert(to_json_list(res) == []) + assert(errs == {}) + + def test_agg_collA_found(self): + res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'}) + + exp = [{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}] + + assert(to_json_list(res) == exp) + assert(errs == {}) + + def test_agg_collB(self): + res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'B'}) + + exp = [] + + assert(to_json_list(res) == exp) + assert(errs == {}) + + def test_agg_collB_found(self): + res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': 'B'}) + + exp = [{'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] + + assert(to_json_list(res) == exp) + assert(errs == {}) -def test_agg_no_dir_1(): - res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'X'}) + def test_extra_agg_collB(self): + agg_source = SimpleAggregator({'dir': self.dir_loader}) + res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'}) - exp = [] + exp = [{'source': 'dir:colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] - assert(to_json_list(res) == exp) - assert(errs == {}) + assert(to_json_list(res) == exp) + assert(errs == {}) -def test_agg_no_dir_2(): - loader = DirectoryIndexSource(root_dir, '') - res, errs = loader({'url': 'example.com/', 'param.coll': 'X'}) + def test_agg_all_found_1(self): + res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': '*'}) - exp = [] + exp = [ + {'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}, + {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, + {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, + ] - assert(to_json_list(res) == exp) - assert(errs == {}) + assert(to_json_list(res) == exp) + assert(errs == {}) -def test_agg_dir_sources_1(): - res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) - exp = {'sources': {'colls/A/indexes/example.cdxj': 'file', - 'colls/B/indexes/iana.cdxj': 'file', - 'colls/C/indexes/dupes.cdxj': 'file'} - } + def test_agg_all_found_2(self): + res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': '*'}) - assert(res == exp) + exp = [ + {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, + {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, + {'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'} + ] + + assert(to_json_list(res) == exp) + assert(errs == {}) + + @patch('webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header) + def test_agg_dir_and_memento(self): + sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), + 'local': self.dir_loader} + agg_source = SimpleAggregator(sources) + + res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6}) + + exp = [ + {'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'}, + {'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'}, + {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'}, + {'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, + {'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, + {'source': 'local:colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'} + ] + + assert(to_json_list(res) == exp) + assert(errs == {}) -def test_agg_dir_sources_2(): - res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'}) - exp = {'sources': {'colls/A/indexes/example.cdxj': 'file', - 'colls/C/indexes/dupes.cdxj': 'file'} - } + def test_agg_no_dir_1(self): + res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'X'}) - assert(res == exp) + exp = [] + + assert(to_json_list(res) == exp) + assert(errs == {}) -def test_agg_dir_sources_single_dir(): - loader = DirectoryIndexSource(os.path.join(root_dir, 'colls', 'A', 'indexes'), '') - res = loader.get_source_list({'url': 'example.com/'}) + def test_agg_no_dir_2(self): + loader = DirectoryIndexSource(self.root_dir, '') + res, errs = loader({'url': 'example.com/', 'param.coll': 'X'}) - exp = {'sources': {'example.cdxj': 'file'}} + exp = [] - assert(res == exp) + assert(to_json_list(res) == exp) + assert(errs == {}) -def test_agg_dir_sources_not_found_dir(): - loader = DirectoryIndexSource(os.path.join(root_dir, 'colls', 'Z', 'indexes'), '') - res = loader.get_source_list({'url': 'example.com/'}) + def test_agg_dir_sources_1(self): + res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) + exp = {'sources': {'colls/A/indexes/example.cdxj': 'file', + 'colls/B/indexes/iana.cdxj': 'file', + 'colls/C/indexes/dupes.cdxj': 'file'} + } - exp = {'sources': {}} + assert(res == exp) - assert(res == exp) + + def test_agg_dir_sources_2(self): + res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'}) + exp = {'sources': {'colls/A/indexes/example.cdxj': 'file', + 'colls/C/indexes/dupes.cdxj': 'file'} + } + + assert(res == exp) + + + def test_agg_dir_sources_single_dir(self): + loader = DirectoryIndexSource(os.path.join(self.root_dir, 'colls', 'A', 'indexes'), '') + res = loader.get_source_list({'url': 'example.com/'}) + + exp = {'sources': {'example.cdxj': 'file'}} + + assert(res == exp) + + + def test_agg_dir_sources_not_found_dir(self): + loader = DirectoryIndexSource(os.path.join(self.root_dir, 'colls', 'Z', 'indexes'), '') + res = loader.get_source_list({'url': 'example.com/'}) + + exp = {'sources': {}} + + assert(res == exp) diff --git a/webagg/test/test_memento_agg.py b/webagg/test/test_memento_agg.py index 52dc79da..784bf785 100644 --- a/webagg/test/test_memento_agg.py +++ b/webagg/test/test_memento_agg.py @@ -4,7 +4,7 @@ from webagg.aggregator import SimpleAggregator, GeventTimeoutAggregator from webagg.aggregator import BaseAggregator from webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource -from .testutils import json_list, to_path +from .testutils import to_json_list, to_path import json import pytest @@ -48,7 +48,7 @@ def test_mem_agg_index_1(agg): {"timestamp": "20140107040552", "load_url": "http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/", "source": "ait"} ] - assert(json_list(res) == exp) + assert(to_json_list(res) == exp) assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)", 'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"}) @@ -65,7 +65,7 @@ def test_mem_agg_index_2(agg): {"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source": "ait"}, {"timestamp": "20100519202418", "load_url": "http://web.archive.org/web/20100519202418id_/http://example.com/", "source": "ia"}] - assert(json_list(res) == exp) + assert(to_json_list(res) == exp) assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"}) @@ -80,7 +80,7 @@ def test_mem_agg_index_3(agg): {"timestamp": "20140806161228", "load_url": "http://web.archive.org/web/20140806161228id_/http://vvork.com/", "source": "ia"}, {"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}] - assert(json_list(res) == exp) + assert(to_json_list(res) == exp) assert(errs == {}) @@ -92,7 +92,7 @@ def test_mem_agg_index_4(agg): exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}, {"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}] - assert(json_list(res) == exp) + assert(to_json_list(res) == exp) assert(errs == {}) @@ -101,7 +101,7 @@ def test_mem_agg_not_found(agg): url = 'http://vvork.com/' res, errs = agg(dict(url=url, closest='20141001', limit=2)) - assert(json_list(res) == []) + assert(to_json_list(res) == []) assert(errs == {'notfound': "NotFoundException('testdata/not-found-x',)"}) @@ -118,7 +118,7 @@ def test_mem_agg_timeout(agg): res, errs = agg(dict(url=url, closest='20141001', limit=2)) BaseAggregator.load_child_source = orig_source - assert(json_list(res) == []) + assert(to_json_list(res) == []) assert(errs == {'local': 'timeout', 'ait': 'timeout', 'bl': 'timeout', 'ia': 'timeout', 'rhiz': 'timeout'}) diff --git a/webagg/test/test_timeouts.py b/webagg/test/test_timeouts.py index 04370c5d..60080ce6 100644 --- a/webagg/test/test_timeouts.py +++ b/webagg/test/test_timeouts.py @@ -5,7 +5,7 @@ from webagg.indexsource import FileIndexSource from webagg.aggregator import SimpleAggregator, TimeoutMixin from webagg.aggregator import GeventTimeoutAggregator, GeventTimeoutAggregator -from .testutils import json_list +from .testutils import to_json_list class TimeoutFileSource(FileIndexSource): @@ -41,7 +41,7 @@ def test_timeout_long_all_pass(): {'source': 'slower', 'timestamp': '20140127171251'}, {'source': 'slow', 'timestamp': '20160225042329'}] - assert(json_list(res, fields=['source', 'timestamp']) == exp) + assert(to_json_list(res, fields=['source', 'timestamp']) == exp) assert(errs == {}) @@ -53,7 +53,7 @@ def test_timeout_slower_skipped_1(): exp = [{'source': 'slow', 'timestamp': '20160225042329'}] - assert(json_list(res, fields=['source', 'timestamp']) == exp) + assert(to_json_list(res, fields=['source', 'timestamp']) == exp) assert(errs == {'slower': 'timeout'}) @@ -65,7 +65,7 @@ def test_timeout_slower_skipped_2(): exp = [] - assert(json_list(res, fields=['source', 'timestamp']) == exp) + assert(to_json_list(res, fields=['source', 'timestamp']) == exp) assert(errs == {'slower': 'timeout', 'slow': 'timeout'}) @@ -80,28 +80,28 @@ def test_timeout_skipping(): exp = [{'source': 'slow', 'timestamp': '20160225042329'}] res, errs = agg(dict(url='http://example.com/')) - assert(json_list(res, fields=['source', 'timestamp']) == exp) + assert(to_json_list(res, fields=['source', 'timestamp']) == exp) assert(sources['slow'].calls == 4) assert(sources['slower'].calls == 4) assert(errs == {'slower': 'timeout'}) res, errs = agg(dict(url='http://example.com/')) - assert(json_list(res, fields=['source', 'timestamp']) == exp) + assert(to_json_list(res, fields=['source', 'timestamp']) == exp) assert(sources['slow'].calls == 5) assert(sources['slower'].calls == 5) assert(errs == {'slower': 'timeout'}) res, errs = agg(dict(url='http://example.com/')) - assert(json_list(res, fields=['source', 'timestamp']) == exp) + assert(to_json_list(res, fields=['source', 'timestamp']) == exp) assert(sources['slow'].calls == 6) assert(sources['slower'].calls == 5) assert(errs == {}) res, errs = agg(dict(url='http://example.com/')) - assert(json_list(res, fields=['source', 'timestamp']) == exp) + assert(to_json_list(res, fields=['source', 'timestamp']) == exp) assert(sources['slow'].calls == 7) assert(sources['slower'].calls == 5) @@ -110,7 +110,7 @@ def test_timeout_skipping(): time.sleep(2.01) res, errs = agg(dict(url='http://example.com/')) - assert(json_list(res, fields=['source', 'timestamp']) == exp) + assert(to_json_list(res, fields=['source', 'timestamp']) == exp) assert(sources['slow'].calls == 8) assert(sources['slower'].calls == 6) diff --git a/webagg/test/testutils.py b/webagg/test/testutils.py index b9f8ab98..61f8b155 100644 --- a/webagg/test/testutils.py +++ b/webagg/test/testutils.py @@ -1,7 +1,9 @@ import json import os +import tempfile +import shutil -def json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']): +def to_json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']): return list([json.loads(cdx.to_json(fields)) for cdx in cdxlist]) def key_ts_res(cdxlist, extra='filename'): @@ -14,3 +16,12 @@ def to_path(path): return path +class TempDirTests(object): + @classmethod + def setup_class(cls): + cls.root_dir = tempfile.mkdtemp() + + @classmethod + def teardown_class(cls): + shutil.rmtree(cls.root_dir) +