import tempfile import os import shutil import json from .testutils import to_path, to_json_list, TempDirTests, BaseTestClass from mock import patch import time from webagg.aggregator import DirectoryIndexSource, CacheDirectoryIndexSource from webagg.aggregator import SimpleAggregator from webagg.indexsource import MementoIndexSource #============================================================================= linkheader = """\ ; rel="original", ; rel="timemap"; type="application/link-format", ; rel="first memento"; datetime="Sun, 20 Jan 2002 14:25:10 GMT", ; rel="prev memento"; datetime="Sat, 01 May 2010 12:34:14 GMT", ; rel="memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", ; rel="next memento"; datetime="Wed, 19 May 2010 20:24:18 GMT", ; rel="last memento"; datetime="Mon, 07 Mar 2016 20:06:19 GMT"\ """ def mock_link_header(*args, **kwargs): return linkheader class TestDirAgg(TempDirTests, BaseTestClass): @classmethod def setup_class(cls): super(TestDirAgg, cls).setup_class() coll_A = to_path(cls.root_dir + '/colls/A/indexes') coll_B = to_path(cls.root_dir + '/colls/B/indexes') coll_C = to_path(cls.root_dir + '/colls/C/indexes') os.makedirs(coll_A) os.makedirs(coll_B) os.makedirs(coll_C) dir_prefix = to_path(cls.root_dir) dir_path ='colls/{coll}/indexes' shutil.copy(to_path('testdata/example.cdxj'), coll_A) shutil.copy(to_path('testdata/iana.cdxj'), coll_B) shutil.copy(to_path('testdata/dupes.cdxj'), coll_C) with open(to_path(cls.root_dir) + '/somefile', 'w') as fh: fh.write('foo') cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path) cls.cache_dir_loader = CacheDirectoryIndexSource(dir_prefix, dir_path) def test_agg_no_coll_set(self): res, errs = self.dir_loader(dict(url='example.com/')) assert(to_json_list(res) == []) assert(errs == {}) def test_agg_collA_found(self): res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'}) exp = [{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}] assert(to_json_list(res) == exp) assert(errs == {}) def test_agg_collB(self): res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'B'}) exp = [] assert(to_json_list(res) == exp) assert(errs == {}) def test_agg_collB_found(self): res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': 'B'}) exp = [{'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] assert(to_json_list(res) == exp) assert(errs == {}) def test_extra_agg_collB(self): agg_source = SimpleAggregator({'dir': self.dir_loader}) res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'}) exp = [{'source': 'dir:colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}] assert(to_json_list(res) == exp) assert(errs == {}) def test_agg_all_found_1(self): res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': '*'}) exp = [ {'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}, {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'}, ] assert(to_json_list(res) == exp) assert(errs == {}) def test_agg_all_found_2(self): res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': '*'}) exp = [ {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, {'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'} ] assert(to_json_list(res) == exp) assert(errs == {}) @patch('webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header) def test_agg_dir_and_memento(self): sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'), 'local': self.dir_loader} agg_source = SimpleAggregator(sources) res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6}) exp = [ {'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'}, {'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'}, {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'}, {'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'}, {'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'}, {'source': 'local:colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'} ] assert(to_json_list(res) == exp) assert(errs == {}) def test_agg_no_dir_1(self): res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'X'}) exp = [] assert(to_json_list(res) == exp) assert(errs == {}) def test_agg_no_dir_2(self): loader = DirectoryIndexSource(self.root_dir, '') res, errs = loader({'url': 'example.com/', 'param.coll': 'X'}) exp = [] assert(to_json_list(res) == exp) assert(errs == {}) def test_agg_dir_sources_1(self): res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) exp = {'sources': {'colls/A/indexes/example.cdxj': 'file', 'colls/B/indexes/iana.cdxj': 'file', 'colls/C/indexes/dupes.cdxj': 'file'} } assert(res == exp) def test_agg_dir_sources_2(self): res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'}) exp = {'sources': {'colls/A/indexes/example.cdxj': 'file', 'colls/C/indexes/dupes.cdxj': 'file'} } assert(res == exp) def test_agg_dir_sources_single_dir(self): loader = DirectoryIndexSource(os.path.join(self.root_dir, 'colls', 'A', 'indexes'), '') res = loader.get_source_list({'url': 'example.com/'}) exp = {'sources': {'example.cdxj': 'file'}} assert(res == exp) def test_agg_dir_sources_not_found_dir(self): loader = DirectoryIndexSource(os.path.join(self.root_dir, 'colls', 'Z', 'indexes'), '') res = loader.get_source_list({'url': 'example.com/'}) exp = {'sources': {}} assert(res == exp) def test_cache_dir_sources_1(self): exp = {'sources': {'colls/A/indexes/example.cdxj': 'file', 'colls/B/indexes/iana.cdxj': 'file', 'colls/C/indexes/dupes.cdxj': 'file'} } res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) assert(res == exp) res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) assert(res == exp) new_file = os.path.join(self.root_dir, 'colls/C/indexes/empty.cdxj') with open(new_file, 'a') as fh: os.utime(new_file, None) res = self.cache_dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'}) # New File Included exp['sources']['colls/C/indexes/empty.cdxj'] = 'file' assert(res == exp)