diff --git a/webagg/test/test_dir_agg.py b/webagg/test/test_dir_agg.py
index 14d011aa..165b6346 100644
--- a/webagg/test/test_dir_agg.py
+++ b/webagg/test/test_dir_agg.py
@@ -3,7 +3,7 @@ import os
import shutil
import json
-from .testutils import to_path
+from .testutils import to_path, to_json_list, TempDirTests
from mock import patch
@@ -12,202 +12,179 @@ from webagg.indexsource import MementoIndexSource
#=============================================================================
-root_dir = None
-orig_cwd = None
-dir_loader = None
-
linkheader = """\
; rel="original", ; rel="timemap"; type="application/link-format", ; rel="first memento"; datetime="Sun, 20 Jan 2002 14:25:10 GMT", ; rel="prev memento"; datetime="Sat, 01 May 2010 12:34:14 GMT", ; rel="memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", ; rel="next memento"; datetime="Wed, 19 May 2010 20:24:18 GMT", ; rel="last memento"; datetime="Mon, 07 Mar 2016 20:06:19 GMT"\
"""
-def setup_module():
- global root_dir
- root_dir = tempfile.mkdtemp()
-
- coll_A = to_path(root_dir + '/colls/A/indexes')
- coll_B = to_path(root_dir + '/colls/B/indexes')
- coll_C = to_path(root_dir + '/colls/C/indexes')
-
- os.makedirs(coll_A)
- os.makedirs(coll_B)
- os.makedirs(coll_C)
-
- dir_prefix = to_path(root_dir)
- dir_path ='colls/{coll}/indexes'
-
- shutil.copy(to_path('testdata/example.cdxj'), coll_A)
- shutil.copy(to_path('testdata/iana.cdxj'), coll_B)
- shutil.copy(to_path('testdata/dupes.cdxj'), coll_C)
-
- with open(to_path(root_dir) + 'somefile', 'w') as fh:
- fh.write('foo')
-
- global dir_loader
- dir_loader = DirectoryIndexSource(dir_prefix, dir_path)
-
- #global orig_cwd
- #orig_cwd = os.getcwd()
- #os.chdir(root_dir)
-
- # use actually set dir
- #root_dir = os.getcwd()
-
-def teardown_module():
- #global orig_cwd
- #os.chdir(orig_cwd)
-
- global root_dir
- shutil.rmtree(root_dir)
-
-
-def to_json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']):
- return list([json.loads(cdx.to_json(fields)) for cdx in cdxlist])
-
-
-def test_agg_no_coll_set():
- res, errs = dir_loader(dict(url='example.com/'))
- assert(to_json_list(res) == [])
- assert(errs == {})
-
-def test_agg_collA_found():
- res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'A'})
-
- exp = [{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}]
-
- assert(to_json_list(res) == exp)
- assert(errs == {})
-
-def test_agg_collB():
- res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'B'})
-
- exp = []
-
- assert(to_json_list(res) == exp)
- assert(errs == {})
-
-def test_agg_collB_found():
- res, errs = dir_loader({'url': 'iana.org/', 'param.coll': 'B'})
-
- exp = [{'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
-
- assert(to_json_list(res) == exp)
- assert(errs == {})
-
-
-def test_extra_agg_collB():
- agg_source = SimpleAggregator({'dir': dir_loader})
- res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'})
-
- exp = [{'source': 'dir:colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
-
- assert(to_json_list(res) == exp)
- assert(errs == {})
-
-
-def test_agg_all_found_1():
- res, errs = dir_loader({'url': 'iana.org/', 'param.coll': '*'})
-
- exp = [
- {'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
- {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
- {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
- ]
-
- assert(to_json_list(res) == exp)
- assert(errs == {})
-
-
-def test_agg_all_found_2():
- res, errs = dir_loader({'url': 'example.com/', 'param.coll': '*'})
-
- exp = [
- {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
- {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
- {'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
- ]
-
- assert(to_json_list(res) == exp)
- assert(errs == {})
-
def mock_link_header(*args, **kwargs):
return linkheader
-@patch('webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header)
-def test_agg_dir_and_memento():
- sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
- 'local': dir_loader}
- agg_source = SimpleAggregator(sources)
- res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
+class TestDirAgg(TempDirTests):
+ @classmethod
+ def setup_class(cls):
+ super(TestDirAgg, cls).setup_class()
+ coll_A = to_path(cls.root_dir + '/colls/A/indexes')
+ coll_B = to_path(cls.root_dir + '/colls/B/indexes')
+ coll_C = to_path(cls.root_dir + '/colls/C/indexes')
- exp = [
- {'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
- {'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
- {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
- {'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
- {'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
- {'source': 'local:colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
- ]
+ os.makedirs(coll_A)
+ os.makedirs(coll_B)
+ os.makedirs(coll_C)
- assert(to_json_list(res) == exp)
- assert(errs == {})
+ dir_prefix = to_path(cls.root_dir)
+ dir_path ='colls/{coll}/indexes'
+
+ shutil.copy(to_path('testdata/example.cdxj'), coll_A)
+ shutil.copy(to_path('testdata/iana.cdxj'), coll_B)
+ shutil.copy(to_path('testdata/dupes.cdxj'), coll_C)
+
+ with open(to_path(cls.root_dir) + 'somefile', 'w') as fh:
+ fh.write('foo')
+
+ cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path)
+
+ def test_agg_no_coll_set(self):
+ res, errs = self.dir_loader(dict(url='example.com/'))
+ assert(to_json_list(res) == [])
+ assert(errs == {})
+
+ def test_agg_collA_found(self):
+ res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'})
+
+ exp = [{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}]
+
+ assert(to_json_list(res) == exp)
+ assert(errs == {})
+
+ def test_agg_collB(self):
+ res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'B'})
+
+ exp = []
+
+ assert(to_json_list(res) == exp)
+ assert(errs == {})
+
+ def test_agg_collB_found(self):
+ res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': 'B'})
+
+ exp = [{'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
+
+ assert(to_json_list(res) == exp)
+ assert(errs == {})
-def test_agg_no_dir_1():
- res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'X'})
+ def test_extra_agg_collB(self):
+ agg_source = SimpleAggregator({'dir': self.dir_loader})
+ res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'})
- exp = []
+ exp = [{'source': 'dir:colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
- assert(to_json_list(res) == exp)
- assert(errs == {})
+ assert(to_json_list(res) == exp)
+ assert(errs == {})
-def test_agg_no_dir_2():
- loader = DirectoryIndexSource(root_dir, '')
- res, errs = loader({'url': 'example.com/', 'param.coll': 'X'})
+ def test_agg_all_found_1(self):
+ res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': '*'})
- exp = []
+ exp = [
+ {'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
+ {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
+ {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
+ ]
- assert(to_json_list(res) == exp)
- assert(errs == {})
+ assert(to_json_list(res) == exp)
+ assert(errs == {})
-def test_agg_dir_sources_1():
- res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
- exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
- 'colls/B/indexes/iana.cdxj': 'file',
- 'colls/C/indexes/dupes.cdxj': 'file'}
- }
+ def test_agg_all_found_2(self):
+ res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': '*'})
- assert(res == exp)
+ exp = [
+ {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
+ {'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
+ {'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
+ ]
+
+ assert(to_json_list(res) == exp)
+ assert(errs == {})
+
+ @patch('webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header)
+ def test_agg_dir_and_memento(self):
+ sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
+ 'local': self.dir_loader}
+ agg_source = SimpleAggregator(sources)
+
+ res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
+
+ exp = [
+ {'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
+ {'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
+ {'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
+ {'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
+ {'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
+ {'source': 'local:colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
+ ]
+
+ assert(to_json_list(res) == exp)
+ assert(errs == {})
-def test_agg_dir_sources_2():
- res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
- exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
- 'colls/C/indexes/dupes.cdxj': 'file'}
- }
+ def test_agg_no_dir_1(self):
+ res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'X'})
- assert(res == exp)
+ exp = []
+
+ assert(to_json_list(res) == exp)
+ assert(errs == {})
-def test_agg_dir_sources_single_dir():
- loader = DirectoryIndexSource(os.path.join(root_dir, 'colls', 'A', 'indexes'), '')
- res = loader.get_source_list({'url': 'example.com/'})
+ def test_agg_no_dir_2(self):
+ loader = DirectoryIndexSource(self.root_dir, '')
+ res, errs = loader({'url': 'example.com/', 'param.coll': 'X'})
- exp = {'sources': {'example.cdxj': 'file'}}
+ exp = []
- assert(res == exp)
+ assert(to_json_list(res) == exp)
+ assert(errs == {})
-def test_agg_dir_sources_not_found_dir():
- loader = DirectoryIndexSource(os.path.join(root_dir, 'colls', 'Z', 'indexes'), '')
- res = loader.get_source_list({'url': 'example.com/'})
+ def test_agg_dir_sources_1(self):
+ res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
+ exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
+ 'colls/B/indexes/iana.cdxj': 'file',
+ 'colls/C/indexes/dupes.cdxj': 'file'}
+ }
- exp = {'sources': {}}
+ assert(res == exp)
- assert(res == exp)
+
+ def test_agg_dir_sources_2(self):
+ res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
+ exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
+ 'colls/C/indexes/dupes.cdxj': 'file'}
+ }
+
+ assert(res == exp)
+
+
+ def test_agg_dir_sources_single_dir(self):
+ loader = DirectoryIndexSource(os.path.join(self.root_dir, 'colls', 'A', 'indexes'), '')
+ res = loader.get_source_list({'url': 'example.com/'})
+
+ exp = {'sources': {'example.cdxj': 'file'}}
+
+ assert(res == exp)
+
+
+ def test_agg_dir_sources_not_found_dir(self):
+ loader = DirectoryIndexSource(os.path.join(self.root_dir, 'colls', 'Z', 'indexes'), '')
+ res = loader.get_source_list({'url': 'example.com/'})
+
+ exp = {'sources': {}}
+
+ assert(res == exp)
diff --git a/webagg/test/test_memento_agg.py b/webagg/test/test_memento_agg.py
index 52dc79da..784bf785 100644
--- a/webagg/test/test_memento_agg.py
+++ b/webagg/test/test_memento_agg.py
@@ -4,7 +4,7 @@ from webagg.aggregator import SimpleAggregator, GeventTimeoutAggregator
from webagg.aggregator import BaseAggregator
from webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource
-from .testutils import json_list, to_path
+from .testutils import to_json_list, to_path
import json
import pytest
@@ -48,7 +48,7 @@ def test_mem_agg_index_1(agg):
{"timestamp": "20140107040552", "load_url": "http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/", "source": "ait"}
]
- assert(json_list(res) == exp)
+ assert(to_json_list(res) == exp)
assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)",
'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"})
@@ -65,7 +65,7 @@ def test_mem_agg_index_2(agg):
{"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source": "ait"},
{"timestamp": "20100519202418", "load_url": "http://web.archive.org/web/20100519202418id_/http://example.com/", "source": "ia"}]
- assert(json_list(res) == exp)
+ assert(to_json_list(res) == exp)
assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"})
@@ -80,7 +80,7 @@ def test_mem_agg_index_3(agg):
{"timestamp": "20140806161228", "load_url": "http://web.archive.org/web/20140806161228id_/http://vvork.com/", "source": "ia"},
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}]
- assert(json_list(res) == exp)
+ assert(to_json_list(res) == exp)
assert(errs == {})
@@ -92,7 +92,7 @@ def test_mem_agg_index_4(agg):
exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"},
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}]
- assert(json_list(res) == exp)
+ assert(to_json_list(res) == exp)
assert(errs == {})
@@ -101,7 +101,7 @@ def test_mem_agg_not_found(agg):
url = 'http://vvork.com/'
res, errs = agg(dict(url=url, closest='20141001', limit=2))
- assert(json_list(res) == [])
+ assert(to_json_list(res) == [])
assert(errs == {'notfound': "NotFoundException('testdata/not-found-x',)"})
@@ -118,7 +118,7 @@ def test_mem_agg_timeout(agg):
res, errs = agg(dict(url=url, closest='20141001', limit=2))
BaseAggregator.load_child_source = orig_source
- assert(json_list(res) == [])
+ assert(to_json_list(res) == [])
assert(errs == {'local': 'timeout',
'ait': 'timeout', 'bl': 'timeout', 'ia': 'timeout', 'rhiz': 'timeout'})
diff --git a/webagg/test/test_timeouts.py b/webagg/test/test_timeouts.py
index 04370c5d..60080ce6 100644
--- a/webagg/test/test_timeouts.py
+++ b/webagg/test/test_timeouts.py
@@ -5,7 +5,7 @@ from webagg.indexsource import FileIndexSource
from webagg.aggregator import SimpleAggregator, TimeoutMixin
from webagg.aggregator import GeventTimeoutAggregator, GeventTimeoutAggregator
-from .testutils import json_list
+from .testutils import to_json_list
class TimeoutFileSource(FileIndexSource):
@@ -41,7 +41,7 @@ def test_timeout_long_all_pass():
{'source': 'slower', 'timestamp': '20140127171251'},
{'source': 'slow', 'timestamp': '20160225042329'}]
- assert(json_list(res, fields=['source', 'timestamp']) == exp)
+ assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(errs == {})
@@ -53,7 +53,7 @@ def test_timeout_slower_skipped_1():
exp = [{'source': 'slow', 'timestamp': '20160225042329'}]
- assert(json_list(res, fields=['source', 'timestamp']) == exp)
+ assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(errs == {'slower': 'timeout'})
@@ -65,7 +65,7 @@ def test_timeout_slower_skipped_2():
exp = []
- assert(json_list(res, fields=['source', 'timestamp']) == exp)
+ assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(errs == {'slower': 'timeout', 'slow': 'timeout'})
@@ -80,28 +80,28 @@ def test_timeout_skipping():
exp = [{'source': 'slow', 'timestamp': '20160225042329'}]
res, errs = agg(dict(url='http://example.com/'))
- assert(json_list(res, fields=['source', 'timestamp']) == exp)
+ assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 4)
assert(sources['slower'].calls == 4)
assert(errs == {'slower': 'timeout'})
res, errs = agg(dict(url='http://example.com/'))
- assert(json_list(res, fields=['source', 'timestamp']) == exp)
+ assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 5)
assert(sources['slower'].calls == 5)
assert(errs == {'slower': 'timeout'})
res, errs = agg(dict(url='http://example.com/'))
- assert(json_list(res, fields=['source', 'timestamp']) == exp)
+ assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 6)
assert(sources['slower'].calls == 5)
assert(errs == {})
res, errs = agg(dict(url='http://example.com/'))
- assert(json_list(res, fields=['source', 'timestamp']) == exp)
+ assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 7)
assert(sources['slower'].calls == 5)
@@ -110,7 +110,7 @@ def test_timeout_skipping():
time.sleep(2.01)
res, errs = agg(dict(url='http://example.com/'))
- assert(json_list(res, fields=['source', 'timestamp']) == exp)
+ assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 8)
assert(sources['slower'].calls == 6)
diff --git a/webagg/test/testutils.py b/webagg/test/testutils.py
index b9f8ab98..61f8b155 100644
--- a/webagg/test/testutils.py
+++ b/webagg/test/testutils.py
@@ -1,7 +1,9 @@
import json
import os
+import tempfile
+import shutil
-def json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']):
+def to_json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']):
return list([json.loads(cdx.to_json(fields)) for cdx in cdxlist])
def key_ts_res(cdxlist, extra='filename'):
@@ -14,3 +16,12 @@ def to_path(path):
return path
+class TempDirTests(object):
+ @classmethod
+ def setup_class(cls):
+ cls.root_dir = tempfile.mkdtemp()
+
+ @classmethod
+ def teardown_class(cls):
+ shutil.rmtree(cls.root_dir)
+