1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

tests: webagg test tweaks, create TempDirTests for sharing tests that require a temp dir

This commit is contained in:
Ilya Kreymer 2016-03-10 16:04:27 -08:00
parent 7b847311d5
commit c309637a3a
4 changed files with 169 additions and 181 deletions

View File

@ -3,7 +3,7 @@ import os
import shutil
import json
from .testutils import to_path
from .testutils import to_path, to_json_list, TempDirTests
from mock import patch
@ -12,202 +12,179 @@ from webagg.indexsource import MementoIndexSource
#=============================================================================
root_dir = None
orig_cwd = None
dir_loader = None
linkheader = """\
<http://example.com/>; rel="original", <http://web.archive.org/web/timemap/link/http://example.com/>; rel="timemap"; type="application/link-format", <http://web.archive.org/web/20020120142510/http://example.com/>; rel="first memento"; datetime="Sun, 20 Jan 2002 14:25:10 GMT", <http://web.archive.org/web/20100501123414/http://example.com/>; rel="prev memento"; datetime="Sat, 01 May 2010 12:34:14 GMT", <http://web.archive.org/web/20100514231857/http://example.com/>; rel="memento"; datetime="Fri, 14 May 2010 23:18:57 GMT", <http://web.archive.org/web/20100519202418/http://example.com/>; rel="next memento"; datetime="Wed, 19 May 2010 20:24:18 GMT", <http://web.archive.org/web/20160307200619/http://example.com/>; rel="last memento"; datetime="Mon, 07 Mar 2016 20:06:19 GMT"\
"""
def setup_module():
global root_dir
root_dir = tempfile.mkdtemp()
coll_A = to_path(root_dir + '/colls/A/indexes')
coll_B = to_path(root_dir + '/colls/B/indexes')
coll_C = to_path(root_dir + '/colls/C/indexes')
os.makedirs(coll_A)
os.makedirs(coll_B)
os.makedirs(coll_C)
dir_prefix = to_path(root_dir)
dir_path ='colls/{coll}/indexes'
shutil.copy(to_path('testdata/example.cdxj'), coll_A)
shutil.copy(to_path('testdata/iana.cdxj'), coll_B)
shutil.copy(to_path('testdata/dupes.cdxj'), coll_C)
with open(to_path(root_dir) + 'somefile', 'w') as fh:
fh.write('foo')
global dir_loader
dir_loader = DirectoryIndexSource(dir_prefix, dir_path)
#global orig_cwd
#orig_cwd = os.getcwd()
#os.chdir(root_dir)
# use actually set dir
#root_dir = os.getcwd()
def teardown_module():
#global orig_cwd
#os.chdir(orig_cwd)
global root_dir
shutil.rmtree(root_dir)
def to_json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']):
return list([json.loads(cdx.to_json(fields)) for cdx in cdxlist])
def test_agg_no_coll_set():
res, errs = dir_loader(dict(url='example.com/'))
assert(to_json_list(res) == [])
assert(errs == {})
def test_agg_collA_found():
res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'A'})
exp = [{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}]
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_collB():
res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'B'})
exp = []
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_collB_found():
res, errs = dir_loader({'url': 'iana.org/', 'param.coll': 'B'})
exp = [{'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
assert(to_json_list(res) == exp)
assert(errs == {})
def test_extra_agg_collB():
agg_source = SimpleAggregator({'dir': dir_loader})
res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'})
exp = [{'source': 'dir:colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_all_found_1():
res, errs = dir_loader({'url': 'iana.org/', 'param.coll': '*'})
exp = [
{'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
]
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_all_found_2():
res, errs = dir_loader({'url': 'example.com/', 'param.coll': '*'})
exp = [
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
]
assert(to_json_list(res) == exp)
assert(errs == {})
def mock_link_header(*args, **kwargs):
return linkheader
@patch('webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header)
def test_agg_dir_and_memento():
sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
'local': dir_loader}
agg_source = SimpleAggregator(sources)
res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
class TestDirAgg(TempDirTests):
@classmethod
def setup_class(cls):
super(TestDirAgg, cls).setup_class()
coll_A = to_path(cls.root_dir + '/colls/A/indexes')
coll_B = to_path(cls.root_dir + '/colls/B/indexes')
coll_C = to_path(cls.root_dir + '/colls/C/indexes')
exp = [
{'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
{'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
{'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
{'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
{'source': 'local:colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
]
os.makedirs(coll_A)
os.makedirs(coll_B)
os.makedirs(coll_C)
assert(to_json_list(res) == exp)
assert(errs == {})
dir_prefix = to_path(cls.root_dir)
dir_path ='colls/{coll}/indexes'
shutil.copy(to_path('testdata/example.cdxj'), coll_A)
shutil.copy(to_path('testdata/iana.cdxj'), coll_B)
shutil.copy(to_path('testdata/dupes.cdxj'), coll_C)
with open(to_path(cls.root_dir) + 'somefile', 'w') as fh:
fh.write('foo')
cls.dir_loader = DirectoryIndexSource(dir_prefix, dir_path)
def test_agg_no_coll_set(self):
res, errs = self.dir_loader(dict(url='example.com/'))
assert(to_json_list(res) == [])
assert(errs == {})
def test_agg_collA_found(self):
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'A'})
exp = [{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}]
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_collB(self):
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'B'})
exp = []
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_collB_found(self):
res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': 'B'})
exp = [{'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_no_dir_1():
res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'X'})
def test_extra_agg_collB(self):
agg_source = SimpleAggregator({'dir': self.dir_loader})
res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'})
exp = []
exp = [{'source': 'dir:colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
assert(to_json_list(res) == exp)
assert(errs == {})
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_no_dir_2():
loader = DirectoryIndexSource(root_dir, '')
res, errs = loader({'url': 'example.com/', 'param.coll': 'X'})
def test_agg_all_found_1(self):
res, errs = self.dir_loader({'url': 'iana.org/', 'param.coll': '*'})
exp = []
exp = [
{'source': 'colls/B/indexes/iana.cdxj', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
]
assert(to_json_list(res) == exp)
assert(errs == {})
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_dir_sources_1():
res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
'colls/B/indexes/iana.cdxj': 'file',
'colls/C/indexes/dupes.cdxj': 'file'}
}
def test_agg_all_found_2(self):
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': '*'})
assert(res == exp)
exp = [
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': 'colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
{'source': 'colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
]
assert(to_json_list(res) == exp)
assert(errs == {})
@patch('webagg.indexsource.MementoIndexSource.get_timegate_links', mock_link_header)
def test_agg_dir_and_memento(self):
sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
'local': self.dir_loader}
agg_source = SimpleAggregator(sources)
res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
exp = [
{'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
{'source': 'ia', 'timestamp': '20100519202418', 'load_url': 'http://web.archive.org/web/20100519202418id_/http://example.com/'},
{'source': 'ia', 'timestamp': '20100501123414', 'load_url': 'http://web.archive.org/web/20100501123414id_/http://example.com/'},
{'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
{'source': 'local:colls/C/indexes/dupes.cdxj', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
{'source': 'local:colls/A/indexes/example.cdxj', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
]
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_dir_sources_2():
res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
'colls/C/indexes/dupes.cdxj': 'file'}
}
def test_agg_no_dir_1(self):
res, errs = self.dir_loader({'url': 'example.com/', 'param.coll': 'X'})
assert(res == exp)
exp = []
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_dir_sources_single_dir():
loader = DirectoryIndexSource(os.path.join(root_dir, 'colls', 'A', 'indexes'), '')
res = loader.get_source_list({'url': 'example.com/'})
def test_agg_no_dir_2(self):
loader = DirectoryIndexSource(self.root_dir, '')
res, errs = loader({'url': 'example.com/', 'param.coll': 'X'})
exp = {'sources': {'example.cdxj': 'file'}}
exp = []
assert(res == exp)
assert(to_json_list(res) == exp)
assert(errs == {})
def test_agg_dir_sources_not_found_dir():
loader = DirectoryIndexSource(os.path.join(root_dir, 'colls', 'Z', 'indexes'), '')
res = loader.get_source_list({'url': 'example.com/'})
def test_agg_dir_sources_1(self):
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
'colls/B/indexes/iana.cdxj': 'file',
'colls/C/indexes/dupes.cdxj': 'file'}
}
exp = {'sources': {}}
assert(res == exp)
assert(res == exp)
def test_agg_dir_sources_2(self):
res = self.dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
exp = {'sources': {'colls/A/indexes/example.cdxj': 'file',
'colls/C/indexes/dupes.cdxj': 'file'}
}
assert(res == exp)
def test_agg_dir_sources_single_dir(self):
loader = DirectoryIndexSource(os.path.join(self.root_dir, 'colls', 'A', 'indexes'), '')
res = loader.get_source_list({'url': 'example.com/'})
exp = {'sources': {'example.cdxj': 'file'}}
assert(res == exp)
def test_agg_dir_sources_not_found_dir(self):
loader = DirectoryIndexSource(os.path.join(self.root_dir, 'colls', 'Z', 'indexes'), '')
res = loader.get_source_list({'url': 'example.com/'})
exp = {'sources': {}}
assert(res == exp)

View File

@ -4,7 +4,7 @@ from webagg.aggregator import SimpleAggregator, GeventTimeoutAggregator
from webagg.aggregator import BaseAggregator
from webagg.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource
from .testutils import json_list, to_path
from .testutils import to_json_list, to_path
import json
import pytest
@ -48,7 +48,7 @@ def test_mem_agg_index_1(agg):
{"timestamp": "20140107040552", "load_url": "http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/", "source": "ait"}
]
assert(json_list(res) == exp)
assert(to_json_list(res) == exp)
assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)",
'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://iana.org/',)"})
@ -65,7 +65,7 @@ def test_mem_agg_index_2(agg):
{"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source": "ait"},
{"timestamp": "20100519202418", "load_url": "http://web.archive.org/web/20100519202418id_/http://example.com/", "source": "ia"}]
assert(json_list(res) == exp)
assert(to_json_list(res) == exp)
assert(errs == {'rhiz': "NotFoundException('http://webenact.rhizome.org/vvork/http://example.com/',)"})
@ -80,7 +80,7 @@ def test_mem_agg_index_3(agg):
{"timestamp": "20140806161228", "load_url": "http://web.archive.org/web/20140806161228id_/http://vvork.com/", "source": "ia"},
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}]
assert(json_list(res) == exp)
assert(to_json_list(res) == exp)
assert(errs == {})
@ -92,7 +92,7 @@ def test_mem_agg_index_4(agg):
exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"},
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}]
assert(json_list(res) == exp)
assert(to_json_list(res) == exp)
assert(errs == {})
@ -101,7 +101,7 @@ def test_mem_agg_not_found(agg):
url = 'http://vvork.com/'
res, errs = agg(dict(url=url, closest='20141001', limit=2))
assert(json_list(res) == [])
assert(to_json_list(res) == [])
assert(errs == {'notfound': "NotFoundException('testdata/not-found-x',)"})
@ -118,7 +118,7 @@ def test_mem_agg_timeout(agg):
res, errs = agg(dict(url=url, closest='20141001', limit=2))
BaseAggregator.load_child_source = orig_source
assert(json_list(res) == [])
assert(to_json_list(res) == [])
assert(errs == {'local': 'timeout',
'ait': 'timeout', 'bl': 'timeout', 'ia': 'timeout', 'rhiz': 'timeout'})

View File

@ -5,7 +5,7 @@ from webagg.indexsource import FileIndexSource
from webagg.aggregator import SimpleAggregator, TimeoutMixin
from webagg.aggregator import GeventTimeoutAggregator, GeventTimeoutAggregator
from .testutils import json_list
from .testutils import to_json_list
class TimeoutFileSource(FileIndexSource):
@ -41,7 +41,7 @@ def test_timeout_long_all_pass():
{'source': 'slower', 'timestamp': '20140127171251'},
{'source': 'slow', 'timestamp': '20160225042329'}]
assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(errs == {})
@ -53,7 +53,7 @@ def test_timeout_slower_skipped_1():
exp = [{'source': 'slow', 'timestamp': '20160225042329'}]
assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(errs == {'slower': 'timeout'})
@ -65,7 +65,7 @@ def test_timeout_slower_skipped_2():
exp = []
assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(errs == {'slower': 'timeout', 'slow': 'timeout'})
@ -80,28 +80,28 @@ def test_timeout_skipping():
exp = [{'source': 'slow', 'timestamp': '20160225042329'}]
res, errs = agg(dict(url='http://example.com/'))
assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 4)
assert(sources['slower'].calls == 4)
assert(errs == {'slower': 'timeout'})
res, errs = agg(dict(url='http://example.com/'))
assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 5)
assert(sources['slower'].calls == 5)
assert(errs == {'slower': 'timeout'})
res, errs = agg(dict(url='http://example.com/'))
assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 6)
assert(sources['slower'].calls == 5)
assert(errs == {})
res, errs = agg(dict(url='http://example.com/'))
assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 7)
assert(sources['slower'].calls == 5)
@ -110,7 +110,7 @@ def test_timeout_skipping():
time.sleep(2.01)
res, errs = agg(dict(url='http://example.com/'))
assert(json_list(res, fields=['source', 'timestamp']) == exp)
assert(to_json_list(res, fields=['source', 'timestamp']) == exp)
assert(sources['slow'].calls == 8)
assert(sources['slower'].calls == 6)

View File

@ -1,7 +1,9 @@
import json
import os
import tempfile
import shutil
def json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']):
def to_json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']):
return list([json.loads(cdx.to_json(fields)) for cdx in cdxlist])
def key_ts_res(cdxlist, extra='filename'):
@ -14,3 +16,12 @@ def to_path(path):
return path
class TempDirTests(object):
@classmethod
def setup_class(cls):
cls.root_dir = tempfile.mkdtemp()
@classmethod
def teardown_class(cls):
shutil.rmtree(cls.root_dir)