mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-22 14:24:27 +01:00
195 lines
5.7 KiB
Python
195 lines
5.7 KiB
Python
import tempfile
|
|
import os
|
|
import shutil
|
|
import json
|
|
|
|
from .testutils import to_path
|
|
|
|
from rezag.aggindexsource import DirectoryIndexSource, SimpleAggregator
|
|
from rezag.indexsource import MementoIndexSource
|
|
|
|
|
|
#=============================================================================
|
|
root_dir = None
|
|
orig_cwd = None
|
|
dir_loader = None
|
|
|
|
def setup_module():
|
|
global root_dir
|
|
root_dir = tempfile.mkdtemp()
|
|
|
|
coll_A = to_path(root_dir + '/colls/A/indexes')
|
|
coll_B = to_path(root_dir + '/colls/B/indexes')
|
|
coll_C = to_path(root_dir + '/colls/C/indexes')
|
|
|
|
os.makedirs(coll_A)
|
|
os.makedirs(coll_B)
|
|
os.makedirs(coll_C)
|
|
|
|
dir_prefix = to_path(root_dir)
|
|
dir_path ='colls/{coll}/indexes'
|
|
|
|
shutil.copy(to_path('testdata/example.cdxj'), coll_A)
|
|
shutil.copy(to_path('testdata/iana.cdxj'), coll_B)
|
|
shutil.copy(to_path('testdata/dupes.cdxj'), coll_C)
|
|
|
|
with open(to_path(root_dir) + 'somefile', 'w') as fh:
|
|
fh.write('foo')
|
|
|
|
global dir_loader
|
|
dir_loader = DirectoryIndexSource(dir_prefix, dir_path)
|
|
|
|
global orig_cwd
|
|
orig_cwd = os.getcwd()
|
|
os.chdir(root_dir)
|
|
|
|
# use actually set dir
|
|
root_dir = os.getcwd()
|
|
|
|
def teardown_module():
|
|
global orig_cwd
|
|
os.chdir(orig_cwd)
|
|
|
|
global root_dir
|
|
shutil.rmtree(root_dir)
|
|
|
|
|
|
def to_json_list(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']):
|
|
return list([json.loads(cdx.to_json(fields)) for cdx in cdxlist])
|
|
|
|
|
|
def test_agg_no_coll_set():
|
|
res, errs = dir_loader(dict(url='example.com/'))
|
|
assert(to_json_list(res) == [])
|
|
assert(errs == {})
|
|
|
|
def test_agg_collA_found():
|
|
res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'A'})
|
|
|
|
exp = [{'source': 'colls/A/indexes', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}]
|
|
|
|
assert(to_json_list(res) == exp)
|
|
assert(errs == {})
|
|
|
|
def test_agg_collB():
|
|
res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'B'})
|
|
|
|
exp = []
|
|
|
|
assert(to_json_list(res) == exp)
|
|
assert(errs == {})
|
|
|
|
def test_agg_collB_found():
|
|
res, errs = dir_loader({'url': 'iana.org/', 'param.coll': 'B'})
|
|
|
|
exp = [{'source': 'colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
|
|
|
|
assert(to_json_list(res) == exp)
|
|
assert(errs == {})
|
|
|
|
|
|
def test_extra_agg_collB():
|
|
agg_source = SimpleAggregator({'dir': dir_loader})
|
|
res, errs = agg_source({'url': 'iana.org/', 'param.coll': 'B'})
|
|
|
|
exp = [{'source': 'dir:colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'}]
|
|
|
|
assert(to_json_list(res) == exp)
|
|
assert(errs == {})
|
|
|
|
|
|
def test_agg_all_found_1():
|
|
res, errs = dir_loader({'url': 'iana.org/', 'param.coll': '*'})
|
|
|
|
exp = [
|
|
{'source': 'colls/B/indexes', 'timestamp': '20140126200624', 'filename': 'iana.warc.gz'},
|
|
{'source': 'colls/C/indexes', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
|
|
{'source': 'colls/C/indexes', 'timestamp': '20140127171238', 'filename': 'dupes.warc.gz'},
|
|
]
|
|
|
|
assert(to_json_list(res) == exp)
|
|
assert(errs == {})
|
|
|
|
|
|
def test_agg_all_found_2():
|
|
res, errs = dir_loader({'url': 'example.com/', 'param.coll': '*'})
|
|
|
|
exp = [
|
|
{'source': 'colls/C/indexes', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
|
{'source': 'colls/C/indexes', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
|
{'source': 'colls/A/indexes', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
|
|
]
|
|
|
|
assert(to_json_list(res) == exp)
|
|
assert(errs == {})
|
|
|
|
|
|
|
|
def test_agg_dir_and_memento():
|
|
sources = {'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
|
|
'local': dir_loader}
|
|
agg_source = SimpleAggregator(sources)
|
|
|
|
res, errs = agg_source({'url': 'example.com/', 'param.local.coll': '*', 'closest': '20100512', 'limit': 6})
|
|
|
|
exp = [
|
|
{'source': 'ia', 'timestamp': '20100513052358', 'load_url': 'http://web.archive.org/web/20100513052358id_/http://example.com/'},
|
|
{'source': 'ia', 'timestamp': '20100514231857', 'load_url': 'http://web.archive.org/web/20100514231857id_/http://example.com/'},
|
|
{'source': 'ia', 'timestamp': '20100506013442', 'load_url': 'http://web.archive.org/web/20100506013442id_/http://example.com/'},
|
|
{'source': 'local:colls/C/indexes', 'timestamp': '20140127171200', 'filename': 'dupes.warc.gz'},
|
|
{'source': 'local:colls/C/indexes', 'timestamp': '20140127171251', 'filename': 'dupes.warc.gz'},
|
|
{'source': 'local:colls/A/indexes', 'timestamp': '20160225042329', 'filename': 'example.warc.gz'}
|
|
]
|
|
|
|
assert(to_json_list(res) == exp)
|
|
assert(errs == {})
|
|
|
|
|
|
def test_agg_no_dir_1():
|
|
res, errs = dir_loader({'url': 'example.com/', 'param.coll': 'X'})
|
|
|
|
exp = []
|
|
|
|
assert(to_json_list(res) == exp)
|
|
assert(errs == {})
|
|
|
|
|
|
def test_agg_no_dir_2():
|
|
loader = DirectoryIndexSource(root_dir, '')
|
|
res, errs = loader({'url': 'example.com/', 'param.coll': 'X'})
|
|
|
|
exp = []
|
|
|
|
assert(to_json_list(res) == exp)
|
|
assert(errs == {})
|
|
|
|
|
|
def test_agg_dir_sources_1():
|
|
res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '*'})
|
|
exp = {'sources': {'colls/A/indexes': 'file',
|
|
'colls/B/indexes': 'file',
|
|
'colls/C/indexes': 'file'}
|
|
}
|
|
|
|
assert(res == exp)
|
|
|
|
|
|
def test_agg_dir_sources_2():
|
|
res = dir_loader.get_source_list({'url': 'example.com/', 'param.coll': '[A,C]'})
|
|
exp = {'sources': {'colls/A/indexes': 'file',
|
|
'colls/C/indexes': 'file'}
|
|
}
|
|
|
|
assert(res == exp)
|
|
|
|
|
|
def test_agg_dir_sources_single_dir():
|
|
loader = DirectoryIndexSource('testdata/', '')
|
|
res = loader.get_source_list({'url': 'example.com/'})
|
|
|
|
exp = {'sources': {}}
|
|
|
|
assert(res == exp)
|
|
|
|
|