mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
tests: add test for directory auto collection loader,
collection manager and new 6-field minimal cdx format
This commit is contained in:
parent
1ba24de357
commit
759d151551
@ -33,10 +33,8 @@ directory structure expected by pywb
|
||||
self.default_config['paths'][name])
|
||||
|
||||
def _create_dir(self, dirname):
|
||||
try:
|
||||
if not os.path.isdir(dirname):
|
||||
os.mkdir(dirname)
|
||||
except:
|
||||
pass
|
||||
|
||||
logging.info('Created Dir: ' + dirname)
|
||||
|
||||
@ -54,7 +52,7 @@ directory structure expected by pywb
|
||||
raise Exception('Directory ' + warcdir + ' does not exist')
|
||||
|
||||
if not warcs:
|
||||
print('No WARCs specified')
|
||||
logging.info('No WARCs specified')
|
||||
return
|
||||
|
||||
for filename in warcs:
|
||||
@ -68,7 +66,7 @@ directory structure expected by pywb
|
||||
logging.info('Indexing ' + self.warc_dir + ' to ' + cdx_file)
|
||||
cdxindexer_main(['-p', '-s', '-r', cdx_file, self.warc_dir])
|
||||
|
||||
def main():
|
||||
def main(args=None):
|
||||
description = """
|
||||
Create manage file based web archive collections
|
||||
"""
|
||||
@ -99,7 +97,7 @@ Some examples:
|
||||
parser.add_argument('name')
|
||||
parser.add_argument('files', nargs='*')
|
||||
|
||||
r = parser.parse_args()
|
||||
r = parser.parse_args(args=args)
|
||||
|
||||
m = CollectionsManager(r.name)
|
||||
if r.init:
|
||||
|
@ -8,6 +8,13 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20
|
||||
com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz
|
||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||
|
||||
# warc.gz -- minimal cdx
|
||||
>>> print_cdx_index('example.warc.gz', minimal=True)
|
||||
CDX N b a S V g
|
||||
com,example)/?example=1 20140103030321 http://example.com?example=1 1043 333 example.warc.gz
|
||||
com,example)/?example=1 20140103030341 http://example.com?example=1 553 1864 example.warc.gz
|
||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example 577 2907 example.warc.gz
|
||||
|
||||
# warc.gz -- parse all
|
||||
>>> print_cdx_index('example.warc.gz', include_all=True)
|
||||
CDX N b a m s k r M S V g
|
||||
@ -122,7 +129,7 @@ com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYA
|
||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 warcs/example-url-agnostic-orig.warc.gz
|
||||
Total: 206
|
||||
|
||||
# test sort, multiple inputs, all records + post query
|
||||
# test sort, 9-field, multiple inputs, all records + post query
|
||||
>>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR])
|
||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz
|
||||
org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz
|
||||
|
@ -154,17 +154,10 @@ class DirectoryCollsLoader(object):
|
||||
# already set
|
||||
return False
|
||||
|
||||
thedir = self.config.get('paths').get(dir_key)
|
||||
|
||||
if not thedir:
|
||||
msg = 'No "{0}" for collection {1}'.format(dir_key, root_dir)
|
||||
if required:
|
||||
raise Exception(msg)
|
||||
else:
|
||||
logging.warn(msg)
|
||||
return False
|
||||
thedir = self.config.get('paths')[dir_key]
|
||||
|
||||
fulldir = os.path.join(root_dir, thedir)
|
||||
|
||||
if os.path.isdir(fulldir):
|
||||
fulldir = os.path.abspath(fulldir) + os.path.sep
|
||||
coll[dir_key] = fulldir
|
||||
@ -172,8 +165,8 @@ class DirectoryCollsLoader(object):
|
||||
elif required:
|
||||
msg = 'Dir "{0}" does not exist for "{1}"'.format(fulldir, dir_key)
|
||||
raise Exception(msg)
|
||||
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
def load_dir(self, root_dir, name):
|
||||
config_file = os.path.join(root_dir, 'config.yaml')
|
||||
|
1
setup.py
1
setup.py
@ -50,6 +50,7 @@ setup(
|
||||
'pywb.warc',
|
||||
'pywb.rewrite',
|
||||
'pywb.framework',
|
||||
'pywb.manager',
|
||||
'pywb.perms',
|
||||
'pywb.webapp',
|
||||
'pywb.apps'
|
||||
|
148
tests/test_auto_colls.py
Normal file
148
tests/test_auto_colls.py
Normal file
@ -0,0 +1,148 @@
|
||||
import os
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
import webtest
|
||||
|
||||
from pywb.webapp.pywb_init import create_wb_router
|
||||
from pywb.manager.manager import main
|
||||
|
||||
from pywb import get_test_dir
|
||||
from pywb.framework.wsgi_wrappers import init_app
|
||||
|
||||
from pytest import raises
|
||||
|
||||
|
||||
#=============================================================================
|
||||
root_dir = None
|
||||
orig_cwd = None
|
||||
|
||||
def setup_module():
|
||||
global root_dir
|
||||
root_dir = tempfile.mkdtemp()
|
||||
|
||||
global orig_cwd
|
||||
orig_cwd = os.getcwd()
|
||||
os.chdir(root_dir)
|
||||
|
||||
def teardown_module():
|
||||
global root_dir
|
||||
shutil.rmtree(root_dir)
|
||||
|
||||
global orig_cwd
|
||||
os.chdir(orig_cwd)
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class TestManagedColls(object):
|
||||
def setup(self):
|
||||
global root_dir
|
||||
self.root_dir = root_dir
|
||||
|
||||
def _create_app(self):
|
||||
self.app = init_app(create_wb_router)
|
||||
self.testapp = webtest.TestApp(self.app)
|
||||
|
||||
def _check_dirs(self, base, dirlist):
|
||||
for dir_ in dirlist:
|
||||
assert os.path.isdir(os.path.join(base, dir_))
|
||||
|
||||
def test_create_first_coll(self):
|
||||
main(['--init', 'test'])
|
||||
|
||||
colls = os.path.join(self.root_dir, 'collections')
|
||||
assert os.path.isdir(colls)
|
||||
|
||||
test = os.path.join(colls, 'test')
|
||||
assert os.path.isdir(test)
|
||||
|
||||
self._check_dirs(test, ['cdx', 'warcs', 'static', 'templates'])
|
||||
|
||||
def test_add_warcs(self):
|
||||
warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz')
|
||||
|
||||
main(['--addwarc', 'test', warc1])
|
||||
|
||||
self._create_app()
|
||||
resp = self.testapp.get('/test/20140103030321/http://example.com?example=1')
|
||||
assert resp.status_int == 200
|
||||
|
||||
def test_add_more_warcs(self):
|
||||
warc1 = os.path.join(get_test_dir(), 'warcs', 'iana.warc.gz')
|
||||
warc2 = os.path.join(get_test_dir(), 'warcs', 'example-extra.warc')
|
||||
|
||||
main(['--addwarc', 'test', warc1, warc2])
|
||||
|
||||
# Spurrious file in collections
|
||||
with open(os.path.join(self.root_dir, 'collections', 'blah'), 'w+b') as fh:
|
||||
fh.write('foo\n')
|
||||
|
||||
with raises(IOError):
|
||||
main(['--addwarc', 'test', 'non-existent-file.warc.gz'])
|
||||
|
||||
main(['--addwarc', 'test'])
|
||||
|
||||
main(['--reindex', 'test'])
|
||||
|
||||
self._create_app()
|
||||
resp = self.testapp.get('/test/20140103030321/http://example.com?example=1')
|
||||
assert resp.status_int == 200
|
||||
|
||||
def test_add_static(self):
|
||||
a_static = os.path.join(self.root_dir, 'collections', 'test', 'static', 'abc.js')
|
||||
|
||||
with open(a_static, 'w+b') as fh:
|
||||
fh.write('/* Some JS File */')
|
||||
|
||||
self._create_app()
|
||||
resp = self.testapp.get('/static/test/abc.js')
|
||||
assert resp.status_int == 200
|
||||
assert resp.content_type == 'application/javascript'
|
||||
assert '/* Some JS File */' in resp.body
|
||||
|
||||
def test_custom_search(self):
|
||||
a_static = os.path.join(self.root_dir, 'collections', 'test', 'templates', 'search.html')
|
||||
|
||||
with open(a_static, 'w+b') as fh:
|
||||
fh.write('pywb custom search page')
|
||||
|
||||
self._create_app()
|
||||
resp = self.testapp.get('/test/')
|
||||
assert resp.status_int == 200
|
||||
assert resp.content_type == 'text/html'
|
||||
assert 'pywb custom search page' in resp.body
|
||||
|
||||
def test_no_templates(self):
|
||||
shutil.rmtree(os.path.join(self.root_dir, 'collections', 'test', 'templates'))
|
||||
|
||||
self._create_app()
|
||||
|
||||
resp = self.testapp.get('/test/')
|
||||
assert resp.status_int == 200
|
||||
assert resp.content_type == 'text/html'
|
||||
assert 'pywb custom search page' not in resp.body
|
||||
|
||||
def test_err_missing_dirs(self):
|
||||
colls = os.path.join(self.root_dir, 'collections')
|
||||
|
||||
# No CDX
|
||||
cdx_path = os.path.join(colls, 'test', 'cdx')
|
||||
shutil.rmtree(cdx_path)
|
||||
|
||||
with raises(Exception):
|
||||
self._create_app()
|
||||
|
||||
# CDX a file not a dir
|
||||
with open(cdx_path, 'w+b') as fh:
|
||||
fh.write('foo\n')
|
||||
|
||||
with raises(Exception):
|
||||
self._create_app()
|
||||
|
||||
shutil.rmtree(colls)
|
||||
|
||||
# No Collections
|
||||
self._create_app()
|
||||
resp = self.testapp.get('/test/', status=404)
|
||||
assert resp.status_int == 404
|
||||
|
Loading…
x
Reference in New Issue
Block a user