diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index 9b7de7b9..e746d277 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -33,10 +33,8 @@ directory structure expected by pywb self.default_config['paths'][name]) def _create_dir(self, dirname): - try: + if not os.path.isdir(dirname): os.mkdir(dirname) - except: - pass logging.info('Created Dir: ' + dirname) @@ -54,7 +52,7 @@ directory structure expected by pywb raise Exception('Directory ' + warcdir + ' does not exist') if not warcs: - print('No WARCs specified') + logging.info('No WARCs specified') return for filename in warcs: @@ -68,7 +66,7 @@ directory structure expected by pywb logging.info('Indexing ' + self.warc_dir + ' to ' + cdx_file) cdxindexer_main(['-p', '-s', '-r', cdx_file, self.warc_dir]) -def main(): +def main(args=None): description = """ Create manage file based web archive collections """ @@ -99,7 +97,7 @@ Some examples: parser.add_argument('name') parser.add_argument('files', nargs='*') - r = parser.parse_args() + r = parser.parse_args(args=args) m = CollectionsManager(r.name) if r.init: diff --git a/pywb/warc/test/test_indexing.py b/pywb/warc/test/test_indexing.py index e64595a3..c01d3ed0 100644 --- a/pywb/warc/test/test_indexing.py +++ b/pywb/warc/test/test_indexing.py @@ -8,6 +8,13 @@ com,example)/?example=1 20140103030321 http://example.com?example=1 text/html 20 com,example)/?example=1 20140103030341 http://example.com?example=1 warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 1864 example.warc.gz org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz +# warc.gz -- minimal cdx +>>> print_cdx_index('example.warc.gz', minimal=True) + CDX N b a S V g +com,example)/?example=1 20140103030321 http://example.com?example=1 1043 333 example.warc.gz +com,example)/?example=1 20140103030341 http://example.com?example=1 553 1864 example.warc.gz +org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example 577 2907 example.warc.gz + # warc.gz -- parse all >>> print_cdx_index('example.warc.gz', include_all=True) CDX N b a m s k r M S V g @@ -122,7 +129,7 @@ com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYA org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1001 353 warcs/example-url-agnostic-orig.warc.gz Total: 206 -# test sort, multiple inputs, all records + post query +# test sort, 9-field, multiple inputs, all records + post query >>> cli_lines(['--sort', '-a', '-p', '-9', TEST_WARC_DIR]) com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 355 example-url-agnostic-revisit.warc.gz org,iana,example)/ 20130702195402 http://example.iana.org/ text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - 353 example-url-agnostic-orig.warc.gz diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index 1b48eb44..d14600a7 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -154,17 +154,10 @@ class DirectoryCollsLoader(object): # already set return False - thedir = self.config.get('paths').get(dir_key) - - if not thedir: - msg = 'No "{0}" for collection {1}'.format(dir_key, root_dir) - if required: - raise Exception(msg) - else: - logging.warn(msg) - return False + thedir = self.config.get('paths')[dir_key] fulldir = os.path.join(root_dir, thedir) + if os.path.isdir(fulldir): fulldir = os.path.abspath(fulldir) + os.path.sep coll[dir_key] = fulldir @@ -172,8 +165,8 @@ class DirectoryCollsLoader(object): elif required: msg = 'Dir "{0}" does not exist for "{1}"'.format(fulldir, dir_key) raise Exception(msg) - - return False + else: + return False def load_dir(self, root_dir, name): config_file = os.path.join(root_dir, 'config.yaml') diff --git a/setup.py b/setup.py index 23f4c23f..1eeb424d 100755 --- a/setup.py +++ b/setup.py @@ -50,6 +50,7 @@ setup( 'pywb.warc', 'pywb.rewrite', 'pywb.framework', + 'pywb.manager', 'pywb.perms', 'pywb.webapp', 'pywb.apps' diff --git a/tests/test_auto_colls.py b/tests/test_auto_colls.py new file mode 100644 index 00000000..0b892d1b --- /dev/null +++ b/tests/test_auto_colls.py @@ -0,0 +1,148 @@ +import os +import tempfile +import shutil + +import webtest + +from pywb.webapp.pywb_init import create_wb_router +from pywb.manager.manager import main + +from pywb import get_test_dir +from pywb.framework.wsgi_wrappers import init_app + +from pytest import raises + + +#============================================================================= +root_dir = None +orig_cwd = None + +def setup_module(): + global root_dir + root_dir = tempfile.mkdtemp() + + global orig_cwd + orig_cwd = os.getcwd() + os.chdir(root_dir) + +def teardown_module(): + global root_dir + shutil.rmtree(root_dir) + + global orig_cwd + os.chdir(orig_cwd) + + +#============================================================================= +class TestManagedColls(object): + def setup(self): + global root_dir + self.root_dir = root_dir + + def _create_app(self): + self.app = init_app(create_wb_router) + self.testapp = webtest.TestApp(self.app) + + def _check_dirs(self, base, dirlist): + for dir_ in dirlist: + assert os.path.isdir(os.path.join(base, dir_)) + + def test_create_first_coll(self): + main(['--init', 'test']) + + colls = os.path.join(self.root_dir, 'collections') + assert os.path.isdir(colls) + + test = os.path.join(colls, 'test') + assert os.path.isdir(test) + + self._check_dirs(test, ['cdx', 'warcs', 'static', 'templates']) + + def test_add_warcs(self): + warc1 = os.path.join(get_test_dir(), 'warcs', 'example.warc.gz') + + main(['--addwarc', 'test', warc1]) + + self._create_app() + resp = self.testapp.get('/test/20140103030321/http://example.com?example=1') + assert resp.status_int == 200 + + def test_add_more_warcs(self): + warc1 = os.path.join(get_test_dir(), 'warcs', 'iana.warc.gz') + warc2 = os.path.join(get_test_dir(), 'warcs', 'example-extra.warc') + + main(['--addwarc', 'test', warc1, warc2]) + + # Spurrious file in collections + with open(os.path.join(self.root_dir, 'collections', 'blah'), 'w+b') as fh: + fh.write('foo\n') + + with raises(IOError): + main(['--addwarc', 'test', 'non-existent-file.warc.gz']) + + main(['--addwarc', 'test']) + + main(['--reindex', 'test']) + + self._create_app() + resp = self.testapp.get('/test/20140103030321/http://example.com?example=1') + assert resp.status_int == 200 + + def test_add_static(self): + a_static = os.path.join(self.root_dir, 'collections', 'test', 'static', 'abc.js') + + with open(a_static, 'w+b') as fh: + fh.write('/* Some JS File */') + + self._create_app() + resp = self.testapp.get('/static/test/abc.js') + assert resp.status_int == 200 + assert resp.content_type == 'application/javascript' + assert '/* Some JS File */' in resp.body + + def test_custom_search(self): + a_static = os.path.join(self.root_dir, 'collections', 'test', 'templates', 'search.html') + + with open(a_static, 'w+b') as fh: + fh.write('pywb custom search page') + + self._create_app() + resp = self.testapp.get('/test/') + assert resp.status_int == 200 + assert resp.content_type == 'text/html' + assert 'pywb custom search page' in resp.body + + def test_no_templates(self): + shutil.rmtree(os.path.join(self.root_dir, 'collections', 'test', 'templates')) + + self._create_app() + + resp = self.testapp.get('/test/') + assert resp.status_int == 200 + assert resp.content_type == 'text/html' + assert 'pywb custom search page' not in resp.body + + def test_err_missing_dirs(self): + colls = os.path.join(self.root_dir, 'collections') + + # No CDX + cdx_path = os.path.join(colls, 'test', 'cdx') + shutil.rmtree(cdx_path) + + with raises(Exception): + self._create_app() + + # CDX a file not a dir + with open(cdx_path, 'w+b') as fh: + fh.write('foo\n') + + with raises(Exception): + self._create_app() + + shutil.rmtree(colls) + + # No Collections + self._create_app() + resp = self.testapp.get('/test/', status=404) + assert resp.status_int == 404 +