From a32c6f089c61499052a759a2651d934a3de4ada6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 28 Sep 2017 02:08:31 -0700 Subject: [PATCH] auto-all aggregate collection support: (#69) - enabled with 'all_coll' in config or --all-coll cli option, eg. --all-coll all to enable - supported for replay, timemap and cdx endpoints, uses wildcard '*' for coll name with directory aggregator - tests: record/replay tests updated to replay via all collection, check all collection cdxj --- pywb/apps/cli.py | 21 ++++++++---- pywb/apps/frontendapp.py | 12 +++++++ tests/config_test_record.yaml | 2 ++ tests/test_record_replay.py | 64 ++++++++++++++++++++++++++--------- 4 files changed, 77 insertions(+), 22 deletions(-) diff --git a/pywb/apps/cli.py b/pywb/apps/cli.py index 101be519..53564921 100644 --- a/pywb/apps/cli.py +++ b/pywb/apps/cli.py @@ -34,12 +34,12 @@ class BaseCli(object): parser.add_argument('--debug', action='store_true') parser.add_argument('--profile', action='store_true') + parser.add_argument('--live', action='store_true', help='Add live-web handler at /live') + parser.add_argument('--proxy', help='Enable HTTP/S Proxy on specified collection') - parser.add_argument('--live', action='store_true', help='Add /live handler') - self.desc = desc - self.extra_config = None + self.extra_config = {} self._extend_parser(parser) @@ -62,9 +62,12 @@ class BaseCli(object): def load(self): if self.r.live: - self.extra_config = {'collections': - {'live': {'index': '$live', - 'use_js_obj_proxy': True}}} + self.extra_config['collections'] = {'live': + {'index': '$live', + 'use_js_obj_proxy': True}} + + if self.r.debug: + self.extra_config['debug'] = True def run(self): self.run_gevent() @@ -81,12 +84,18 @@ class ReplayCli(BaseCli): parser.add_argument('-a', '--autoindex', action='store_true') parser.add_argument('--auto-interval', type=int, default=30) + parser.add_argument('--all-coll', help='Set "all" collection') + help_dir='Specify root archive dir (default is current working directory)' parser.add_argument('-d', '--directory', help=help_dir) def load(self): super(ReplayCli, self).load() + + if self.r.all_coll: + self.extra_config['all_coll'] = self.r.all_coll + import os if self.r.directory: #pragma: no cover os.chdir(self.r.directory) diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index a799846a..12f1aa56 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -46,6 +46,8 @@ class FrontEndApp(object): self.static_handler = StaticHandler('pywb/static/') + self.all_coll = config.get('all_coll', None) + self.url_map = Map() self.url_map.add(Rule('/static/_//', endpoint=self.serve_static)) self.url_map.add(Rule('/static/', endpoint=self.serve_static)) @@ -60,6 +62,7 @@ class FrontEndApp(object): self.url_map.add(Rule(coll_prefix + '/', endpoint=self.serve_coll_page)) self.url_map.add(Rule(coll_prefix + '/timemap//', endpoint=self.serve_content)) self.url_map.add(Rule(coll_prefix + '/cdx', endpoint=self.serve_cdx)) + if self.recorder: self.url_map.add(Rule(coll_prefix + '/record/', endpoint=self.serve_record)) @@ -134,6 +137,9 @@ class FrontEndApp(object): self.raise_not_found(environ, 'Static File Not Found: {0}'.format(filepath)) def get_metadata(self, coll): + if coll == self.all_coll: + coll = '*' + metadata = {'coll': coll, 'type': 'replay'} @@ -170,6 +176,9 @@ class FrontEndApp(object): def serve_cdx(self, environ, coll='$root'): base_url = self.rewriterapp.paths['cdx-server'] + if coll == self.all_coll: + coll = '*' + cdx_url = base_url.format(coll=coll) if environ.get('QUERY_STRING'): @@ -242,6 +251,9 @@ class FrontEndApp(object): return WbResponse.json_response(result) def is_valid_coll(self, coll): + if coll == self.all_coll: + return True + return (coll in self.warcserver.list_fixed_routes() or coll in self.warcserver.list_dynamic_routes()) diff --git a/tests/config_test_record.yaml b/tests/config_test_record.yaml index a821311e..a98034db 100644 --- a/tests/config_test_record.yaml +++ b/tests/config_test_record.yaml @@ -2,6 +2,8 @@ debug: true collections_root: _test_colls +all_coll: all + recorder: live collections: diff --git a/tests/test_record_replay.py b/tests/test_record_replay.py index 57dd8397..47d8304a 100644 --- a/tests/test_record_replay.py +++ b/tests/test_record_replay.py @@ -1,8 +1,11 @@ from .base_config_test import BaseConfigTest, fmod, CollsDirMixin from pywb.manager.manager import main as manager from pywb.manager.autoindex import AutoIndexer +from pywb.warcserver.test.testutils import to_path + import os import time +import json # ============================================================================ @@ -10,7 +13,7 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest): @classmethod def setup_class(cls): super(TestRecordReplay, cls).setup_class('config_test_record.yaml') - cls.indexer = AutoIndexer(interval=0.25) + cls.indexer = AutoIndexer(interval=0.1) cls.indexer.start() @classmethod @@ -25,45 +28,74 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest): manager(['init', 'test2']) assert os.path.isdir(os.path.join(self.root_dir, '_test_colls', 'test2', 'archive')) - def test_record_1(self, fmod): - fmod_slash = fmod + '/' if fmod else '' - res = self.get('/test/record/mp_/http://httpbin.org/get?A=B', fmod_slash) + def test_record_1(self): + res = self.testapp.get('/test/record/mp_/http://httpbin.org/get?A=B') assert '"A": "B"' in res.text def test_replay_1(self, fmod): - time.sleep(0.5) + self.ensure_empty() fmod_slash = fmod + '/' if fmod else '' - res = self.get('/test/mp_/http://httpbin.org/get?A=B', fmod_slash) + res = self.get('/test/{0}http://httpbin.org/get?A=B', fmod_slash) assert '"A": "B"' in res.text - def test_record_2(self, fmod): - fmod_slash = fmod + '/' if fmod else '' - res = self.get('/test2/record/{0}http://httpbin.org/get?C=D', fmod_slash) + def test_record_2(self): + res = self.testapp.get('/test2/record/mp_/http://httpbin.org/get?C=D') assert '"C": "D"' in res.text def test_replay_2(self, fmod): - time.sleep(0.5) + self.ensure_empty() fmod_slash = fmod + '/' if fmod else '' res = self.get('/test2/{0}http://httpbin.org/get?C=D', fmod_slash) assert '"C": "D"' in res.text - def test_record_again_1(self, fmod): - fmod_slash = fmod + '/' if fmod else '' - res = self.get('/test/record/{0}http://httpbin.org/get?C=D', fmod_slash) + def test_record_again_1(self): + res = self.testapp.get('/test/record/mp_/http://httpbin.org/get?C=D') assert '"C": "D"' in res.text def test_replay_again_1(self, fmod): - time.sleep(0.5) + self.ensure_empty() fmod_slash = fmod + '/' if fmod else '' res = self.get('/test/{0}http://httpbin.org/get?C=D', fmod_slash) assert '"C": "D"' in res.text - # two warcs, for framed and non-framed capture - assert len(os.listdir(os.path.join(self.root_dir, '_test_colls', 'test', 'archive'))) == 2 + assert len(os.listdir(os.path.join(self.root_dir, '_test_colls', 'test', 'archive'))) == 1 assert len(os.listdir(os.path.join(self.root_dir, '_test_colls', 'test', 'indexes'))) == 1 + def ensure_empty(self): + while not self.app.recorder.write_queue.empty(): + time.sleep(0.1) + + time.sleep(0.4) + + def test_replay_all_coll(self, fmod): + self.ensure_empty() + + fmod_slash = fmod + '/' if fmod else '' + + res = self.get('/all/{0}http://httpbin.org/get?C=D', fmod_slash) + assert '"C": "D"' in res.text + + res = self.get('/all/mp_/http://httpbin.org/get?A=B', fmod_slash) + assert '"A": "B"' in res.text + + def test_cdx_all_coll(self): + res = self.testapp.get('/all/cdx?url=http://httpbin.org/get*&output=json') + + cdxj_lines = [json.loads(line) for line in res.text.rstrip().split('\n')] + + assert len(cdxj_lines) == 3 + + assert cdxj_lines[0]['url'] == 'http://httpbin.org/get?A=B' + assert cdxj_lines[1]['url'] == 'http://httpbin.org/get?C=D' + assert cdxj_lines[2]['url'] == 'http://httpbin.org/get?C=D' + + assert cdxj_lines[0]['source'] == to_path('_test_colls:test/indexes/autoindex.cdxj') + assert cdxj_lines[1]['source'] == to_path('_test_colls:test2/indexes/autoindex.cdxj') + assert cdxj_lines[2]['source'] == to_path('_test_colls:test/indexes/autoindex.cdxj') + + assert cdxj_lines[0]['filename'] == cdxj_lines[2]['filename']