mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
auto-all aggregate collection support: (#69)
- enabled with 'all_coll' in config or --all-coll cli option, eg. --all-coll all to enable - supported for replay, timemap and cdx endpoints, uses wildcard '*' for coll name with directory aggregator - tests: record/replay tests updated to replay via all collection, check all collection cdxj
This commit is contained in:
parent
5791980132
commit
a32c6f089c
@ -34,12 +34,12 @@ class BaseCli(object):
|
||||
parser.add_argument('--debug', action='store_true')
|
||||
parser.add_argument('--profile', action='store_true')
|
||||
|
||||
parser.add_argument('--live', action='store_true', help='Add live-web handler at /live')
|
||||
|
||||
parser.add_argument('--proxy', help='Enable HTTP/S Proxy on specified collection')
|
||||
|
||||
parser.add_argument('--live', action='store_true', help='Add /live handler')
|
||||
|
||||
self.desc = desc
|
||||
self.extra_config = None
|
||||
self.extra_config = {}
|
||||
|
||||
self._extend_parser(parser)
|
||||
|
||||
@ -62,9 +62,12 @@ class BaseCli(object):
|
||||
|
||||
def load(self):
|
||||
if self.r.live:
|
||||
self.extra_config = {'collections':
|
||||
{'live': {'index': '$live',
|
||||
'use_js_obj_proxy': True}}}
|
||||
self.extra_config['collections'] = {'live':
|
||||
{'index': '$live',
|
||||
'use_js_obj_proxy': True}}
|
||||
|
||||
if self.r.debug:
|
||||
self.extra_config['debug'] = True
|
||||
|
||||
def run(self):
|
||||
self.run_gevent()
|
||||
@ -81,12 +84,18 @@ class ReplayCli(BaseCli):
|
||||
parser.add_argument('-a', '--autoindex', action='store_true')
|
||||
parser.add_argument('--auto-interval', type=int, default=30)
|
||||
|
||||
parser.add_argument('--all-coll', help='Set "all" collection')
|
||||
|
||||
help_dir='Specify root archive dir (default is current working directory)'
|
||||
parser.add_argument('-d', '--directory', help=help_dir)
|
||||
|
||||
|
||||
def load(self):
|
||||
super(ReplayCli, self).load()
|
||||
|
||||
if self.r.all_coll:
|
||||
self.extra_config['all_coll'] = self.r.all_coll
|
||||
|
||||
import os
|
||||
if self.r.directory: #pragma: no cover
|
||||
os.chdir(self.r.directory)
|
||||
|
@ -46,6 +46,8 @@ class FrontEndApp(object):
|
||||
|
||||
self.static_handler = StaticHandler('pywb/static/')
|
||||
|
||||
self.all_coll = config.get('all_coll', None)
|
||||
|
||||
self.url_map = Map()
|
||||
self.url_map.add(Rule('/static/_/<coll>/<path:filepath>', endpoint=self.serve_static))
|
||||
self.url_map.add(Rule('/static/<path:filepath>', endpoint=self.serve_static))
|
||||
@ -60,6 +62,7 @@ class FrontEndApp(object):
|
||||
self.url_map.add(Rule(coll_prefix + '/', endpoint=self.serve_coll_page))
|
||||
self.url_map.add(Rule(coll_prefix + '/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
|
||||
self.url_map.add(Rule(coll_prefix + '/cdx', endpoint=self.serve_cdx))
|
||||
|
||||
if self.recorder:
|
||||
self.url_map.add(Rule(coll_prefix + '/record/<path:url>', endpoint=self.serve_record))
|
||||
|
||||
@ -134,6 +137,9 @@ class FrontEndApp(object):
|
||||
self.raise_not_found(environ, 'Static File Not Found: {0}'.format(filepath))
|
||||
|
||||
def get_metadata(self, coll):
|
||||
if coll == self.all_coll:
|
||||
coll = '*'
|
||||
|
||||
metadata = {'coll': coll,
|
||||
'type': 'replay'}
|
||||
|
||||
@ -170,6 +176,9 @@ class FrontEndApp(object):
|
||||
def serve_cdx(self, environ, coll='$root'):
|
||||
base_url = self.rewriterapp.paths['cdx-server']
|
||||
|
||||
if coll == self.all_coll:
|
||||
coll = '*'
|
||||
|
||||
cdx_url = base_url.format(coll=coll)
|
||||
|
||||
if environ.get('QUERY_STRING'):
|
||||
@ -242,6 +251,9 @@ class FrontEndApp(object):
|
||||
return WbResponse.json_response(result)
|
||||
|
||||
def is_valid_coll(self, coll):
|
||||
if coll == self.all_coll:
|
||||
return True
|
||||
|
||||
return (coll in self.warcserver.list_fixed_routes() or
|
||||
coll in self.warcserver.list_dynamic_routes())
|
||||
|
||||
|
@ -2,6 +2,8 @@ debug: true
|
||||
|
||||
collections_root: _test_colls
|
||||
|
||||
all_coll: all
|
||||
|
||||
recorder: live
|
||||
|
||||
collections:
|
||||
|
@ -1,8 +1,11 @@
|
||||
from .base_config_test import BaseConfigTest, fmod, CollsDirMixin
|
||||
from pywb.manager.manager import main as manager
|
||||
from pywb.manager.autoindex import AutoIndexer
|
||||
from pywb.warcserver.test.testutils import to_path
|
||||
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -10,7 +13,7 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
super(TestRecordReplay, cls).setup_class('config_test_record.yaml')
|
||||
cls.indexer = AutoIndexer(interval=0.25)
|
||||
cls.indexer = AutoIndexer(interval=0.1)
|
||||
cls.indexer.start()
|
||||
|
||||
@classmethod
|
||||
@ -25,45 +28,74 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
|
||||
manager(['init', 'test2'])
|
||||
assert os.path.isdir(os.path.join(self.root_dir, '_test_colls', 'test2', 'archive'))
|
||||
|
||||
def test_record_1(self, fmod):
|
||||
fmod_slash = fmod + '/' if fmod else ''
|
||||
res = self.get('/test/record/mp_/http://httpbin.org/get?A=B', fmod_slash)
|
||||
def test_record_1(self):
|
||||
res = self.testapp.get('/test/record/mp_/http://httpbin.org/get?A=B')
|
||||
assert '"A": "B"' in res.text
|
||||
|
||||
def test_replay_1(self, fmod):
|
||||
time.sleep(0.5)
|
||||
self.ensure_empty()
|
||||
|
||||
fmod_slash = fmod + '/' if fmod else ''
|
||||
res = self.get('/test/mp_/http://httpbin.org/get?A=B', fmod_slash)
|
||||
res = self.get('/test/{0}http://httpbin.org/get?A=B', fmod_slash)
|
||||
assert '"A": "B"' in res.text
|
||||
|
||||
def test_record_2(self, fmod):
|
||||
fmod_slash = fmod + '/' if fmod else ''
|
||||
res = self.get('/test2/record/{0}http://httpbin.org/get?C=D', fmod_slash)
|
||||
def test_record_2(self):
|
||||
res = self.testapp.get('/test2/record/mp_/http://httpbin.org/get?C=D')
|
||||
assert '"C": "D"' in res.text
|
||||
|
||||
def test_replay_2(self, fmod):
|
||||
time.sleep(0.5)
|
||||
self.ensure_empty()
|
||||
|
||||
fmod_slash = fmod + '/' if fmod else ''
|
||||
res = self.get('/test2/{0}http://httpbin.org/get?C=D', fmod_slash)
|
||||
assert '"C": "D"' in res.text
|
||||
|
||||
def test_record_again_1(self, fmod):
|
||||
fmod_slash = fmod + '/' if fmod else ''
|
||||
res = self.get('/test/record/{0}http://httpbin.org/get?C=D', fmod_slash)
|
||||
def test_record_again_1(self):
|
||||
res = self.testapp.get('/test/record/mp_/http://httpbin.org/get?C=D')
|
||||
assert '"C": "D"' in res.text
|
||||
|
||||
def test_replay_again_1(self, fmod):
|
||||
time.sleep(0.5)
|
||||
self.ensure_empty()
|
||||
|
||||
fmod_slash = fmod + '/' if fmod else ''
|
||||
res = self.get('/test/{0}http://httpbin.org/get?C=D', fmod_slash)
|
||||
assert '"C": "D"' in res.text
|
||||
|
||||
# two warcs, for framed and non-framed capture
|
||||
assert len(os.listdir(os.path.join(self.root_dir, '_test_colls', 'test', 'archive'))) == 2
|
||||
assert len(os.listdir(os.path.join(self.root_dir, '_test_colls', 'test', 'archive'))) == 1
|
||||
|
||||
assert len(os.listdir(os.path.join(self.root_dir, '_test_colls', 'test', 'indexes'))) == 1
|
||||
|
||||
def ensure_empty(self):
|
||||
while not self.app.recorder.write_queue.empty():
|
||||
time.sleep(0.1)
|
||||
|
||||
time.sleep(0.4)
|
||||
|
||||
def test_replay_all_coll(self, fmod):
|
||||
self.ensure_empty()
|
||||
|
||||
fmod_slash = fmod + '/' if fmod else ''
|
||||
|
||||
res = self.get('/all/{0}http://httpbin.org/get?C=D', fmod_slash)
|
||||
assert '"C": "D"' in res.text
|
||||
|
||||
res = self.get('/all/mp_/http://httpbin.org/get?A=B', fmod_slash)
|
||||
assert '"A": "B"' in res.text
|
||||
|
||||
def test_cdx_all_coll(self):
|
||||
res = self.testapp.get('/all/cdx?url=http://httpbin.org/get*&output=json')
|
||||
|
||||
cdxj_lines = [json.loads(line) for line in res.text.rstrip().split('\n')]
|
||||
|
||||
assert len(cdxj_lines) == 3
|
||||
|
||||
assert cdxj_lines[0]['url'] == 'http://httpbin.org/get?A=B'
|
||||
assert cdxj_lines[1]['url'] == 'http://httpbin.org/get?C=D'
|
||||
assert cdxj_lines[2]['url'] == 'http://httpbin.org/get?C=D'
|
||||
|
||||
assert cdxj_lines[0]['source'] == to_path('_test_colls:test/indexes/autoindex.cdxj')
|
||||
assert cdxj_lines[1]['source'] == to_path('_test_colls:test2/indexes/autoindex.cdxj')
|
||||
assert cdxj_lines[2]['source'] == to_path('_test_colls:test/indexes/autoindex.cdxj')
|
||||
|
||||
assert cdxj_lines[0]['filename'] == cdxj_lines[2]['filename']
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user