mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
auto-all aggregate collection support: (#69)
- enabled with 'all_coll' in config or --all-coll cli option, eg. --all-coll all to enable - supported for replay, timemap and cdx endpoints, uses wildcard '*' for coll name with directory aggregator - tests: record/replay tests updated to replay via all collection, check all collection cdxj
This commit is contained in:
parent
5791980132
commit
a32c6f089c
@ -34,12 +34,12 @@ class BaseCli(object):
|
|||||||
parser.add_argument('--debug', action='store_true')
|
parser.add_argument('--debug', action='store_true')
|
||||||
parser.add_argument('--profile', action='store_true')
|
parser.add_argument('--profile', action='store_true')
|
||||||
|
|
||||||
|
parser.add_argument('--live', action='store_true', help='Add live-web handler at /live')
|
||||||
|
|
||||||
parser.add_argument('--proxy', help='Enable HTTP/S Proxy on specified collection')
|
parser.add_argument('--proxy', help='Enable HTTP/S Proxy on specified collection')
|
||||||
|
|
||||||
parser.add_argument('--live', action='store_true', help='Add /live handler')
|
|
||||||
|
|
||||||
self.desc = desc
|
self.desc = desc
|
||||||
self.extra_config = None
|
self.extra_config = {}
|
||||||
|
|
||||||
self._extend_parser(parser)
|
self._extend_parser(parser)
|
||||||
|
|
||||||
@ -62,9 +62,12 @@ class BaseCli(object):
|
|||||||
|
|
||||||
def load(self):
|
def load(self):
|
||||||
if self.r.live:
|
if self.r.live:
|
||||||
self.extra_config = {'collections':
|
self.extra_config['collections'] = {'live':
|
||||||
{'live': {'index': '$live',
|
{'index': '$live',
|
||||||
'use_js_obj_proxy': True}}}
|
'use_js_obj_proxy': True}}
|
||||||
|
|
||||||
|
if self.r.debug:
|
||||||
|
self.extra_config['debug'] = True
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
self.run_gevent()
|
self.run_gevent()
|
||||||
@ -81,12 +84,18 @@ class ReplayCli(BaseCli):
|
|||||||
parser.add_argument('-a', '--autoindex', action='store_true')
|
parser.add_argument('-a', '--autoindex', action='store_true')
|
||||||
parser.add_argument('--auto-interval', type=int, default=30)
|
parser.add_argument('--auto-interval', type=int, default=30)
|
||||||
|
|
||||||
|
parser.add_argument('--all-coll', help='Set "all" collection')
|
||||||
|
|
||||||
help_dir='Specify root archive dir (default is current working directory)'
|
help_dir='Specify root archive dir (default is current working directory)'
|
||||||
parser.add_argument('-d', '--directory', help=help_dir)
|
parser.add_argument('-d', '--directory', help=help_dir)
|
||||||
|
|
||||||
|
|
||||||
def load(self):
|
def load(self):
|
||||||
super(ReplayCli, self).load()
|
super(ReplayCli, self).load()
|
||||||
|
|
||||||
|
if self.r.all_coll:
|
||||||
|
self.extra_config['all_coll'] = self.r.all_coll
|
||||||
|
|
||||||
import os
|
import os
|
||||||
if self.r.directory: #pragma: no cover
|
if self.r.directory: #pragma: no cover
|
||||||
os.chdir(self.r.directory)
|
os.chdir(self.r.directory)
|
||||||
|
@ -46,6 +46,8 @@ class FrontEndApp(object):
|
|||||||
|
|
||||||
self.static_handler = StaticHandler('pywb/static/')
|
self.static_handler = StaticHandler('pywb/static/')
|
||||||
|
|
||||||
|
self.all_coll = config.get('all_coll', None)
|
||||||
|
|
||||||
self.url_map = Map()
|
self.url_map = Map()
|
||||||
self.url_map.add(Rule('/static/_/<coll>/<path:filepath>', endpoint=self.serve_static))
|
self.url_map.add(Rule('/static/_/<coll>/<path:filepath>', endpoint=self.serve_static))
|
||||||
self.url_map.add(Rule('/static/<path:filepath>', endpoint=self.serve_static))
|
self.url_map.add(Rule('/static/<path:filepath>', endpoint=self.serve_static))
|
||||||
@ -60,6 +62,7 @@ class FrontEndApp(object):
|
|||||||
self.url_map.add(Rule(coll_prefix + '/', endpoint=self.serve_coll_page))
|
self.url_map.add(Rule(coll_prefix + '/', endpoint=self.serve_coll_page))
|
||||||
self.url_map.add(Rule(coll_prefix + '/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
|
self.url_map.add(Rule(coll_prefix + '/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
|
||||||
self.url_map.add(Rule(coll_prefix + '/cdx', endpoint=self.serve_cdx))
|
self.url_map.add(Rule(coll_prefix + '/cdx', endpoint=self.serve_cdx))
|
||||||
|
|
||||||
if self.recorder:
|
if self.recorder:
|
||||||
self.url_map.add(Rule(coll_prefix + '/record/<path:url>', endpoint=self.serve_record))
|
self.url_map.add(Rule(coll_prefix + '/record/<path:url>', endpoint=self.serve_record))
|
||||||
|
|
||||||
@ -134,6 +137,9 @@ class FrontEndApp(object):
|
|||||||
self.raise_not_found(environ, 'Static File Not Found: {0}'.format(filepath))
|
self.raise_not_found(environ, 'Static File Not Found: {0}'.format(filepath))
|
||||||
|
|
||||||
def get_metadata(self, coll):
|
def get_metadata(self, coll):
|
||||||
|
if coll == self.all_coll:
|
||||||
|
coll = '*'
|
||||||
|
|
||||||
metadata = {'coll': coll,
|
metadata = {'coll': coll,
|
||||||
'type': 'replay'}
|
'type': 'replay'}
|
||||||
|
|
||||||
@ -170,6 +176,9 @@ class FrontEndApp(object):
|
|||||||
def serve_cdx(self, environ, coll='$root'):
|
def serve_cdx(self, environ, coll='$root'):
|
||||||
base_url = self.rewriterapp.paths['cdx-server']
|
base_url = self.rewriterapp.paths['cdx-server']
|
||||||
|
|
||||||
|
if coll == self.all_coll:
|
||||||
|
coll = '*'
|
||||||
|
|
||||||
cdx_url = base_url.format(coll=coll)
|
cdx_url = base_url.format(coll=coll)
|
||||||
|
|
||||||
if environ.get('QUERY_STRING'):
|
if environ.get('QUERY_STRING'):
|
||||||
@ -242,6 +251,9 @@ class FrontEndApp(object):
|
|||||||
return WbResponse.json_response(result)
|
return WbResponse.json_response(result)
|
||||||
|
|
||||||
def is_valid_coll(self, coll):
|
def is_valid_coll(self, coll):
|
||||||
|
if coll == self.all_coll:
|
||||||
|
return True
|
||||||
|
|
||||||
return (coll in self.warcserver.list_fixed_routes() or
|
return (coll in self.warcserver.list_fixed_routes() or
|
||||||
coll in self.warcserver.list_dynamic_routes())
|
coll in self.warcserver.list_dynamic_routes())
|
||||||
|
|
||||||
|
@ -2,6 +2,8 @@ debug: true
|
|||||||
|
|
||||||
collections_root: _test_colls
|
collections_root: _test_colls
|
||||||
|
|
||||||
|
all_coll: all
|
||||||
|
|
||||||
recorder: live
|
recorder: live
|
||||||
|
|
||||||
collections:
|
collections:
|
||||||
|
@ -1,8 +1,11 @@
|
|||||||
from .base_config_test import BaseConfigTest, fmod, CollsDirMixin
|
from .base_config_test import BaseConfigTest, fmod, CollsDirMixin
|
||||||
from pywb.manager.manager import main as manager
|
from pywb.manager.manager import main as manager
|
||||||
from pywb.manager.autoindex import AutoIndexer
|
from pywb.manager.autoindex import AutoIndexer
|
||||||
|
from pywb.warcserver.test.testutils import to_path
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@ -10,7 +13,7 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def setup_class(cls):
|
def setup_class(cls):
|
||||||
super(TestRecordReplay, cls).setup_class('config_test_record.yaml')
|
super(TestRecordReplay, cls).setup_class('config_test_record.yaml')
|
||||||
cls.indexer = AutoIndexer(interval=0.25)
|
cls.indexer = AutoIndexer(interval=0.1)
|
||||||
cls.indexer.start()
|
cls.indexer.start()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -25,45 +28,74 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
|
|||||||
manager(['init', 'test2'])
|
manager(['init', 'test2'])
|
||||||
assert os.path.isdir(os.path.join(self.root_dir, '_test_colls', 'test2', 'archive'))
|
assert os.path.isdir(os.path.join(self.root_dir, '_test_colls', 'test2', 'archive'))
|
||||||
|
|
||||||
def test_record_1(self, fmod):
|
def test_record_1(self):
|
||||||
fmod_slash = fmod + '/' if fmod else ''
|
res = self.testapp.get('/test/record/mp_/http://httpbin.org/get?A=B')
|
||||||
res = self.get('/test/record/mp_/http://httpbin.org/get?A=B', fmod_slash)
|
|
||||||
assert '"A": "B"' in res.text
|
assert '"A": "B"' in res.text
|
||||||
|
|
||||||
def test_replay_1(self, fmod):
|
def test_replay_1(self, fmod):
|
||||||
time.sleep(0.5)
|
self.ensure_empty()
|
||||||
|
|
||||||
fmod_slash = fmod + '/' if fmod else ''
|
fmod_slash = fmod + '/' if fmod else ''
|
||||||
res = self.get('/test/mp_/http://httpbin.org/get?A=B', fmod_slash)
|
res = self.get('/test/{0}http://httpbin.org/get?A=B', fmod_slash)
|
||||||
assert '"A": "B"' in res.text
|
assert '"A": "B"' in res.text
|
||||||
|
|
||||||
def test_record_2(self, fmod):
|
def test_record_2(self):
|
||||||
fmod_slash = fmod + '/' if fmod else ''
|
res = self.testapp.get('/test2/record/mp_/http://httpbin.org/get?C=D')
|
||||||
res = self.get('/test2/record/{0}http://httpbin.org/get?C=D', fmod_slash)
|
|
||||||
assert '"C": "D"' in res.text
|
assert '"C": "D"' in res.text
|
||||||
|
|
||||||
def test_replay_2(self, fmod):
|
def test_replay_2(self, fmod):
|
||||||
time.sleep(0.5)
|
self.ensure_empty()
|
||||||
|
|
||||||
fmod_slash = fmod + '/' if fmod else ''
|
fmod_slash = fmod + '/' if fmod else ''
|
||||||
res = self.get('/test2/{0}http://httpbin.org/get?C=D', fmod_slash)
|
res = self.get('/test2/{0}http://httpbin.org/get?C=D', fmod_slash)
|
||||||
assert '"C": "D"' in res.text
|
assert '"C": "D"' in res.text
|
||||||
|
|
||||||
def test_record_again_1(self, fmod):
|
def test_record_again_1(self):
|
||||||
fmod_slash = fmod + '/' if fmod else ''
|
res = self.testapp.get('/test/record/mp_/http://httpbin.org/get?C=D')
|
||||||
res = self.get('/test/record/{0}http://httpbin.org/get?C=D', fmod_slash)
|
|
||||||
assert '"C": "D"' in res.text
|
assert '"C": "D"' in res.text
|
||||||
|
|
||||||
def test_replay_again_1(self, fmod):
|
def test_replay_again_1(self, fmod):
|
||||||
time.sleep(0.5)
|
self.ensure_empty()
|
||||||
|
|
||||||
fmod_slash = fmod + '/' if fmod else ''
|
fmod_slash = fmod + '/' if fmod else ''
|
||||||
res = self.get('/test/{0}http://httpbin.org/get?C=D', fmod_slash)
|
res = self.get('/test/{0}http://httpbin.org/get?C=D', fmod_slash)
|
||||||
assert '"C": "D"' in res.text
|
assert '"C": "D"' in res.text
|
||||||
|
|
||||||
# two warcs, for framed and non-framed capture
|
assert len(os.listdir(os.path.join(self.root_dir, '_test_colls', 'test', 'archive'))) == 1
|
||||||
assert len(os.listdir(os.path.join(self.root_dir, '_test_colls', 'test', 'archive'))) == 2
|
|
||||||
|
|
||||||
assert len(os.listdir(os.path.join(self.root_dir, '_test_colls', 'test', 'indexes'))) == 1
|
assert len(os.listdir(os.path.join(self.root_dir, '_test_colls', 'test', 'indexes'))) == 1
|
||||||
|
|
||||||
|
def ensure_empty(self):
|
||||||
|
while not self.app.recorder.write_queue.empty():
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
time.sleep(0.4)
|
||||||
|
|
||||||
|
def test_replay_all_coll(self, fmod):
|
||||||
|
self.ensure_empty()
|
||||||
|
|
||||||
|
fmod_slash = fmod + '/' if fmod else ''
|
||||||
|
|
||||||
|
res = self.get('/all/{0}http://httpbin.org/get?C=D', fmod_slash)
|
||||||
|
assert '"C": "D"' in res.text
|
||||||
|
|
||||||
|
res = self.get('/all/mp_/http://httpbin.org/get?A=B', fmod_slash)
|
||||||
|
assert '"A": "B"' in res.text
|
||||||
|
|
||||||
|
def test_cdx_all_coll(self):
|
||||||
|
res = self.testapp.get('/all/cdx?url=http://httpbin.org/get*&output=json')
|
||||||
|
|
||||||
|
cdxj_lines = [json.loads(line) for line in res.text.rstrip().split('\n')]
|
||||||
|
|
||||||
|
assert len(cdxj_lines) == 3
|
||||||
|
|
||||||
|
assert cdxj_lines[0]['url'] == 'http://httpbin.org/get?A=B'
|
||||||
|
assert cdxj_lines[1]['url'] == 'http://httpbin.org/get?C=D'
|
||||||
|
assert cdxj_lines[2]['url'] == 'http://httpbin.org/get?C=D'
|
||||||
|
|
||||||
|
assert cdxj_lines[0]['source'] == to_path('_test_colls:test/indexes/autoindex.cdxj')
|
||||||
|
assert cdxj_lines[1]['source'] == to_path('_test_colls:test2/indexes/autoindex.cdxj')
|
||||||
|
assert cdxj_lines[2]['source'] == to_path('_test_colls:test/indexes/autoindex.cdxj')
|
||||||
|
|
||||||
|
assert cdxj_lines[0]['filename'] == cdxj_lines[2]['filename']
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user