1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

auto-all aggregate collection support: (#69)

- enabled with 'all_coll' in config or --all-coll cli option, eg. --all-coll all to enable
- supported for replay, timemap and cdx endpoints, uses wildcard '*' for coll name with directory aggregator
- tests: record/replay tests updated to replay via all collection, check all collection cdxj
This commit is contained in:
Ilya Kreymer 2017-09-28 02:08:31 -07:00
parent 5791980132
commit a32c6f089c
4 changed files with 77 additions and 22 deletions

View File

@ -34,12 +34,12 @@ class BaseCli(object):
parser.add_argument('--debug', action='store_true')
parser.add_argument('--profile', action='store_true')
parser.add_argument('--live', action='store_true', help='Add live-web handler at /live')
parser.add_argument('--proxy', help='Enable HTTP/S Proxy on specified collection')
parser.add_argument('--live', action='store_true', help='Add /live handler')
self.desc = desc
self.extra_config = None
self.extra_config = {}
self._extend_parser(parser)
@ -62,9 +62,12 @@ class BaseCli(object):
def load(self):
if self.r.live:
self.extra_config = {'collections':
{'live': {'index': '$live',
'use_js_obj_proxy': True}}}
self.extra_config['collections'] = {'live':
{'index': '$live',
'use_js_obj_proxy': True}}
if self.r.debug:
self.extra_config['debug'] = True
def run(self):
self.run_gevent()
@ -81,12 +84,18 @@ class ReplayCli(BaseCli):
parser.add_argument('-a', '--autoindex', action='store_true')
parser.add_argument('--auto-interval', type=int, default=30)
parser.add_argument('--all-coll', help='Set "all" collection')
help_dir='Specify root archive dir (default is current working directory)'
parser.add_argument('-d', '--directory', help=help_dir)
def load(self):
super(ReplayCli, self).load()
if self.r.all_coll:
self.extra_config['all_coll'] = self.r.all_coll
import os
if self.r.directory: #pragma: no cover
os.chdir(self.r.directory)

View File

@ -46,6 +46,8 @@ class FrontEndApp(object):
self.static_handler = StaticHandler('pywb/static/')
self.all_coll = config.get('all_coll', None)
self.url_map = Map()
self.url_map.add(Rule('/static/_/<coll>/<path:filepath>', endpoint=self.serve_static))
self.url_map.add(Rule('/static/<path:filepath>', endpoint=self.serve_static))
@ -60,6 +62,7 @@ class FrontEndApp(object):
self.url_map.add(Rule(coll_prefix + '/', endpoint=self.serve_coll_page))
self.url_map.add(Rule(coll_prefix + '/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
self.url_map.add(Rule(coll_prefix + '/cdx', endpoint=self.serve_cdx))
if self.recorder:
self.url_map.add(Rule(coll_prefix + '/record/<path:url>', endpoint=self.serve_record))
@ -134,6 +137,9 @@ class FrontEndApp(object):
self.raise_not_found(environ, 'Static File Not Found: {0}'.format(filepath))
def get_metadata(self, coll):
if coll == self.all_coll:
coll = '*'
metadata = {'coll': coll,
'type': 'replay'}
@ -170,6 +176,9 @@ class FrontEndApp(object):
def serve_cdx(self, environ, coll='$root'):
base_url = self.rewriterapp.paths['cdx-server']
if coll == self.all_coll:
coll = '*'
cdx_url = base_url.format(coll=coll)
if environ.get('QUERY_STRING'):
@ -242,6 +251,9 @@ class FrontEndApp(object):
return WbResponse.json_response(result)
def is_valid_coll(self, coll):
if coll == self.all_coll:
return True
return (coll in self.warcserver.list_fixed_routes() or
coll in self.warcserver.list_dynamic_routes())

View File

@ -2,6 +2,8 @@ debug: true
collections_root: _test_colls
all_coll: all
recorder: live
collections:

View File

@ -1,8 +1,11 @@
from .base_config_test import BaseConfigTest, fmod, CollsDirMixin
from pywb.manager.manager import main as manager
from pywb.manager.autoindex import AutoIndexer
from pywb.warcserver.test.testutils import to_path
import os
import time
import json
# ============================================================================
@ -10,7 +13,7 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
@classmethod
def setup_class(cls):
super(TestRecordReplay, cls).setup_class('config_test_record.yaml')
cls.indexer = AutoIndexer(interval=0.25)
cls.indexer = AutoIndexer(interval=0.1)
cls.indexer.start()
@classmethod
@ -25,45 +28,74 @@ class TestRecordReplay(CollsDirMixin, BaseConfigTest):
manager(['init', 'test2'])
assert os.path.isdir(os.path.join(self.root_dir, '_test_colls', 'test2', 'archive'))
def test_record_1(self, fmod):
fmod_slash = fmod + '/' if fmod else ''
res = self.get('/test/record/mp_/http://httpbin.org/get?A=B', fmod_slash)
def test_record_1(self):
res = self.testapp.get('/test/record/mp_/http://httpbin.org/get?A=B')
assert '"A": "B"' in res.text
def test_replay_1(self, fmod):
time.sleep(0.5)
self.ensure_empty()
fmod_slash = fmod + '/' if fmod else ''
res = self.get('/test/mp_/http://httpbin.org/get?A=B', fmod_slash)
res = self.get('/test/{0}http://httpbin.org/get?A=B', fmod_slash)
assert '"A": "B"' in res.text
def test_record_2(self, fmod):
fmod_slash = fmod + '/' if fmod else ''
res = self.get('/test2/record/{0}http://httpbin.org/get?C=D', fmod_slash)
def test_record_2(self):
res = self.testapp.get('/test2/record/mp_/http://httpbin.org/get?C=D')
assert '"C": "D"' in res.text
def test_replay_2(self, fmod):
time.sleep(0.5)
self.ensure_empty()
fmod_slash = fmod + '/' if fmod else ''
res = self.get('/test2/{0}http://httpbin.org/get?C=D', fmod_slash)
assert '"C": "D"' in res.text
def test_record_again_1(self, fmod):
fmod_slash = fmod + '/' if fmod else ''
res = self.get('/test/record/{0}http://httpbin.org/get?C=D', fmod_slash)
def test_record_again_1(self):
res = self.testapp.get('/test/record/mp_/http://httpbin.org/get?C=D')
assert '"C": "D"' in res.text
def test_replay_again_1(self, fmod):
time.sleep(0.5)
self.ensure_empty()
fmod_slash = fmod + '/' if fmod else ''
res = self.get('/test/{0}http://httpbin.org/get?C=D', fmod_slash)
assert '"C": "D"' in res.text
# two warcs, for framed and non-framed capture
assert len(os.listdir(os.path.join(self.root_dir, '_test_colls', 'test', 'archive'))) == 2
assert len(os.listdir(os.path.join(self.root_dir, '_test_colls', 'test', 'archive'))) == 1
assert len(os.listdir(os.path.join(self.root_dir, '_test_colls', 'test', 'indexes'))) == 1
def ensure_empty(self):
while not self.app.recorder.write_queue.empty():
time.sleep(0.1)
time.sleep(0.4)
def test_replay_all_coll(self, fmod):
self.ensure_empty()
fmod_slash = fmod + '/' if fmod else ''
res = self.get('/all/{0}http://httpbin.org/get?C=D', fmod_slash)
assert '"C": "D"' in res.text
res = self.get('/all/mp_/http://httpbin.org/get?A=B', fmod_slash)
assert '"A": "B"' in res.text
def test_cdx_all_coll(self):
res = self.testapp.get('/all/cdx?url=http://httpbin.org/get*&output=json')
cdxj_lines = [json.loads(line) for line in res.text.rstrip().split('\n')]
assert len(cdxj_lines) == 3
assert cdxj_lines[0]['url'] == 'http://httpbin.org/get?A=B'
assert cdxj_lines[1]['url'] == 'http://httpbin.org/get?C=D'
assert cdxj_lines[2]['url'] == 'http://httpbin.org/get?C=D'
assert cdxj_lines[0]['source'] == to_path('_test_colls:test/indexes/autoindex.cdxj')
assert cdxj_lines[1]['source'] == to_path('_test_colls:test2/indexes/autoindex.cdxj')
assert cdxj_lines[2]['source'] == to_path('_test_colls:test/indexes/autoindex.cdxj')
assert cdxj_lines[0]['filename'] == cdxj_lines[2]['filename']