From 33eb4a4ae119cd323ce90bc41ce14b9c3827cc3f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 6 Sep 2017 23:25:30 -0700 Subject: [PATCH] cdx-server/frontendapp refactor: (#237) frontendapp/warcserver improvements: - support '/cdx' endpoint for every collection, exposing standard cdx-server api - remove '-cdx' endpoint in warcserver, redundant with index and frontend /cdx endpoint - warcserver: simplify paths! support static paths (/A, /B) + dynamic paths (/) on same endpoint --- pywb/apps/frontendapp.py | 36 ++++++++++++++++++++++++------- pywb/apps/rewriterapp.py | 4 ++-- pywb/warcserver/basewarcserver.py | 10 ++++++--- pywb/warcserver/warcserver.py | 19 +++------------- tests/config_test_root_coll.yaml | 2 +- tests/test_cdx_server_app.py | 2 +- tests/test_integration.py | 10 ++++----- tests/test_root_coll.py | 6 ++++++ 8 files changed, 53 insertions(+), 36 deletions(-) diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index c96eef49..a730fb4b 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -22,6 +22,7 @@ from pywb.apps.wbrequestresponse import WbResponse import os import traceback +import requests # ============================================================================ @@ -33,8 +34,6 @@ class FrontEndApp(object): framed_replay = self.warcserver.config.get('framed_replay', True) - self.rewriterapp = RewriterApp(framed_replay, config=self.warcserver.config) - self.warcserver_server = GeventServer(self.warcserver, port=0) self.static_handler = StaticHandler('pywb/static/') @@ -47,16 +46,21 @@ class FrontEndApp(object): if self.is_valid_coll('$root'): self.url_map.add(Rule('/', endpoint=self.serve_coll_page)) self.url_map.add(Rule('/timemap//', endpoint=self.serve_content)) + self.url_map.add(Rule('/cdx', endpoint=self.serve_cdx)) self.url_map.add(Rule('/', endpoint=self.serve_content)) else: self.url_map.add(Rule('//', endpoint=self.serve_coll_page)) self.url_map.add(Rule('//timemap//', endpoint=self.serve_content)) + self.url_map.add(Rule('//cdx', endpoint=self.serve_cdx)) self.url_map.add(Rule('//', endpoint=self.serve_content)) self.url_map.add(Rule('/', endpoint=self.serve_home)) - self.rewriterapp.paths = self.get_upstream_paths(self.warcserver_server.port) + upstream_paths = self.get_upstream_paths(self.warcserver_server.port) + self.rewriterapp = RewriterApp(framed_replay, + config=self.warcserver.config, + paths=upstream_paths) self.templates_dir = self.warcserver.config.get('templates_dir', 'templates') self.static_dir = self.warcserver.config.get('static_dir', 'static') @@ -65,8 +69,9 @@ class FrontEndApp(object): self.metadata_cache = MetadataCache(metadata_templ) def get_upstream_paths(self, port): - return {'replay-dyn': 'http://localhost:%s/_/resource/postreq?param.coll={coll}' % port, - 'replay-fixed': 'http://localhost:%s/{coll}/resource/postreq' % port + return { + 'replay': 'http://localhost:%s/{coll}/resource/postreq' % port, + 'cdx-server': 'http://localhost:%s/{coll}/index' % port, } def serve_home(self, environ): @@ -98,17 +103,16 @@ class FrontEndApp(object): self.raise_not_found(environ, 'Static File Not Found: {0}'.format(filepath)) def get_metadata(self, coll): - metadata = {'coll': coll} + metadata = {'coll': coll, + 'type': 'replay'} if self.warcserver.config.get('use_js_obj_proxy'): metadata['use_js_obj_proxy'] = True if coll in self.warcserver.list_fixed_routes(): metadata.update(self.warcserver.get_coll_config(coll)) - metadata['type'] = 'replay-fixed' else: metadata.update(self.metadata_cache.load(coll)) - metadata['type'] = 'replay-dyn' return metadata @@ -132,6 +136,22 @@ class FrontEndApp(object): return WbResponse.text_response(content, content_type='text/html; charset="utf-8"') + def serve_cdx(self, environ, coll='$root'): + base_url = self.rewriterapp.paths['cdx-server'] + + cdx_url = base_url.format(coll=coll) + + if environ.get('QUERY_STRING'): + cdx_url += '&' if '?' in cdx_url else '?' + cdx_url += environ.get('QUERY_STRING') + + try: + res = requests.get(cdx_url, stream=True) + return WbResponse.bin_stream(res.raw, content_type=res.headers.get('Content-Type')) + + except Exception as e: + return WbResponse.text_content('Error: ' + str(e), status='400 Bad Request') + def serve_content(self, environ, coll='$root', url='', timemap_output=''): if not self.is_valid_coll(coll): self.raise_not_found(environ, 'No handler for "/{0}"'.format(coll)) diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index 6531db49..49e29c63 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -48,11 +48,11 @@ class UpstreamException(WbException): class RewriterApp(object): VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json' - def __init__(self, framed_replay=False, jinja_env=None, config=None): + def __init__(self, framed_replay=False, jinja_env=None, config=None, paths=None): self.loader = ArcWarcRecordLoader() self.config = config or {} - self.paths = {} + self.paths = paths or {} self.framed_replay = framed_replay diff --git a/pywb/warcserver/basewarcserver.py b/pywb/warcserver/basewarcserver.py index d6e62530..5b159aaf 100644 --- a/pywb/warcserver/basewarcserver.py +++ b/pywb/warcserver/basewarcserver.py @@ -26,16 +26,20 @@ class BaseWarcServer(object): self.url_map.add(Rule('/', endpoint=list_routes)) - def add_route(self, path, handler): - def direct_input_request(environ, mode=''): + def add_route(self, path, handler, path_param_name=''): + def direct_input_request(environ, mode='', path_param_value=''): params = self.get_query_dict(environ) params['mode'] = mode + if path_param_value: + params[path_param_name] = path_param_value params['_input_req'] = DirectWSGIInputRequest(environ) return handler(params) - def post_fullrequest(environ, mode=''): + def post_fullrequest(environ, mode='', path_param_value=''): params = self.get_query_dict(environ) params['mode'] = mode + if path_param_value: + params[path_param_name] = path_param_value params['_input_req'] = POSTInputRequest(environ) return handler(params) diff --git a/pywb/warcserver/warcserver.py b/pywb/warcserver/warcserver.py index c4a0dcce..863c5028 100644 --- a/pywb/warcserver/warcserver.py +++ b/pywb/warcserver/warcserver.py @@ -55,16 +55,14 @@ class WarcServer(BaseWarcServer): super(WarcServer, self).__init__(debug=config.get('debug', False)) self.config = config - if self.config.get('enable_auto_colls', True): - auto_handler = self.load_auto_colls() - self.add_route('/_', auto_handler) - self.fixed_routes = self.load_colls() for name, route in iteritems(self.fixed_routes): self.add_route('/' + name, route) - self._add_simple_route('/-cdx', self.cdx_compat) + if self.config.get('enable_auto_colls', True): + auto_handler = self.load_auto_colls() + self.add_route('/', auto_handler, path_param_name='param.coll') def _lookup(self, environ, path): urls = self.url_map.bind(environ['HTTP_HOST'], path_info=path) @@ -77,17 +75,6 @@ class WarcServer(BaseWarcServer): print(e) return None - def cdx_compat(self, environ, coll=''): - """ -cdx server api - """ - result = self._lookup(environ, '/{0}/index'.format(coll)) - if result: - return result - - environ['QUERY_STRING'] += '¶m.coll=' + coll - result = self._lookup(environ, '/_/index') - return result - def load_auto_colls(self): self.root_dir = self.config.get('collections_root', '') if not self.root_dir: diff --git a/tests/config_test_root_coll.yaml b/tests/config_test_root_coll.yaml index 50f4e238..4c467f0f 100644 --- a/tests/config_test_root_coll.yaml +++ b/tests/config_test_root_coll.yaml @@ -3,6 +3,6 @@ debug: true collections_root: _test_colls collections: - '$root': $live + '$root': '$live' diff --git a/tests/test_cdx_server_app.py b/tests/test_cdx_server_app.py index 762efb62..6154bf90 100644 --- a/tests/test_cdx_server_app.py +++ b/tests/test_cdx_server_app.py @@ -24,7 +24,7 @@ class TestCDXApp(BaseTestClass): def query(self, url, is_error=False, **params): params['url'] = url - return self.testapp.get('/pywb-cdx?' + urlencode(params, doseq=1), expect_errors=is_error) + return self.testapp.get('/pywb/index?' + urlencode(params, doseq=1), expect_errors=is_error) def test_exact_url(self): """ diff --git a/tests/test_integration.py b/tests/test_integration.py index 10214419..faec4196 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -428,15 +428,15 @@ class TestWbIntegration(BaseConfigTest): resp = self.testapp.get('/static/notfound.css', status = 404) assert resp.status_int == 404 - def _test_cdx_server_filters(self): - resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mime:warc/revisit&filter=filename:dupes.warc.gz') - self._assert_basic_text(resp) + def test_cdx_server_filters(self): + resp = self.testapp.get('/pywb/cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mime:warc/revisit&filter=filename:dupes.warc.gz') + assert resp.content_type == 'text/x-cdxj' actual_len = len(resp.text.rstrip().split('\n')) assert actual_len == 1, actual_len - def _test_cdx_server_advanced(self): + def test_cdx_server_advanced(self): # combine collapsing, reversing and revisit resolving - resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/print.css&collapseTime=11&resolveRevisits=true&reverse=true') + resp = self.testapp.get('/pywb/cdx?url=http://www.iana.org/_css/2013.1/print.css&collapseTime=11&resolveRevisits=true&reverse=true') # convert back to CDXObject cdxs = list(map(CDXObject, resp.body.rstrip().split(b'\n'))) diff --git a/tests/test_root_coll.py b/tests/test_root_coll.py index 3639a604..94b64eef 100644 --- a/tests/test_root_coll.py +++ b/tests/test_root_coll.py @@ -37,3 +37,9 @@ class TestRootColl(BaseConfigTest): resp = self.testapp.get('/') assert 'Search' in resp.text + def test_root_cdx(self): + resp = self.testapp.get('/cdx?url=http://www.iana.org/&output=json&limit=1') + resp.content_type = 'application/json' + assert resp.json['is_live'] == 'true' + assert resp.json['url'] == 'http://www.iana.org/' + assert resp.json['source'] == '$root'