mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
cdx-server/frontendapp refactor: (#237)
frontendapp/warcserver improvements: - support '/cdx' endpoint for every collection, exposing standard cdx-server api - remove '-cdx' endpoint in warcserver, redundant with index and frontend /cdx endpoint - warcserver: simplify paths! support static paths (/A, /B) + dynamic paths (/<path>) on same endpoint
This commit is contained in:
parent
772993ba53
commit
33eb4a4ae1
@ -22,6 +22,7 @@ from pywb.apps.wbrequestresponse import WbResponse
|
||||
|
||||
import os
|
||||
import traceback
|
||||
import requests
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -33,8 +34,6 @@ class FrontEndApp(object):
|
||||
|
||||
framed_replay = self.warcserver.config.get('framed_replay', True)
|
||||
|
||||
self.rewriterapp = RewriterApp(framed_replay, config=self.warcserver.config)
|
||||
|
||||
self.warcserver_server = GeventServer(self.warcserver, port=0)
|
||||
|
||||
self.static_handler = StaticHandler('pywb/static/')
|
||||
@ -47,16 +46,21 @@ class FrontEndApp(object):
|
||||
if self.is_valid_coll('$root'):
|
||||
self.url_map.add(Rule('/', endpoint=self.serve_coll_page))
|
||||
self.url_map.add(Rule('/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
|
||||
self.url_map.add(Rule('/cdx', endpoint=self.serve_cdx))
|
||||
self.url_map.add(Rule('/<path:url>', endpoint=self.serve_content))
|
||||
|
||||
else:
|
||||
self.url_map.add(Rule('/<coll>/', endpoint=self.serve_coll_page))
|
||||
self.url_map.add(Rule('/<coll>/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
|
||||
self.url_map.add(Rule('/<coll>/cdx', endpoint=self.serve_cdx))
|
||||
self.url_map.add(Rule('/<coll>/<path:url>', endpoint=self.serve_content))
|
||||
|
||||
self.url_map.add(Rule('/', endpoint=self.serve_home))
|
||||
|
||||
self.rewriterapp.paths = self.get_upstream_paths(self.warcserver_server.port)
|
||||
upstream_paths = self.get_upstream_paths(self.warcserver_server.port)
|
||||
self.rewriterapp = RewriterApp(framed_replay,
|
||||
config=self.warcserver.config,
|
||||
paths=upstream_paths)
|
||||
|
||||
self.templates_dir = self.warcserver.config.get('templates_dir', 'templates')
|
||||
self.static_dir = self.warcserver.config.get('static_dir', 'static')
|
||||
@ -65,8 +69,9 @@ class FrontEndApp(object):
|
||||
self.metadata_cache = MetadataCache(metadata_templ)
|
||||
|
||||
def get_upstream_paths(self, port):
|
||||
return {'replay-dyn': 'http://localhost:%s/_/resource/postreq?param.coll={coll}' % port,
|
||||
'replay-fixed': 'http://localhost:%s/{coll}/resource/postreq' % port
|
||||
return {
|
||||
'replay': 'http://localhost:%s/{coll}/resource/postreq' % port,
|
||||
'cdx-server': 'http://localhost:%s/{coll}/index' % port,
|
||||
}
|
||||
|
||||
def serve_home(self, environ):
|
||||
@ -98,17 +103,16 @@ class FrontEndApp(object):
|
||||
self.raise_not_found(environ, 'Static File Not Found: {0}'.format(filepath))
|
||||
|
||||
def get_metadata(self, coll):
|
||||
metadata = {'coll': coll}
|
||||
metadata = {'coll': coll,
|
||||
'type': 'replay'}
|
||||
|
||||
if self.warcserver.config.get('use_js_obj_proxy'):
|
||||
metadata['use_js_obj_proxy'] = True
|
||||
|
||||
if coll in self.warcserver.list_fixed_routes():
|
||||
metadata.update(self.warcserver.get_coll_config(coll))
|
||||
metadata['type'] = 'replay-fixed'
|
||||
else:
|
||||
metadata.update(self.metadata_cache.load(coll))
|
||||
metadata['type'] = 'replay-dyn'
|
||||
|
||||
return metadata
|
||||
|
||||
@ -132,6 +136,22 @@ class FrontEndApp(object):
|
||||
|
||||
return WbResponse.text_response(content, content_type='text/html; charset="utf-8"')
|
||||
|
||||
def serve_cdx(self, environ, coll='$root'):
|
||||
base_url = self.rewriterapp.paths['cdx-server']
|
||||
|
||||
cdx_url = base_url.format(coll=coll)
|
||||
|
||||
if environ.get('QUERY_STRING'):
|
||||
cdx_url += '&' if '?' in cdx_url else '?'
|
||||
cdx_url += environ.get('QUERY_STRING')
|
||||
|
||||
try:
|
||||
res = requests.get(cdx_url, stream=True)
|
||||
return WbResponse.bin_stream(res.raw, content_type=res.headers.get('Content-Type'))
|
||||
|
||||
except Exception as e:
|
||||
return WbResponse.text_content('Error: ' + str(e), status='400 Bad Request')
|
||||
|
||||
def serve_content(self, environ, coll='$root', url='', timemap_output=''):
|
||||
if not self.is_valid_coll(coll):
|
||||
self.raise_not_found(environ, 'No handler for "/{0}"'.format(coll))
|
||||
|
@ -48,11 +48,11 @@ class UpstreamException(WbException):
|
||||
class RewriterApp(object):
|
||||
VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json'
|
||||
|
||||
def __init__(self, framed_replay=False, jinja_env=None, config=None):
|
||||
def __init__(self, framed_replay=False, jinja_env=None, config=None, paths=None):
|
||||
self.loader = ArcWarcRecordLoader()
|
||||
|
||||
self.config = config or {}
|
||||
self.paths = {}
|
||||
self.paths = paths or {}
|
||||
|
||||
self.framed_replay = framed_replay
|
||||
|
||||
|
@ -26,16 +26,20 @@ class BaseWarcServer(object):
|
||||
|
||||
self.url_map.add(Rule('/', endpoint=list_routes))
|
||||
|
||||
def add_route(self, path, handler):
|
||||
def direct_input_request(environ, mode=''):
|
||||
def add_route(self, path, handler, path_param_name=''):
|
||||
def direct_input_request(environ, mode='', path_param_value=''):
|
||||
params = self.get_query_dict(environ)
|
||||
params['mode'] = mode
|
||||
if path_param_value:
|
||||
params[path_param_name] = path_param_value
|
||||
params['_input_req'] = DirectWSGIInputRequest(environ)
|
||||
return handler(params)
|
||||
|
||||
def post_fullrequest(environ, mode=''):
|
||||
def post_fullrequest(environ, mode='', path_param_value=''):
|
||||
params = self.get_query_dict(environ)
|
||||
params['mode'] = mode
|
||||
if path_param_value:
|
||||
params[path_param_name] = path_param_value
|
||||
params['_input_req'] = POSTInputRequest(environ)
|
||||
return handler(params)
|
||||
|
||||
|
@ -55,16 +55,14 @@ class WarcServer(BaseWarcServer):
|
||||
super(WarcServer, self).__init__(debug=config.get('debug', False))
|
||||
self.config = config
|
||||
|
||||
if self.config.get('enable_auto_colls', True):
|
||||
auto_handler = self.load_auto_colls()
|
||||
self.add_route('/_', auto_handler)
|
||||
|
||||
self.fixed_routes = self.load_colls()
|
||||
|
||||
for name, route in iteritems(self.fixed_routes):
|
||||
self.add_route('/' + name, route)
|
||||
|
||||
self._add_simple_route('/<coll>-cdx', self.cdx_compat)
|
||||
if self.config.get('enable_auto_colls', True):
|
||||
auto_handler = self.load_auto_colls()
|
||||
self.add_route('/<path:path_param_value>', auto_handler, path_param_name='param.coll')
|
||||
|
||||
def _lookup(self, environ, path):
|
||||
urls = self.url_map.bind(environ['HTTP_HOST'], path_info=path)
|
||||
@ -77,17 +75,6 @@ class WarcServer(BaseWarcServer):
|
||||
print(e)
|
||||
return None
|
||||
|
||||
def cdx_compat(self, environ, coll=''):
|
||||
""" -cdx server api
|
||||
"""
|
||||
result = self._lookup(environ, '/{0}/index'.format(coll))
|
||||
if result:
|
||||
return result
|
||||
|
||||
environ['QUERY_STRING'] += '¶m.coll=' + coll
|
||||
result = self._lookup(environ, '/_/index')
|
||||
return result
|
||||
|
||||
def load_auto_colls(self):
|
||||
self.root_dir = self.config.get('collections_root', '')
|
||||
if not self.root_dir:
|
||||
|
@ -3,6 +3,6 @@ debug: true
|
||||
collections_root: _test_colls
|
||||
|
||||
collections:
|
||||
'$root': $live
|
||||
'$root': '$live'
|
||||
|
||||
|
||||
|
@ -24,7 +24,7 @@ class TestCDXApp(BaseTestClass):
|
||||
|
||||
def query(self, url, is_error=False, **params):
|
||||
params['url'] = url
|
||||
return self.testapp.get('/pywb-cdx?' + urlencode(params, doseq=1), expect_errors=is_error)
|
||||
return self.testapp.get('/pywb/index?' + urlencode(params, doseq=1), expect_errors=is_error)
|
||||
|
||||
def test_exact_url(self):
|
||||
"""
|
||||
|
@ -428,15 +428,15 @@ class TestWbIntegration(BaseConfigTest):
|
||||
resp = self.testapp.get('/static/notfound.css', status = 404)
|
||||
assert resp.status_int == 404
|
||||
|
||||
def _test_cdx_server_filters(self):
|
||||
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mime:warc/revisit&filter=filename:dupes.warc.gz')
|
||||
self._assert_basic_text(resp)
|
||||
def test_cdx_server_filters(self):
|
||||
resp = self.testapp.get('/pywb/cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mime:warc/revisit&filter=filename:dupes.warc.gz')
|
||||
assert resp.content_type == 'text/x-cdxj'
|
||||
actual_len = len(resp.text.rstrip().split('\n'))
|
||||
assert actual_len == 1, actual_len
|
||||
|
||||
def _test_cdx_server_advanced(self):
|
||||
def test_cdx_server_advanced(self):
|
||||
# combine collapsing, reversing and revisit resolving
|
||||
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/print.css&collapseTime=11&resolveRevisits=true&reverse=true')
|
||||
resp = self.testapp.get('/pywb/cdx?url=http://www.iana.org/_css/2013.1/print.css&collapseTime=11&resolveRevisits=true&reverse=true')
|
||||
|
||||
# convert back to CDXObject
|
||||
cdxs = list(map(CDXObject, resp.body.rstrip().split(b'\n')))
|
||||
|
@ -37,3 +37,9 @@ class TestRootColl(BaseConfigTest):
|
||||
resp = self.testapp.get('/')
|
||||
assert 'Search' in resp.text
|
||||
|
||||
def test_root_cdx(self):
|
||||
resp = self.testapp.get('/cdx?url=http://www.iana.org/&output=json&limit=1')
|
||||
resp.content_type = 'application/json'
|
||||
assert resp.json['is_live'] == 'true'
|
||||
assert resp.json['url'] == 'http://www.iana.org/'
|
||||
assert resp.json['source'] == '$root'
|
||||
|
Loading…
x
Reference in New Issue
Block a user