1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

cdx-server/frontendapp refactor: (#237)

frontendapp/warcserver improvements:
- support '/cdx' endpoint for every collection, exposing standard cdx-server api
- remove '-cdx' endpoint in warcserver, redundant with index and frontend /cdx endpoint
- warcserver: simplify paths! support static paths (/A, /B) + dynamic paths (/<path>) on same endpoint
This commit is contained in:
Ilya Kreymer 2017-09-06 23:25:30 -07:00 committed by GitHub
parent 772993ba53
commit 33eb4a4ae1
8 changed files with 53 additions and 36 deletions

View File

@ -22,6 +22,7 @@ from pywb.apps.wbrequestresponse import WbResponse
import os
import traceback
import requests
# ============================================================================
@ -33,8 +34,6 @@ class FrontEndApp(object):
framed_replay = self.warcserver.config.get('framed_replay', True)
self.rewriterapp = RewriterApp(framed_replay, config=self.warcserver.config)
self.warcserver_server = GeventServer(self.warcserver, port=0)
self.static_handler = StaticHandler('pywb/static/')
@ -47,16 +46,21 @@ class FrontEndApp(object):
if self.is_valid_coll('$root'):
self.url_map.add(Rule('/', endpoint=self.serve_coll_page))
self.url_map.add(Rule('/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
self.url_map.add(Rule('/cdx', endpoint=self.serve_cdx))
self.url_map.add(Rule('/<path:url>', endpoint=self.serve_content))
else:
self.url_map.add(Rule('/<coll>/', endpoint=self.serve_coll_page))
self.url_map.add(Rule('/<coll>/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
self.url_map.add(Rule('/<coll>/cdx', endpoint=self.serve_cdx))
self.url_map.add(Rule('/<coll>/<path:url>', endpoint=self.serve_content))
self.url_map.add(Rule('/', endpoint=self.serve_home))
self.rewriterapp.paths = self.get_upstream_paths(self.warcserver_server.port)
upstream_paths = self.get_upstream_paths(self.warcserver_server.port)
self.rewriterapp = RewriterApp(framed_replay,
config=self.warcserver.config,
paths=upstream_paths)
self.templates_dir = self.warcserver.config.get('templates_dir', 'templates')
self.static_dir = self.warcserver.config.get('static_dir', 'static')
@ -65,8 +69,9 @@ class FrontEndApp(object):
self.metadata_cache = MetadataCache(metadata_templ)
def get_upstream_paths(self, port):
return {'replay-dyn': 'http://localhost:%s/_/resource/postreq?param.coll={coll}' % port,
'replay-fixed': 'http://localhost:%s/{coll}/resource/postreq' % port
return {
'replay': 'http://localhost:%s/{coll}/resource/postreq' % port,
'cdx-server': 'http://localhost:%s/{coll}/index' % port,
}
def serve_home(self, environ):
@ -98,17 +103,16 @@ class FrontEndApp(object):
self.raise_not_found(environ, 'Static File Not Found: {0}'.format(filepath))
def get_metadata(self, coll):
metadata = {'coll': coll}
metadata = {'coll': coll,
'type': 'replay'}
if self.warcserver.config.get('use_js_obj_proxy'):
metadata['use_js_obj_proxy'] = True
if coll in self.warcserver.list_fixed_routes():
metadata.update(self.warcserver.get_coll_config(coll))
metadata['type'] = 'replay-fixed'
else:
metadata.update(self.metadata_cache.load(coll))
metadata['type'] = 'replay-dyn'
return metadata
@ -132,6 +136,22 @@ class FrontEndApp(object):
return WbResponse.text_response(content, content_type='text/html; charset="utf-8"')
def serve_cdx(self, environ, coll='$root'):
base_url = self.rewriterapp.paths['cdx-server']
cdx_url = base_url.format(coll=coll)
if environ.get('QUERY_STRING'):
cdx_url += '&' if '?' in cdx_url else '?'
cdx_url += environ.get('QUERY_STRING')
try:
res = requests.get(cdx_url, stream=True)
return WbResponse.bin_stream(res.raw, content_type=res.headers.get('Content-Type'))
except Exception as e:
return WbResponse.text_content('Error: ' + str(e), status='400 Bad Request')
def serve_content(self, environ, coll='$root', url='', timemap_output=''):
if not self.is_valid_coll(coll):
self.raise_not_found(environ, 'No handler for "/{0}"'.format(coll))

View File

@ -48,11 +48,11 @@ class UpstreamException(WbException):
class RewriterApp(object):
VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json'
def __init__(self, framed_replay=False, jinja_env=None, config=None):
def __init__(self, framed_replay=False, jinja_env=None, config=None, paths=None):
self.loader = ArcWarcRecordLoader()
self.config = config or {}
self.paths = {}
self.paths = paths or {}
self.framed_replay = framed_replay

View File

@ -26,16 +26,20 @@ class BaseWarcServer(object):
self.url_map.add(Rule('/', endpoint=list_routes))
def add_route(self, path, handler):
def direct_input_request(environ, mode=''):
def add_route(self, path, handler, path_param_name=''):
def direct_input_request(environ, mode='', path_param_value=''):
params = self.get_query_dict(environ)
params['mode'] = mode
if path_param_value:
params[path_param_name] = path_param_value
params['_input_req'] = DirectWSGIInputRequest(environ)
return handler(params)
def post_fullrequest(environ, mode=''):
def post_fullrequest(environ, mode='', path_param_value=''):
params = self.get_query_dict(environ)
params['mode'] = mode
if path_param_value:
params[path_param_name] = path_param_value
params['_input_req'] = POSTInputRequest(environ)
return handler(params)

View File

@ -55,16 +55,14 @@ class WarcServer(BaseWarcServer):
super(WarcServer, self).__init__(debug=config.get('debug', False))
self.config = config
if self.config.get('enable_auto_colls', True):
auto_handler = self.load_auto_colls()
self.add_route('/_', auto_handler)
self.fixed_routes = self.load_colls()
for name, route in iteritems(self.fixed_routes):
self.add_route('/' + name, route)
self._add_simple_route('/<coll>-cdx', self.cdx_compat)
if self.config.get('enable_auto_colls', True):
auto_handler = self.load_auto_colls()
self.add_route('/<path:path_param_value>', auto_handler, path_param_name='param.coll')
def _lookup(self, environ, path):
urls = self.url_map.bind(environ['HTTP_HOST'], path_info=path)
@ -77,17 +75,6 @@ class WarcServer(BaseWarcServer):
print(e)
return None
def cdx_compat(self, environ, coll=''):
""" -cdx server api
"""
result = self._lookup(environ, '/{0}/index'.format(coll))
if result:
return result
environ['QUERY_STRING'] += '&param.coll=' + coll
result = self._lookup(environ, '/_/index')
return result
def load_auto_colls(self):
self.root_dir = self.config.get('collections_root', '')
if not self.root_dir:

View File

@ -3,6 +3,6 @@ debug: true
collections_root: _test_colls
collections:
'$root': $live
'$root': '$live'

View File

@ -24,7 +24,7 @@ class TestCDXApp(BaseTestClass):
def query(self, url, is_error=False, **params):
params['url'] = url
return self.testapp.get('/pywb-cdx?' + urlencode(params, doseq=1), expect_errors=is_error)
return self.testapp.get('/pywb/index?' + urlencode(params, doseq=1), expect_errors=is_error)
def test_exact_url(self):
"""

View File

@ -428,15 +428,15 @@ class TestWbIntegration(BaseConfigTest):
resp = self.testapp.get('/static/notfound.css', status = 404)
assert resp.status_int == 404
def _test_cdx_server_filters(self):
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mime:warc/revisit&filter=filename:dupes.warc.gz')
self._assert_basic_text(resp)
def test_cdx_server_filters(self):
resp = self.testapp.get('/pywb/cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mime:warc/revisit&filter=filename:dupes.warc.gz')
assert resp.content_type == 'text/x-cdxj'
actual_len = len(resp.text.rstrip().split('\n'))
assert actual_len == 1, actual_len
def _test_cdx_server_advanced(self):
def test_cdx_server_advanced(self):
# combine collapsing, reversing and revisit resolving
resp = self.testapp.get('/pywb-cdx?url=http://www.iana.org/_css/2013.1/print.css&collapseTime=11&resolveRevisits=true&reverse=true')
resp = self.testapp.get('/pywb/cdx?url=http://www.iana.org/_css/2013.1/print.css&collapseTime=11&resolveRevisits=true&reverse=true')
# convert back to CDXObject
cdxs = list(map(CDXObject, resp.body.rstrip().split(b'\n')))

View File

@ -37,3 +37,9 @@ class TestRootColl(BaseConfigTest):
resp = self.testapp.get('/')
assert 'Search' in resp.text
def test_root_cdx(self):
resp = self.testapp.get('/cdx?url=http://www.iana.org/&output=json&limit=1')
resp.content_type = 'application/json'
assert resp.json['is_live'] == 'true'
assert resp.json['url'] == 'http://www.iana.org/'
assert resp.json['source'] == '$root'