mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
refactor: move configs/config.yaml to root again
remove cdx-server specific config, instead make cdx server api-only path configurable from regular config
This commit is contained in:
parent
8bdafeb040
commit
80f2da9548
@ -4,14 +4,12 @@ pywb 0.2.2 changelist
|
|||||||
* Generate cdx indexs via command-line `cdx-indexer` script. Optionally sorting, and output to either a single combined file or a file per-directory.
|
* Generate cdx indexs via command-line `cdx-indexer` script. Optionally sorting, and output to either a single combined file or a file per-directory.
|
||||||
Refer to ``cdx-indexer -h`` for more info.
|
Refer to ``cdx-indexer -h`` for more info.
|
||||||
|
|
||||||
* Initial support for prefix url queries, eg: http://localhost:8080/pywb/*/http://example.com\* to query all captures from http://example.com
|
* Initial support for prefix url queries, eg: http://localhost:8080/pywb/\*/http://example.com\* to query all captures from http://example.com
|
||||||
|
|
||||||
* Support for optional LXML html-based parser for fastest possible parsing. If lxml is installed on the system and via ``pip install lxml``, lxml parser is enabled by default.
|
* Support for optional LXML html-based parser for fastest possible parsing. If lxml is installed on the system and via ``pip install lxml``, lxml parser is enabled by default.
|
||||||
(This can be turned off by setting ``use_lxml_parser: false`` in the config)
|
(This can be turned off by setting ``use_lxml_parser: false`` in the config)
|
||||||
|
|
||||||
* Memento: TimeMaps in ``application/link-format`` provided via the ``/timemap/*/`` query.. eg: http://localhost:8080/pywb/timemap/\*/http://example.com
|
* Full support for `Memento Protocol RFC7089 <http://www.mementoweb.org/guide/rfc/>`_ Memento, TimeGate and TimeMaps. Memento: TimeMaps in ``application/link-format`` provided via the ``/timemap/*/`` query.. eg: http://localhost:8080/pywb/timemap/\*/http://example.com
|
||||||
|
|
||||||
* Basic support for `Memento Protocol RFC7089 <http://www.mementoweb.org/guide/rfc/>`_ Memento, TimeGate and now TimeMaps.
|
|
||||||
|
|
||||||
* pywb now features new `domain-specific rules <https://github.com/ikreymer/pywb/blob/master/pywb/rules.yaml>`_ which are applied to resolve and render certain difficult and dynamic content, in order to make accurate web replay work.
|
* pywb now features new `domain-specific rules <https://github.com/ikreymer/pywb/blob/master/pywb/rules.yaml>`_ which are applied to resolve and render certain difficult and dynamic content, in order to make accurate web replay work.
|
||||||
This ruleset will be under further iteration to address further challenges as the web evoles.
|
This ruleset will be under further iteration to address further challenges as the web evoles.
|
||||||
|
@ -132,7 +132,7 @@ Sample Setup
|
|||||||
|
|
||||||
pywb is configurable via yaml.
|
pywb is configurable via yaml.
|
||||||
|
|
||||||
The simplest `config.yaml <https://github.com/ikreymer/pywb/blob/develop/configs/config.yaml>`_ is roughly as follows:
|
The simplest `config.yaml <https://github.com/ikreymer/pywb/blob/develop/config.yaml>`_ is roughly as follows:
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
@ -145,7 +145,7 @@ The simplest `config.yaml <https://github.com/ikreymer/pywb/blob/develop/configs
|
|||||||
|
|
||||||
This sets up pywb with a single route for collection /pywb
|
This sets up pywb with a single route for collection /pywb
|
||||||
|
|
||||||
(The the latest version of `config.yaml <https://github.com/ikreymer/pywb/blob/master/config.yaml>`_ contains
|
(The the latest version of `config.yaml <https://github.com/ikreymer/pywb/blob/develop/config.yaml>`_ contains
|
||||||
additional documentation and specifies all the optional properties, such
|
additional documentation and specifies all the optional properties, such
|
||||||
as ui filenames for Jinja2/html template files.)
|
as ui filenames for Jinja2/html template files.)
|
||||||
|
|
||||||
|
@ -43,7 +43,7 @@ Given an archive of warcs at ``myarchive/warcs``
|
|||||||
2. Run ``cdx-indexer --sort myarchive/cdx myarchive/warcs`` to generate .cdx files for each
|
2. Run ``cdx-indexer --sort myarchive/cdx myarchive/warcs`` to generate .cdx files for each
|
||||||
warc/arc file in ``myarchive/warcs``
|
warc/arc file in ``myarchive/warcs``
|
||||||
|
|
||||||
3. Edit ``config.yaml`` to contain the following. You may replace ``pywb`` with
|
3. Edit ``<https://github.com/ikreymer/pywb/blob/develop/config.yaml>`` to contain the following. You may replace ``pywb`` with
|
||||||
a name of your choice -- it will be the path to your collection. (Multiple collections can be added
|
a name of your choice -- it will be the path to your collection. (Multiple collections can be added
|
||||||
for different sets of .cdx files as well)
|
for different sets of .cdx files as well)
|
||||||
|
|
||||||
@ -71,7 +71,7 @@ If you already have .cdx files for your archive, you can skip the first two step
|
|||||||
pywb recommends using `SURT <http://crawler.archive.org/articles/user_manual/glossary.html#surt>`_ (Sort-friendly URI Reordering Transform)
|
pywb recommends using `SURT <http://crawler.archive.org/articles/user_manual/glossary.html#surt>`_ (Sort-friendly URI Reordering Transform)
|
||||||
sorted urls and the ``cdx-indexer`` automatically generates indexs in this format.
|
sorted urls and the ``cdx-indexer`` automatically generates indexs in this format.
|
||||||
|
|
||||||
However, pywb is compatible with regular url keyed indexs.
|
However, pywb is compatible with regular url keyed indexes also.
|
||||||
If you would like to use non-SURT ordered .cdx files, simply add this field to the config:
|
If you would like to use non-SURT ordered .cdx files, simply add this field to the config:
|
||||||
|
|
||||||
::
|
::
|
||||||
@ -112,7 +112,7 @@ Additional Documentation
|
|||||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
- For additional/up-to-date configuration details, consult the current
|
- For additional/up-to-date configuration details, consult the current
|
||||||
`config.yaml <https://github.com/ikreymer/pywb/blob/develop/configs/config.yaml>`_
|
`config.yaml <https://github.com/ikreymer/pywb/blob/develop/config.yaml>`_
|
||||||
|
|
||||||
- The `wiki <https://github.com/ikreymer/pywb/wiki>`_ will have
|
- The `wiki <https://github.com/ikreymer/pywb/wiki>`_ will have
|
||||||
additional technical documentation about various aspects of pywb
|
additional technical documentation about various aspects of pywb
|
||||||
|
@ -98,9 +98,6 @@ enable_cdx_api: true
|
|||||||
# set to false to disable
|
# set to false to disable
|
||||||
#domain_specific_rules: rules.yaml
|
#domain_specific_rules: rules.yaml
|
||||||
|
|
||||||
# Permissions checker
|
|
||||||
#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms []
|
|
||||||
|
|
||||||
# Memento support, enable
|
# Memento support, enable
|
||||||
enable_memento: true
|
enable_memento: true
|
||||||
|
|
@ -1,21 +1,18 @@
|
|||||||
from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server
|
from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server
|
||||||
|
|
||||||
from pywb.core.cdx_api_handler import create_cdx_server_app
|
#from pywb.core.cdx_api_handler import create_cdx_server_app
|
||||||
|
from pywb.core.pywb_init import create_cdx_server_app
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# init cdx server app
|
# init cdx server app
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
|
||||||
# cdx-server only config
|
|
||||||
DEFAULT_CONFIG = 'pywb/configs/cdx-server-config.yaml'
|
|
||||||
|
|
||||||
application = init_app(create_cdx_server_app,
|
application = init_app(create_cdx_server_app,
|
||||||
load_yaml=True,
|
load_yaml=True)
|
||||||
config_file=DEFAULT_CONFIG)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
start_wsgi_server(application, 'CDX Server')
|
start_wsgi_server(application, 'CDX Server', default_port=8090)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
@ -14,7 +14,7 @@ from pywb import get_test_dir
|
|||||||
|
|
||||||
TEST_CDX_DIR = get_test_dir() + 'cdx/'
|
TEST_CDX_DIR = get_test_dir() + 'cdx/'
|
||||||
|
|
||||||
CDX_SERVER_URL = 'http://localhost/cdx'
|
CDX_SERVER_URL = 'http://localhost/pywb-cdx'
|
||||||
|
|
||||||
CDX_RESULT = [
|
CDX_RESULT = [
|
||||||
('urlkey', 'com,example)/'),
|
('urlkey', 'com,example)/'),
|
||||||
|
@ -1,3 +0,0 @@
|
|||||||
#CDX Server WSGI App Config
|
|
||||||
index_paths: ./sample_archive/cdx/
|
|
||||||
port: 8090
|
|
@ -49,17 +49,3 @@ class CDXAPIHandler(BaseHandler):
|
|||||||
params['output'] = 'text'
|
params['output'] = 'text'
|
||||||
|
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def create_cdx_server_app(config):
|
|
||||||
"""
|
|
||||||
Create a cdx server config to be wrapped in a wsgi app
|
|
||||||
Currently using single access point '/cdx' to expose the api
|
|
||||||
TODO: more complex example with multiple collections?
|
|
||||||
"""
|
|
||||||
query_handler = QueryHandler.init_from_config(config)
|
|
||||||
|
|
||||||
port = config.get('port')
|
|
||||||
routes = [Route('cdx', CDXAPIHandler(query_handler))]
|
|
||||||
return ArchivalRouter(routes, port=port)
|
|
||||||
|
@ -96,13 +96,67 @@ def create_wb_handler(query_handler, config, ds_rules_file=None):
|
|||||||
wb_handler = wb_handler_class(
|
wb_handler = wb_handler_class(
|
||||||
query_handler,
|
query_handler,
|
||||||
replayer,
|
replayer,
|
||||||
#html_view=html_view,
|
|
||||||
search_view=search_view,
|
search_view=search_view,
|
||||||
)
|
)
|
||||||
|
|
||||||
return wb_handler
|
return wb_handler
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def init_collection(value, config):
|
||||||
|
if isinstance(value, str):
|
||||||
|
value = {'index_paths': value}
|
||||||
|
|
||||||
|
route_config = DictChain(value, config)
|
||||||
|
|
||||||
|
ds_rules_file = route_config.get('domain_specific_rules', None)
|
||||||
|
|
||||||
|
html_view = load_query_template(config.get('query_html'),
|
||||||
|
'Captures Page')
|
||||||
|
|
||||||
|
query_handler = QueryHandler.init_from_config(route_config,
|
||||||
|
ds_rules_file,
|
||||||
|
html_view)
|
||||||
|
|
||||||
|
return route_config, query_handler, ds_rules_file
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def add_cdx_api_handler(name, cdx_api_suffix, routes, query_handler):
|
||||||
|
# if bool, use -cdx suffix, else use custom string
|
||||||
|
# as the suffix
|
||||||
|
if isinstance(cdx_api_suffix, bool):
|
||||||
|
name += '-cdx'
|
||||||
|
else:
|
||||||
|
name += str(cdx_api_suffix)
|
||||||
|
|
||||||
|
routes.append(Route(name, CDXAPIHandler(query_handler)))
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def create_cdx_server_app(passed_config):
|
||||||
|
"""
|
||||||
|
Create a cdx server api-only app
|
||||||
|
For each collection, create a /<coll>-cdx access point
|
||||||
|
which follows the cdx api
|
||||||
|
"""
|
||||||
|
config = DictChain(passed_config, DEFAULTS)
|
||||||
|
|
||||||
|
collections = config.get('collections')
|
||||||
|
|
||||||
|
routes = []
|
||||||
|
|
||||||
|
for name, value in collections.iteritems():
|
||||||
|
result = init_collection(value, config)
|
||||||
|
route_config, query_handler, ds_rules_file = result
|
||||||
|
|
||||||
|
cdx_api_suffix = route_config.get('enable_cdx_api', True)
|
||||||
|
|
||||||
|
add_cdx_api_handler(name, cdx_api_suffix, routes, query_handler)
|
||||||
|
|
||||||
|
return ArchivalRouter(routes)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def create_wb_router(passed_config={}):
|
def create_wb_router(passed_config={}):
|
||||||
|
|
||||||
@ -131,24 +185,9 @@ def create_wb_router(passed_config={}):
|
|||||||
use_lxml_parser()
|
use_lxml_parser()
|
||||||
|
|
||||||
for name, value in collections.iteritems():
|
for name, value in collections.iteritems():
|
||||||
if isinstance(value, str):
|
|
||||||
value = {'index_paths': value}
|
|
||||||
|
|
||||||
route_config = DictChain(value, config)
|
result = init_collection(value, config)
|
||||||
|
route_config, query_handler, ds_rules_file = result
|
||||||
ds_rules_file = route_config.get('domain_specific_rules', None)
|
|
||||||
|
|
||||||
#perms_policy = route_config.get('perms_policy', None)
|
|
||||||
#
|
|
||||||
#cdx_server = create_cdx_server(route_config,
|
|
||||||
# ds_rules_file)
|
|
||||||
#
|
|
||||||
html_view = load_query_template(config.get('query_html'),
|
|
||||||
'Captures Page')
|
|
||||||
|
|
||||||
query_handler = QueryHandler.init_from_config(route_config,
|
|
||||||
ds_rules_file,
|
|
||||||
html_view)
|
|
||||||
|
|
||||||
wb_handler = create_wb_handler(
|
wb_handler = create_wb_handler(
|
||||||
query_handler=query_handler,
|
query_handler=query_handler,
|
||||||
@ -168,13 +207,7 @@ def create_wb_router(passed_config={}):
|
|||||||
cdx_api_suffix = route_config.get('enable_cdx_api', False)
|
cdx_api_suffix = route_config.get('enable_cdx_api', False)
|
||||||
|
|
||||||
if cdx_api_suffix:
|
if cdx_api_suffix:
|
||||||
# if bool, use -cdx suffix, else use custom string
|
add_cdx_api_handler(name, cdx_api_suffix, routes, query_handler)
|
||||||
# as the suffix
|
|
||||||
if isinstance(cdx_api_suffix, bool):
|
|
||||||
cdx_api_suffix = '-cdx'
|
|
||||||
|
|
||||||
routes.append(Route(name + str(cdx_api_suffix),
|
|
||||||
CDXAPIHandler(query_handler)))
|
|
||||||
|
|
||||||
if config.get('debug_echo_env', False):
|
if config.get('debug_echo_env', False):
|
||||||
routes.append(Route('echo_env', DebugEchoEnvHandler()))
|
routes.append(Route('echo_env', DebugEchoEnvHandler()))
|
||||||
|
@ -103,7 +103,7 @@ def handle_exception(env, wb_router, exc, print_trace):
|
|||||||
status=status)
|
status=status)
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
DEFAULT_CONFIG_FILE = 'pywb/configs/config.yaml'
|
DEFAULT_CONFIG_FILE = 'config.yaml'
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -139,7 +139,7 @@ def init_app(init_func, load_yaml=True, config_file=None):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def start_wsgi_server(the_app, name): # pragma: no cover
|
def start_wsgi_server(the_app, name, default_port=None): # pragma: no cover
|
||||||
from wsgiref.simple_server import make_server
|
from wsgiref.simple_server import make_server
|
||||||
from optparse import OptionParser
|
from optparse import OptionParser
|
||||||
|
|
||||||
@ -154,7 +154,10 @@ def start_wsgi_server(the_app, name): # pragma: no cover
|
|||||||
port = the_app.port
|
port = the_app.port
|
||||||
|
|
||||||
if not port:
|
if not port:
|
||||||
port = DEFAULT_PORT
|
if default_port:
|
||||||
|
port = default_port
|
||||||
|
else:
|
||||||
|
port = DEFAULT_PORT
|
||||||
|
|
||||||
logging.info('Starting %s on port %s', name, port)
|
logging.info('Starting %s on port %s', name, port)
|
||||||
|
|
||||||
|
2
setup.py
2
setup.py
@ -54,7 +54,7 @@ setup(
|
|||||||
'pywb.apps'
|
'pywb.apps'
|
||||||
],
|
],
|
||||||
package_data={
|
package_data={
|
||||||
'pywb': ['ui/*', 'static/*', '*.yaml', 'configs/*'],
|
'pywb': ['ui/*', 'static/*', '*.yaml'],
|
||||||
},
|
},
|
||||||
data_files=[
|
data_files=[
|
||||||
('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
||||||
|
@ -17,7 +17,7 @@ def client():
|
|||||||
#================================================================
|
#================================================================
|
||||||
def query(client, url, is_error=False, **params):
|
def query(client, url, is_error=False, **params):
|
||||||
params['url'] = url
|
params['url'] = url
|
||||||
return client.get('/cdx?' + urlencode(params, doseq=1), expect_errors=is_error)
|
return client.get('/pywb-cdx?' + urlencode(params, doseq=1), expect_errors=is_error)
|
||||||
|
|
||||||
|
|
||||||
#================================================================
|
#================================================================
|
||||||
|
Loading…
x
Reference in New Issue
Block a user