mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
refactor: move configs/config.yaml to root again
remove cdx-server specific config, instead make cdx server api-only path configurable from regular config
This commit is contained in:
parent
8bdafeb040
commit
80f2da9548
@ -4,14 +4,12 @@ pywb 0.2.2 changelist
|
||||
* Generate cdx indexs via command-line `cdx-indexer` script. Optionally sorting, and output to either a single combined file or a file per-directory.
|
||||
Refer to ``cdx-indexer -h`` for more info.
|
||||
|
||||
* Initial support for prefix url queries, eg: http://localhost:8080/pywb/*/http://example.com\* to query all captures from http://example.com
|
||||
* Initial support for prefix url queries, eg: http://localhost:8080/pywb/\*/http://example.com\* to query all captures from http://example.com
|
||||
|
||||
* Support for optional LXML html-based parser for fastest possible parsing. If lxml is installed on the system and via ``pip install lxml``, lxml parser is enabled by default.
|
||||
(This can be turned off by setting ``use_lxml_parser: false`` in the config)
|
||||
|
||||
* Memento: TimeMaps in ``application/link-format`` provided via the ``/timemap/*/`` query.. eg: http://localhost:8080/pywb/timemap/\*/http://example.com
|
||||
|
||||
* Basic support for `Memento Protocol RFC7089 <http://www.mementoweb.org/guide/rfc/>`_ Memento, TimeGate and now TimeMaps.
|
||||
* Full support for `Memento Protocol RFC7089 <http://www.mementoweb.org/guide/rfc/>`_ Memento, TimeGate and TimeMaps. Memento: TimeMaps in ``application/link-format`` provided via the ``/timemap/*/`` query.. eg: http://localhost:8080/pywb/timemap/\*/http://example.com
|
||||
|
||||
* pywb now features new `domain-specific rules <https://github.com/ikreymer/pywb/blob/master/pywb/rules.yaml>`_ which are applied to resolve and render certain difficult and dynamic content, in order to make accurate web replay work.
|
||||
This ruleset will be under further iteration to address further challenges as the web evoles.
|
||||
|
@ -132,7 +132,7 @@ Sample Setup
|
||||
|
||||
pywb is configurable via yaml.
|
||||
|
||||
The simplest `config.yaml <https://github.com/ikreymer/pywb/blob/develop/configs/config.yaml>`_ is roughly as follows:
|
||||
The simplest `config.yaml <https://github.com/ikreymer/pywb/blob/develop/config.yaml>`_ is roughly as follows:
|
||||
|
||||
::
|
||||
|
||||
@ -145,7 +145,7 @@ The simplest `config.yaml <https://github.com/ikreymer/pywb/blob/develop/configs
|
||||
|
||||
This sets up pywb with a single route for collection /pywb
|
||||
|
||||
(The the latest version of `config.yaml <https://github.com/ikreymer/pywb/blob/master/config.yaml>`_ contains
|
||||
(The the latest version of `config.yaml <https://github.com/ikreymer/pywb/blob/develop/config.yaml>`_ contains
|
||||
additional documentation and specifies all the optional properties, such
|
||||
as ui filenames for Jinja2/html template files.)
|
||||
|
||||
|
@ -43,7 +43,7 @@ Given an archive of warcs at ``myarchive/warcs``
|
||||
2. Run ``cdx-indexer --sort myarchive/cdx myarchive/warcs`` to generate .cdx files for each
|
||||
warc/arc file in ``myarchive/warcs``
|
||||
|
||||
3. Edit ``config.yaml`` to contain the following. You may replace ``pywb`` with
|
||||
3. Edit ``<https://github.com/ikreymer/pywb/blob/develop/config.yaml>`` to contain the following. You may replace ``pywb`` with
|
||||
a name of your choice -- it will be the path to your collection. (Multiple collections can be added
|
||||
for different sets of .cdx files as well)
|
||||
|
||||
@ -71,7 +71,7 @@ If you already have .cdx files for your archive, you can skip the first two step
|
||||
pywb recommends using `SURT <http://crawler.archive.org/articles/user_manual/glossary.html#surt>`_ (Sort-friendly URI Reordering Transform)
|
||||
sorted urls and the ``cdx-indexer`` automatically generates indexs in this format.
|
||||
|
||||
However, pywb is compatible with regular url keyed indexs.
|
||||
However, pywb is compatible with regular url keyed indexes also.
|
||||
If you would like to use non-SURT ordered .cdx files, simply add this field to the config:
|
||||
|
||||
::
|
||||
@ -112,7 +112,7 @@ Additional Documentation
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
- For additional/up-to-date configuration details, consult the current
|
||||
`config.yaml <https://github.com/ikreymer/pywb/blob/develop/configs/config.yaml>`_
|
||||
`config.yaml <https://github.com/ikreymer/pywb/blob/develop/config.yaml>`_
|
||||
|
||||
- The `wiki <https://github.com/ikreymer/pywb/wiki>`_ will have
|
||||
additional technical documentation about various aspects of pywb
|
||||
|
@ -98,9 +98,6 @@ enable_cdx_api: true
|
||||
# set to false to disable
|
||||
#domain_specific_rules: rules.yaml
|
||||
|
||||
# Permissions checker
|
||||
#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms []
|
||||
|
||||
# Memento support, enable
|
||||
enable_memento: true
|
||||
|
@ -1,21 +1,18 @@
|
||||
from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server
|
||||
|
||||
from pywb.core.cdx_api_handler import create_cdx_server_app
|
||||
#from pywb.core.cdx_api_handler import create_cdx_server_app
|
||||
from pywb.core.pywb_init import create_cdx_server_app
|
||||
|
||||
#=================================================================
|
||||
# init cdx server app
|
||||
#=================================================================
|
||||
|
||||
# cdx-server only config
|
||||
DEFAULT_CONFIG = 'pywb/configs/cdx-server-config.yaml'
|
||||
|
||||
application = init_app(create_cdx_server_app,
|
||||
load_yaml=True,
|
||||
config_file=DEFAULT_CONFIG)
|
||||
load_yaml=True)
|
||||
|
||||
|
||||
def main():
|
||||
start_wsgi_server(application, 'CDX Server')
|
||||
start_wsgi_server(application, 'CDX Server', default_port=8090)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
@ -14,7 +14,7 @@ from pywb import get_test_dir
|
||||
|
||||
TEST_CDX_DIR = get_test_dir() + 'cdx/'
|
||||
|
||||
CDX_SERVER_URL = 'http://localhost/cdx'
|
||||
CDX_SERVER_URL = 'http://localhost/pywb-cdx'
|
||||
|
||||
CDX_RESULT = [
|
||||
('urlkey', 'com,example)/'),
|
||||
|
@ -1,3 +0,0 @@
|
||||
#CDX Server WSGI App Config
|
||||
index_paths: ./sample_archive/cdx/
|
||||
port: 8090
|
@ -49,17 +49,3 @@ class CDXAPIHandler(BaseHandler):
|
||||
params['output'] = 'text'
|
||||
|
||||
return params
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_cdx_server_app(config):
|
||||
"""
|
||||
Create a cdx server config to be wrapped in a wsgi app
|
||||
Currently using single access point '/cdx' to expose the api
|
||||
TODO: more complex example with multiple collections?
|
||||
"""
|
||||
query_handler = QueryHandler.init_from_config(config)
|
||||
|
||||
port = config.get('port')
|
||||
routes = [Route('cdx', CDXAPIHandler(query_handler))]
|
||||
return ArchivalRouter(routes, port=port)
|
||||
|
@ -96,13 +96,67 @@ def create_wb_handler(query_handler, config, ds_rules_file=None):
|
||||
wb_handler = wb_handler_class(
|
||||
query_handler,
|
||||
replayer,
|
||||
#html_view=html_view,
|
||||
search_view=search_view,
|
||||
)
|
||||
|
||||
return wb_handler
|
||||
|
||||
|
||||
#=================================================================
|
||||
def init_collection(value, config):
|
||||
if isinstance(value, str):
|
||||
value = {'index_paths': value}
|
||||
|
||||
route_config = DictChain(value, config)
|
||||
|
||||
ds_rules_file = route_config.get('domain_specific_rules', None)
|
||||
|
||||
html_view = load_query_template(config.get('query_html'),
|
||||
'Captures Page')
|
||||
|
||||
query_handler = QueryHandler.init_from_config(route_config,
|
||||
ds_rules_file,
|
||||
html_view)
|
||||
|
||||
return route_config, query_handler, ds_rules_file
|
||||
|
||||
|
||||
#=================================================================
|
||||
def add_cdx_api_handler(name, cdx_api_suffix, routes, query_handler):
|
||||
# if bool, use -cdx suffix, else use custom string
|
||||
# as the suffix
|
||||
if isinstance(cdx_api_suffix, bool):
|
||||
name += '-cdx'
|
||||
else:
|
||||
name += str(cdx_api_suffix)
|
||||
|
||||
routes.append(Route(name, CDXAPIHandler(query_handler)))
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_cdx_server_app(passed_config):
|
||||
"""
|
||||
Create a cdx server api-only app
|
||||
For each collection, create a /<coll>-cdx access point
|
||||
which follows the cdx api
|
||||
"""
|
||||
config = DictChain(passed_config, DEFAULTS)
|
||||
|
||||
collections = config.get('collections')
|
||||
|
||||
routes = []
|
||||
|
||||
for name, value in collections.iteritems():
|
||||
result = init_collection(value, config)
|
||||
route_config, query_handler, ds_rules_file = result
|
||||
|
||||
cdx_api_suffix = route_config.get('enable_cdx_api', True)
|
||||
|
||||
add_cdx_api_handler(name, cdx_api_suffix, routes, query_handler)
|
||||
|
||||
return ArchivalRouter(routes)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_wb_router(passed_config={}):
|
||||
|
||||
@ -131,24 +185,9 @@ def create_wb_router(passed_config={}):
|
||||
use_lxml_parser()
|
||||
|
||||
for name, value in collections.iteritems():
|
||||
if isinstance(value, str):
|
||||
value = {'index_paths': value}
|
||||
|
||||
route_config = DictChain(value, config)
|
||||
|
||||
ds_rules_file = route_config.get('domain_specific_rules', None)
|
||||
|
||||
#perms_policy = route_config.get('perms_policy', None)
|
||||
#
|
||||
#cdx_server = create_cdx_server(route_config,
|
||||
# ds_rules_file)
|
||||
#
|
||||
html_view = load_query_template(config.get('query_html'),
|
||||
'Captures Page')
|
||||
|
||||
query_handler = QueryHandler.init_from_config(route_config,
|
||||
ds_rules_file,
|
||||
html_view)
|
||||
result = init_collection(value, config)
|
||||
route_config, query_handler, ds_rules_file = result
|
||||
|
||||
wb_handler = create_wb_handler(
|
||||
query_handler=query_handler,
|
||||
@ -168,13 +207,7 @@ def create_wb_router(passed_config={}):
|
||||
cdx_api_suffix = route_config.get('enable_cdx_api', False)
|
||||
|
||||
if cdx_api_suffix:
|
||||
# if bool, use -cdx suffix, else use custom string
|
||||
# as the suffix
|
||||
if isinstance(cdx_api_suffix, bool):
|
||||
cdx_api_suffix = '-cdx'
|
||||
|
||||
routes.append(Route(name + str(cdx_api_suffix),
|
||||
CDXAPIHandler(query_handler)))
|
||||
add_cdx_api_handler(name, cdx_api_suffix, routes, query_handler)
|
||||
|
||||
if config.get('debug_echo_env', False):
|
||||
routes.append(Route('echo_env', DebugEchoEnvHandler()))
|
||||
|
@ -103,7 +103,7 @@ def handle_exception(env, wb_router, exc, print_trace):
|
||||
status=status)
|
||||
|
||||
#=================================================================
|
||||
DEFAULT_CONFIG_FILE = 'pywb/configs/config.yaml'
|
||||
DEFAULT_CONFIG_FILE = 'config.yaml'
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -139,7 +139,7 @@ def init_app(init_func, load_yaml=True, config_file=None):
|
||||
|
||||
|
||||
#=================================================================
|
||||
def start_wsgi_server(the_app, name): # pragma: no cover
|
||||
def start_wsgi_server(the_app, name, default_port=None): # pragma: no cover
|
||||
from wsgiref.simple_server import make_server
|
||||
from optparse import OptionParser
|
||||
|
||||
@ -154,7 +154,10 @@ def start_wsgi_server(the_app, name): # pragma: no cover
|
||||
port = the_app.port
|
||||
|
||||
if not port:
|
||||
port = DEFAULT_PORT
|
||||
if default_port:
|
||||
port = default_port
|
||||
else:
|
||||
port = DEFAULT_PORT
|
||||
|
||||
logging.info('Starting %s on port %s', name, port)
|
||||
|
||||
|
2
setup.py
2
setup.py
@ -54,7 +54,7 @@ setup(
|
||||
'pywb.apps'
|
||||
],
|
||||
package_data={
|
||||
'pywb': ['ui/*', 'static/*', '*.yaml', 'configs/*'],
|
||||
'pywb': ['ui/*', 'static/*', '*.yaml'],
|
||||
},
|
||||
data_files=[
|
||||
('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
||||
|
@ -17,7 +17,7 @@ def client():
|
||||
#================================================================
|
||||
def query(client, url, is_error=False, **params):
|
||||
params['url'] = url
|
||||
return client.get('/cdx?' + urlencode(params, doseq=1), expect_errors=is_error)
|
||||
return client.get('/pywb-cdx?' + urlencode(params, doseq=1), expect_errors=is_error)
|
||||
|
||||
|
||||
#================================================================
|
||||
|
Loading…
x
Reference in New Issue
Block a user