1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

refactor: move configs/config.yaml to root again

remove cdx-server specific config, instead make cdx server api-only
path configurable from regular config
This commit is contained in:
Ilya Kreymer 2014-04-02 21:26:53 -07:00
parent 8bdafeb040
commit 80f2da9548
12 changed files with 78 additions and 67 deletions

View File

@ -4,14 +4,12 @@ pywb 0.2.2 changelist
* Generate cdx indexs via command-line `cdx-indexer` script. Optionally sorting, and output to either a single combined file or a file per-directory.
Refer to ``cdx-indexer -h`` for more info.
* Initial support for prefix url queries, eg: http://localhost:8080/pywb/*/http://example.com\* to query all captures from http://example.com
* Initial support for prefix url queries, eg: http://localhost:8080/pywb/\*/http://example.com\* to query all captures from http://example.com
* Support for optional LXML html-based parser for fastest possible parsing. If lxml is installed on the system and via ``pip install lxml``, lxml parser is enabled by default.
(This can be turned off by setting ``use_lxml_parser: false`` in the config)
* Memento: TimeMaps in ``application/link-format`` provided via the ``/timemap/*/`` query.. eg: http://localhost:8080/pywb/timemap/\*/http://example.com
* Basic support for `Memento Protocol RFC7089 <http://www.mementoweb.org/guide/rfc/>`_ Memento, TimeGate and now TimeMaps.
* Full support for `Memento Protocol RFC7089 <http://www.mementoweb.org/guide/rfc/>`_ Memento, TimeGate and TimeMaps. Memento: TimeMaps in ``application/link-format`` provided via the ``/timemap/*/`` query.. eg: http://localhost:8080/pywb/timemap/\*/http://example.com
* pywb now features new `domain-specific rules <https://github.com/ikreymer/pywb/blob/master/pywb/rules.yaml>`_ which are applied to resolve and render certain difficult and dynamic content, in order to make accurate web replay work.
This ruleset will be under further iteration to address further challenges as the web evoles.

View File

@ -132,7 +132,7 @@ Sample Setup
pywb is configurable via yaml.
The simplest `config.yaml <https://github.com/ikreymer/pywb/blob/develop/configs/config.yaml>`_ is roughly as follows:
The simplest `config.yaml <https://github.com/ikreymer/pywb/blob/develop/config.yaml>`_ is roughly as follows:
::
@ -145,7 +145,7 @@ The simplest `config.yaml <https://github.com/ikreymer/pywb/blob/develop/configs
This sets up pywb with a single route for collection /pywb
(The the latest version of `config.yaml <https://github.com/ikreymer/pywb/blob/master/config.yaml>`_ contains
(The the latest version of `config.yaml <https://github.com/ikreymer/pywb/blob/develop/config.yaml>`_ contains
additional documentation and specifies all the optional properties, such
as ui filenames for Jinja2/html template files.)

View File

@ -43,7 +43,7 @@ Given an archive of warcs at ``myarchive/warcs``
2. Run ``cdx-indexer --sort myarchive/cdx myarchive/warcs`` to generate .cdx files for each
warc/arc file in ``myarchive/warcs``
3. Edit ``config.yaml`` to contain the following. You may replace ``pywb`` with
3. Edit ``<https://github.com/ikreymer/pywb/blob/develop/config.yaml>`` to contain the following. You may replace ``pywb`` with
a name of your choice -- it will be the path to your collection. (Multiple collections can be added
for different sets of .cdx files as well)
@ -71,7 +71,7 @@ If you already have .cdx files for your archive, you can skip the first two step
pywb recommends using `SURT <http://crawler.archive.org/articles/user_manual/glossary.html#surt>`_ (Sort-friendly URI Reordering Transform)
sorted urls and the ``cdx-indexer`` automatically generates indexs in this format.
However, pywb is compatible with regular url keyed indexs.
However, pywb is compatible with regular url keyed indexes also.
If you would like to use non-SURT ordered .cdx files, simply add this field to the config:
::
@ -112,7 +112,7 @@ Additional Documentation
~~~~~~~~~~~~~~~~~~~~~~~~
- For additional/up-to-date configuration details, consult the current
`config.yaml <https://github.com/ikreymer/pywb/blob/develop/configs/config.yaml>`_
`config.yaml <https://github.com/ikreymer/pywb/blob/develop/config.yaml>`_
- The `wiki <https://github.com/ikreymer/pywb/wiki>`_ will have
additional technical documentation about various aspects of pywb

View File

@ -98,9 +98,6 @@ enable_cdx_api: true
# set to false to disable
#domain_specific_rules: rules.yaml
# Permissions checker
#perms_checker: !!python/object/new:pywb.cdx.perms.AllowAllPerms []
# Memento support, enable
enable_memento: true

View File

@ -1,21 +1,18 @@
from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server
from pywb.core.cdx_api_handler import create_cdx_server_app
#from pywb.core.cdx_api_handler import create_cdx_server_app
from pywb.core.pywb_init import create_cdx_server_app
#=================================================================
# init cdx server app
#=================================================================
# cdx-server only config
DEFAULT_CONFIG = 'pywb/configs/cdx-server-config.yaml'
application = init_app(create_cdx_server_app,
load_yaml=True,
config_file=DEFAULT_CONFIG)
load_yaml=True)
def main():
start_wsgi_server(application, 'CDX Server')
start_wsgi_server(application, 'CDX Server', default_port=8090)
if __name__ == "__main__":
main()

View File

@ -14,7 +14,7 @@ from pywb import get_test_dir
TEST_CDX_DIR = get_test_dir() + 'cdx/'
CDX_SERVER_URL = 'http://localhost/cdx'
CDX_SERVER_URL = 'http://localhost/pywb-cdx'
CDX_RESULT = [
('urlkey', 'com,example)/'),

View File

@ -1,3 +0,0 @@
#CDX Server WSGI App Config
index_paths: ./sample_archive/cdx/
port: 8090

View File

@ -49,17 +49,3 @@ class CDXAPIHandler(BaseHandler):
params['output'] = 'text'
return params
#=================================================================
def create_cdx_server_app(config):
"""
Create a cdx server config to be wrapped in a wsgi app
Currently using single access point '/cdx' to expose the api
TODO: more complex example with multiple collections?
"""
query_handler = QueryHandler.init_from_config(config)
port = config.get('port')
routes = [Route('cdx', CDXAPIHandler(query_handler))]
return ArchivalRouter(routes, port=port)

View File

@ -96,13 +96,67 @@ def create_wb_handler(query_handler, config, ds_rules_file=None):
wb_handler = wb_handler_class(
query_handler,
replayer,
#html_view=html_view,
search_view=search_view,
)
return wb_handler
#=================================================================
def init_collection(value, config):
if isinstance(value, str):
value = {'index_paths': value}
route_config = DictChain(value, config)
ds_rules_file = route_config.get('domain_specific_rules', None)
html_view = load_query_template(config.get('query_html'),
'Captures Page')
query_handler = QueryHandler.init_from_config(route_config,
ds_rules_file,
html_view)
return route_config, query_handler, ds_rules_file
#=================================================================
def add_cdx_api_handler(name, cdx_api_suffix, routes, query_handler):
# if bool, use -cdx suffix, else use custom string
# as the suffix
if isinstance(cdx_api_suffix, bool):
name += '-cdx'
else:
name += str(cdx_api_suffix)
routes.append(Route(name, CDXAPIHandler(query_handler)))
#=================================================================
def create_cdx_server_app(passed_config):
"""
Create a cdx server api-only app
For each collection, create a /<coll>-cdx access point
which follows the cdx api
"""
config = DictChain(passed_config, DEFAULTS)
collections = config.get('collections')
routes = []
for name, value in collections.iteritems():
result = init_collection(value, config)
route_config, query_handler, ds_rules_file = result
cdx_api_suffix = route_config.get('enable_cdx_api', True)
add_cdx_api_handler(name, cdx_api_suffix, routes, query_handler)
return ArchivalRouter(routes)
#=================================================================
def create_wb_router(passed_config={}):
@ -131,24 +185,9 @@ def create_wb_router(passed_config={}):
use_lxml_parser()
for name, value in collections.iteritems():
if isinstance(value, str):
value = {'index_paths': value}
route_config = DictChain(value, config)
ds_rules_file = route_config.get('domain_specific_rules', None)
#perms_policy = route_config.get('perms_policy', None)
#
#cdx_server = create_cdx_server(route_config,
# ds_rules_file)
#
html_view = load_query_template(config.get('query_html'),
'Captures Page')
query_handler = QueryHandler.init_from_config(route_config,
ds_rules_file,
html_view)
result = init_collection(value, config)
route_config, query_handler, ds_rules_file = result
wb_handler = create_wb_handler(
query_handler=query_handler,
@ -168,13 +207,7 @@ def create_wb_router(passed_config={}):
cdx_api_suffix = route_config.get('enable_cdx_api', False)
if cdx_api_suffix:
# if bool, use -cdx suffix, else use custom string
# as the suffix
if isinstance(cdx_api_suffix, bool):
cdx_api_suffix = '-cdx'
routes.append(Route(name + str(cdx_api_suffix),
CDXAPIHandler(query_handler)))
add_cdx_api_handler(name, cdx_api_suffix, routes, query_handler)
if config.get('debug_echo_env', False):
routes.append(Route('echo_env', DebugEchoEnvHandler()))

View File

@ -103,7 +103,7 @@ def handle_exception(env, wb_router, exc, print_trace):
status=status)
#=================================================================
DEFAULT_CONFIG_FILE = 'pywb/configs/config.yaml'
DEFAULT_CONFIG_FILE = 'config.yaml'
#=================================================================
@ -139,7 +139,7 @@ def init_app(init_func, load_yaml=True, config_file=None):
#=================================================================
def start_wsgi_server(the_app, name): # pragma: no cover
def start_wsgi_server(the_app, name, default_port=None): # pragma: no cover
from wsgiref.simple_server import make_server
from optparse import OptionParser
@ -154,6 +154,9 @@ def start_wsgi_server(the_app, name): # pragma: no cover
port = the_app.port
if not port:
if default_port:
port = default_port
else:
port = DEFAULT_PORT
logging.info('Starting %s on port %s', name, port)

View File

@ -54,7 +54,7 @@ setup(
'pywb.apps'
],
package_data={
'pywb': ['ui/*', 'static/*', '*.yaml', 'configs/*'],
'pywb': ['ui/*', 'static/*', '*.yaml'],
},
data_files=[
('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),

View File

@ -17,7 +17,7 @@ def client():
#================================================================
def query(client, url, is_error=False, **params):
params['url'] = url
return client.get('/cdx?' + urlencode(params, doseq=1), expect_errors=is_error)
return client.get('/pywb-cdx?' + urlencode(params, doseq=1), expect_errors=is_error)
#================================================================