From 6f79840b798424c9d3e5c4cf40d50c03fe78f2c9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 27 Oct 2019 01:39:52 +0100 Subject: [PATCH] Docs, custom metadata improvements (#509) * metadata/coll_config: don't confuse user metadata with collection config, don't display collection config settings as metadata (ukwa/ukwa-pywb#47) - for collection template, add separate 'coll_config' dict, keep user metadata only in 'metadata' dict (default to empty) - for static collections, assume metadata is in the 'metadata' dict of collection config - for dynamic collections, load metadata.yaml into 'metadata' dict - ensure 'metadata' key is passed to frame_insert - ensure 'metadata' added consistently in framed and non-framed mode - tests: update tests to ensure metadata is added consistently - fuzzymatch: don't match 204 OPTIONS responses, update fuzzymatcher test * documentation - add documentation for metadata in ui-customization, rebuild docs, - add link to ui customization from configuring - work on access control docs * fixed small typo's in ui-customization.rst * frontendapp: fix doc string - misc: remove warning on urllib3 Retry init - set version to pywb 2.4.0rc0 Co-Authored-By: John Berlin --- README.rst | 10 +- docs/code/pywb.manager.rst | 8 ++ docs/code/pywb.rewrite.rst | 8 ++ docs/code/pywb.rst | 12 +++ docs/code/pywb.utils.rst | 8 ++ docs/code/pywb.warcserver.rst | 16 +++ docs/manual/access-control.rst | 148 ++++++++++++++++++++++++++ docs/manual/configuring.rst | 5 + docs/manual/ui-customization.rst | 62 +++++++---- pywb/apps/frontendapp.py | 37 ++++--- pywb/apps/rewriterapp.py | 3 +- pywb/templates/frame_insert.html | 1 + pywb/version.py | 2 +- pywb/warcserver/http.py | 5 +- pywb/warcserver/index/fuzzymatcher.py | 5 + tests/test_auto_colls.py | 8 ++ 16 files changed, 290 insertions(+), 48 deletions(-) create mode 100644 docs/manual/access-control.rst diff --git a/README.rst b/README.rst index 2e545a0e..e150bc98 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -Webrecorder pywb 2.3 +Webrecorder pywb 2.4 ==================== .. image:: https://travis-ci.org/webrecorder/pywb.svg?branch=master @@ -41,6 +41,8 @@ The 2.x release included a major overhaul of pywb and introduces many new featur * Improved 'calendar' query UI, grouping results by year and month, and updated replay banner. +* New with 2.4: An extensinble access control system. + Please see the `full documentation `_ for more detailed info on all these features. @@ -67,12 +69,6 @@ Contributions & Bug Reports Users are encouraged to fork and contribute to this project to keep improving web archiving tools. -A few key features are high on list of priorities, but have not yet been implemented, including: - -* Url Exclusion System - -* UI Improvements - If you are interested in contributing, especially to any of these areas, please let us know! Otherwise, please take a look at `list of current issues `_ and feel free to open new ones about any aspect of pywb, including the new documentation. diff --git a/docs/code/pywb.manager.rst b/docs/code/pywb.manager.rst index d2ef00c1..93bb3bdc 100644 --- a/docs/code/pywb.manager.rst +++ b/docs/code/pywb.manager.rst @@ -4,6 +4,14 @@ pywb\.manager package Submodules ---------- +pywb\.manager\.aclmanager module +-------------------------------- + +.. automodule:: pywb.manager.aclmanager + :members: + :undoc-members: + :show-inheritance: + pywb\.manager\.autoindex module ------------------------------- diff --git a/docs/code/pywb.rewrite.rst b/docs/code/pywb.rewrite.rst index 6e551540..914354df 100644 --- a/docs/code/pywb.rewrite.rst +++ b/docs/code/pywb.rewrite.rst @@ -100,6 +100,14 @@ pywb\.rewrite\.rewrite\_hls module :undoc-members: :show-inheritance: +pywb\.rewrite\.rewrite\_js\_workers module +------------------------------------------ + +.. automodule:: pywb.rewrite.rewrite_js_workers + :members: + :undoc-members: + :show-inheritance: + pywb\.rewrite\.rewriteinputreq module ------------------------------------- diff --git a/docs/code/pywb.rst b/docs/code/pywb.rst index 7344991b..7910f946 100644 --- a/docs/code/pywb.rst +++ b/docs/code/pywb.rst @@ -14,6 +14,18 @@ Subpackages pywb.utils pywb.warcserver +Submodules +---------- + +pywb\.version module +-------------------- + +.. automodule:: pywb.version + :members: + :undoc-members: + :show-inheritance: + + Module contents --------------- diff --git a/docs/code/pywb.utils.rst b/docs/code/pywb.utils.rst index aebce3a3..a228d07b 100644 --- a/docs/code/pywb.utils.rst +++ b/docs/code/pywb.utils.rst @@ -60,6 +60,14 @@ pywb\.utils\.memento module :undoc-members: :show-inheritance: +pywb\.utils\.merge module +------------------------- + +.. automodule:: pywb.utils.merge + :members: + :undoc-members: + :show-inheritance: + pywb\.utils\.wbexception module ------------------------------- diff --git a/docs/code/pywb.warcserver.rst b/docs/code/pywb.warcserver.rst index 0498fabe..f7595ed1 100644 --- a/docs/code/pywb.warcserver.rst +++ b/docs/code/pywb.warcserver.rst @@ -12,6 +12,22 @@ Subpackages Submodules ---------- +pywb\.warcserver\.access\_checker module +---------------------------------------- + +.. automodule:: pywb.warcserver.access_checker + :members: + :undoc-members: + :show-inheritance: + +pywb\.warcserver\.amf module +---------------------------- + +.. automodule:: pywb.warcserver.amf + :members: + :undoc-members: + :show-inheritance: + pywb\.warcserver\.basewarcserver module --------------------------------------- diff --git a/docs/manual/access-control.rst b/docs/manual/access-control.rst new file mode 100644 index 00000000..f9dcc61f --- /dev/null +++ b/docs/manual/access-control.rst @@ -0,0 +1,148 @@ +.. _access-control: + +Access Control System +--------------------- + +The access controls system allows for a flexible configuration of rules to allow, +block or exclude access to individual urls by longest-prefix match. + +Access Control Files (.aclj) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Access controls are set in one or more access control json files (.aclj), sorted in reverse alphabetical order. +To determine the best match, a binary search is used (similar to CDXJ) lookup and then the best match is found forward. + +An .aclj file may look as follows:: + + org,httpbin)/anything/something - {"access": "allow", "url": "http://httpbin.org/anything/something"} + org,httpbin)/anything - {"access": "exclude", "url": "http://httpbin.org/anything"} + org,httpbin)/ - {"access": "block", "url": "httpbin.org/"} + com, - {"access": "allow", "url": "com,"} + + +Each JSON entry contains an ``access`` field and the original ``url`` field that was used to convert to the SURT (if any). + +The prefix consists of a SURT key and a ``-`` (currently reserved for a timestamp/date range field to be added later) + +Given these rules, a user would: +* be allowed to visit ``http://httpbin.org/anything/something`` (allow) +* but would receive an 'access blocked' error message when viewing ``http://httpbin.org/`` (block) +* would receive a 404 not found error when viewing ``http://httpbin.org/anything`` (exclude) + + +Access Types: allow, block, exclude +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The available access types are as follows: + +- ``exclude`` - when matched, results are excluded from the index, as if they do not exist. User will receive a 404. +- ``block`` - when matched, results are not excluded from the index, marked with ``access: block``, but access to the actual is blocked. User will see a 451 +- ``allow`` - full access to the index and the resource. + +The difference between ``exclude`` and ``block`` is that when blocked, the user can be notified that access is blocked, while +with exclude, no trace of the resource is presented to the user. + +The use of ``allow`` is useful to provide access to more specific resources within a broader block/exclude rule. + +Access Error Messages +^^^^^^^^^^^^^^^^^^^^^ + +The special error code 451 is used to indicate that a resource has been blocked (access setting ``block``) + +The [error.html](https://github.com/webrecorder/pywb/blob/master/pywb/templates/error.html) template contains a special message for this access and can be customized further. + +By design, resources that are ``exclude``-ed simply appear as 404 not found and no special error is provided. + + +Managing Access Lists via Command-Line +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The .aclj files need not ever be added or edited manually. + +The pywb ``wb-manager`` utility has been extended to provide tools for adding, removing and checking access control rules. + +The access rules are written to ``/acl/access-rules.acl`` for a given collection ```` for automatic collections. + +For example, to add the first line to an ACL file ``access.aclj``, one could run:: + + wb-manager acl add http://httpbin.org/anything/something exclude + + +The URL supplied can be a URL or a SURT prefix. If a SURT is supplied, it is used as is:: + + wb-manager acl add com, allow + + +By default, access control rules apply to a prefix of a given URL or SURT. + +To have the rule apply only to the exact match, use:: + + wb-manager acl add http://httpbin.org/anything/something allow --exact-match + +Rules added with and without the ``--exact-match`` flag are considered distinct rules, and can be added +and removed separately. + +With the above rules, ``http://httpbin.org/anything/something`` would be allowed, but +``http://httpbin.org/anything/something/subpath`` would be excluded for any ``subpath``. + +To remove a rule, one can run:: + + wb-manager acl remove http://httpbin.org/anything/something + +To import rules in bulk, such as from an OpenWayback-style excludes.txt and mark them as ``exclude``:: + + wb-manager acl importtxt ./excludes.txt exclude + + +See ``wb-manager acl -h`` for a list of additional commands such as for validating rules files and running a match against +an existing rule set. + + + +Access Controls for Custom Collections +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For manually configured collections, there are additional options for configuring access controls. +The access control files can be specified explicitly using the ``acl_paths`` key and allow specifying multiple ACL files, +and allowing sharing access control files between different collections. + +Single ACLJ:: + + collections: + test: + acl_paths: ./path/to/file.aclj + default_access: block + + + +Multiple ACLJ:: + + collections: + test: + acl_paths: + - ./path/to/allows.aclj + - ./path/to/blocks.aclj + - ./path/to/other.aclj + - ./path/to/directory + + default_access: block + +The ``acl_paths`` can be a single entry or a list, and can also include directories. If a directory is specified, all ``.aclj`` files +in the directory are checked. + +When finding the best rule from multiple ``.aclj`` files, each file is binary searched and the result +set merge-sorted to find the best match (very similar to the CDXJ index lookup). + +Note: It might make sense to separate ``allows.aclj`` and ``blocks.aclj`` into individual files for organizational reasons, +but there is no specific need to keep more than one access control files. + +Default Access +^^^^^^^^^^^^^^ + +An additional ``default_access`` setting can be added to specify the default rule if no other rules match for custom collections. +If omitted, this setting is ``default_access: allow``, which is usually the desired default. + +Setting ``default_access: block`` and providing a list of ``allow`` rules provides a flexible way to allow access +to only a limited set of resources, and block access to anything out of scope by default. + + diff --git a/docs/manual/configuring.rst b/docs/manual/configuring.rst index 8d568436..0fb47f54 100644 --- a/docs/manual/configuring.rst +++ b/docs/manual/configuring.rst @@ -105,6 +105,11 @@ When resolving a ``example.warc.gz``, pywb will then check (in order): * Then, ``http://remote-backup.example.com/collections//example.warc.gz`` (if first lookup unsuccessful) +UI Customizations +----------------- + +See :ref:`ui-customization` for more details on how to customize the UI. + Special and Custom Collections ------------------------------ diff --git a/docs/manual/ui-customization.rst b/docs/manual/ui-customization.rst index c61ca9fd..49f1e9a9 100644 --- a/docs/manual/ui-customization.rst +++ b/docs/manual/ui-customization.rst @@ -1,34 +1,26 @@ -.. _configuring-pywb-ui: +.. _ui-customizations: UI Customizations ----------------- pywb supports UI customizations, either for an entire archive, -or per-collection. - -Static Files -^^^^^^^^^^^^ - -The replay server will automatically support static files placed under the following directories: - -* Files under the root ``static`` directory can be accessed via ``http://my-archive.example.com/static/`` - -* Files under the per-collection ``./collections//static`` directory can be accessed via ``http://my-archive.example.com/static/_//`` +or per-collection. Jinja2 templates are used for rendering all views, +and static files can also be added as needed. Templates ^^^^^^^^^ -pywb users Jinja2 templates to render HTML to render the HTML for all aspects of the application. +Default templates, listed below, are found in the ``./pywb/templates/`` directory. -A version placed in the ``templates`` directory, either in the root or per collection, will override that template. +Custom template files placed in the ``templates`` directory, either in the root or per collection, will override that template. -To copy the default pywb template to the template directory run: +To copy the default pywb template to the template directory using the cli tools, run: ``wb-manager template --add search_html`` -The following templates are available: +The following page-level templates are available, corresponding to home page, collection page or search results: - * ``home.html`` -- Home Page Template, used for ``http://my-archive.example.com/`` + * ``index.html`` -- Home Page Template, used for ``http://my-archive.example.com/`` * ``search.html`` -- Collection Template, used for each collection page ``http://my-archive.example.com//`` @@ -50,8 +42,8 @@ Replay and Banner templates: * ``banner.html`` -- The banner used for frameless replay. Can be set to blank to disable the banner. -For those looking to customize the default template(s) when deploying pywb, the following templates located in the -pywb/templates directory. +To customize the default pywb UI across multiple pages, the following generic templates +can also be overriden: * ``base.html`` -- The base template used for non-replay related pages. @@ -74,6 +66,40 @@ The ``base.html`` template also provides five blocks that can be supplied by tem * ``footer`` -- Block for adding content to the ```` after the ``body`` block, includes the ``footer.html`` template +Static Files +^^^^^^^^^^^^ + +The pywb server will automatically support static files placed under the following directories: + +* Files under the root ``static`` directory can be accessed via ``http://my-archive.example.com/static/`` + +* Files under the per-collection ``./collections//static`` directory can be accessed via ``http://my-archive.example.com/static/_//`` + + +Custom Metadata +^^^^^^^^^^^^^^^ + +It is possible to also add custom metadata that will be available in the Jinja2 template. + +For dynamic collections, any fields placed under ``/metadata.yaml`` filed can be accessed + +via the ``{{ metadata }}`` variable. + +For example, if metadata file contains: + +.. ex-block:: yaml + + somedata: value + +Accessing ``{{ metadata.somedata }}`` will resolve to ``value`` + +The metadata can also be added via commandline: ``wb-manager metadata myCollection --set somedata=value]`` + + + +The default collection UI template (search.html) currently lists all of the available metadata fields. + + Custom Outer Replay Frame ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index 9068b4ee..1bc4c5f4 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -289,25 +289,22 @@ class FrontEndApp(object): except Exception: self.raise_not_found(environ, 'static_file_not_found', filepath) - def get_metadata(self, coll): - """Retrieve the metadata associated with a collection + def get_coll_config(self, coll): + """Retrieve the collection config, including metadata, associated with a collection - :param str coll: The name of the collection to receive metadata for - :return: The collections metadata if it exists + :param str coll: The name of the collection to receive config info for + :return: The collections config :rtype: dict """ - # if coll == self.all_coll: - # coll = '*' - - metadata = {'coll': coll, - 'type': 'replay'} + coll_config = {'coll': coll, + 'type': 'replay'} if coll in self.warcserver.list_fixed_routes(): - metadata.update(self.warcserver.get_coll_config(coll)) + coll_config.update(self.warcserver.get_coll_config(coll)) else: - metadata.update(self.metadata_cache.load(coll)) + coll_config['metadata'] = self.metadata_cache.load(coll) or {} - return metadata + return coll_config def serve_coll_page(self, environ, coll='$root'): """Render and serve a collections search page (search.html). @@ -322,7 +319,8 @@ class FrontEndApp(object): self.setup_paths(environ, coll) - metadata = self.get_metadata(coll) + coll_config = self.get_coll_config(coll) + metadata = coll_config.get('metadata') view = BaseInsertView(self.rewriterapp.jinja_env, 'search.html') @@ -332,8 +330,9 @@ class FrontEndApp(object): content = view.render_to_string(environ, wb_prefix=wb_prefix, - metadata=metadata, - coll=coll) + coll=coll, + coll_config=coll_config, + metadata=metadata) return WbResponse.text_response(content, content_type='text/html; charset="utf-8"') @@ -409,16 +408,16 @@ class FrontEndApp(object): if environ.get('QUERY_STRING'): wb_url_str += '?' + environ.get('QUERY_STRING') - metadata = self.get_metadata(coll) + coll_config = self.get_coll_config(coll) if record: - metadata['type'] = 'record' + coll_config['type'] = 'record' if timemap_output: - metadata['output'] = timemap_output + coll_config['output'] = timemap_output # ensure that the timemap path information is not included wb_url_str = wb_url_str.replace('timemap/{0}/'.format(timemap_output), '') - return self.rewriterapp.render_content(wb_url_str, metadata, environ) + return self.rewriterapp.render_content(wb_url_str, coll_config, environ) def setup_paths(self, environ, coll, record=False): """Populates the WSGI environment dictionary with the path information necessary to perform a response for diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index 7810f214..600bade2 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -493,6 +493,7 @@ class RewriterApp(object): framed_replay, coll=kwargs.get('coll', ''), replay_mod=self.replay_mod, + metadata=kwargs.get('metadata', {}), config=self.config)) cookie_rewriter = None @@ -835,7 +836,7 @@ class RewriterApp(object): pass def get_top_frame_params(self, wb_url, kwargs): - return None + return {'metadata': kwargs.get('metadata', {})} def handle_custom_response(self, environ, wb_url, full_prefix, host_prefix, kwargs): if kwargs.get('output'): diff --git a/pywb/templates/frame_insert.html b/pywb/templates/frame_insert.html index 1f7e74e1..169b1704 100644 --- a/pywb/templates/frame_insert.html +++ b/pywb/templates/frame_insert.html @@ -18,6 +18,7 @@ html, body +
diff --git a/pywb/version.py b/pywb/version.py index 86f56f5b..9cd4096b 100644 --- a/pywb/version.py +++ b/pywb/version.py @@ -1,4 +1,4 @@ -__version__ = '2.4.0' +__version__ = '2.4.0rc0' if __name__ == '__main__': print(__version__) diff --git a/pywb/warcserver/http.py b/pywb/warcserver/http.py index 4c2aa8c3..68e8711a 100644 --- a/pywb/warcserver/http.py +++ b/pywb/warcserver/http.py @@ -4,6 +4,7 @@ import requests import six.moves.http_client from requests.adapters import DEFAULT_POOLBLOCK, HTTPAdapter from urllib3.poolmanager import PoolManager +from urllib3.util.retry import Retry six.moves.http_client._MAXHEADERS = 10000 six.moves.http_client._MAXLINE = 131072 @@ -41,8 +42,8 @@ class PywbHttpAdapter(HTTPAdapter): # ============================================================================= class DefaultAdapters(object): - live_adapter = PywbHttpAdapter(max_retries=3) - remote_adapter = PywbHttpAdapter(max_retries=3) + live_adapter = PywbHttpAdapter(max_retries=Retry(3)) + remote_adapter = PywbHttpAdapter(max_retries=Retry(3)) requests.packages.urllib3.disable_warnings() diff --git a/pywb/warcserver/index/fuzzymatcher.py b/pywb/warcserver/index/fuzzymatcher.py index d47d00ab..618b64f2 100644 --- a/pywb/warcserver/index/fuzzymatcher.py +++ b/pywb/warcserver/index/fuzzymatcher.py @@ -194,6 +194,11 @@ class FuzzyMatcher(object): check_query = False url_no_query, ext = self.get_ext(url) + # don't fuzzy match to 204 + if cdx.get('status') == '204': + if '__pywb_method=options' in cdx['urlkey']: + return False + # check ext if ext and ext not in self.default_filters['not_exts']: check_query = True diff --git a/tests/test_auto_colls.py b/tests/test_auto_colls.py index b734907c..61efc764 100644 --- a/tests/test_auto_colls.py +++ b/tests/test_auto_colls.py @@ -283,6 +283,7 @@ class TestManagedColls(CollsDirMixin, BaseConfigTest): with open(banner_file, 'w+b') as fh: fh.write(b'
Custom Banner Here!
') + fh.write(b'\n{{ metadata | tojson }}') def test_add_custom_banner_replay(self, fmod): resp = self.get('/test/20140103030321/http://example.com/?example=1', fmod) @@ -314,6 +315,13 @@ class TestManagedColls(CollsDirMixin, BaseConfigTest): assert 'overriden search page: ' in resp.text assert '"some":"value"' in resp.text + def test_replay_banner_metadata(self, fmod): + """ Test adding metadata in replay banner (both framed and non-frame) + """ + resp = self.get('/test/20140103030321{0}/http://example.com/?example=1', fmod) + assert '
Custom Banner Here!
' in resp.text + assert '"some":"value"' in resp.text + def test_more_custom_templates_replay(self, fmod): resp = self.get('/test/20140103030321{0}/http://example.com/?example=1', fmod) assert resp.status_int == 200