From 8d98b9111eaf8b5a5232e6bf0c38340602acaebd Mon Sep 17 00:00:00 2001
From: John Berlin <n0tan3rd@gmail.com>
Date: Wed, 10 Apr 2019 14:00:53 -0400
Subject: [PATCH] added additional code documentation in order to meet the
 documentation requirements of pywb

---
 pywb/apps/frontendapp.py          | 27 ++++++++--
 pywb/apps/rewriterapp.py          | 26 ++++++++++
 pywb/utils/loaders.py             | 14 +++++
 pywb/utils/memento.py             | 20 ++++++++
 pywb/utils/wbexception.py         | 70 ++++++++++++++++++++++++-
 pywb/warcserver/access_checker.py | 85 +++++++++++++++++++++++++++++--
 6 files changed, 231 insertions(+), 11 deletions(-)

diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py
index cba2dff6..124eeae5 100644
--- a/pywb/apps/frontendapp.py
+++ b/pywb/apps/frontendapp.py
@@ -1,12 +1,10 @@
 from gevent.monkey import patch_all; patch_all()
 
-#from bottle import run, Bottle, request, response, debug
 from werkzeug.routing import Map, Rule
 from werkzeug.exceptions import HTTPException, NotFound
 from werkzeug.wsgi import pop_path_info
 from six.moves.urllib.parse import urljoin
 from six import iteritems
-from warcio.statusandheaders import StatusAndHeaders
 from warcio.utils import to_native_str
 from warcio.timeutils import iso_date_to_timestamp
 from wsgiprox.wsgiprox import WSGIProxMiddleware
@@ -17,14 +15,14 @@ from pywb.recorder.recorderapp import RecorderApp
 from pywb.utils.loaders import load_yaml_config
 from pywb.utils.geventserver import GeventServer
 from pywb.utils.io import StreamIter
-from pywb.utils.wbexception import NotFoundException, WbException, AppPageNotFound
+from pywb.utils.wbexception import WbException, AppPageNotFound
 
 from pywb.warcserver.warcserver import WarcServer
 
 from pywb.rewrite.templateview import BaseInsertView
 
 from pywb.apps.static_handler import StaticHandler
-from pywb.apps.rewriterapp import RewriterApp, UpstreamException
+from pywb.apps.rewriterapp import RewriterApp
 from pywb.apps.wbrequestresponse import WbResponse
 
 import os
@@ -71,6 +69,8 @@ class FrontEndApp(object):
         self.handler = self.handle_request
         self.warcserver = WarcServer(config_file=config_file,
                                      custom_config=custom_config)
+        self.recorder = None
+        self.recorder_path = None
 
         config = self.warcserver.config
 
@@ -151,7 +151,11 @@ class FrontEndApp(object):
         return base_paths
 
     def init_recorder(self, recorder_config):
-        """Initialize the recording functionality of pywb. If recording_config is None this function is a no op"""
+        """Initialize the recording functionality of pywb. If recording_config is None this function is a no op
+
+        :param str|dict|None recorder_config: The configuration for the recorder app
+        :rtype: None
+        """
         if not recorder_config:
             self.recorder = None
             self.recorder_path = None
@@ -204,6 +208,12 @@ class FrontEndApp(object):
         indexer.start()
 
     def is_proxy_enabled(self, environ):
+        """Returns T/F indicating if proxy mode is enabled
+
+        :param dict environ: The WSGI environment dictionary for the request
+        :return: T/F indicating if proxy mode is enabled
+        :rtype: bool
+        """
         return self.proxy_prefix is not None and 'wsgiprox.proxy_host' in environ
 
     def serve_home(self, environ):
@@ -485,6 +495,13 @@ class FrontEndApp(object):
         return WbResponse.redir_response(full_url, '307 Redirect')
 
     def __call__(self, environ, start_response):
+        """Handles a request
+
+        :param dict environ: The WSGI environment dictionary for the request
+        :param start_response:
+        :return: The WbResponse for the request
+        :rtype: WbResponse
+        """
         return self.handler(environ, start_response)
 
     def handle_request(self, environ, start_response):
diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py
index c7f313e6..c60d068e 100644
--- a/pywb/apps/rewriterapp.py
+++ b/pywb/apps/rewriterapp.py
@@ -147,6 +147,17 @@ class RewriterApp(object):
         return is_timegate
 
     def _get_prefer_mod(self, wb_url, environ, content_rw, is_proxy):
+        """Returns the default rewrite modifier and rewrite modifier based on the
+        value of the Prefer HTTP header if it is present
+
+        :param WbUrl wb_url: The WbUrl for the URL being rewritten
+        :param dict environ: The WSGI environment dictionary for the request
+        :param content_rw: The content rewriter instance
+        :param bool is_proxy: Is the rewrite operating in proxy mode
+        :return: A tuple containing the default rewrite modifier and rewrite modifier based
+        on the  value of the Prefer HTTP header if it is present
+        :rtype: tuple[str|None, str|None]
+        """
         if not self.enable_prefer:
             return None, None
 
@@ -516,6 +527,21 @@ class RewriterApp(object):
     def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
                            status_headers, is_timegate, is_proxy, coll=None,
                            pref_applied=None, mod=None, is_memento=True):
+        """
+
+        :param str url: The URI-R being rewritten
+        :param str full_prefix: The replay prefix
+        :param str|None memento_dt: The memento datetime for the URI-R being rewritten
+        :param str memento_ts: The memento timestamp
+        :param warcio.StatusAndHeaders status_headers:
+        :param bool is_timegate: Are we returning a response for a timegate
+        :param bool is_proxy: Are we operating in proxy mode
+        :param str|None coll: The collection the URI-R is from
+        :param str|None pref_applied:
+        :param str|None mod: The rewrite modifier
+        :param bool is_memento:
+        :rtype: None
+        """
 
         replay_mod = mod or self.replay_mod
 
diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py
index 6ca5b613..e2933811 100644
--- a/pywb/utils/loaders.py
+++ b/pywb/utils/loaders.py
@@ -35,6 +35,11 @@ except ImportError:  # pragma: no cover
 
 # ============================================================================
 def init_yaml_env_vars():
+    """Initializes the yaml parser to be able to set
+    the value of fields from environment variables
+
+    :rtype: None
+    """
     env_rx = re.compile(r'\$\{[^}]+\}')
 
     yaml.add_implicit_resolver('!envvar', env_rx)
@@ -421,9 +426,18 @@ class S3Loader(BaseLoader):
 
 # =================================================================
 class WebHDFSLoader(HttpLoader):
+    """Loader class specifically for loading webhdfs content"""
+
     HTTP_URL = 'http://{host}/webhdfs/v1{path}?'
 
     def load(self, url, offset, length):
+        """Loads the supplied web hdfs content
+
+        :param str url: The URL to the web hdfs content to be loaded
+        :param int|float|double offset: The offset of the content to be loaded
+        :param int|float|double length: The length of the content to be loaded
+        :return: The raw response content
+        """
         parts = urlsplit(url)
 
         http_url = self.HTTP_URL.format(host=parts.netloc,
diff --git a/pywb/utils/memento.py b/pywb/utils/memento.py
index f55c2dc6..a20319cf 100644
--- a/pywb/utils/memento.py
+++ b/pywb/utils/memento.py
@@ -66,6 +66,16 @@ class MementoUtils(object):
 
     @classmethod
     def make_timemap_memento_link(cls, cdx, datetime=None, rel='memento', end=',\n', memento_format=None):
+        """Creates a memento link string for a timemap
+
+        :param dict cdx: The cdx object
+        :param str|None datetime: The datetime
+        :param str rel: The rel type
+        :param str end: Optional string appended to the end of the created link string
+        :param str|None memento_format: Optional string used to format the URL
+        :return: A memento link string
+        :rtype: str
+        """
         url = cdx.get('url')
         if not url:
             url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length'))
@@ -113,6 +123,16 @@ class MementoUtils(object):
 
     @classmethod
     def make_memento_link(cls, url, type, dt, coll=None, memento_format=None):
+        """Creates a memento link string
+
+        :param str url: A URL
+        :param str type: The rel type
+        :param str dt: The datetime of the URL
+        :param str|None coll: Optional name of a collection
+        :param str|None memento_format: Optional string used to format the supplied URL
+        :return: A memento link string
+        :rtype: str
+        """
         if memento_format:
             memento_format = memento_format.format(url=url,
                                                    timestamp=http_date_to_timestamp(dt))
diff --git a/pywb/utils/wbexception.py b/pywb/utils/wbexception.py
index 228df9f2..9d986dd1 100644
--- a/pywb/utils/wbexception.py
+++ b/pywb/utils/wbexception.py
@@ -3,16 +3,34 @@ from werkzeug.http import HTTP_STATUS_CODES
 
 #=================================================================
 class WbException(Exception):
+    """Base class for exceptions raised by Pywb"""
+
     def __init__(self, msg=None, url=None):
-        Exception.__init__(self, msg)
+        """Initialize a new WbException
+
+        :param str|None msg: The message for the error response
+        :param str|None url: The URL that caused the error
+        :rtype: None
+        """
+        super(WbException, self).__init__(msg)
         self.msg = msg
         self.url = url
 
     @property
     def status_code(self):
+        """Returns the status code to be used for the error response
+
+        :return: The status code for the error response (500)
+        :rtype: int
+        """
         return 500
 
     def status(self):
+        """Returns the HTTP status line for the error response
+
+        :return: The HTTP status line for the error response
+        :rtype: str
+        """
         return str(self.status_code) + ' ' + HTTP_STATUS_CODES.get(self.status_code, 'Unknown Error')
 
     def __repr__(self):
@@ -25,46 +43,96 @@ class WbException(Exception):
 
 #=================================================================
 class AccessException(WbException):
+    """An Exception used to indicate an access control violation"""
+
     @property
     def status_code(self):
+        """Returns the status code to be used for the error response
+
+        :return: The status code for the error response (451)
+        :rtype: int
+        """
         return 451
 
 
 #=================================================================
 class BadRequestException(WbException):
+    """An Exception used to indicate that request was bad"""
+
     @property
     def status_code(self):
+        """Returns the status code to be used for the error response
+
+        :return: The status code for the error response (400)
+        :rtype: int
+        """
         return 400
 
 
 #=================================================================
 class NotFoundException(WbException):
+    """An Exception used to indicate that a resource was not found"""
+
     @property
     def status_code(self):
+        """Returns the status code to be used for the error response
+
+        :return: The status code for the error response (404)
+        :rtype: int
+        """
         return 404
 
 
 #=================================================================
 class LiveResourceException(WbException):
+    """An Exception used to indicate that an error was encountered during the
+    retrial of a live web resource"""
+
     @property
     def status_code(self):
+        """Returns the status code to be used for the error response
+
+        :return: The status code for the error response (400)
+        :rtype: int
+        """
         return 400
 
 
 # ============================================================================
 class UpstreamException(WbException):
+    """An Exception used to indicate that an error was encountered from an upstream endpoint"""
+
     def __init__(self, status_code, url, details):
+        """Initialize a new UpstreamException
+
+        :param int status_code: The status code for the error response
+        :param str url: The URL that caused the error
+        :param str details: The details of the error encountered
+        :rtype: None
+        """
         super(UpstreamException, self).__init__(url=url, msg=details)
         self._status_code = status_code
 
     @property
     def status_code(self):
+        """Returns the status code to be used for the error response
+
+        :return: The status code for the error response
+        :rtype: int
+        """
         return self._status_code
 
 
 # ============================================================================
 class AppPageNotFound(WbException):
+    """An Exception used to indicate that a page was not found"""
+
     @property
     def status_code(self):
+        """Returns the status code to be used for the error response
+
+        :return: The status code for the error response (400)
+        :rtype: int
+        """
         return 404
 
diff --git a/pywb/warcserver/access_checker.py b/pywb/warcserver/access_checker.py
index a0eb4abf..9cd2790e 100644
--- a/pywb/warcserver/access_checker.py
+++ b/pywb/warcserver/access_checker.py
@@ -11,11 +11,28 @@ import os
 
 # ============================================================================
 class FileAccessIndexSource(FileIndexSource):
+    """An Index Source class specific to access control lists"""
+
     @staticmethod
     def rev_cmp(a, b):
+        """Performs a comparison between two items using the
+        algorithm of the removed builtin cmp
+
+        :param a: A value to be compared
+        :param b: A value to be compared
+        :return: The result of the comparison
+        :rtype: int
+        """
         return (a < b) - (a > b)
 
     def _do_iter(self, fh, params):
+        """Iterates over the supplied file handle to an access control list
+        yielding the results of the search for the params key
+
+        :param TextIO fh: The file handle to an access control list
+        :param dict params: The params of the
+        :return: A generator yielding the results of the param search
+        """
         exact_suffix = params.get('exact_match_suffix')
         key = params['key']
         if exact_suffix:
@@ -27,31 +44,47 @@ class FileAccessIndexSource(FileIndexSource):
 
 # ============================================================================
 class ReverseMergeMixin(object):
+    """A mixin that provides revered merge functionality"""
+
     def _merge(self, iter_list):
+        """Merges the supplied list of iterators in reverse
+
+        :param iter_list: The list of iterators to be merged
+        :return: An iterator that yields the results of the reverse merge
+        """
         return merge(*(iter_list), reverse=True)
 
 
 # ============================================================================
 class AccessRulesAggregator(ReverseMergeMixin, SimpleAggregator):
-    pass
+    """An Aggregator specific to access control"""
 
 
 # ============================================================================
 class DirectoryAccessSource(ReverseMergeMixin, DirectoryIndexSource):
-    INDEX_SOURCES = [('.aclj', FileAccessIndexSource)]
+    """An directory index source specific to access control"""
+
+    INDEX_SOURCES = [('.aclj', FileAccessIndexSource)]  # type: list[tuple]
 
 
 # ============================================================================
 class CacheDirectoryAccessSource(CacheDirectoryMixin, DirectoryAccessSource):
-    pass
+    """An cache directory index source specific to access control"""
 
 
 # ============================================================================
 class AccessChecker(object):
-    EXACT_SUFFIX = '###'
-    EXACT_SUFFIX_B = b'###'
+    """An access checker class"""
+
+    EXACT_SUFFIX = '###'  # type: str
+    EXACT_SUFFIX_B = b'###'  # type: bytes
 
     def __init__(self, access_source, default_access='allow'):
+        """Initialize a new AccessChecker
+
+        :param str|list[str]|AccessRulesAggregator access_source: An access source
+        :param str default_access: The default access action (allow)
+        """
         if isinstance(access_source, str):
             self.aggregator = self.create_access_aggregator([access_source])
         elif isinstance(access_source, list):
@@ -66,6 +99,13 @@ class AccessChecker(object):
         self.default_rule['default'] = 'true'
 
     def create_access_aggregator(self, source_files):
+        """Creates a new AccessRulesAggregator using the supplied list
+        of access control file names
+
+        :param list[str] source_files: The list of access control file names
+        :return: The created AccessRulesAggregator
+        :rtype: AccessRulesAggregator
+        """
         sources = {}
         for filename in source_files:
             sources[filename] = self.create_access_source(filename)
@@ -74,6 +114,17 @@ class AccessChecker(object):
         return aggregator
 
     def create_access_source(self, filename):
+        """Creates a new access source for the supplied filename.
+
+        If the filename is for a directory an CacheDirectoryAccessSource
+        instance is returned otherwise an FileAccessIndexSource instance
+
+        :param str filename: The name of an file/directory
+        :return: An instance of CacheDirectoryAccessSource or FileAccessIndexSource
+        depending on if the supplied filename is for a directory or file
+        :rtype: CacheDirectoryAccessSource|FileAccessIndexSource
+        :raises Exception: Indicates an invalid access source was supplied
+        """
         if os.path.isdir(filename):
             return CacheDirectoryAccessSource(filename)
 
@@ -84,6 +135,16 @@ class AccessChecker(object):
             raise Exception('Invalid Access Source: ' + filename)
 
     def find_access_rule(self, url, ts=None, urlkey=None):
+        """Attempts to find the access control rule for the
+        supplied URL otherwise returns the default rule
+
+        :param str url: The URL for the rule to be found
+        :param str|None ts: A timestamp (not used)
+        :param str|None urlkey: The access control url key
+        :return: The access control rule for the supplied URL
+        if one exists otherwise the default rule
+        :rtype: CDXObject
+        """
         params = {'url': url,
                   'urlkey': urlkey,
                   'nosource': 'true',
@@ -121,10 +182,24 @@ class AccessChecker(object):
         return self.default_rule
 
     def __call__(self, res):
+        """Wraps the cdx iter in the supplied tuple returning a
+        the wrapped cdx iter and the other members of the supplied
+        tuple in same order
+
+        :param tuple res: The result tuple
+        :return: An tuple
+        """
         cdx_iter, errs = res
         return self.wrap_iter(cdx_iter), errs
 
     def wrap_iter(self, cdx_iter):
+        """Wraps the supplied cdx iter and yields cdx objects
+        that contain the access control results for the cdx object
+        being yielded
+
+        :param cdx_iter: The cdx object iterator to be wrapped
+        :return: The wrapped cdx object iterator
+        """
         last_rule = None
         last_url = None