From 8d98b9111eaf8b5a5232e6bf0c38340602acaebd Mon Sep 17 00:00:00 2001 From: John Berlin Date: Wed, 10 Apr 2019 14:00:53 -0400 Subject: [PATCH] added additional code documentation in order to meet the documentation requirements of pywb --- pywb/apps/frontendapp.py | 27 ++++++++-- pywb/apps/rewriterapp.py | 26 ++++++++++ pywb/utils/loaders.py | 14 +++++ pywb/utils/memento.py | 20 ++++++++ pywb/utils/wbexception.py | 70 ++++++++++++++++++++++++- pywb/warcserver/access_checker.py | 85 +++++++++++++++++++++++++++++-- 6 files changed, 231 insertions(+), 11 deletions(-) diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index cba2dff6..124eeae5 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -1,12 +1,10 @@ from gevent.monkey import patch_all; patch_all() -#from bottle import run, Bottle, request, response, debug from werkzeug.routing import Map, Rule from werkzeug.exceptions import HTTPException, NotFound from werkzeug.wsgi import pop_path_info from six.moves.urllib.parse import urljoin from six import iteritems -from warcio.statusandheaders import StatusAndHeaders from warcio.utils import to_native_str from warcio.timeutils import iso_date_to_timestamp from wsgiprox.wsgiprox import WSGIProxMiddleware @@ -17,14 +15,14 @@ from pywb.recorder.recorderapp import RecorderApp from pywb.utils.loaders import load_yaml_config from pywb.utils.geventserver import GeventServer from pywb.utils.io import StreamIter -from pywb.utils.wbexception import NotFoundException, WbException, AppPageNotFound +from pywb.utils.wbexception import WbException, AppPageNotFound from pywb.warcserver.warcserver import WarcServer from pywb.rewrite.templateview import BaseInsertView from pywb.apps.static_handler import StaticHandler -from pywb.apps.rewriterapp import RewriterApp, UpstreamException +from pywb.apps.rewriterapp import RewriterApp from pywb.apps.wbrequestresponse import WbResponse import os @@ -71,6 +69,8 @@ class FrontEndApp(object): self.handler = self.handle_request self.warcserver = WarcServer(config_file=config_file, custom_config=custom_config) + self.recorder = None + self.recorder_path = None config = self.warcserver.config @@ -151,7 +151,11 @@ class FrontEndApp(object): return base_paths def init_recorder(self, recorder_config): - """Initialize the recording functionality of pywb. If recording_config is None this function is a no op""" + """Initialize the recording functionality of pywb. If recording_config is None this function is a no op + + :param str|dict|None recorder_config: The configuration for the recorder app + :rtype: None + """ if not recorder_config: self.recorder = None self.recorder_path = None @@ -204,6 +208,12 @@ class FrontEndApp(object): indexer.start() def is_proxy_enabled(self, environ): + """Returns T/F indicating if proxy mode is enabled + + :param dict environ: The WSGI environment dictionary for the request + :return: T/F indicating if proxy mode is enabled + :rtype: bool + """ return self.proxy_prefix is not None and 'wsgiprox.proxy_host' in environ def serve_home(self, environ): @@ -485,6 +495,13 @@ class FrontEndApp(object): return WbResponse.redir_response(full_url, '307 Redirect') def __call__(self, environ, start_response): + """Handles a request + + :param dict environ: The WSGI environment dictionary for the request + :param start_response: + :return: The WbResponse for the request + :rtype: WbResponse + """ return self.handler(environ, start_response) def handle_request(self, environ, start_response): diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index c7f313e6..c60d068e 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -147,6 +147,17 @@ class RewriterApp(object): return is_timegate def _get_prefer_mod(self, wb_url, environ, content_rw, is_proxy): + """Returns the default rewrite modifier and rewrite modifier based on the + value of the Prefer HTTP header if it is present + + :param WbUrl wb_url: The WbUrl for the URL being rewritten + :param dict environ: The WSGI environment dictionary for the request + :param content_rw: The content rewriter instance + :param bool is_proxy: Is the rewrite operating in proxy mode + :return: A tuple containing the default rewrite modifier and rewrite modifier based + on the value of the Prefer HTTP header if it is present + :rtype: tuple[str|None, str|None] + """ if not self.enable_prefer: return None, None @@ -516,6 +527,21 @@ class RewriterApp(object): def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts, status_headers, is_timegate, is_proxy, coll=None, pref_applied=None, mod=None, is_memento=True): + """ + + :param str url: The URI-R being rewritten + :param str full_prefix: The replay prefix + :param str|None memento_dt: The memento datetime for the URI-R being rewritten + :param str memento_ts: The memento timestamp + :param warcio.StatusAndHeaders status_headers: + :param bool is_timegate: Are we returning a response for a timegate + :param bool is_proxy: Are we operating in proxy mode + :param str|None coll: The collection the URI-R is from + :param str|None pref_applied: + :param str|None mod: The rewrite modifier + :param bool is_memento: + :rtype: None + """ replay_mod = mod or self.replay_mod diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 6ca5b613..e2933811 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -35,6 +35,11 @@ except ImportError: # pragma: no cover # ============================================================================ def init_yaml_env_vars(): + """Initializes the yaml parser to be able to set + the value of fields from environment variables + + :rtype: None + """ env_rx = re.compile(r'\$\{[^}]+\}') yaml.add_implicit_resolver('!envvar', env_rx) @@ -421,9 +426,18 @@ class S3Loader(BaseLoader): # ================================================================= class WebHDFSLoader(HttpLoader): + """Loader class specifically for loading webhdfs content""" + HTTP_URL = 'http://{host}/webhdfs/v1{path}?' def load(self, url, offset, length): + """Loads the supplied web hdfs content + + :param str url: The URL to the web hdfs content to be loaded + :param int|float|double offset: The offset of the content to be loaded + :param int|float|double length: The length of the content to be loaded + :return: The raw response content + """ parts = urlsplit(url) http_url = self.HTTP_URL.format(host=parts.netloc, diff --git a/pywb/utils/memento.py b/pywb/utils/memento.py index f55c2dc6..a20319cf 100644 --- a/pywb/utils/memento.py +++ b/pywb/utils/memento.py @@ -66,6 +66,16 @@ class MementoUtils(object): @classmethod def make_timemap_memento_link(cls, cdx, datetime=None, rel='memento', end=',\n', memento_format=None): + """Creates a memento link string for a timemap + + :param dict cdx: The cdx object + :param str|None datetime: The datetime + :param str rel: The rel type + :param str end: Optional string appended to the end of the created link string + :param str|None memento_format: Optional string used to format the URL + :return: A memento link string + :rtype: str + """ url = cdx.get('url') if not url: url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length')) @@ -113,6 +123,16 @@ class MementoUtils(object): @classmethod def make_memento_link(cls, url, type, dt, coll=None, memento_format=None): + """Creates a memento link string + + :param str url: A URL + :param str type: The rel type + :param str dt: The datetime of the URL + :param str|None coll: Optional name of a collection + :param str|None memento_format: Optional string used to format the supplied URL + :return: A memento link string + :rtype: str + """ if memento_format: memento_format = memento_format.format(url=url, timestamp=http_date_to_timestamp(dt)) diff --git a/pywb/utils/wbexception.py b/pywb/utils/wbexception.py index 228df9f2..9d986dd1 100644 --- a/pywb/utils/wbexception.py +++ b/pywb/utils/wbexception.py @@ -3,16 +3,34 @@ from werkzeug.http import HTTP_STATUS_CODES #================================================================= class WbException(Exception): + """Base class for exceptions raised by Pywb""" + def __init__(self, msg=None, url=None): - Exception.__init__(self, msg) + """Initialize a new WbException + + :param str|None msg: The message for the error response + :param str|None url: The URL that caused the error + :rtype: None + """ + super(WbException, self).__init__(msg) self.msg = msg self.url = url @property def status_code(self): + """Returns the status code to be used for the error response + + :return: The status code for the error response (500) + :rtype: int + """ return 500 def status(self): + """Returns the HTTP status line for the error response + + :return: The HTTP status line for the error response + :rtype: str + """ return str(self.status_code) + ' ' + HTTP_STATUS_CODES.get(self.status_code, 'Unknown Error') def __repr__(self): @@ -25,46 +43,96 @@ class WbException(Exception): #================================================================= class AccessException(WbException): + """An Exception used to indicate an access control violation""" + @property def status_code(self): + """Returns the status code to be used for the error response + + :return: The status code for the error response (451) + :rtype: int + """ return 451 #================================================================= class BadRequestException(WbException): + """An Exception used to indicate that request was bad""" + @property def status_code(self): + """Returns the status code to be used for the error response + + :return: The status code for the error response (400) + :rtype: int + """ return 400 #================================================================= class NotFoundException(WbException): + """An Exception used to indicate that a resource was not found""" + @property def status_code(self): + """Returns the status code to be used for the error response + + :return: The status code for the error response (404) + :rtype: int + """ return 404 #================================================================= class LiveResourceException(WbException): + """An Exception used to indicate that an error was encountered during the + retrial of a live web resource""" + @property def status_code(self): + """Returns the status code to be used for the error response + + :return: The status code for the error response (400) + :rtype: int + """ return 400 # ============================================================================ class UpstreamException(WbException): + """An Exception used to indicate that an error was encountered from an upstream endpoint""" + def __init__(self, status_code, url, details): + """Initialize a new UpstreamException + + :param int status_code: The status code for the error response + :param str url: The URL that caused the error + :param str details: The details of the error encountered + :rtype: None + """ super(UpstreamException, self).__init__(url=url, msg=details) self._status_code = status_code @property def status_code(self): + """Returns the status code to be used for the error response + + :return: The status code for the error response + :rtype: int + """ return self._status_code # ============================================================================ class AppPageNotFound(WbException): + """An Exception used to indicate that a page was not found""" + @property def status_code(self): + """Returns the status code to be used for the error response + + :return: The status code for the error response (400) + :rtype: int + """ return 404 diff --git a/pywb/warcserver/access_checker.py b/pywb/warcserver/access_checker.py index a0eb4abf..9cd2790e 100644 --- a/pywb/warcserver/access_checker.py +++ b/pywb/warcserver/access_checker.py @@ -11,11 +11,28 @@ import os # ============================================================================ class FileAccessIndexSource(FileIndexSource): + """An Index Source class specific to access control lists""" + @staticmethod def rev_cmp(a, b): + """Performs a comparison between two items using the + algorithm of the removed builtin cmp + + :param a: A value to be compared + :param b: A value to be compared + :return: The result of the comparison + :rtype: int + """ return (a < b) - (a > b) def _do_iter(self, fh, params): + """Iterates over the supplied file handle to an access control list + yielding the results of the search for the params key + + :param TextIO fh: The file handle to an access control list + :param dict params: The params of the + :return: A generator yielding the results of the param search + """ exact_suffix = params.get('exact_match_suffix') key = params['key'] if exact_suffix: @@ -27,31 +44,47 @@ class FileAccessIndexSource(FileIndexSource): # ============================================================================ class ReverseMergeMixin(object): + """A mixin that provides revered merge functionality""" + def _merge(self, iter_list): + """Merges the supplied list of iterators in reverse + + :param iter_list: The list of iterators to be merged + :return: An iterator that yields the results of the reverse merge + """ return merge(*(iter_list), reverse=True) # ============================================================================ class AccessRulesAggregator(ReverseMergeMixin, SimpleAggregator): - pass + """An Aggregator specific to access control""" # ============================================================================ class DirectoryAccessSource(ReverseMergeMixin, DirectoryIndexSource): - INDEX_SOURCES = [('.aclj', FileAccessIndexSource)] + """An directory index source specific to access control""" + + INDEX_SOURCES = [('.aclj', FileAccessIndexSource)] # type: list[tuple] # ============================================================================ class CacheDirectoryAccessSource(CacheDirectoryMixin, DirectoryAccessSource): - pass + """An cache directory index source specific to access control""" # ============================================================================ class AccessChecker(object): - EXACT_SUFFIX = '###' - EXACT_SUFFIX_B = b'###' + """An access checker class""" + + EXACT_SUFFIX = '###' # type: str + EXACT_SUFFIX_B = b'###' # type: bytes def __init__(self, access_source, default_access='allow'): + """Initialize a new AccessChecker + + :param str|list[str]|AccessRulesAggregator access_source: An access source + :param str default_access: The default access action (allow) + """ if isinstance(access_source, str): self.aggregator = self.create_access_aggregator([access_source]) elif isinstance(access_source, list): @@ -66,6 +99,13 @@ class AccessChecker(object): self.default_rule['default'] = 'true' def create_access_aggregator(self, source_files): + """Creates a new AccessRulesAggregator using the supplied list + of access control file names + + :param list[str] source_files: The list of access control file names + :return: The created AccessRulesAggregator + :rtype: AccessRulesAggregator + """ sources = {} for filename in source_files: sources[filename] = self.create_access_source(filename) @@ -74,6 +114,17 @@ class AccessChecker(object): return aggregator def create_access_source(self, filename): + """Creates a new access source for the supplied filename. + + If the filename is for a directory an CacheDirectoryAccessSource + instance is returned otherwise an FileAccessIndexSource instance + + :param str filename: The name of an file/directory + :return: An instance of CacheDirectoryAccessSource or FileAccessIndexSource + depending on if the supplied filename is for a directory or file + :rtype: CacheDirectoryAccessSource|FileAccessIndexSource + :raises Exception: Indicates an invalid access source was supplied + """ if os.path.isdir(filename): return CacheDirectoryAccessSource(filename) @@ -84,6 +135,16 @@ class AccessChecker(object): raise Exception('Invalid Access Source: ' + filename) def find_access_rule(self, url, ts=None, urlkey=None): + """Attempts to find the access control rule for the + supplied URL otherwise returns the default rule + + :param str url: The URL for the rule to be found + :param str|None ts: A timestamp (not used) + :param str|None urlkey: The access control url key + :return: The access control rule for the supplied URL + if one exists otherwise the default rule + :rtype: CDXObject + """ params = {'url': url, 'urlkey': urlkey, 'nosource': 'true', @@ -121,10 +182,24 @@ class AccessChecker(object): return self.default_rule def __call__(self, res): + """Wraps the cdx iter in the supplied tuple returning a + the wrapped cdx iter and the other members of the supplied + tuple in same order + + :param tuple res: The result tuple + :return: An tuple + """ cdx_iter, errs = res return self.wrap_iter(cdx_iter), errs def wrap_iter(self, cdx_iter): + """Wraps the supplied cdx iter and yields cdx objects + that contain the access control results for the cdx object + being yielded + + :param cdx_iter: The cdx object iterator to be wrapped + :return: The wrapped cdx object iterator + """ last_rule = None last_url = None