1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

added additional code documentation in order to meet the documentation requirements of pywb

This commit is contained in:
John Berlin 2019-04-10 14:00:53 -04:00
parent 9a40d29ac3
commit 8d98b9111e
No known key found for this signature in database
GPG Key ID: 6EF5E4B442011B02
6 changed files with 231 additions and 11 deletions

View File

@ -1,12 +1,10 @@
from gevent.monkey import patch_all; patch_all() from gevent.monkey import patch_all; patch_all()
#from bottle import run, Bottle, request, response, debug
from werkzeug.routing import Map, Rule from werkzeug.routing import Map, Rule
from werkzeug.exceptions import HTTPException, NotFound from werkzeug.exceptions import HTTPException, NotFound
from werkzeug.wsgi import pop_path_info from werkzeug.wsgi import pop_path_info
from six.moves.urllib.parse import urljoin from six.moves.urllib.parse import urljoin
from six import iteritems from six import iteritems
from warcio.statusandheaders import StatusAndHeaders
from warcio.utils import to_native_str from warcio.utils import to_native_str
from warcio.timeutils import iso_date_to_timestamp from warcio.timeutils import iso_date_to_timestamp
from wsgiprox.wsgiprox import WSGIProxMiddleware from wsgiprox.wsgiprox import WSGIProxMiddleware
@ -17,14 +15,14 @@ from pywb.recorder.recorderapp import RecorderApp
from pywb.utils.loaders import load_yaml_config from pywb.utils.loaders import load_yaml_config
from pywb.utils.geventserver import GeventServer from pywb.utils.geventserver import GeventServer
from pywb.utils.io import StreamIter from pywb.utils.io import StreamIter
from pywb.utils.wbexception import NotFoundException, WbException, AppPageNotFound from pywb.utils.wbexception import WbException, AppPageNotFound
from pywb.warcserver.warcserver import WarcServer from pywb.warcserver.warcserver import WarcServer
from pywb.rewrite.templateview import BaseInsertView from pywb.rewrite.templateview import BaseInsertView
from pywb.apps.static_handler import StaticHandler from pywb.apps.static_handler import StaticHandler
from pywb.apps.rewriterapp import RewriterApp, UpstreamException from pywb.apps.rewriterapp import RewriterApp
from pywb.apps.wbrequestresponse import WbResponse from pywb.apps.wbrequestresponse import WbResponse
import os import os
@ -71,6 +69,8 @@ class FrontEndApp(object):
self.handler = self.handle_request self.handler = self.handle_request
self.warcserver = WarcServer(config_file=config_file, self.warcserver = WarcServer(config_file=config_file,
custom_config=custom_config) custom_config=custom_config)
self.recorder = None
self.recorder_path = None
config = self.warcserver.config config = self.warcserver.config
@ -151,7 +151,11 @@ class FrontEndApp(object):
return base_paths return base_paths
def init_recorder(self, recorder_config): def init_recorder(self, recorder_config):
"""Initialize the recording functionality of pywb. If recording_config is None this function is a no op""" """Initialize the recording functionality of pywb. If recording_config is None this function is a no op
:param str|dict|None recorder_config: The configuration for the recorder app
:rtype: None
"""
if not recorder_config: if not recorder_config:
self.recorder = None self.recorder = None
self.recorder_path = None self.recorder_path = None
@ -204,6 +208,12 @@ class FrontEndApp(object):
indexer.start() indexer.start()
def is_proxy_enabled(self, environ): def is_proxy_enabled(self, environ):
"""Returns T/F indicating if proxy mode is enabled
:param dict environ: The WSGI environment dictionary for the request
:return: T/F indicating if proxy mode is enabled
:rtype: bool
"""
return self.proxy_prefix is not None and 'wsgiprox.proxy_host' in environ return self.proxy_prefix is not None and 'wsgiprox.proxy_host' in environ
def serve_home(self, environ): def serve_home(self, environ):
@ -485,6 +495,13 @@ class FrontEndApp(object):
return WbResponse.redir_response(full_url, '307 Redirect') return WbResponse.redir_response(full_url, '307 Redirect')
def __call__(self, environ, start_response): def __call__(self, environ, start_response):
"""Handles a request
:param dict environ: The WSGI environment dictionary for the request
:param start_response:
:return: The WbResponse for the request
:rtype: WbResponse
"""
return self.handler(environ, start_response) return self.handler(environ, start_response)
def handle_request(self, environ, start_response): def handle_request(self, environ, start_response):

View File

@ -147,6 +147,17 @@ class RewriterApp(object):
return is_timegate return is_timegate
def _get_prefer_mod(self, wb_url, environ, content_rw, is_proxy): def _get_prefer_mod(self, wb_url, environ, content_rw, is_proxy):
"""Returns the default rewrite modifier and rewrite modifier based on the
value of the Prefer HTTP header if it is present
:param WbUrl wb_url: The WbUrl for the URL being rewritten
:param dict environ: The WSGI environment dictionary for the request
:param content_rw: The content rewriter instance
:param bool is_proxy: Is the rewrite operating in proxy mode
:return: A tuple containing the default rewrite modifier and rewrite modifier based
on the value of the Prefer HTTP header if it is present
:rtype: tuple[str|None, str|None]
"""
if not self.enable_prefer: if not self.enable_prefer:
return None, None return None, None
@ -516,6 +527,21 @@ class RewriterApp(object):
def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts, def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
status_headers, is_timegate, is_proxy, coll=None, status_headers, is_timegate, is_proxy, coll=None,
pref_applied=None, mod=None, is_memento=True): pref_applied=None, mod=None, is_memento=True):
"""
:param str url: The URI-R being rewritten
:param str full_prefix: The replay prefix
:param str|None memento_dt: The memento datetime for the URI-R being rewritten
:param str memento_ts: The memento timestamp
:param warcio.StatusAndHeaders status_headers:
:param bool is_timegate: Are we returning a response for a timegate
:param bool is_proxy: Are we operating in proxy mode
:param str|None coll: The collection the URI-R is from
:param str|None pref_applied:
:param str|None mod: The rewrite modifier
:param bool is_memento:
:rtype: None
"""
replay_mod = mod or self.replay_mod replay_mod = mod or self.replay_mod

View File

@ -35,6 +35,11 @@ except ImportError: # pragma: no cover
# ============================================================================ # ============================================================================
def init_yaml_env_vars(): def init_yaml_env_vars():
"""Initializes the yaml parser to be able to set
the value of fields from environment variables
:rtype: None
"""
env_rx = re.compile(r'\$\{[^}]+\}') env_rx = re.compile(r'\$\{[^}]+\}')
yaml.add_implicit_resolver('!envvar', env_rx) yaml.add_implicit_resolver('!envvar', env_rx)
@ -421,9 +426,18 @@ class S3Loader(BaseLoader):
# ================================================================= # =================================================================
class WebHDFSLoader(HttpLoader): class WebHDFSLoader(HttpLoader):
"""Loader class specifically for loading webhdfs content"""
HTTP_URL = 'http://{host}/webhdfs/v1{path}?' HTTP_URL = 'http://{host}/webhdfs/v1{path}?'
def load(self, url, offset, length): def load(self, url, offset, length):
"""Loads the supplied web hdfs content
:param str url: The URL to the web hdfs content to be loaded
:param int|float|double offset: The offset of the content to be loaded
:param int|float|double length: The length of the content to be loaded
:return: The raw response content
"""
parts = urlsplit(url) parts = urlsplit(url)
http_url = self.HTTP_URL.format(host=parts.netloc, http_url = self.HTTP_URL.format(host=parts.netloc,

View File

@ -66,6 +66,16 @@ class MementoUtils(object):
@classmethod @classmethod
def make_timemap_memento_link(cls, cdx, datetime=None, rel='memento', end=',\n', memento_format=None): def make_timemap_memento_link(cls, cdx, datetime=None, rel='memento', end=',\n', memento_format=None):
"""Creates a memento link string for a timemap
:param dict cdx: The cdx object
:param str|None datetime: The datetime
:param str rel: The rel type
:param str end: Optional string appended to the end of the created link string
:param str|None memento_format: Optional string used to format the URL
:return: A memento link string
:rtype: str
"""
url = cdx.get('url') url = cdx.get('url')
if not url: if not url:
url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length')) url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length'))
@ -113,6 +123,16 @@ class MementoUtils(object):
@classmethod @classmethod
def make_memento_link(cls, url, type, dt, coll=None, memento_format=None): def make_memento_link(cls, url, type, dt, coll=None, memento_format=None):
"""Creates a memento link string
:param str url: A URL
:param str type: The rel type
:param str dt: The datetime of the URL
:param str|None coll: Optional name of a collection
:param str|None memento_format: Optional string used to format the supplied URL
:return: A memento link string
:rtype: str
"""
if memento_format: if memento_format:
memento_format = memento_format.format(url=url, memento_format = memento_format.format(url=url,
timestamp=http_date_to_timestamp(dt)) timestamp=http_date_to_timestamp(dt))

View File

@ -3,16 +3,34 @@ from werkzeug.http import HTTP_STATUS_CODES
#================================================================= #=================================================================
class WbException(Exception): class WbException(Exception):
"""Base class for exceptions raised by Pywb"""
def __init__(self, msg=None, url=None): def __init__(self, msg=None, url=None):
Exception.__init__(self, msg) """Initialize a new WbException
:param str|None msg: The message for the error response
:param str|None url: The URL that caused the error
:rtype: None
"""
super(WbException, self).__init__(msg)
self.msg = msg self.msg = msg
self.url = url self.url = url
@property @property
def status_code(self): def status_code(self):
"""Returns the status code to be used for the error response
:return: The status code for the error response (500)
:rtype: int
"""
return 500 return 500
def status(self): def status(self):
"""Returns the HTTP status line for the error response
:return: The HTTP status line for the error response
:rtype: str
"""
return str(self.status_code) + ' ' + HTTP_STATUS_CODES.get(self.status_code, 'Unknown Error') return str(self.status_code) + ' ' + HTTP_STATUS_CODES.get(self.status_code, 'Unknown Error')
def __repr__(self): def __repr__(self):
@ -25,46 +43,96 @@ class WbException(Exception):
#================================================================= #=================================================================
class AccessException(WbException): class AccessException(WbException):
"""An Exception used to indicate an access control violation"""
@property @property
def status_code(self): def status_code(self):
"""Returns the status code to be used for the error response
:return: The status code for the error response (451)
:rtype: int
"""
return 451 return 451
#================================================================= #=================================================================
class BadRequestException(WbException): class BadRequestException(WbException):
"""An Exception used to indicate that request was bad"""
@property @property
def status_code(self): def status_code(self):
"""Returns the status code to be used for the error response
:return: The status code for the error response (400)
:rtype: int
"""
return 400 return 400
#================================================================= #=================================================================
class NotFoundException(WbException): class NotFoundException(WbException):
"""An Exception used to indicate that a resource was not found"""
@property @property
def status_code(self): def status_code(self):
"""Returns the status code to be used for the error response
:return: The status code for the error response (404)
:rtype: int
"""
return 404 return 404
#================================================================= #=================================================================
class LiveResourceException(WbException): class LiveResourceException(WbException):
"""An Exception used to indicate that an error was encountered during the
retrial of a live web resource"""
@property @property
def status_code(self): def status_code(self):
"""Returns the status code to be used for the error response
:return: The status code for the error response (400)
:rtype: int
"""
return 400 return 400
# ============================================================================ # ============================================================================
class UpstreamException(WbException): class UpstreamException(WbException):
"""An Exception used to indicate that an error was encountered from an upstream endpoint"""
def __init__(self, status_code, url, details): def __init__(self, status_code, url, details):
"""Initialize a new UpstreamException
:param int status_code: The status code for the error response
:param str url: The URL that caused the error
:param str details: The details of the error encountered
:rtype: None
"""
super(UpstreamException, self).__init__(url=url, msg=details) super(UpstreamException, self).__init__(url=url, msg=details)
self._status_code = status_code self._status_code = status_code
@property @property
def status_code(self): def status_code(self):
"""Returns the status code to be used for the error response
:return: The status code for the error response
:rtype: int
"""
return self._status_code return self._status_code
# ============================================================================ # ============================================================================
class AppPageNotFound(WbException): class AppPageNotFound(WbException):
"""An Exception used to indicate that a page was not found"""
@property @property
def status_code(self): def status_code(self):
"""Returns the status code to be used for the error response
:return: The status code for the error response (400)
:rtype: int
"""
return 404 return 404

View File

@ -11,11 +11,28 @@ import os
# ============================================================================ # ============================================================================
class FileAccessIndexSource(FileIndexSource): class FileAccessIndexSource(FileIndexSource):
"""An Index Source class specific to access control lists"""
@staticmethod @staticmethod
def rev_cmp(a, b): def rev_cmp(a, b):
"""Performs a comparison between two items using the
algorithm of the removed builtin cmp
:param a: A value to be compared
:param b: A value to be compared
:return: The result of the comparison
:rtype: int
"""
return (a < b) - (a > b) return (a < b) - (a > b)
def _do_iter(self, fh, params): def _do_iter(self, fh, params):
"""Iterates over the supplied file handle to an access control list
yielding the results of the search for the params key
:param TextIO fh: The file handle to an access control list
:param dict params: The params of the
:return: A generator yielding the results of the param search
"""
exact_suffix = params.get('exact_match_suffix') exact_suffix = params.get('exact_match_suffix')
key = params['key'] key = params['key']
if exact_suffix: if exact_suffix:
@ -27,31 +44,47 @@ class FileAccessIndexSource(FileIndexSource):
# ============================================================================ # ============================================================================
class ReverseMergeMixin(object): class ReverseMergeMixin(object):
"""A mixin that provides revered merge functionality"""
def _merge(self, iter_list): def _merge(self, iter_list):
"""Merges the supplied list of iterators in reverse
:param iter_list: The list of iterators to be merged
:return: An iterator that yields the results of the reverse merge
"""
return merge(*(iter_list), reverse=True) return merge(*(iter_list), reverse=True)
# ============================================================================ # ============================================================================
class AccessRulesAggregator(ReverseMergeMixin, SimpleAggregator): class AccessRulesAggregator(ReverseMergeMixin, SimpleAggregator):
pass """An Aggregator specific to access control"""
# ============================================================================ # ============================================================================
class DirectoryAccessSource(ReverseMergeMixin, DirectoryIndexSource): class DirectoryAccessSource(ReverseMergeMixin, DirectoryIndexSource):
INDEX_SOURCES = [('.aclj', FileAccessIndexSource)] """An directory index source specific to access control"""
INDEX_SOURCES = [('.aclj', FileAccessIndexSource)] # type: list[tuple]
# ============================================================================ # ============================================================================
class CacheDirectoryAccessSource(CacheDirectoryMixin, DirectoryAccessSource): class CacheDirectoryAccessSource(CacheDirectoryMixin, DirectoryAccessSource):
pass """An cache directory index source specific to access control"""
# ============================================================================ # ============================================================================
class AccessChecker(object): class AccessChecker(object):
EXACT_SUFFIX = '###' """An access checker class"""
EXACT_SUFFIX_B = b'###'
EXACT_SUFFIX = '###' # type: str
EXACT_SUFFIX_B = b'###' # type: bytes
def __init__(self, access_source, default_access='allow'): def __init__(self, access_source, default_access='allow'):
"""Initialize a new AccessChecker
:param str|list[str]|AccessRulesAggregator access_source: An access source
:param str default_access: The default access action (allow)
"""
if isinstance(access_source, str): if isinstance(access_source, str):
self.aggregator = self.create_access_aggregator([access_source]) self.aggregator = self.create_access_aggregator([access_source])
elif isinstance(access_source, list): elif isinstance(access_source, list):
@ -66,6 +99,13 @@ class AccessChecker(object):
self.default_rule['default'] = 'true' self.default_rule['default'] = 'true'
def create_access_aggregator(self, source_files): def create_access_aggregator(self, source_files):
"""Creates a new AccessRulesAggregator using the supplied list
of access control file names
:param list[str] source_files: The list of access control file names
:return: The created AccessRulesAggregator
:rtype: AccessRulesAggregator
"""
sources = {} sources = {}
for filename in source_files: for filename in source_files:
sources[filename] = self.create_access_source(filename) sources[filename] = self.create_access_source(filename)
@ -74,6 +114,17 @@ class AccessChecker(object):
return aggregator return aggregator
def create_access_source(self, filename): def create_access_source(self, filename):
"""Creates a new access source for the supplied filename.
If the filename is for a directory an CacheDirectoryAccessSource
instance is returned otherwise an FileAccessIndexSource instance
:param str filename: The name of an file/directory
:return: An instance of CacheDirectoryAccessSource or FileAccessIndexSource
depending on if the supplied filename is for a directory or file
:rtype: CacheDirectoryAccessSource|FileAccessIndexSource
:raises Exception: Indicates an invalid access source was supplied
"""
if os.path.isdir(filename): if os.path.isdir(filename):
return CacheDirectoryAccessSource(filename) return CacheDirectoryAccessSource(filename)
@ -84,6 +135,16 @@ class AccessChecker(object):
raise Exception('Invalid Access Source: ' + filename) raise Exception('Invalid Access Source: ' + filename)
def find_access_rule(self, url, ts=None, urlkey=None): def find_access_rule(self, url, ts=None, urlkey=None):
"""Attempts to find the access control rule for the
supplied URL otherwise returns the default rule
:param str url: The URL for the rule to be found
:param str|None ts: A timestamp (not used)
:param str|None urlkey: The access control url key
:return: The access control rule for the supplied URL
if one exists otherwise the default rule
:rtype: CDXObject
"""
params = {'url': url, params = {'url': url,
'urlkey': urlkey, 'urlkey': urlkey,
'nosource': 'true', 'nosource': 'true',
@ -121,10 +182,24 @@ class AccessChecker(object):
return self.default_rule return self.default_rule
def __call__(self, res): def __call__(self, res):
"""Wraps the cdx iter in the supplied tuple returning a
the wrapped cdx iter and the other members of the supplied
tuple in same order
:param tuple res: The result tuple
:return: An tuple
"""
cdx_iter, errs = res cdx_iter, errs = res
return self.wrap_iter(cdx_iter), errs return self.wrap_iter(cdx_iter), errs
def wrap_iter(self, cdx_iter): def wrap_iter(self, cdx_iter):
"""Wraps the supplied cdx iter and yields cdx objects
that contain the access control results for the cdx object
being yielded
:param cdx_iter: The cdx object iterator to be wrapped
:return: The wrapped cdx object iterator
"""
last_rule = None last_rule = None
last_url = None last_url = None