1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

added additional code documentation in order to meet the documentation requirements of pywb

This commit is contained in:
John Berlin 2019-04-10 14:00:53 -04:00
parent 9a40d29ac3
commit 8d98b9111e
No known key found for this signature in database
GPG Key ID: 6EF5E4B442011B02
6 changed files with 231 additions and 11 deletions

View File

@ -1,12 +1,10 @@
from gevent.monkey import patch_all; patch_all()
#from bottle import run, Bottle, request, response, debug
from werkzeug.routing import Map, Rule
from werkzeug.exceptions import HTTPException, NotFound
from werkzeug.wsgi import pop_path_info
from six.moves.urllib.parse import urljoin
from six import iteritems
from warcio.statusandheaders import StatusAndHeaders
from warcio.utils import to_native_str
from warcio.timeutils import iso_date_to_timestamp
from wsgiprox.wsgiprox import WSGIProxMiddleware
@ -17,14 +15,14 @@ from pywb.recorder.recorderapp import RecorderApp
from pywb.utils.loaders import load_yaml_config
from pywb.utils.geventserver import GeventServer
from pywb.utils.io import StreamIter
from pywb.utils.wbexception import NotFoundException, WbException, AppPageNotFound
from pywb.utils.wbexception import WbException, AppPageNotFound
from pywb.warcserver.warcserver import WarcServer
from pywb.rewrite.templateview import BaseInsertView
from pywb.apps.static_handler import StaticHandler
from pywb.apps.rewriterapp import RewriterApp, UpstreamException
from pywb.apps.rewriterapp import RewriterApp
from pywb.apps.wbrequestresponse import WbResponse
import os
@ -71,6 +69,8 @@ class FrontEndApp(object):
self.handler = self.handle_request
self.warcserver = WarcServer(config_file=config_file,
custom_config=custom_config)
self.recorder = None
self.recorder_path = None
config = self.warcserver.config
@ -151,7 +151,11 @@ class FrontEndApp(object):
return base_paths
def init_recorder(self, recorder_config):
"""Initialize the recording functionality of pywb. If recording_config is None this function is a no op"""
"""Initialize the recording functionality of pywb. If recording_config is None this function is a no op
:param str|dict|None recorder_config: The configuration for the recorder app
:rtype: None
"""
if not recorder_config:
self.recorder = None
self.recorder_path = None
@ -204,6 +208,12 @@ class FrontEndApp(object):
indexer.start()
def is_proxy_enabled(self, environ):
"""Returns T/F indicating if proxy mode is enabled
:param dict environ: The WSGI environment dictionary for the request
:return: T/F indicating if proxy mode is enabled
:rtype: bool
"""
return self.proxy_prefix is not None and 'wsgiprox.proxy_host' in environ
def serve_home(self, environ):
@ -485,6 +495,13 @@ class FrontEndApp(object):
return WbResponse.redir_response(full_url, '307 Redirect')
def __call__(self, environ, start_response):
"""Handles a request
:param dict environ: The WSGI environment dictionary for the request
:param start_response:
:return: The WbResponse for the request
:rtype: WbResponse
"""
return self.handler(environ, start_response)
def handle_request(self, environ, start_response):

View File

@ -147,6 +147,17 @@ class RewriterApp(object):
return is_timegate
def _get_prefer_mod(self, wb_url, environ, content_rw, is_proxy):
"""Returns the default rewrite modifier and rewrite modifier based on the
value of the Prefer HTTP header if it is present
:param WbUrl wb_url: The WbUrl for the URL being rewritten
:param dict environ: The WSGI environment dictionary for the request
:param content_rw: The content rewriter instance
:param bool is_proxy: Is the rewrite operating in proxy mode
:return: A tuple containing the default rewrite modifier and rewrite modifier based
on the value of the Prefer HTTP header if it is present
:rtype: tuple[str|None, str|None]
"""
if not self.enable_prefer:
return None, None
@ -516,6 +527,21 @@ class RewriterApp(object):
def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
status_headers, is_timegate, is_proxy, coll=None,
pref_applied=None, mod=None, is_memento=True):
"""
:param str url: The URI-R being rewritten
:param str full_prefix: The replay prefix
:param str|None memento_dt: The memento datetime for the URI-R being rewritten
:param str memento_ts: The memento timestamp
:param warcio.StatusAndHeaders status_headers:
:param bool is_timegate: Are we returning a response for a timegate
:param bool is_proxy: Are we operating in proxy mode
:param str|None coll: The collection the URI-R is from
:param str|None pref_applied:
:param str|None mod: The rewrite modifier
:param bool is_memento:
:rtype: None
"""
replay_mod = mod or self.replay_mod

View File

@ -35,6 +35,11 @@ except ImportError: # pragma: no cover
# ============================================================================
def init_yaml_env_vars():
"""Initializes the yaml parser to be able to set
the value of fields from environment variables
:rtype: None
"""
env_rx = re.compile(r'\$\{[^}]+\}')
yaml.add_implicit_resolver('!envvar', env_rx)
@ -421,9 +426,18 @@ class S3Loader(BaseLoader):
# =================================================================
class WebHDFSLoader(HttpLoader):
"""Loader class specifically for loading webhdfs content"""
HTTP_URL = 'http://{host}/webhdfs/v1{path}?'
def load(self, url, offset, length):
"""Loads the supplied web hdfs content
:param str url: The URL to the web hdfs content to be loaded
:param int|float|double offset: The offset of the content to be loaded
:param int|float|double length: The length of the content to be loaded
:return: The raw response content
"""
parts = urlsplit(url)
http_url = self.HTTP_URL.format(host=parts.netloc,

View File

@ -66,6 +66,16 @@ class MementoUtils(object):
@classmethod
def make_timemap_memento_link(cls, cdx, datetime=None, rel='memento', end=',\n', memento_format=None):
"""Creates a memento link string for a timemap
:param dict cdx: The cdx object
:param str|None datetime: The datetime
:param str rel: The rel type
:param str end: Optional string appended to the end of the created link string
:param str|None memento_format: Optional string used to format the URL
:return: A memento link string
:rtype: str
"""
url = cdx.get('url')
if not url:
url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length'))
@ -113,6 +123,16 @@ class MementoUtils(object):
@classmethod
def make_memento_link(cls, url, type, dt, coll=None, memento_format=None):
"""Creates a memento link string
:param str url: A URL
:param str type: The rel type
:param str dt: The datetime of the URL
:param str|None coll: Optional name of a collection
:param str|None memento_format: Optional string used to format the supplied URL
:return: A memento link string
:rtype: str
"""
if memento_format:
memento_format = memento_format.format(url=url,
timestamp=http_date_to_timestamp(dt))

View File

@ -3,16 +3,34 @@ from werkzeug.http import HTTP_STATUS_CODES
#=================================================================
class WbException(Exception):
"""Base class for exceptions raised by Pywb"""
def __init__(self, msg=None, url=None):
Exception.__init__(self, msg)
"""Initialize a new WbException
:param str|None msg: The message for the error response
:param str|None url: The URL that caused the error
:rtype: None
"""
super(WbException, self).__init__(msg)
self.msg = msg
self.url = url
@property
def status_code(self):
"""Returns the status code to be used for the error response
:return: The status code for the error response (500)
:rtype: int
"""
return 500
def status(self):
"""Returns the HTTP status line for the error response
:return: The HTTP status line for the error response
:rtype: str
"""
return str(self.status_code) + ' ' + HTTP_STATUS_CODES.get(self.status_code, 'Unknown Error')
def __repr__(self):
@ -25,46 +43,96 @@ class WbException(Exception):
#=================================================================
class AccessException(WbException):
"""An Exception used to indicate an access control violation"""
@property
def status_code(self):
"""Returns the status code to be used for the error response
:return: The status code for the error response (451)
:rtype: int
"""
return 451
#=================================================================
class BadRequestException(WbException):
"""An Exception used to indicate that request was bad"""
@property
def status_code(self):
"""Returns the status code to be used for the error response
:return: The status code for the error response (400)
:rtype: int
"""
return 400
#=================================================================
class NotFoundException(WbException):
"""An Exception used to indicate that a resource was not found"""
@property
def status_code(self):
"""Returns the status code to be used for the error response
:return: The status code for the error response (404)
:rtype: int
"""
return 404
#=================================================================
class LiveResourceException(WbException):
"""An Exception used to indicate that an error was encountered during the
retrial of a live web resource"""
@property
def status_code(self):
"""Returns the status code to be used for the error response
:return: The status code for the error response (400)
:rtype: int
"""
return 400
# ============================================================================
class UpstreamException(WbException):
"""An Exception used to indicate that an error was encountered from an upstream endpoint"""
def __init__(self, status_code, url, details):
"""Initialize a new UpstreamException
:param int status_code: The status code for the error response
:param str url: The URL that caused the error
:param str details: The details of the error encountered
:rtype: None
"""
super(UpstreamException, self).__init__(url=url, msg=details)
self._status_code = status_code
@property
def status_code(self):
"""Returns the status code to be used for the error response
:return: The status code for the error response
:rtype: int
"""
return self._status_code
# ============================================================================
class AppPageNotFound(WbException):
"""An Exception used to indicate that a page was not found"""
@property
def status_code(self):
"""Returns the status code to be used for the error response
:return: The status code for the error response (400)
:rtype: int
"""
return 404

View File

@ -11,11 +11,28 @@ import os
# ============================================================================
class FileAccessIndexSource(FileIndexSource):
"""An Index Source class specific to access control lists"""
@staticmethod
def rev_cmp(a, b):
"""Performs a comparison between two items using the
algorithm of the removed builtin cmp
:param a: A value to be compared
:param b: A value to be compared
:return: The result of the comparison
:rtype: int
"""
return (a < b) - (a > b)
def _do_iter(self, fh, params):
"""Iterates over the supplied file handle to an access control list
yielding the results of the search for the params key
:param TextIO fh: The file handle to an access control list
:param dict params: The params of the
:return: A generator yielding the results of the param search
"""
exact_suffix = params.get('exact_match_suffix')
key = params['key']
if exact_suffix:
@ -27,31 +44,47 @@ class FileAccessIndexSource(FileIndexSource):
# ============================================================================
class ReverseMergeMixin(object):
"""A mixin that provides revered merge functionality"""
def _merge(self, iter_list):
"""Merges the supplied list of iterators in reverse
:param iter_list: The list of iterators to be merged
:return: An iterator that yields the results of the reverse merge
"""
return merge(*(iter_list), reverse=True)
# ============================================================================
class AccessRulesAggregator(ReverseMergeMixin, SimpleAggregator):
pass
"""An Aggregator specific to access control"""
# ============================================================================
class DirectoryAccessSource(ReverseMergeMixin, DirectoryIndexSource):
INDEX_SOURCES = [('.aclj', FileAccessIndexSource)]
"""An directory index source specific to access control"""
INDEX_SOURCES = [('.aclj', FileAccessIndexSource)] # type: list[tuple]
# ============================================================================
class CacheDirectoryAccessSource(CacheDirectoryMixin, DirectoryAccessSource):
pass
"""An cache directory index source specific to access control"""
# ============================================================================
class AccessChecker(object):
EXACT_SUFFIX = '###'
EXACT_SUFFIX_B = b'###'
"""An access checker class"""
EXACT_SUFFIX = '###' # type: str
EXACT_SUFFIX_B = b'###' # type: bytes
def __init__(self, access_source, default_access='allow'):
"""Initialize a new AccessChecker
:param str|list[str]|AccessRulesAggregator access_source: An access source
:param str default_access: The default access action (allow)
"""
if isinstance(access_source, str):
self.aggregator = self.create_access_aggregator([access_source])
elif isinstance(access_source, list):
@ -66,6 +99,13 @@ class AccessChecker(object):
self.default_rule['default'] = 'true'
def create_access_aggregator(self, source_files):
"""Creates a new AccessRulesAggregator using the supplied list
of access control file names
:param list[str] source_files: The list of access control file names
:return: The created AccessRulesAggregator
:rtype: AccessRulesAggregator
"""
sources = {}
for filename in source_files:
sources[filename] = self.create_access_source(filename)
@ -74,6 +114,17 @@ class AccessChecker(object):
return aggregator
def create_access_source(self, filename):
"""Creates a new access source for the supplied filename.
If the filename is for a directory an CacheDirectoryAccessSource
instance is returned otherwise an FileAccessIndexSource instance
:param str filename: The name of an file/directory
:return: An instance of CacheDirectoryAccessSource or FileAccessIndexSource
depending on if the supplied filename is for a directory or file
:rtype: CacheDirectoryAccessSource|FileAccessIndexSource
:raises Exception: Indicates an invalid access source was supplied
"""
if os.path.isdir(filename):
return CacheDirectoryAccessSource(filename)
@ -84,6 +135,16 @@ class AccessChecker(object):
raise Exception('Invalid Access Source: ' + filename)
def find_access_rule(self, url, ts=None, urlkey=None):
"""Attempts to find the access control rule for the
supplied URL otherwise returns the default rule
:param str url: The URL for the rule to be found
:param str|None ts: A timestamp (not used)
:param str|None urlkey: The access control url key
:return: The access control rule for the supplied URL
if one exists otherwise the default rule
:rtype: CDXObject
"""
params = {'url': url,
'urlkey': urlkey,
'nosource': 'true',
@ -121,10 +182,24 @@ class AccessChecker(object):
return self.default_rule
def __call__(self, res):
"""Wraps the cdx iter in the supplied tuple returning a
the wrapped cdx iter and the other members of the supplied
tuple in same order
:param tuple res: The result tuple
:return: An tuple
"""
cdx_iter, errs = res
return self.wrap_iter(cdx_iter), errs
def wrap_iter(self, cdx_iter):
"""Wraps the supplied cdx iter and yields cdx objects
that contain the access control results for the cdx object
being yielded
:param cdx_iter: The cdx object iterator to be wrapped
:return: The wrapped cdx object iterator
"""
last_rule = None
last_url = None