mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
added additional code documentation in order to meet the documentation requirements of pywb
This commit is contained in:
parent
9a40d29ac3
commit
8d98b9111e
@ -1,12 +1,10 @@
|
||||
from gevent.monkey import patch_all; patch_all()
|
||||
|
||||
#from bottle import run, Bottle, request, response, debug
|
||||
from werkzeug.routing import Map, Rule
|
||||
from werkzeug.exceptions import HTTPException, NotFound
|
||||
from werkzeug.wsgi import pop_path_info
|
||||
from six.moves.urllib.parse import urljoin
|
||||
from six import iteritems
|
||||
from warcio.statusandheaders import StatusAndHeaders
|
||||
from warcio.utils import to_native_str
|
||||
from warcio.timeutils import iso_date_to_timestamp
|
||||
from wsgiprox.wsgiprox import WSGIProxMiddleware
|
||||
@ -17,14 +15,14 @@ from pywb.recorder.recorderapp import RecorderApp
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
from pywb.utils.geventserver import GeventServer
|
||||
from pywb.utils.io import StreamIter
|
||||
from pywb.utils.wbexception import NotFoundException, WbException, AppPageNotFound
|
||||
from pywb.utils.wbexception import WbException, AppPageNotFound
|
||||
|
||||
from pywb.warcserver.warcserver import WarcServer
|
||||
|
||||
from pywb.rewrite.templateview import BaseInsertView
|
||||
|
||||
from pywb.apps.static_handler import StaticHandler
|
||||
from pywb.apps.rewriterapp import RewriterApp, UpstreamException
|
||||
from pywb.apps.rewriterapp import RewriterApp
|
||||
from pywb.apps.wbrequestresponse import WbResponse
|
||||
|
||||
import os
|
||||
@ -71,6 +69,8 @@ class FrontEndApp(object):
|
||||
self.handler = self.handle_request
|
||||
self.warcserver = WarcServer(config_file=config_file,
|
||||
custom_config=custom_config)
|
||||
self.recorder = None
|
||||
self.recorder_path = None
|
||||
|
||||
config = self.warcserver.config
|
||||
|
||||
@ -151,7 +151,11 @@ class FrontEndApp(object):
|
||||
return base_paths
|
||||
|
||||
def init_recorder(self, recorder_config):
|
||||
"""Initialize the recording functionality of pywb. If recording_config is None this function is a no op"""
|
||||
"""Initialize the recording functionality of pywb. If recording_config is None this function is a no op
|
||||
|
||||
:param str|dict|None recorder_config: The configuration for the recorder app
|
||||
:rtype: None
|
||||
"""
|
||||
if not recorder_config:
|
||||
self.recorder = None
|
||||
self.recorder_path = None
|
||||
@ -204,6 +208,12 @@ class FrontEndApp(object):
|
||||
indexer.start()
|
||||
|
||||
def is_proxy_enabled(self, environ):
|
||||
"""Returns T/F indicating if proxy mode is enabled
|
||||
|
||||
:param dict environ: The WSGI environment dictionary for the request
|
||||
:return: T/F indicating if proxy mode is enabled
|
||||
:rtype: bool
|
||||
"""
|
||||
return self.proxy_prefix is not None and 'wsgiprox.proxy_host' in environ
|
||||
|
||||
def serve_home(self, environ):
|
||||
@ -485,6 +495,13 @@ class FrontEndApp(object):
|
||||
return WbResponse.redir_response(full_url, '307 Redirect')
|
||||
|
||||
def __call__(self, environ, start_response):
|
||||
"""Handles a request
|
||||
|
||||
:param dict environ: The WSGI environment dictionary for the request
|
||||
:param start_response:
|
||||
:return: The WbResponse for the request
|
||||
:rtype: WbResponse
|
||||
"""
|
||||
return self.handler(environ, start_response)
|
||||
|
||||
def handle_request(self, environ, start_response):
|
||||
|
@ -147,6 +147,17 @@ class RewriterApp(object):
|
||||
return is_timegate
|
||||
|
||||
def _get_prefer_mod(self, wb_url, environ, content_rw, is_proxy):
|
||||
"""Returns the default rewrite modifier and rewrite modifier based on the
|
||||
value of the Prefer HTTP header if it is present
|
||||
|
||||
:param WbUrl wb_url: The WbUrl for the URL being rewritten
|
||||
:param dict environ: The WSGI environment dictionary for the request
|
||||
:param content_rw: The content rewriter instance
|
||||
:param bool is_proxy: Is the rewrite operating in proxy mode
|
||||
:return: A tuple containing the default rewrite modifier and rewrite modifier based
|
||||
on the value of the Prefer HTTP header if it is present
|
||||
:rtype: tuple[str|None, str|None]
|
||||
"""
|
||||
if not self.enable_prefer:
|
||||
return None, None
|
||||
|
||||
@ -516,6 +527,21 @@ class RewriterApp(object):
|
||||
def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
|
||||
status_headers, is_timegate, is_proxy, coll=None,
|
||||
pref_applied=None, mod=None, is_memento=True):
|
||||
"""
|
||||
|
||||
:param str url: The URI-R being rewritten
|
||||
:param str full_prefix: The replay prefix
|
||||
:param str|None memento_dt: The memento datetime for the URI-R being rewritten
|
||||
:param str memento_ts: The memento timestamp
|
||||
:param warcio.StatusAndHeaders status_headers:
|
||||
:param bool is_timegate: Are we returning a response for a timegate
|
||||
:param bool is_proxy: Are we operating in proxy mode
|
||||
:param str|None coll: The collection the URI-R is from
|
||||
:param str|None pref_applied:
|
||||
:param str|None mod: The rewrite modifier
|
||||
:param bool is_memento:
|
||||
:rtype: None
|
||||
"""
|
||||
|
||||
replay_mod = mod or self.replay_mod
|
||||
|
||||
|
@ -35,6 +35,11 @@ except ImportError: # pragma: no cover
|
||||
|
||||
# ============================================================================
|
||||
def init_yaml_env_vars():
|
||||
"""Initializes the yaml parser to be able to set
|
||||
the value of fields from environment variables
|
||||
|
||||
:rtype: None
|
||||
"""
|
||||
env_rx = re.compile(r'\$\{[^}]+\}')
|
||||
|
||||
yaml.add_implicit_resolver('!envvar', env_rx)
|
||||
@ -421,9 +426,18 @@ class S3Loader(BaseLoader):
|
||||
|
||||
# =================================================================
|
||||
class WebHDFSLoader(HttpLoader):
|
||||
"""Loader class specifically for loading webhdfs content"""
|
||||
|
||||
HTTP_URL = 'http://{host}/webhdfs/v1{path}?'
|
||||
|
||||
def load(self, url, offset, length):
|
||||
"""Loads the supplied web hdfs content
|
||||
|
||||
:param str url: The URL to the web hdfs content to be loaded
|
||||
:param int|float|double offset: The offset of the content to be loaded
|
||||
:param int|float|double length: The length of the content to be loaded
|
||||
:return: The raw response content
|
||||
"""
|
||||
parts = urlsplit(url)
|
||||
|
||||
http_url = self.HTTP_URL.format(host=parts.netloc,
|
||||
|
@ -66,6 +66,16 @@ class MementoUtils(object):
|
||||
|
||||
@classmethod
|
||||
def make_timemap_memento_link(cls, cdx, datetime=None, rel='memento', end=',\n', memento_format=None):
|
||||
"""Creates a memento link string for a timemap
|
||||
|
||||
:param dict cdx: The cdx object
|
||||
:param str|None datetime: The datetime
|
||||
:param str rel: The rel type
|
||||
:param str end: Optional string appended to the end of the created link string
|
||||
:param str|None memento_format: Optional string used to format the URL
|
||||
:return: A memento link string
|
||||
:rtype: str
|
||||
"""
|
||||
url = cdx.get('url')
|
||||
if not url:
|
||||
url = 'file://{0}:{1}:{2}'.format(cdx.get('filename'), cdx.get('offset'), cdx.get('length'))
|
||||
@ -113,6 +123,16 @@ class MementoUtils(object):
|
||||
|
||||
@classmethod
|
||||
def make_memento_link(cls, url, type, dt, coll=None, memento_format=None):
|
||||
"""Creates a memento link string
|
||||
|
||||
:param str url: A URL
|
||||
:param str type: The rel type
|
||||
:param str dt: The datetime of the URL
|
||||
:param str|None coll: Optional name of a collection
|
||||
:param str|None memento_format: Optional string used to format the supplied URL
|
||||
:return: A memento link string
|
||||
:rtype: str
|
||||
"""
|
||||
if memento_format:
|
||||
memento_format = memento_format.format(url=url,
|
||||
timestamp=http_date_to_timestamp(dt))
|
||||
|
@ -3,16 +3,34 @@ from werkzeug.http import HTTP_STATUS_CODES
|
||||
|
||||
#=================================================================
|
||||
class WbException(Exception):
|
||||
"""Base class for exceptions raised by Pywb"""
|
||||
|
||||
def __init__(self, msg=None, url=None):
|
||||
Exception.__init__(self, msg)
|
||||
"""Initialize a new WbException
|
||||
|
||||
:param str|None msg: The message for the error response
|
||||
:param str|None url: The URL that caused the error
|
||||
:rtype: None
|
||||
"""
|
||||
super(WbException, self).__init__(msg)
|
||||
self.msg = msg
|
||||
self.url = url
|
||||
|
||||
@property
|
||||
def status_code(self):
|
||||
"""Returns the status code to be used for the error response
|
||||
|
||||
:return: The status code for the error response (500)
|
||||
:rtype: int
|
||||
"""
|
||||
return 500
|
||||
|
||||
def status(self):
|
||||
"""Returns the HTTP status line for the error response
|
||||
|
||||
:return: The HTTP status line for the error response
|
||||
:rtype: str
|
||||
"""
|
||||
return str(self.status_code) + ' ' + HTTP_STATUS_CODES.get(self.status_code, 'Unknown Error')
|
||||
|
||||
def __repr__(self):
|
||||
@ -25,46 +43,96 @@ class WbException(Exception):
|
||||
|
||||
#=================================================================
|
||||
class AccessException(WbException):
|
||||
"""An Exception used to indicate an access control violation"""
|
||||
|
||||
@property
|
||||
def status_code(self):
|
||||
"""Returns the status code to be used for the error response
|
||||
|
||||
:return: The status code for the error response (451)
|
||||
:rtype: int
|
||||
"""
|
||||
return 451
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BadRequestException(WbException):
|
||||
"""An Exception used to indicate that request was bad"""
|
||||
|
||||
@property
|
||||
def status_code(self):
|
||||
"""Returns the status code to be used for the error response
|
||||
|
||||
:return: The status code for the error response (400)
|
||||
:rtype: int
|
||||
"""
|
||||
return 400
|
||||
|
||||
|
||||
#=================================================================
|
||||
class NotFoundException(WbException):
|
||||
"""An Exception used to indicate that a resource was not found"""
|
||||
|
||||
@property
|
||||
def status_code(self):
|
||||
"""Returns the status code to be used for the error response
|
||||
|
||||
:return: The status code for the error response (404)
|
||||
:rtype: int
|
||||
"""
|
||||
return 404
|
||||
|
||||
|
||||
#=================================================================
|
||||
class LiveResourceException(WbException):
|
||||
"""An Exception used to indicate that an error was encountered during the
|
||||
retrial of a live web resource"""
|
||||
|
||||
@property
|
||||
def status_code(self):
|
||||
"""Returns the status code to be used for the error response
|
||||
|
||||
:return: The status code for the error response (400)
|
||||
:rtype: int
|
||||
"""
|
||||
return 400
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class UpstreamException(WbException):
|
||||
"""An Exception used to indicate that an error was encountered from an upstream endpoint"""
|
||||
|
||||
def __init__(self, status_code, url, details):
|
||||
"""Initialize a new UpstreamException
|
||||
|
||||
:param int status_code: The status code for the error response
|
||||
:param str url: The URL that caused the error
|
||||
:param str details: The details of the error encountered
|
||||
:rtype: None
|
||||
"""
|
||||
super(UpstreamException, self).__init__(url=url, msg=details)
|
||||
self._status_code = status_code
|
||||
|
||||
@property
|
||||
def status_code(self):
|
||||
"""Returns the status code to be used for the error response
|
||||
|
||||
:return: The status code for the error response
|
||||
:rtype: int
|
||||
"""
|
||||
return self._status_code
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class AppPageNotFound(WbException):
|
||||
"""An Exception used to indicate that a page was not found"""
|
||||
|
||||
@property
|
||||
def status_code(self):
|
||||
"""Returns the status code to be used for the error response
|
||||
|
||||
:return: The status code for the error response (400)
|
||||
:rtype: int
|
||||
"""
|
||||
return 404
|
||||
|
||||
|
@ -11,11 +11,28 @@ import os
|
||||
|
||||
# ============================================================================
|
||||
class FileAccessIndexSource(FileIndexSource):
|
||||
"""An Index Source class specific to access control lists"""
|
||||
|
||||
@staticmethod
|
||||
def rev_cmp(a, b):
|
||||
"""Performs a comparison between two items using the
|
||||
algorithm of the removed builtin cmp
|
||||
|
||||
:param a: A value to be compared
|
||||
:param b: A value to be compared
|
||||
:return: The result of the comparison
|
||||
:rtype: int
|
||||
"""
|
||||
return (a < b) - (a > b)
|
||||
|
||||
def _do_iter(self, fh, params):
|
||||
"""Iterates over the supplied file handle to an access control list
|
||||
yielding the results of the search for the params key
|
||||
|
||||
:param TextIO fh: The file handle to an access control list
|
||||
:param dict params: The params of the
|
||||
:return: A generator yielding the results of the param search
|
||||
"""
|
||||
exact_suffix = params.get('exact_match_suffix')
|
||||
key = params['key']
|
||||
if exact_suffix:
|
||||
@ -27,31 +44,47 @@ class FileAccessIndexSource(FileIndexSource):
|
||||
|
||||
# ============================================================================
|
||||
class ReverseMergeMixin(object):
|
||||
"""A mixin that provides revered merge functionality"""
|
||||
|
||||
def _merge(self, iter_list):
|
||||
"""Merges the supplied list of iterators in reverse
|
||||
|
||||
:param iter_list: The list of iterators to be merged
|
||||
:return: An iterator that yields the results of the reverse merge
|
||||
"""
|
||||
return merge(*(iter_list), reverse=True)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class AccessRulesAggregator(ReverseMergeMixin, SimpleAggregator):
|
||||
pass
|
||||
"""An Aggregator specific to access control"""
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class DirectoryAccessSource(ReverseMergeMixin, DirectoryIndexSource):
|
||||
INDEX_SOURCES = [('.aclj', FileAccessIndexSource)]
|
||||
"""An directory index source specific to access control"""
|
||||
|
||||
INDEX_SOURCES = [('.aclj', FileAccessIndexSource)] # type: list[tuple]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class CacheDirectoryAccessSource(CacheDirectoryMixin, DirectoryAccessSource):
|
||||
pass
|
||||
"""An cache directory index source specific to access control"""
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class AccessChecker(object):
|
||||
EXACT_SUFFIX = '###'
|
||||
EXACT_SUFFIX_B = b'###'
|
||||
"""An access checker class"""
|
||||
|
||||
EXACT_SUFFIX = '###' # type: str
|
||||
EXACT_SUFFIX_B = b'###' # type: bytes
|
||||
|
||||
def __init__(self, access_source, default_access='allow'):
|
||||
"""Initialize a new AccessChecker
|
||||
|
||||
:param str|list[str]|AccessRulesAggregator access_source: An access source
|
||||
:param str default_access: The default access action (allow)
|
||||
"""
|
||||
if isinstance(access_source, str):
|
||||
self.aggregator = self.create_access_aggregator([access_source])
|
||||
elif isinstance(access_source, list):
|
||||
@ -66,6 +99,13 @@ class AccessChecker(object):
|
||||
self.default_rule['default'] = 'true'
|
||||
|
||||
def create_access_aggregator(self, source_files):
|
||||
"""Creates a new AccessRulesAggregator using the supplied list
|
||||
of access control file names
|
||||
|
||||
:param list[str] source_files: The list of access control file names
|
||||
:return: The created AccessRulesAggregator
|
||||
:rtype: AccessRulesAggregator
|
||||
"""
|
||||
sources = {}
|
||||
for filename in source_files:
|
||||
sources[filename] = self.create_access_source(filename)
|
||||
@ -74,6 +114,17 @@ class AccessChecker(object):
|
||||
return aggregator
|
||||
|
||||
def create_access_source(self, filename):
|
||||
"""Creates a new access source for the supplied filename.
|
||||
|
||||
If the filename is for a directory an CacheDirectoryAccessSource
|
||||
instance is returned otherwise an FileAccessIndexSource instance
|
||||
|
||||
:param str filename: The name of an file/directory
|
||||
:return: An instance of CacheDirectoryAccessSource or FileAccessIndexSource
|
||||
depending on if the supplied filename is for a directory or file
|
||||
:rtype: CacheDirectoryAccessSource|FileAccessIndexSource
|
||||
:raises Exception: Indicates an invalid access source was supplied
|
||||
"""
|
||||
if os.path.isdir(filename):
|
||||
return CacheDirectoryAccessSource(filename)
|
||||
|
||||
@ -84,6 +135,16 @@ class AccessChecker(object):
|
||||
raise Exception('Invalid Access Source: ' + filename)
|
||||
|
||||
def find_access_rule(self, url, ts=None, urlkey=None):
|
||||
"""Attempts to find the access control rule for the
|
||||
supplied URL otherwise returns the default rule
|
||||
|
||||
:param str url: The URL for the rule to be found
|
||||
:param str|None ts: A timestamp (not used)
|
||||
:param str|None urlkey: The access control url key
|
||||
:return: The access control rule for the supplied URL
|
||||
if one exists otherwise the default rule
|
||||
:rtype: CDXObject
|
||||
"""
|
||||
params = {'url': url,
|
||||
'urlkey': urlkey,
|
||||
'nosource': 'true',
|
||||
@ -121,10 +182,24 @@ class AccessChecker(object):
|
||||
return self.default_rule
|
||||
|
||||
def __call__(self, res):
|
||||
"""Wraps the cdx iter in the supplied tuple returning a
|
||||
the wrapped cdx iter and the other members of the supplied
|
||||
tuple in same order
|
||||
|
||||
:param tuple res: The result tuple
|
||||
:return: An tuple
|
||||
"""
|
||||
cdx_iter, errs = res
|
||||
return self.wrap_iter(cdx_iter), errs
|
||||
|
||||
def wrap_iter(self, cdx_iter):
|
||||
"""Wraps the supplied cdx iter and yields cdx objects
|
||||
that contain the access control results for the cdx object
|
||||
being yielded
|
||||
|
||||
:param cdx_iter: The cdx object iterator to be wrapped
|
||||
:return: The wrapped cdx object iterator
|
||||
"""
|
||||
last_rule = None
|
||||
last_url = None
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user