From 802b9fa4f5d4f04a1490c15a7648084cf1eb51a0 Mon Sep 17 00:00:00 2001 From: John Berlin Date: Tue, 10 Sep 2019 14:45:05 -0400 Subject: [PATCH] apps: - frontendapp.py: restored the pulling out of collection route creation into its own function - rewriterapp.py: reformated file and added documentation utils: - geventserver.py: added documentation - wbexception.py: updated documentation --- pywb/apps/frontendapp.py | 75 ++++++++++++++-------- pywb/apps/rewriterapp.py | 126 +++++++++++++++++++++++++------------ pywb/utils/geventserver.py | 46 +++++++++++++- pywb/utils/wbexception.py | 21 +++---- 4 files changed, 187 insertions(+), 81 deletions(-) diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index 124eeae5..98b3dcff 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -1,7 +1,7 @@ from gevent.monkey import patch_all; patch_all() from werkzeug.routing import Map, Rule -from werkzeug.exceptions import HTTPException, NotFound +from werkzeug.exceptions import HTTPException from werkzeug.wsgi import pop_path_info from six.moves.urllib.parse import urljoin from six import iteritems @@ -43,6 +43,8 @@ class FrontEndApp(object): - WSGIProxMiddleware (Optional): If proxy mode is enabled, performs pywb's HTTP(s) proxy functionality - AutoIndexer (Optional): If auto-indexing is enabled for the collections it is started here - RecorderApp (Optional): Recording functionality, available when recording mode is enabled + + The RewriterApp is configurable and can be set via the class var `REWRITER_APP_CLS`, defaults to RewriterApp """ REPLAY_API = 'http://localhost:%s/{coll}/resource/postreq' @@ -62,8 +64,8 @@ class FrontEndApp(object): def __init__(self, config_file=None, custom_config=None): """ - :param str config_file: Path to the config file - :param dict custom_config: Dictionary containing additional configuration information + :param str|None config_file: Path to the config file + :param dict|None custom_config: Dictionary containing additional configuration information """ config_file = config_file or './config.yaml' self.handler = self.handle_request @@ -71,6 +73,7 @@ class FrontEndApp(object): custom_config=custom_config) self.recorder = None self.recorder_path = None + self.proxy_default_timestamp = None config = self.warcserver.config @@ -108,7 +111,8 @@ class FrontEndApp(object): def _init_routes(self): """Initialize the routes and based on the configuration file makes available - specific routes (proxy mode, record)""" + specific routes (proxy mode, record) + """ self.url_map = Map() self.url_map.add(Rule('/static/_//', endpoint=self.serve_static)) self.url_map.add(Rule('/static/', endpoint=self.serve_static)) @@ -120,18 +124,42 @@ class FrontEndApp(object): coll_prefix = '/' self.url_map.add(Rule('/', endpoint=self.serve_home)) - self.url_map.add(Rule(coll_prefix + self.cdx_api_endpoint, endpoint=self.serve_cdx)) - self.url_map.add(Rule(coll_prefix + '/', endpoint=self.serve_coll_page)) - self.url_map.add(Rule(coll_prefix + '/timemap//', endpoint=self.serve_content)) - - if self.recorder_path: - self.url_map.add(Rule(coll_prefix + self.RECORD_ROUTE + '/', endpoint=self.serve_record)) + self._init_coll_routes(coll_prefix) if self.proxy_prefix is not None: # Add the proxy-fetch endpoint to enable PreservationWorker to make CORS fetches worry free in proxy mode self.url_map.add(Rule('/proxy-fetch/', endpoint=self.proxy_fetch, methods=['GET', 'HEAD', 'OPTIONS'])) - self.url_map.add(Rule(coll_prefix + '/', endpoint=self.serve_content)) + + def _init_coll_routes(self, coll_prefix): + """Initialize and register the routes for specified collection path + + :param str coll_prefix: The collection path + :rtype: None + """ + routes = self._make_coll_routes(coll_prefix) + for route in routes: + self.url_map.add(route) + + def _make_coll_routes(self, coll_prefix): + """Creates a list of standard collection routes for the + specified collection path + + :param str coll_prefix: The collection path + :return: A list of route rules for the supplied collection + :rtype: list[Rule] + """ + routes = [ + Rule(coll_prefix + self.cdx_api_endpoint, endpoint=self.serve_cdx), + Rule(coll_prefix + '/', endpoint=self.serve_coll_page), + Rule(coll_prefix + '/timemap//', endpoint=self.serve_content), + Rule(coll_prefix + '/', endpoint=self.serve_content) + ] + + if self.recorder_path: + routes.append(Rule(coll_prefix + self.RECORD_ROUTE + '/', endpoint=self.serve_record)) + + return routes def get_upstream_paths(self, port): """Retrieve a dictionary containing the full URLs of the upstream apps @@ -141,9 +169,9 @@ class FrontEndApp(object): :rtype: dict[str, str] """ base_paths = { - 'replay': self.REPLAY_API % port, - 'cdx-server': self.CDX_API % port, - } + 'replay': self.REPLAY_API % port, + 'cdx-server': self.CDX_API % port, + } if self.recorder_path: base_paths['record'] = self.recorder_path @@ -178,7 +206,6 @@ class FrontEndApp(object): self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer, accept_colls=recorder_config.get('source_filter')) - recorder_server = GeventServer(self.recorder, port=0) self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll) @@ -260,7 +287,7 @@ class FrontEndApp(object): if proxy_enabled: response.add_access_control_headers(env=environ) return response - except: + except Exception: self.raise_not_found(environ, 'static_file_not_found', filepath) def get_metadata(self, coll): @@ -270,7 +297,7 @@ class FrontEndApp(object): :return: The collections metadata if it exists :rtype: dict """ - #if coll == self.all_coll: + # if coll == self.all_coll: # coll = '*' metadata = {'coll': coll, @@ -321,7 +348,7 @@ class FrontEndApp(object): """ base_url = self.rewriterapp.paths['cdx-server'] - #if coll == self.all_coll: + # if coll == self.all_coll: # coll = '*' cdx_url = base_url.format(coll=coll) @@ -433,7 +460,7 @@ class FrontEndApp(object): """ result = {'fixed': self.warcserver.list_fixed_routes(), 'dynamic': self.warcserver.list_dynamic_routes() - } + } return WbResponse.json_response(result) @@ -444,7 +471,7 @@ class FrontEndApp(object): :return: True if the collection is valid, false otherwise :rtype: bool """ - #if coll == self.all_coll: + # if coll == self.all_coll: # return True return (coll in self.warcserver.list_fixed_routes() or @@ -478,8 +505,6 @@ class FrontEndApp(object): inx = referer[1:].find('http') if not inx: inx = referer[1:].find('///') - if inx > 0: - inx + 1 if inx < 0: return @@ -607,7 +632,7 @@ class FrontEndApp(object): if not self.ALL_DIGITS.match(self.proxy_default_timestamp): try: self.proxy_default_timestamp = iso_date_to_timestamp(self.proxy_default_timestamp) - except: + except Exception: raise Exception('Invalid Proxy Timestamp: Must Be All-Digit Timestamp or ISO Date Format') self.proxy_coll = proxy_coll @@ -691,7 +716,7 @@ class MetadataCache(object): try: mtime = os.path.getmtime(path) obj = self.cache.get(path) - except: + except Exception: return {} if not obj: @@ -733,5 +758,3 @@ class MetadataCache(object): if __name__ == "__main__": app_server = FrontEndApp.create_app(port=8080) app_server.join() - - diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index c60d068e..7810f214 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -2,53 +2,45 @@ from io import BytesIO import requests from fakeredis import FakeStrictRedis - -from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit, unquote +from six.moves.urllib.parse import unquote, urlencode, urlsplit, urlunsplit from warcio.bufferedreaders import BufferedReader from warcio.recordloader import ArcWarcRecordLoader from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date -from werkzeug.http import HTTP_STATUS_CODES from pywb.apps.wbrequestresponse import WbResponse +from pywb.rewrite.cookies import CookieTracker from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy from pywb.rewrite.rewriteinputreq import RewriteInputRequest from pywb.rewrite.templateview import BaseInsertView, HeadInsertView, JinjaEnv, TopFrameView from pywb.rewrite.url_rewriter import IdentityUrlRewriter, UrlRewriter from pywb.rewrite.wburl import WbUrl -from pywb.rewrite.url_rewriter import UrlRewriter, IdentityUrlRewriter - -from pywb.rewrite.cookies import CookieTracker -from pywb.utils.wbexception import WbException, NotFoundException, UpstreamException from pywb.utils.canonicalize import canonicalize from pywb.utils.io import BUFF_SIZE, OffsetLimitReader, no_except_close from pywb.utils.memento import MementoUtils -from pywb.utils.wbexception import WbException +from pywb.utils.wbexception import NotFoundException, UpstreamException from pywb.warcserver.index.cdxobject import CDXObject -# ============================================================================ -class UpstreamException(WbException): - def __init__(self, status_code, url, details): - super(UpstreamException, self).__init__(url=url, msg=details) - self._status_code = status_code - - @property - def status_code(self): - return self._status_code - - -# ============================================================================ -# class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent): -# pass - - # ============================================================================ class RewriterApp(object): + """Primary application for rewriting the content served by pywb (if it is to be rewritten). + + This class is also responsible rendering the archives templates + """ VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json' DEFAULT_CSP = "default-src 'unsafe-eval' 'unsafe-inline' 'self' data: blob: mediastream: ws: wss: ; form-action 'self'" def __init__(self, framed_replay=False, jinja_env=None, config=None, paths=None): + """Initialize a new instance of RewriterApp + + :param bool framed_replay: Is rewriting happening in framed replay mode + :param JinjaEnv|None jinja_env: Optional JinjaEnv instance to be used for + rendering static files + :param dict|None config: Optional config dictionary + :param dict|None paths: Optional dictionary containing a mapping + of path names to URLs + """ self.loader = ArcWarcRecordLoader() self.config = config or {} @@ -108,25 +100,65 @@ class RewriterApp(object): # deprecated: Use X-Forwarded-Proto header instead! self.force_scheme = config.get('force_scheme') - def _init_cookie_tracker(self): - return CookieTracker(FakeStrictRedis()) + def _init_cookie_tracker(self, redis=None): + """Initialize the CookieTracker + + :param redis: Optional redis instance to be used + Defaults to FakeStrictRedis + :return: The initialized cookie tracker + :rtype: CookieTracker + """ + if redis is None: + redis = FakeStrictRedis() + return CookieTracker(redis) def add_csp_header(self, wb_url, status_headers): + """Adds Content-Security-Policy headers to the supplied + StatusAndHeaders instance if the wb_url's mod is equal + to the replay mod + + :param WbUrl wb_url: The WbUrl for the URL being operated on + :param warcio.StatusAndHeaders status_headers: The status and + headers instance for the reply to the URL + """ if self.csp_header and wb_url.mod == self.replay_mod: status_headers.headers.append(self.csp_header) def _html_templ(self, name): + """Returns the html file name for the supplied + html template name. + + :param str name: The name of the html template + :return: The file name for the template + :rtype: str|None + """ value = self.config.get(name) if not value: value = name.replace('_html', '.html') return value def is_framed_replay(self, wb_url): + """Returns T/F indicating if the rewriter app is configured to + be operating in framed replay mode and the supplied WbUrl + is also operating in framed replay mode + + :param WbUrl wb_url: The WbUrl instance to check + :return: T/F if in framed replay mode + :rtype: bool + """ return (self.framed_replay and wb_url.mod == self.frame_mod and wb_url.is_replay()) def _check_accept_dt(self, wb_url, environ): + """Returns T/F indicating if the supplied WbUrl instance + is for a timegate request + + :param WbUrl wb_url: The URL to be checked + :param dict environ: The wsgi environment object for the request + :return: T/F indicating if the WbUrl is for timegate request + :rtype: bool + """ is_timegate = False if wb_url.is_latest_replay(): accept_dt = environ.get('HTTP_ACCEPT_DATETIME') @@ -177,6 +209,15 @@ class RewriterApp(object): return mod, prefer def _check_range(self, inputreq, wb_url): + """Checks the input request if it is a range request returning + the start and end of the range as well as T/F if the request should + be skipped as a tuple. + + :param RewriteInputRequest inputreq: The input request to check range + :param WbUrl wb_url: The WbUrl associated with the request + :return: A tuple with the start, end, and T/F should skip request + :rtype: tuple[int|None, int|None, bool] + """ skip_record = False range_start = None range_end = None @@ -527,7 +568,7 @@ class RewriterApp(object): def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts, status_headers, is_timegate, is_proxy, coll=None, pref_applied=None, mod=None, is_memento=True): - """ + """Adds the memento link headers to supplied StatusAndHeaders instance :param str url: The URI-R being rewritten :param str full_prefix: The replay prefix @@ -601,8 +642,7 @@ class RewriterApp(object): return timegate_url, timemap_url def get_top_url(self, full_prefix, wb_url, cdx, kwargs): - top_url = full_prefix - top_url += wb_url.to_str(mod='') + top_url = full_prefix + wb_url.to_str(mod='') return top_url def handle_error(self, environ, wbe): @@ -640,10 +680,7 @@ class RewriterApp(object): else: closest = wb_url.timestamp - params = {} - params['url'] = wb_url.url - params['closest'] = closest - params['matchType'] = 'exact' + params = {'url': wb_url.url, 'closest': closest, 'matchType': 'exact'} if wb_url.mod == 'vi_': params['content_type'] = self.VIDEO_INFO_CONTENT_TYPE @@ -658,11 +695,20 @@ class RewriterApp(object): return r def do_query(self, wb_url, kwargs): - params = {} - params['url'] = wb_url.url - params['output'] = kwargs.get('output', 'json') - params['from'] = wb_url.timestamp - params['to'] = wb_url.end_timestamp + """Performs the timemap query request for the supplied WbUrl + returning the response + + :param WbUrl wb_url: The WbUrl to be queried + :param dict kwargs: Optional keyword arguments + :return: The queries response + :rtype: requests.Response + """ + params = { + 'url': wb_url.url, + 'output': kwargs.get('output', 'json'), + 'from': wb_url.timestamp, + 'to': wb_url.end_timestamp + } if 'memento_format' in kwargs: params['memento_format'] = kwargs['memento_format'] @@ -763,8 +809,8 @@ class RewriterApp(object): return False def get_base_url(self, wb_url, kwargs): - type = kwargs.get('type') - return self.paths[type].format(**kwargs) + type_ = kwargs.get('type') + return self.paths[type_].format(**kwargs) def get_upstream_url(self, wb_url, kwargs, params): base_url = self.get_base_url(wb_url, kwargs) diff --git a/pywb/utils/geventserver.py b/pywb/utils/geventserver.py index 1db2cf96..8301c513 100644 --- a/pywb/utils/geventserver.py +++ b/pywb/utils/geventserver.py @@ -7,17 +7,36 @@ from gevent.pywsgi import WSGIHandler, WSGIServer # ============================================================================ class GeventServer(object): + """Class for optionally running a WSGI application in a greenlet""" + def __init__(self, app, port=0, hostname='localhost', handler_class=None, - direct=False): + direct=False): + """Initialize a new GeventServer instance + + :param app: The WSGI application instance to be used + :param int port: The port the server is to listen on + :param str hostname: The hostname the server is to use + :param handler_class: The class to be used for handling WSGI requests + :param bool direct: T/F indicating if the server should be run in a greenlet + or in current thread + """ self.port = port + self.server = None + self.ge = None self.make_server(app, port, hostname, handler_class, direct=direct) def stop(self): + """Stops the running server if it was started""" if self.server: logging.debug('stopping server on ' + str(self.port)) self.server.stop() def _run(self, server, port): + """Start running the server forever + + :param server: The server to be run + :param int port: The port the server is to listen on + """ logging.debug('starting server on ' + str(port)) try: server.serve_forever() @@ -26,6 +45,16 @@ class GeventServer(object): traceback.print_exc() def make_server(self, app, port, hostname, handler_class, direct=False): + """Creates and starts the server. If direct is true the server is run + in the current thread otherwise in a greenlet. + + :param app: The WSGI application instance to be used + :param int port: The port the server is to listen on + :param str hostname: The hostname the server is to use + :param handler_class: The class to be used for handling WSGI requests + :param bool direct: T/F indicating if the server should be run in a greenlet + or in current thread + """ server = WSGIServer((hostname, port), app, handler_class=handler_class) server.init_socket() self.port = server.address[1] @@ -38,12 +67,25 @@ class GeventServer(object): self.ge = spawn(self._run, server, self.port) def join(self): - self.ge.join() + """Joins the greenlet spawned for running the server if it was started + in non-direct mode""" + if self.ge: + self.ge.join() # ============================================================================ class RequestURIWSGIHandler(WSGIHandler): + """A specific WSGIHandler subclass that adds `REQUEST_URI` to the environ dictionary + for every request + """ + def get_environ(self): + """Returns the WSGI environ dictionary with the + `REQUEST_URI` added to it + + :return: The WSGI environ dictionary for the request + :rtype: dict + """ environ = super(RequestURIWSGIHandler, self).get_environ() environ['REQUEST_URI'] = self.path return environ diff --git a/pywb/utils/wbexception.py b/pywb/utils/wbexception.py index 9d986dd1..9f2f4e86 100644 --- a/pywb/utils/wbexception.py +++ b/pywb/utils/wbexception.py @@ -1,14 +1,14 @@ from werkzeug.http import HTTP_STATUS_CODES -#================================================================= +# ================================================================= class WbException(Exception): """Base class for exceptions raised by Pywb""" def __init__(self, msg=None, url=None): """Initialize a new WbException - :param str|None msg: The message for the error response + :param str|dict|None msg: The message for the error response :param str|None url: The URL that caused the error :rtype: None """ @@ -34,14 +34,10 @@ class WbException(Exception): return str(self.status_code) + ' ' + HTTP_STATUS_CODES.get(self.status_code, 'Unknown Error') def __repr__(self): - return "{0}('{1}',)".format(self.__class__.__name__, self.msg) - -# Default Error Code -# def status(self): -# return '500 Internal Server Error' + return "{0}('{1}',)".format(self.__class__.__name__, self.msg) -#================================================================= +# ================================================================= class AccessException(WbException): """An Exception used to indicate an access control violation""" @@ -55,7 +51,7 @@ class AccessException(WbException): return 451 -#================================================================= +# ================================================================= class BadRequestException(WbException): """An Exception used to indicate that request was bad""" @@ -69,7 +65,7 @@ class BadRequestException(WbException): return 400 -#================================================================= +# ================================================================= class NotFoundException(WbException): """An Exception used to indicate that a resource was not found""" @@ -83,7 +79,7 @@ class NotFoundException(WbException): return 404 -#================================================================= +# ================================================================= class LiveResourceException(WbException): """An Exception used to indicate that an error was encountered during the retrial of a live web resource""" @@ -107,7 +103,7 @@ class UpstreamException(WbException): :param int status_code: The status code for the error response :param str url: The URL that caused the error - :param str details: The details of the error encountered + :param str|dict details: The details of the error encountered :rtype: None """ super(UpstreamException, self).__init__(url=url, msg=details) @@ -135,4 +131,3 @@ class AppPageNotFound(WbException): :rtype: int """ return 404 -