1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00
- frontendapp.py: restored the pulling out of collection route creation into its own function
 - rewriterapp.py: reformated file and added documentation

 utils:
  - geventserver.py: added documentation
  - wbexception.py: updated documentation
This commit is contained in:
John Berlin 2019-09-10 14:45:05 -04:00
parent 379f7de1ba
commit 802b9fa4f5
No known key found for this signature in database
GPG Key ID: 6EF5E4B442011B02
4 changed files with 187 additions and 81 deletions

View File

@ -1,7 +1,7 @@
from gevent.monkey import patch_all; patch_all()
from werkzeug.routing import Map, Rule
from werkzeug.exceptions import HTTPException, NotFound
from werkzeug.exceptions import HTTPException
from werkzeug.wsgi import pop_path_info
from six.moves.urllib.parse import urljoin
from six import iteritems
@ -43,6 +43,8 @@ class FrontEndApp(object):
- WSGIProxMiddleware (Optional): If proxy mode is enabled, performs pywb's HTTP(s) proxy functionality
- AutoIndexer (Optional): If auto-indexing is enabled for the collections it is started here
- RecorderApp (Optional): Recording functionality, available when recording mode is enabled
The RewriterApp is configurable and can be set via the class var `REWRITER_APP_CLS`, defaults to RewriterApp
"""
REPLAY_API = 'http://localhost:%s/{coll}/resource/postreq'
@ -62,8 +64,8 @@ class FrontEndApp(object):
def __init__(self, config_file=None, custom_config=None):
"""
:param str config_file: Path to the config file
:param dict custom_config: Dictionary containing additional configuration information
:param str|None config_file: Path to the config file
:param dict|None custom_config: Dictionary containing additional configuration information
"""
config_file = config_file or './config.yaml'
self.handler = self.handle_request
@ -71,6 +73,7 @@ class FrontEndApp(object):
custom_config=custom_config)
self.recorder = None
self.recorder_path = None
self.proxy_default_timestamp = None
config = self.warcserver.config
@ -108,7 +111,8 @@ class FrontEndApp(object):
def _init_routes(self):
"""Initialize the routes and based on the configuration file makes available
specific routes (proxy mode, record)"""
specific routes (proxy mode, record)
"""
self.url_map = Map()
self.url_map.add(Rule('/static/_/<coll>/<path:filepath>', endpoint=self.serve_static))
self.url_map.add(Rule('/static/<path:filepath>', endpoint=self.serve_static))
@ -120,18 +124,42 @@ class FrontEndApp(object):
coll_prefix = '/<coll>'
self.url_map.add(Rule('/', endpoint=self.serve_home))
self.url_map.add(Rule(coll_prefix + self.cdx_api_endpoint, endpoint=self.serve_cdx))
self.url_map.add(Rule(coll_prefix + '/', endpoint=self.serve_coll_page))
self.url_map.add(Rule(coll_prefix + '/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content))
if self.recorder_path:
self.url_map.add(Rule(coll_prefix + self.RECORD_ROUTE + '/<path:url>', endpoint=self.serve_record))
self._init_coll_routes(coll_prefix)
if self.proxy_prefix is not None:
# Add the proxy-fetch endpoint to enable PreservationWorker to make CORS fetches worry free in proxy mode
self.url_map.add(Rule('/proxy-fetch/<path:url>', endpoint=self.proxy_fetch,
methods=['GET', 'HEAD', 'OPTIONS']))
self.url_map.add(Rule(coll_prefix + '/<path:url>', endpoint=self.serve_content))
def _init_coll_routes(self, coll_prefix):
"""Initialize and register the routes for specified collection path
:param str coll_prefix: The collection path
:rtype: None
"""
routes = self._make_coll_routes(coll_prefix)
for route in routes:
self.url_map.add(route)
def _make_coll_routes(self, coll_prefix):
"""Creates a list of standard collection routes for the
specified collection path
:param str coll_prefix: The collection path
:return: A list of route rules for the supplied collection
:rtype: list[Rule]
"""
routes = [
Rule(coll_prefix + self.cdx_api_endpoint, endpoint=self.serve_cdx),
Rule(coll_prefix + '/', endpoint=self.serve_coll_page),
Rule(coll_prefix + '/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content),
Rule(coll_prefix + '/<path:url>', endpoint=self.serve_content)
]
if self.recorder_path:
routes.append(Rule(coll_prefix + self.RECORD_ROUTE + '/<path:url>', endpoint=self.serve_record))
return routes
def get_upstream_paths(self, port):
"""Retrieve a dictionary containing the full URLs of the upstream apps
@ -178,7 +206,6 @@ class FrontEndApp(object):
self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer,
accept_colls=recorder_config.get('source_filter'))
recorder_server = GeventServer(self.recorder, port=0)
self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll)
@ -260,7 +287,7 @@ class FrontEndApp(object):
if proxy_enabled:
response.add_access_control_headers(env=environ)
return response
except:
except Exception:
self.raise_not_found(environ, 'static_file_not_found', filepath)
def get_metadata(self, coll):
@ -478,8 +505,6 @@ class FrontEndApp(object):
inx = referer[1:].find('http')
if not inx:
inx = referer[1:].find('///')
if inx > 0:
inx + 1
if inx < 0:
return
@ -607,7 +632,7 @@ class FrontEndApp(object):
if not self.ALL_DIGITS.match(self.proxy_default_timestamp):
try:
self.proxy_default_timestamp = iso_date_to_timestamp(self.proxy_default_timestamp)
except:
except Exception:
raise Exception('Invalid Proxy Timestamp: Must Be All-Digit Timestamp or ISO Date Format')
self.proxy_coll = proxy_coll
@ -691,7 +716,7 @@ class MetadataCache(object):
try:
mtime = os.path.getmtime(path)
obj = self.cache.get(path)
except:
except Exception:
return {}
if not obj:
@ -733,5 +758,3 @@ class MetadataCache(object):
if __name__ == "__main__":
app_server = FrontEndApp.create_app(port=8080)
app_server.join()

View File

@ -2,53 +2,45 @@ from io import BytesIO
import requests
from fakeredis import FakeStrictRedis
from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit, unquote
from six.moves.urllib.parse import unquote, urlencode, urlsplit, urlunsplit
from warcio.bufferedreaders import BufferedReader
from warcio.recordloader import ArcWarcRecordLoader
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
from werkzeug.http import HTTP_STATUS_CODES
from pywb.apps.wbrequestresponse import WbResponse
from pywb.rewrite.cookies import CookieTracker
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
from pywb.rewrite.rewriteinputreq import RewriteInputRequest
from pywb.rewrite.templateview import BaseInsertView, HeadInsertView, JinjaEnv, TopFrameView
from pywb.rewrite.url_rewriter import IdentityUrlRewriter, UrlRewriter
from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter, IdentityUrlRewriter
from pywb.rewrite.cookies import CookieTracker
from pywb.utils.wbexception import WbException, NotFoundException, UpstreamException
from pywb.utils.canonicalize import canonicalize
from pywb.utils.io import BUFF_SIZE, OffsetLimitReader, no_except_close
from pywb.utils.memento import MementoUtils
from pywb.utils.wbexception import WbException
from pywb.utils.wbexception import NotFoundException, UpstreamException
from pywb.warcserver.index.cdxobject import CDXObject
# ============================================================================
class UpstreamException(WbException):
def __init__(self, status_code, url, details):
super(UpstreamException, self).__init__(url=url, msg=details)
self._status_code = status_code
@property
def status_code(self):
return self._status_code
# ============================================================================
# class Rewriter(RewriteDASHMixin, RewriteAMFMixin, RewriteContent):
# pass
# ============================================================================
class RewriterApp(object):
"""Primary application for rewriting the content served by pywb (if it is to be rewritten).
This class is also responsible rendering the archives templates
"""
VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json'
DEFAULT_CSP = "default-src 'unsafe-eval' 'unsafe-inline' 'self' data: blob: mediastream: ws: wss: ; form-action 'self'"
def __init__(self, framed_replay=False, jinja_env=None, config=None, paths=None):
"""Initialize a new instance of RewriterApp
:param bool framed_replay: Is rewriting happening in framed replay mode
:param JinjaEnv|None jinja_env: Optional JinjaEnv instance to be used for
rendering static files
:param dict|None config: Optional config dictionary
:param dict|None paths: Optional dictionary containing a mapping
of path names to URLs
"""
self.loader = ArcWarcRecordLoader()
self.config = config or {}
@ -108,25 +100,65 @@ class RewriterApp(object):
# deprecated: Use X-Forwarded-Proto header instead!
self.force_scheme = config.get('force_scheme')
def _init_cookie_tracker(self):
return CookieTracker(FakeStrictRedis())
def _init_cookie_tracker(self, redis=None):
"""Initialize the CookieTracker
:param redis: Optional redis instance to be used
Defaults to FakeStrictRedis
:return: The initialized cookie tracker
:rtype: CookieTracker
"""
if redis is None:
redis = FakeStrictRedis()
return CookieTracker(redis)
def add_csp_header(self, wb_url, status_headers):
"""Adds Content-Security-Policy headers to the supplied
StatusAndHeaders instance if the wb_url's mod is equal
to the replay mod
:param WbUrl wb_url: The WbUrl for the URL being operated on
:param warcio.StatusAndHeaders status_headers: The status and
headers instance for the reply to the URL
"""
if self.csp_header and wb_url.mod == self.replay_mod:
status_headers.headers.append(self.csp_header)
def _html_templ(self, name):
"""Returns the html file name for the supplied
html template name.
:param str name: The name of the html template
:return: The file name for the template
:rtype: str|None
"""
value = self.config.get(name)
if not value:
value = name.replace('_html', '.html')
return value
def is_framed_replay(self, wb_url):
"""Returns T/F indicating if the rewriter app is configured to
be operating in framed replay mode and the supplied WbUrl
is also operating in framed replay mode
:param WbUrl wb_url: The WbUrl instance to check
:return: T/F if in framed replay mode
:rtype: bool
"""
return (self.framed_replay and
wb_url.mod == self.frame_mod and
wb_url.is_replay())
def _check_accept_dt(self, wb_url, environ):
"""Returns T/F indicating if the supplied WbUrl instance
is for a timegate request
:param WbUrl wb_url: The URL to be checked
:param dict environ: The wsgi environment object for the request
:return: T/F indicating if the WbUrl is for timegate request
:rtype: bool
"""
is_timegate = False
if wb_url.is_latest_replay():
accept_dt = environ.get('HTTP_ACCEPT_DATETIME')
@ -177,6 +209,15 @@ class RewriterApp(object):
return mod, prefer
def _check_range(self, inputreq, wb_url):
"""Checks the input request if it is a range request returning
the start and end of the range as well as T/F if the request should
be skipped as a tuple.
:param RewriteInputRequest inputreq: The input request to check range
:param WbUrl wb_url: The WbUrl associated with the request
:return: A tuple with the start, end, and T/F should skip request
:rtype: tuple[int|None, int|None, bool]
"""
skip_record = False
range_start = None
range_end = None
@ -527,7 +568,7 @@ class RewriterApp(object):
def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
status_headers, is_timegate, is_proxy, coll=None,
pref_applied=None, mod=None, is_memento=True):
"""
"""Adds the memento link headers to supplied StatusAndHeaders instance
:param str url: The URI-R being rewritten
:param str full_prefix: The replay prefix
@ -601,8 +642,7 @@ class RewriterApp(object):
return timegate_url, timemap_url
def get_top_url(self, full_prefix, wb_url, cdx, kwargs):
top_url = full_prefix
top_url += wb_url.to_str(mod='')
top_url = full_prefix + wb_url.to_str(mod='')
return top_url
def handle_error(self, environ, wbe):
@ -640,10 +680,7 @@ class RewriterApp(object):
else:
closest = wb_url.timestamp
params = {}
params['url'] = wb_url.url
params['closest'] = closest
params['matchType'] = 'exact'
params = {'url': wb_url.url, 'closest': closest, 'matchType': 'exact'}
if wb_url.mod == 'vi_':
params['content_type'] = self.VIDEO_INFO_CONTENT_TYPE
@ -658,11 +695,20 @@ class RewriterApp(object):
return r
def do_query(self, wb_url, kwargs):
params = {}
params['url'] = wb_url.url
params['output'] = kwargs.get('output', 'json')
params['from'] = wb_url.timestamp
params['to'] = wb_url.end_timestamp
"""Performs the timemap query request for the supplied WbUrl
returning the response
:param WbUrl wb_url: The WbUrl to be queried
:param dict kwargs: Optional keyword arguments
:return: The queries response
:rtype: requests.Response
"""
params = {
'url': wb_url.url,
'output': kwargs.get('output', 'json'),
'from': wb_url.timestamp,
'to': wb_url.end_timestamp
}
if 'memento_format' in kwargs:
params['memento_format'] = kwargs['memento_format']
@ -763,8 +809,8 @@ class RewriterApp(object):
return False
def get_base_url(self, wb_url, kwargs):
type = kwargs.get('type')
return self.paths[type].format(**kwargs)
type_ = kwargs.get('type')
return self.paths[type_].format(**kwargs)
def get_upstream_url(self, wb_url, kwargs, params):
base_url = self.get_base_url(wb_url, kwargs)

View File

@ -7,17 +7,36 @@ from gevent.pywsgi import WSGIHandler, WSGIServer
# ============================================================================
class GeventServer(object):
"""Class for optionally running a WSGI application in a greenlet"""
def __init__(self, app, port=0, hostname='localhost', handler_class=None,
direct=False):
"""Initialize a new GeventServer instance
:param app: The WSGI application instance to be used
:param int port: The port the server is to listen on
:param str hostname: The hostname the server is to use
:param handler_class: The class to be used for handling WSGI requests
:param bool direct: T/F indicating if the server should be run in a greenlet
or in current thread
"""
self.port = port
self.server = None
self.ge = None
self.make_server(app, port, hostname, handler_class, direct=direct)
def stop(self):
"""Stops the running server if it was started"""
if self.server:
logging.debug('stopping server on ' + str(self.port))
self.server.stop()
def _run(self, server, port):
"""Start running the server forever
:param server: The server to be run
:param int port: The port the server is to listen on
"""
logging.debug('starting server on ' + str(port))
try:
server.serve_forever()
@ -26,6 +45,16 @@ class GeventServer(object):
traceback.print_exc()
def make_server(self, app, port, hostname, handler_class, direct=False):
"""Creates and starts the server. If direct is true the server is run
in the current thread otherwise in a greenlet.
:param app: The WSGI application instance to be used
:param int port: The port the server is to listen on
:param str hostname: The hostname the server is to use
:param handler_class: The class to be used for handling WSGI requests
:param bool direct: T/F indicating if the server should be run in a greenlet
or in current thread
"""
server = WSGIServer((hostname, port), app, handler_class=handler_class)
server.init_socket()
self.port = server.address[1]
@ -38,12 +67,25 @@ class GeventServer(object):
self.ge = spawn(self._run, server, self.port)
def join(self):
"""Joins the greenlet spawned for running the server if it was started
in non-direct mode"""
if self.ge:
self.ge.join()
# ============================================================================
class RequestURIWSGIHandler(WSGIHandler):
"""A specific WSGIHandler subclass that adds `REQUEST_URI` to the environ dictionary
for every request
"""
def get_environ(self):
"""Returns the WSGI environ dictionary with the
`REQUEST_URI` added to it
:return: The WSGI environ dictionary for the request
:rtype: dict
"""
environ = super(RequestURIWSGIHandler, self).get_environ()
environ['REQUEST_URI'] = self.path
return environ

View File

@ -8,7 +8,7 @@ class WbException(Exception):
def __init__(self, msg=None, url=None):
"""Initialize a new WbException
:param str|None msg: The message for the error response
:param str|dict|None msg: The message for the error response
:param str|None url: The URL that caused the error
:rtype: None
"""
@ -36,10 +36,6 @@ class WbException(Exception):
def __repr__(self):
return "{0}('{1}',)".format(self.__class__.__name__, self.msg)
# Default Error Code
# def status(self):
# return '500 Internal Server Error'
# =================================================================
class AccessException(WbException):
@ -107,7 +103,7 @@ class UpstreamException(WbException):
:param int status_code: The status code for the error response
:param str url: The URL that caused the error
:param str details: The details of the error encountered
:param str|dict details: The details of the error encountered
:rtype: None
"""
super(UpstreamException, self).__init__(url=url, msg=details)
@ -135,4 +131,3 @@ class AppPageNotFound(WbException):
:rtype: int
"""
return 404