2017-02-17 18:04:07 -08:00
|
|
|
from gevent.monkey import patch_all; patch_all()
|
|
|
|
|
2019-11-11 09:51:26 -08:00
|
|
|
from werkzeug.routing import Map, Rule, RequestRedirect, Submount
|
2017-02-17 18:04:07 -08:00
|
|
|
from werkzeug.wsgi import pop_path_info
|
2017-02-27 19:07:51 -08:00
|
|
|
from six.moves.urllib.parse import urljoin
|
2017-04-26 12:12:34 -07:00
|
|
|
from six import iteritems
|
2017-06-05 16:58:47 -07:00
|
|
|
from warcio.utils import to_native_str
|
2019-03-11 16:28:09 -07:00
|
|
|
from warcio.timeutils import iso_date_to_timestamp
|
2017-09-27 13:47:02 -07:00
|
|
|
from wsgiprox.wsgiprox import WSGIProxMiddleware
|
2017-06-05 16:58:47 -07:00
|
|
|
|
2017-09-21 22:12:57 -07:00
|
|
|
from pywb.recorder.multifilewarcwriter import MultiFileWARCWriter
|
|
|
|
from pywb.recorder.recorderapp import RecorderApp
|
2021-01-27 02:06:18 +01:00
|
|
|
from pywb.recorder.filters import SkipDupePolicy, WriteDupePolicy, WriteRevisitDupePolicy
|
|
|
|
from pywb.recorder.redisindexer import WritableRedisIndexer
|
2017-09-21 22:12:57 -07:00
|
|
|
|
2017-06-05 16:58:47 -07:00
|
|
|
from pywb.utils.loaders import load_yaml_config
|
2017-05-23 19:08:29 -07:00
|
|
|
from pywb.utils.geventserver import GeventServer
|
2017-11-14 20:08:59 -08:00
|
|
|
from pywb.utils.io import StreamIter
|
2019-04-10 14:00:53 -04:00
|
|
|
from pywb.utils.wbexception import WbException, AppPageNotFound
|
2017-02-17 18:04:07 -08:00
|
|
|
|
2017-05-23 19:08:29 -07:00
|
|
|
from pywb.warcserver.warcserver import WarcServer
|
2017-02-17 18:04:07 -08:00
|
|
|
|
2017-05-23 19:08:29 -07:00
|
|
|
from pywb.rewrite.templateview import BaseInsertView
|
2017-02-17 18:04:07 -08:00
|
|
|
|
2017-05-23 19:08:29 -07:00
|
|
|
from pywb.apps.static_handler import StaticHandler
|
2019-04-10 14:00:53 -04:00
|
|
|
from pywb.apps.rewriterapp import RewriterApp
|
2017-05-23 19:08:29 -07:00
|
|
|
from pywb.apps.wbrequestresponse import WbResponse
|
2017-02-17 18:04:07 -08:00
|
|
|
|
2017-04-26 12:12:34 -07:00
|
|
|
import os
|
2019-03-11 16:28:09 -07:00
|
|
|
import re
|
|
|
|
|
2017-04-26 12:12:34 -07:00
|
|
|
import traceback
|
2017-09-06 23:25:30 -07:00
|
|
|
import requests
|
2017-10-01 09:46:54 -07:00
|
|
|
import logging
|
2017-02-17 18:04:07 -08:00
|
|
|
|
|
|
|
|
|
|
|
# ============================================================================
|
2017-04-21 15:37:21 -07:00
|
|
|
class FrontEndApp(object):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""Orchestrates pywb's core Wayback Machine functionality and is comprised of 2 core sub-apps and 3 optional apps.
|
|
|
|
|
|
|
|
Sub-apps:
|
|
|
|
- WarcServer: Serves the archive content (WARC/ARC and index) as well as from the live web in record/proxy mode
|
|
|
|
- RewriterApp: Rewrites the content served by pywb (if it is to be rewritten)
|
|
|
|
- WSGIProxMiddleware (Optional): If proxy mode is enabled, performs pywb's HTTP(s) proxy functionality
|
|
|
|
- AutoIndexer (Optional): If auto-indexing is enabled for the collections it is started here
|
|
|
|
- RecorderApp (Optional): Recording functionality, available when recording mode is enabled
|
2019-09-10 14:45:05 -04:00
|
|
|
|
|
|
|
The RewriterApp is configurable and can be set via the class var `REWRITER_APP_CLS`, defaults to RewriterApp
|
2018-10-03 16:27:49 -04:00
|
|
|
"""
|
|
|
|
|
2017-10-01 09:46:54 -07:00
|
|
|
REPLAY_API = 'http://localhost:%s/{coll}/resource/postreq'
|
|
|
|
CDX_API = 'http://localhost:%s/{coll}/index'
|
|
|
|
RECORD_SERVER = 'http://localhost:%s'
|
|
|
|
RECORD_API = 'http://localhost:%s/%s/resource/postreq?param.recorder.coll={coll}'
|
|
|
|
|
|
|
|
RECORD_ROUTE = '/record'
|
|
|
|
|
|
|
|
PROXY_CA_NAME = 'pywb HTTPS Proxy CA'
|
|
|
|
|
|
|
|
PROXY_CA_PATH = os.path.join('proxy-certs', 'pywb-ca.pem')
|
|
|
|
|
2018-02-09 17:16:46 -08:00
|
|
|
REWRITER_APP_CLS = RewriterApp
|
|
|
|
|
2019-03-11 16:28:09 -07:00
|
|
|
ALL_DIGITS = re.compile(r'^\d+$')
|
|
|
|
|
2018-02-09 17:16:46 -08:00
|
|
|
def __init__(self, config_file=None, custom_config=None):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""
|
2019-09-10 14:45:05 -04:00
|
|
|
:param str|None config_file: Path to the config file
|
|
|
|
:param dict|None custom_config: Dictionary containing additional configuration information
|
2018-10-03 16:27:49 -04:00
|
|
|
"""
|
2018-02-09 17:16:46 -08:00
|
|
|
config_file = config_file or './config.yaml'
|
2017-10-01 09:46:54 -07:00
|
|
|
self.handler = self.handle_request
|
2017-05-23 19:08:29 -07:00
|
|
|
self.warcserver = WarcServer(config_file=config_file,
|
|
|
|
custom_config=custom_config)
|
2019-04-10 14:00:53 -04:00
|
|
|
self.recorder = None
|
|
|
|
self.recorder_path = None
|
2019-09-10 14:45:05 -04:00
|
|
|
self.proxy_default_timestamp = None
|
2017-02-17 18:04:07 -08:00
|
|
|
|
2017-09-21 22:12:57 -07:00
|
|
|
config = self.warcserver.config
|
|
|
|
|
2017-10-01 09:46:54 -07:00
|
|
|
self.debug = config.get('debug', False)
|
2017-05-02 10:03:18 -07:00
|
|
|
|
2017-05-23 19:08:29 -07:00
|
|
|
self.warcserver_server = GeventServer(self.warcserver, port=0)
|
2017-02-17 18:04:07 -08:00
|
|
|
|
2018-10-03 16:27:49 -04:00
|
|
|
self.proxy_prefix = None # the URL prefix to be used for the collection with proxy mode (e.g. /coll/id_/)
|
|
|
|
self.proxy_coll = None # the name of the collection that has proxy mode enabled
|
2020-02-20 21:53:00 -08:00
|
|
|
self.proxy_record = False # indicate if proxy recording
|
2017-10-01 09:46:54 -07:00
|
|
|
self.init_proxy(config)
|
|
|
|
|
|
|
|
self.init_recorder(config.get('recorder'))
|
2017-09-21 22:12:57 -07:00
|
|
|
|
2017-10-15 22:47:23 -07:00
|
|
|
self.init_autoindex(config.get('autoindex'))
|
|
|
|
|
2017-10-03 15:31:08 -07:00
|
|
|
static_path = config.get('static_url_path', 'pywb/static/').replace('/', os.path.sep)
|
2017-10-01 09:46:54 -07:00
|
|
|
self.static_handler = StaticHandler(static_path)
|
2017-02-17 18:04:07 -08:00
|
|
|
|
2017-10-11 15:33:57 -07:00
|
|
|
self.cdx_api_endpoint = config.get('cdx_api_endpoint', '/cdx')
|
2019-11-07 10:25:49 -08:00
|
|
|
self.query_limit = config.get('query_limit')
|
2017-09-28 02:08:31 -07:00
|
|
|
|
2017-10-01 09:46:54 -07:00
|
|
|
upstream_paths = self.get_upstream_paths(self.warcserver_server.port)
|
|
|
|
|
|
|
|
framed_replay = config.get('framed_replay', True)
|
2018-02-09 17:16:46 -08:00
|
|
|
self.rewriterapp = self.REWRITER_APP_CLS(framed_replay,
|
|
|
|
config=config,
|
|
|
|
paths=upstream_paths)
|
2017-10-01 09:46:54 -07:00
|
|
|
|
|
|
|
self.templates_dir = config.get('templates_dir', 'templates')
|
|
|
|
self.static_dir = config.get('static_dir', 'static')
|
|
|
|
|
|
|
|
metadata_templ = os.path.join(self.warcserver.root_dir, '{coll}', 'metadata.yaml')
|
|
|
|
self.metadata_cache = MetadataCache(metadata_templ)
|
|
|
|
|
2018-02-26 22:53:52 -08:00
|
|
|
self._init_routes()
|
|
|
|
|
2017-10-01 09:46:54 -07:00
|
|
|
def _init_routes(self):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""Initialize the routes and based on the configuration file makes available
|
2019-09-10 14:45:05 -04:00
|
|
|
specific routes (proxy mode, record)
|
|
|
|
"""
|
2017-02-17 18:04:07 -08:00
|
|
|
self.url_map = Map()
|
2017-04-26 12:12:34 -07:00
|
|
|
self.url_map.add(Rule('/static/_/<coll>/<path:filepath>', endpoint=self.serve_static))
|
|
|
|
self.url_map.add(Rule('/static/<path:filepath>', endpoint=self.serve_static))
|
2017-02-27 19:07:51 -08:00
|
|
|
self.url_map.add(Rule('/collinfo.json', endpoint=self.serve_listing))
|
2017-08-07 22:09:02 -07:00
|
|
|
|
|
|
|
if self.is_valid_coll('$root'):
|
2017-09-21 22:12:57 -07:00
|
|
|
coll_prefix = ''
|
2017-08-07 22:09:02 -07:00
|
|
|
else:
|
2017-09-21 22:12:57 -07:00
|
|
|
coll_prefix = '/<coll>'
|
2017-08-07 22:09:02 -07:00
|
|
|
self.url_map.add(Rule('/', endpoint=self.serve_home))
|
2017-02-17 18:04:07 -08:00
|
|
|
|
2019-09-10 14:45:05 -04:00
|
|
|
self._init_coll_routes(coll_prefix)
|
2017-09-21 22:12:57 -07:00
|
|
|
|
2018-10-03 16:27:49 -04:00
|
|
|
if self.proxy_prefix is not None:
|
|
|
|
# Add the proxy-fetch endpoint to enable PreservationWorker to make CORS fetches worry free in proxy mode
|
|
|
|
self.url_map.add(Rule('/proxy-fetch/<path:url>', endpoint=self.proxy_fetch,
|
|
|
|
methods=['GET', 'HEAD', 'OPTIONS']))
|
2019-09-10 14:45:05 -04:00
|
|
|
|
|
|
|
def _init_coll_routes(self, coll_prefix):
|
|
|
|
"""Initialize and register the routes for specified collection path
|
|
|
|
|
|
|
|
:param str coll_prefix: The collection path
|
|
|
|
:rtype: None
|
|
|
|
"""
|
|
|
|
routes = self._make_coll_routes(coll_prefix)
|
2019-11-11 09:51:26 -08:00
|
|
|
|
|
|
|
# init loc routes, if any
|
|
|
|
loc_keys = list(self.rewriterapp.loc_map.keys())
|
|
|
|
if loc_keys:
|
|
|
|
routes.append(Rule('/', endpoint=self.serve_home))
|
|
|
|
|
|
|
|
submount_route = ', '.join(loc_keys)
|
|
|
|
submount_route = '/<any({0}):lang>'.format(submount_route)
|
|
|
|
|
|
|
|
self.url_map.add(Submount(submount_route, routes))
|
|
|
|
|
2019-09-10 14:45:05 -04:00
|
|
|
for route in routes:
|
|
|
|
self.url_map.add(route)
|
|
|
|
|
|
|
|
def _make_coll_routes(self, coll_prefix):
|
|
|
|
"""Creates a list of standard collection routes for the
|
|
|
|
specified collection path
|
|
|
|
|
|
|
|
:param str coll_prefix: The collection path
|
|
|
|
:return: A list of route rules for the supplied collection
|
|
|
|
:rtype: list[Rule]
|
|
|
|
"""
|
|
|
|
routes = [
|
|
|
|
Rule(coll_prefix + self.cdx_api_endpoint, endpoint=self.serve_cdx),
|
|
|
|
Rule(coll_prefix + '/', endpoint=self.serve_coll_page),
|
|
|
|
Rule(coll_prefix + '/timemap/<timemap_output>/<path:url>', endpoint=self.serve_content),
|
|
|
|
Rule(coll_prefix + '/<path:url>', endpoint=self.serve_content)
|
|
|
|
]
|
|
|
|
|
|
|
|
if self.recorder_path:
|
|
|
|
routes.append(Rule(coll_prefix + self.RECORD_ROUTE + '/<path:url>', endpoint=self.serve_record))
|
|
|
|
|
|
|
|
return routes
|
2017-09-21 22:12:57 -07:00
|
|
|
|
2017-02-17 18:04:07 -08:00
|
|
|
def get_upstream_paths(self, port):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""Retrieve a dictionary containing the full URLs of the upstream apps
|
|
|
|
|
|
|
|
:param int port: The port used by the replay and cdx servers
|
|
|
|
:return: A dictionary containing the upstream paths (replay, cdx-server, record [if enabled])
|
|
|
|
:rtype: dict[str, str]
|
|
|
|
"""
|
2017-09-21 22:12:57 -07:00
|
|
|
base_paths = {
|
2019-09-10 14:45:05 -04:00
|
|
|
'replay': self.REPLAY_API % port,
|
|
|
|
'cdx-server': self.CDX_API % port,
|
|
|
|
}
|
2017-02-17 18:04:07 -08:00
|
|
|
|
2017-10-01 09:46:54 -07:00
|
|
|
if self.recorder_path:
|
|
|
|
base_paths['record'] = self.recorder_path
|
2017-09-21 22:12:57 -07:00
|
|
|
|
|
|
|
return base_paths
|
|
|
|
|
2017-10-01 09:46:54 -07:00
|
|
|
def init_recorder(self, recorder_config):
|
2019-04-10 14:00:53 -04:00
|
|
|
"""Initialize the recording functionality of pywb. If recording_config is None this function is a no op
|
|
|
|
|
|
|
|
:param str|dict|None recorder_config: The configuration for the recorder app
|
|
|
|
:rtype: None
|
|
|
|
"""
|
2017-10-01 09:46:54 -07:00
|
|
|
if not recorder_config:
|
2017-09-21 22:12:57 -07:00
|
|
|
self.recorder = None
|
2017-10-01 09:46:54 -07:00
|
|
|
self.recorder_path = None
|
2017-09-21 22:12:57 -07:00
|
|
|
return
|
|
|
|
|
2017-10-01 09:46:54 -07:00
|
|
|
if isinstance(recorder_config, str):
|
|
|
|
recorder_coll = recorder_config
|
|
|
|
recorder_config = {}
|
|
|
|
else:
|
|
|
|
recorder_coll = recorder_config['source_coll']
|
|
|
|
|
2021-01-26 18:53:54 -08:00
|
|
|
# cache mode
|
|
|
|
self.rec_cache_mode = recorder_config.get('cache', 'default')
|
|
|
|
|
|
|
|
dedup_policy = recorder_config.get('dedup_policy')
|
|
|
|
dedup_by_url = False
|
|
|
|
|
|
|
|
if dedup_policy == 'none':
|
|
|
|
dedup_policy = ''
|
|
|
|
|
|
|
|
if dedup_policy == 'keep':
|
|
|
|
dedup_policy = WriteDupePolicy()
|
|
|
|
elif dedup_policy == 'revisit':
|
|
|
|
dedup_policy = WriteRevisitDupePolicy()
|
|
|
|
elif dedup_policy == 'skip':
|
|
|
|
dedup_policy = SkipDupePolicy()
|
|
|
|
dedup_by_url = True
|
|
|
|
elif dedup_policy:
|
|
|
|
msg = 'Invalid option for dedup_policy: {0}'
|
|
|
|
raise Exception(msg.format(dedup_policy))
|
|
|
|
|
|
|
|
if dedup_policy:
|
|
|
|
dedup_index = WritableRedisIndexer(redis_url=self.warcserver.dedup_index_url,
|
|
|
|
dupe_policy=dedup_policy,
|
|
|
|
rel_path_template=self.warcserver.root_dir + '/{coll}/archive')
|
2021-01-27 02:06:18 +01:00
|
|
|
else:
|
|
|
|
dedup_index = None
|
|
|
|
|
2021-01-26 18:53:54 -08:00
|
|
|
|
2017-10-03 15:31:08 -07:00
|
|
|
warc_writer = MultiFileWARCWriter(self.warcserver.archive_paths,
|
2017-10-15 22:47:23 -07:00
|
|
|
max_size=int(recorder_config.get('rollover_size', 1000000000)),
|
|
|
|
max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)),
|
2017-10-01 09:46:54 -07:00
|
|
|
filename_template=recorder_config.get('filename_template'),
|
2021-01-26 18:53:54 -08:00
|
|
|
dedup_index=dedup_index,
|
|
|
|
dedup_by_url=dedup_by_url)
|
2017-09-21 22:12:57 -07:00
|
|
|
|
2018-08-25 08:57:47 +08:00
|
|
|
self.recorder = RecorderApp(self.RECORD_SERVER % str(self.warcserver_server.port), warc_writer,
|
|
|
|
accept_colls=recorder_config.get('source_filter'))
|
|
|
|
|
2017-10-01 09:46:54 -07:00
|
|
|
recorder_server = GeventServer(self.recorder, port=0)
|
|
|
|
|
|
|
|
self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll)
|
2017-09-21 22:12:57 -07:00
|
|
|
|
2017-10-15 22:47:23 -07:00
|
|
|
def init_autoindex(self, auto_interval):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""Initialize and start the auto-indexing of the collections. If auto_interval is None this is a no op.
|
|
|
|
|
|
|
|
:param str|int auto_interval: The auto-indexing interval from the configuration file or CLI argument
|
|
|
|
"""
|
2017-10-15 22:47:23 -07:00
|
|
|
if not auto_interval:
|
|
|
|
return
|
|
|
|
|
|
|
|
from pywb.manager.autoindex import AutoIndexer
|
2018-01-04 19:34:44 -05:00
|
|
|
|
|
|
|
colls_dir = self.warcserver.root_dir if self.warcserver.root_dir else None
|
|
|
|
|
|
|
|
indexer = AutoIndexer(colls_dir=colls_dir, interval=int(auto_interval))
|
|
|
|
|
2017-10-15 22:47:23 -07:00
|
|
|
if not os.path.isdir(indexer.root_path):
|
|
|
|
msg = 'No managed directory "{0}" for auto-indexing'
|
|
|
|
logging.error(msg.format(indexer.root_path))
|
|
|
|
import sys
|
|
|
|
sys.exit(2)
|
|
|
|
|
|
|
|
msg = 'Auto-Indexing Enabled on "{0}", checking every {1} secs'
|
|
|
|
logging.info(msg.format(indexer.root_path, auto_interval))
|
|
|
|
indexer.start()
|
|
|
|
|
2018-10-03 16:27:49 -04:00
|
|
|
def is_proxy_enabled(self, environ):
|
2019-04-10 14:00:53 -04:00
|
|
|
"""Returns T/F indicating if proxy mode is enabled
|
|
|
|
|
|
|
|
:param dict environ: The WSGI environment dictionary for the request
|
|
|
|
:return: T/F indicating if proxy mode is enabled
|
|
|
|
:rtype: bool
|
|
|
|
"""
|
2018-10-03 16:27:49 -04:00
|
|
|
return self.proxy_prefix is not None and 'wsgiprox.proxy_host' in environ
|
|
|
|
|
2017-02-27 19:07:51 -08:00
|
|
|
def serve_home(self, environ):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""Serves the home (/) view of pywb (not a collections)
|
|
|
|
|
|
|
|
:param dict environ: The WSGI environment dictionary for the request
|
|
|
|
:return: The WbResponse for serving the home (/) path
|
|
|
|
:rtype: WbResponse
|
|
|
|
"""
|
2017-04-26 12:12:34 -07:00
|
|
|
home_view = BaseInsertView(self.rewriterapp.jinja_env, 'index.html')
|
2017-05-23 19:08:29 -07:00
|
|
|
fixed_routes = self.warcserver.list_fixed_routes()
|
|
|
|
dynamic_routes = self.warcserver.list_dynamic_routes()
|
2017-04-26 12:12:34 -07:00
|
|
|
|
|
|
|
routes = fixed_routes + dynamic_routes
|
|
|
|
|
|
|
|
all_metadata = self.metadata_cache.get_all(dynamic_routes)
|
|
|
|
|
|
|
|
content = home_view.render_to_string(environ,
|
|
|
|
routes=routes,
|
|
|
|
all_metadata=all_metadata)
|
2017-02-27 19:07:51 -08:00
|
|
|
|
|
|
|
return WbResponse.text_response(content, content_type='text/html; charset="utf-8"')
|
|
|
|
|
2017-04-26 12:12:34 -07:00
|
|
|
def serve_static(self, environ, coll='', filepath=''):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""Serve a static file associated with a specific collection or one of pywb's own static assets
|
|
|
|
|
|
|
|
:param dict environ: The WSGI environment dictionary for the request
|
|
|
|
:param str coll: The collection the static file is associated with
|
|
|
|
:param str filepath: The file path (relative to the collection) for the static assest
|
|
|
|
:return: The WbResponse for the static asset
|
|
|
|
:rtype: WbResponse
|
|
|
|
"""
|
|
|
|
proxy_enabled = self.is_proxy_enabled(environ)
|
|
|
|
if proxy_enabled and environ.get('REQUEST_METHOD') == 'OPTIONS':
|
|
|
|
return WbResponse.options_response(environ)
|
2017-04-26 12:12:34 -07:00
|
|
|
if coll:
|
2017-05-23 19:08:29 -07:00
|
|
|
path = os.path.join(self.warcserver.root_dir, coll, self.static_dir)
|
2017-04-26 12:12:34 -07:00
|
|
|
else:
|
|
|
|
path = self.static_dir
|
|
|
|
|
|
|
|
environ['pywb.static_dir'] = path
|
2017-02-27 19:07:51 -08:00
|
|
|
try:
|
2018-10-03 16:27:49 -04:00
|
|
|
response = self.static_handler(environ, filepath)
|
|
|
|
if proxy_enabled:
|
|
|
|
response.add_access_control_headers(env=environ)
|
|
|
|
return response
|
2019-09-10 14:45:05 -04:00
|
|
|
except Exception:
|
2018-03-02 15:54:27 -08:00
|
|
|
self.raise_not_found(environ, 'static_file_not_found', filepath)
|
2017-02-17 18:04:07 -08:00
|
|
|
|
2019-10-27 01:39:52 +01:00
|
|
|
def get_coll_config(self, coll):
|
|
|
|
"""Retrieve the collection config, including metadata, associated with a collection
|
2018-10-03 16:27:49 -04:00
|
|
|
|
2019-10-27 01:39:52 +01:00
|
|
|
:param str coll: The name of the collection to receive config info for
|
|
|
|
:return: The collections config
|
2018-10-03 16:27:49 -04:00
|
|
|
:rtype: dict
|
|
|
|
"""
|
2019-10-27 01:39:52 +01:00
|
|
|
coll_config = {'coll': coll,
|
|
|
|
'type': 'replay'}
|
JS Object Proxy Override System (#224)
* Init commit for Wombat JS Proxies off of https://github.com/ikreymer/pywb/tree/develop
Changes
- cli.py: add import os for os.chdir(self.r.directory)
- frontendapp.py: added initial support for cors requests.
- static_handler.py: add import for NotFoundException
- wbrequestresponse.py: added the intital implementation for cors requests, webrecoder needs this for recording!
- default_rewriter.py: added JSWombatProxyRewriter to default js rewriter class for internal testing
- html_rewriter.py: made JSWombatProxyRewriter to be default js rewriter class for internal testing
- regex_rewriters.py: implemented JSWombatProxyRewriter and JSWombatProxyRewriter to support wombat JS Proxy
- wombat.js: added JS Proxy support
- remove print
* wombat proxy: simplify mixin using 'first_buff'
* js local scope rewrite/proxy work:
- add DefaultHandlerWithJSProxy to enable new proxy rewrite (disabled by default)
- new proxy toggleable with 'js_local_scope_rewrite: true'
- work on integrating john's proxy work
- getAllOwnProps() to generate list of functions that need to be rebound
- remove non-proxy related changes for now, remove angular special cases (for now)
* local scope proxy work:
- add back __WB_pmw() prefix for postMessage
- don't override postMessage() in proxy obj
- MessageEvent resolve proxy to original window obj
* js obj proxy: use local_init() to load local vars from proxy obj
* wombat: js object proxy improvements:
- use same object '_WB_wombat_obj_proxy' on window and document objects
- reuse default_proxy_get() for get operation from window or document
- resolve and Window/Document object to the proxy, eg. if '_WB_wombat_obj_proxy' exists, return that
- override MessageEvent.source to return window proxy object
* obj proxy work:
- window proxy: defineProperty() override calls Reflect.defineProperty on dummy object as well as window to avoid exception
- window proxy: set() also sets on dummy object, and returns false if Reflect.set returns false (eg. altered by Reflect.defineProperty disabled writing)
- add override_prop_to_proxy() to add override to return proxy obj for attribute
- add override for Node.ownerDocument and HTMLElement.parentNode to return document proxy
server side rewrite: generalize local proxy insert, add list for local let overrides
* js obj proxy work:
- add default '__WB_pmw' to self if undefined (for service workers)
- document.origin override
- proxy obj: improved defineProperty override to work with safari
- proxy obj: catch any exception in dummy obj setter
* client-side rewriting:
- proxy obj: catch exception (such as cross-domain access) in own props init
- proxy obj: check for self reference '_WB_wombat_obj_proxy' access to avoid infinite recurse
- rewrite style: add 'cursor' attr for css url rewriting
* content rewriter: if is_ajax(), skip JS proxy obj rewriting also (html rewrite also skipped)
* client-side rewrite: rewrite 'data:text/css' as inline stylesheet when set via setAttribute() on 'href' in link
* client-side document override improvements:
- fix document.domain, document.referrer, forms add document.origin overrides to use only the document object
- init_doc_overrides() called as part of proxy init
- move non-document overrides to main init
rewrite: add rewrite for "Function('return this')" pattern to use proxy obj
* js obj proxy: now a per-collection (and even a per-request) setting 'use_js_obj_prox' (defaults to False)
live-rewrite-server: defaults to enabled js obj proxy
metadata: get_metadata() loads metadata.yaml for config settings for dynamic collections),
or collection config for static collections
warcserver: get_coll_config() returns config for static collection
tests: use custom test dir instead of default 'collections' dir
tests: add basic test for js obj proxy
update to warcio>=1.4.0
* karma tests: update to safari >10
* client-side rewrite:
- ensure wombat.js is ES5 compatible (don't use let)
- check if Proxy obj exists before attempting to init
* js proxy obj: RewriteWithProxyObj uses user-agent to determine if Proxy obj can be supported
content_rewriter: add overridable get_rewriter()
content_rewriter: fix elif -> if in should_rw_content()
tests: update js proxy obj test with different user agents (supported and unsupported)
karma: reset test to safari 9
* compatibility: remove shorthand notation from wombat.js
* js obj proxy: override MutationObserver.observe() to retrieve original object from proxy
wombat.js: cleanup, remove commented out code, label new proxy system functions, bump version to 2.40
2017-08-05 10:37:32 -07:00
|
|
|
|
|
|
|
if coll in self.warcserver.list_fixed_routes():
|
2019-10-27 01:39:52 +01:00
|
|
|
coll_config.update(self.warcserver.get_coll_config(coll))
|
JS Object Proxy Override System (#224)
* Init commit for Wombat JS Proxies off of https://github.com/ikreymer/pywb/tree/develop
Changes
- cli.py: add import os for os.chdir(self.r.directory)
- frontendapp.py: added initial support for cors requests.
- static_handler.py: add import for NotFoundException
- wbrequestresponse.py: added the intital implementation for cors requests, webrecoder needs this for recording!
- default_rewriter.py: added JSWombatProxyRewriter to default js rewriter class for internal testing
- html_rewriter.py: made JSWombatProxyRewriter to be default js rewriter class for internal testing
- regex_rewriters.py: implemented JSWombatProxyRewriter and JSWombatProxyRewriter to support wombat JS Proxy
- wombat.js: added JS Proxy support
- remove print
* wombat proxy: simplify mixin using 'first_buff'
* js local scope rewrite/proxy work:
- add DefaultHandlerWithJSProxy to enable new proxy rewrite (disabled by default)
- new proxy toggleable with 'js_local_scope_rewrite: true'
- work on integrating john's proxy work
- getAllOwnProps() to generate list of functions that need to be rebound
- remove non-proxy related changes for now, remove angular special cases (for now)
* local scope proxy work:
- add back __WB_pmw() prefix for postMessage
- don't override postMessage() in proxy obj
- MessageEvent resolve proxy to original window obj
* js obj proxy: use local_init() to load local vars from proxy obj
* wombat: js object proxy improvements:
- use same object '_WB_wombat_obj_proxy' on window and document objects
- reuse default_proxy_get() for get operation from window or document
- resolve and Window/Document object to the proxy, eg. if '_WB_wombat_obj_proxy' exists, return that
- override MessageEvent.source to return window proxy object
* obj proxy work:
- window proxy: defineProperty() override calls Reflect.defineProperty on dummy object as well as window to avoid exception
- window proxy: set() also sets on dummy object, and returns false if Reflect.set returns false (eg. altered by Reflect.defineProperty disabled writing)
- add override_prop_to_proxy() to add override to return proxy obj for attribute
- add override for Node.ownerDocument and HTMLElement.parentNode to return document proxy
server side rewrite: generalize local proxy insert, add list for local let overrides
* js obj proxy work:
- add default '__WB_pmw' to self if undefined (for service workers)
- document.origin override
- proxy obj: improved defineProperty override to work with safari
- proxy obj: catch any exception in dummy obj setter
* client-side rewriting:
- proxy obj: catch exception (such as cross-domain access) in own props init
- proxy obj: check for self reference '_WB_wombat_obj_proxy' access to avoid infinite recurse
- rewrite style: add 'cursor' attr for css url rewriting
* content rewriter: if is_ajax(), skip JS proxy obj rewriting also (html rewrite also skipped)
* client-side rewrite: rewrite 'data:text/css' as inline stylesheet when set via setAttribute() on 'href' in link
* client-side document override improvements:
- fix document.domain, document.referrer, forms add document.origin overrides to use only the document object
- init_doc_overrides() called as part of proxy init
- move non-document overrides to main init
rewrite: add rewrite for "Function('return this')" pattern to use proxy obj
* js obj proxy: now a per-collection (and even a per-request) setting 'use_js_obj_prox' (defaults to False)
live-rewrite-server: defaults to enabled js obj proxy
metadata: get_metadata() loads metadata.yaml for config settings for dynamic collections),
or collection config for static collections
warcserver: get_coll_config() returns config for static collection
tests: use custom test dir instead of default 'collections' dir
tests: add basic test for js obj proxy
update to warcio>=1.4.0
* karma tests: update to safari >10
* client-side rewrite:
- ensure wombat.js is ES5 compatible (don't use let)
- check if Proxy obj exists before attempting to init
* js proxy obj: RewriteWithProxyObj uses user-agent to determine if Proxy obj can be supported
content_rewriter: add overridable get_rewriter()
content_rewriter: fix elif -> if in should_rw_content()
tests: update js proxy obj test with different user agents (supported and unsupported)
karma: reset test to safari 9
* compatibility: remove shorthand notation from wombat.js
* js obj proxy: override MutationObserver.observe() to retrieve original object from proxy
wombat.js: cleanup, remove commented out code, label new proxy system functions, bump version to 2.40
2017-08-05 10:37:32 -07:00
|
|
|
else:
|
2019-10-27 01:39:52 +01:00
|
|
|
coll_config['metadata'] = self.metadata_cache.load(coll) or {}
|
JS Object Proxy Override System (#224)
* Init commit for Wombat JS Proxies off of https://github.com/ikreymer/pywb/tree/develop
Changes
- cli.py: add import os for os.chdir(self.r.directory)
- frontendapp.py: added initial support for cors requests.
- static_handler.py: add import for NotFoundException
- wbrequestresponse.py: added the intital implementation for cors requests, webrecoder needs this for recording!
- default_rewriter.py: added JSWombatProxyRewriter to default js rewriter class for internal testing
- html_rewriter.py: made JSWombatProxyRewriter to be default js rewriter class for internal testing
- regex_rewriters.py: implemented JSWombatProxyRewriter and JSWombatProxyRewriter to support wombat JS Proxy
- wombat.js: added JS Proxy support
- remove print
* wombat proxy: simplify mixin using 'first_buff'
* js local scope rewrite/proxy work:
- add DefaultHandlerWithJSProxy to enable new proxy rewrite (disabled by default)
- new proxy toggleable with 'js_local_scope_rewrite: true'
- work on integrating john's proxy work
- getAllOwnProps() to generate list of functions that need to be rebound
- remove non-proxy related changes for now, remove angular special cases (for now)
* local scope proxy work:
- add back __WB_pmw() prefix for postMessage
- don't override postMessage() in proxy obj
- MessageEvent resolve proxy to original window obj
* js obj proxy: use local_init() to load local vars from proxy obj
* wombat: js object proxy improvements:
- use same object '_WB_wombat_obj_proxy' on window and document objects
- reuse default_proxy_get() for get operation from window or document
- resolve and Window/Document object to the proxy, eg. if '_WB_wombat_obj_proxy' exists, return that
- override MessageEvent.source to return window proxy object
* obj proxy work:
- window proxy: defineProperty() override calls Reflect.defineProperty on dummy object as well as window to avoid exception
- window proxy: set() also sets on dummy object, and returns false if Reflect.set returns false (eg. altered by Reflect.defineProperty disabled writing)
- add override_prop_to_proxy() to add override to return proxy obj for attribute
- add override for Node.ownerDocument and HTMLElement.parentNode to return document proxy
server side rewrite: generalize local proxy insert, add list for local let overrides
* js obj proxy work:
- add default '__WB_pmw' to self if undefined (for service workers)
- document.origin override
- proxy obj: improved defineProperty override to work with safari
- proxy obj: catch any exception in dummy obj setter
* client-side rewriting:
- proxy obj: catch exception (such as cross-domain access) in own props init
- proxy obj: check for self reference '_WB_wombat_obj_proxy' access to avoid infinite recurse
- rewrite style: add 'cursor' attr for css url rewriting
* content rewriter: if is_ajax(), skip JS proxy obj rewriting also (html rewrite also skipped)
* client-side rewrite: rewrite 'data:text/css' as inline stylesheet when set via setAttribute() on 'href' in link
* client-side document override improvements:
- fix document.domain, document.referrer, forms add document.origin overrides to use only the document object
- init_doc_overrides() called as part of proxy init
- move non-document overrides to main init
rewrite: add rewrite for "Function('return this')" pattern to use proxy obj
* js obj proxy: now a per-collection (and even a per-request) setting 'use_js_obj_prox' (defaults to False)
live-rewrite-server: defaults to enabled js obj proxy
metadata: get_metadata() loads metadata.yaml for config settings for dynamic collections),
or collection config for static collections
warcserver: get_coll_config() returns config for static collection
tests: use custom test dir instead of default 'collections' dir
tests: add basic test for js obj proxy
update to warcio>=1.4.0
* karma tests: update to safari >10
* client-side rewrite:
- ensure wombat.js is ES5 compatible (don't use let)
- check if Proxy obj exists before attempting to init
* js proxy obj: RewriteWithProxyObj uses user-agent to determine if Proxy obj can be supported
content_rewriter: add overridable get_rewriter()
content_rewriter: fix elif -> if in should_rw_content()
tests: update js proxy obj test with different user agents (supported and unsupported)
karma: reset test to safari 9
* compatibility: remove shorthand notation from wombat.js
* js obj proxy: override MutationObserver.observe() to retrieve original object from proxy
wombat.js: cleanup, remove commented out code, label new proxy system functions, bump version to 2.40
2017-08-05 10:37:32 -07:00
|
|
|
|
2019-10-27 01:39:52 +01:00
|
|
|
return coll_config
|
JS Object Proxy Override System (#224)
* Init commit for Wombat JS Proxies off of https://github.com/ikreymer/pywb/tree/develop
Changes
- cli.py: add import os for os.chdir(self.r.directory)
- frontendapp.py: added initial support for cors requests.
- static_handler.py: add import for NotFoundException
- wbrequestresponse.py: added the intital implementation for cors requests, webrecoder needs this for recording!
- default_rewriter.py: added JSWombatProxyRewriter to default js rewriter class for internal testing
- html_rewriter.py: made JSWombatProxyRewriter to be default js rewriter class for internal testing
- regex_rewriters.py: implemented JSWombatProxyRewriter and JSWombatProxyRewriter to support wombat JS Proxy
- wombat.js: added JS Proxy support
- remove print
* wombat proxy: simplify mixin using 'first_buff'
* js local scope rewrite/proxy work:
- add DefaultHandlerWithJSProxy to enable new proxy rewrite (disabled by default)
- new proxy toggleable with 'js_local_scope_rewrite: true'
- work on integrating john's proxy work
- getAllOwnProps() to generate list of functions that need to be rebound
- remove non-proxy related changes for now, remove angular special cases (for now)
* local scope proxy work:
- add back __WB_pmw() prefix for postMessage
- don't override postMessage() in proxy obj
- MessageEvent resolve proxy to original window obj
* js obj proxy: use local_init() to load local vars from proxy obj
* wombat: js object proxy improvements:
- use same object '_WB_wombat_obj_proxy' on window and document objects
- reuse default_proxy_get() for get operation from window or document
- resolve and Window/Document object to the proxy, eg. if '_WB_wombat_obj_proxy' exists, return that
- override MessageEvent.source to return window proxy object
* obj proxy work:
- window proxy: defineProperty() override calls Reflect.defineProperty on dummy object as well as window to avoid exception
- window proxy: set() also sets on dummy object, and returns false if Reflect.set returns false (eg. altered by Reflect.defineProperty disabled writing)
- add override_prop_to_proxy() to add override to return proxy obj for attribute
- add override for Node.ownerDocument and HTMLElement.parentNode to return document proxy
server side rewrite: generalize local proxy insert, add list for local let overrides
* js obj proxy work:
- add default '__WB_pmw' to self if undefined (for service workers)
- document.origin override
- proxy obj: improved defineProperty override to work with safari
- proxy obj: catch any exception in dummy obj setter
* client-side rewriting:
- proxy obj: catch exception (such as cross-domain access) in own props init
- proxy obj: check for self reference '_WB_wombat_obj_proxy' access to avoid infinite recurse
- rewrite style: add 'cursor' attr for css url rewriting
* content rewriter: if is_ajax(), skip JS proxy obj rewriting also (html rewrite also skipped)
* client-side rewrite: rewrite 'data:text/css' as inline stylesheet when set via setAttribute() on 'href' in link
* client-side document override improvements:
- fix document.domain, document.referrer, forms add document.origin overrides to use only the document object
- init_doc_overrides() called as part of proxy init
- move non-document overrides to main init
rewrite: add rewrite for "Function('return this')" pattern to use proxy obj
* js obj proxy: now a per-collection (and even a per-request) setting 'use_js_obj_prox' (defaults to False)
live-rewrite-server: defaults to enabled js obj proxy
metadata: get_metadata() loads metadata.yaml for config settings for dynamic collections),
or collection config for static collections
warcserver: get_coll_config() returns config for static collection
tests: use custom test dir instead of default 'collections' dir
tests: add basic test for js obj proxy
update to warcio>=1.4.0
* karma tests: update to safari >10
* client-side rewrite:
- ensure wombat.js is ES5 compatible (don't use let)
- check if Proxy obj exists before attempting to init
* js proxy obj: RewriteWithProxyObj uses user-agent to determine if Proxy obj can be supported
content_rewriter: add overridable get_rewriter()
content_rewriter: fix elif -> if in should_rw_content()
tests: update js proxy obj test with different user agents (supported and unsupported)
karma: reset test to safari 9
* compatibility: remove shorthand notation from wombat.js
* js obj proxy: override MutationObserver.observe() to retrieve original object from proxy
wombat.js: cleanup, remove commented out code, label new proxy system functions, bump version to 2.40
2017-08-05 10:37:32 -07:00
|
|
|
|
2017-08-07 22:09:02 -07:00
|
|
|
def serve_coll_page(self, environ, coll='$root'):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""Render and serve a collections search page (search.html).
|
|
|
|
|
|
|
|
:param dict environ: The WSGI environment dictionary for the request
|
|
|
|
:param str coll: The name of the collection to serve the collections search page for
|
|
|
|
:return: The WbResponse containing the collections search page
|
|
|
|
:rtype: WbResponse
|
|
|
|
"""
|
2017-02-27 19:07:51 -08:00
|
|
|
if not self.is_valid_coll(coll):
|
2018-03-02 15:54:27 -08:00
|
|
|
self.raise_not_found(environ, 'coll_not_found', coll)
|
2017-02-27 19:07:51 -08:00
|
|
|
|
2017-04-26 12:12:34 -07:00
|
|
|
self.setup_paths(environ, coll)
|
2017-02-17 18:04:07 -08:00
|
|
|
|
2019-10-27 01:39:52 +01:00
|
|
|
coll_config = self.get_coll_config(coll)
|
|
|
|
metadata = coll_config.get('metadata')
|
2017-02-17 18:04:07 -08:00
|
|
|
|
2017-04-26 12:12:34 -07:00
|
|
|
view = BaseInsertView(self.rewriterapp.jinja_env, 'search.html')
|
2017-02-17 18:04:07 -08:00
|
|
|
|
2019-07-24 10:47:17 -07:00
|
|
|
wb_prefix = environ.get('SCRIPT_NAME', '')
|
2017-08-07 22:09:02 -07:00
|
|
|
if wb_prefix:
|
|
|
|
wb_prefix += '/'
|
|
|
|
|
2017-04-26 12:12:34 -07:00
|
|
|
content = view.render_to_string(environ,
|
2017-08-07 22:09:02 -07:00
|
|
|
wb_prefix=wb_prefix,
|
2019-10-27 01:39:52 +01:00
|
|
|
coll=coll,
|
|
|
|
coll_config=coll_config,
|
|
|
|
metadata=metadata)
|
2017-02-27 19:07:51 -08:00
|
|
|
|
2017-04-26 12:12:34 -07:00
|
|
|
return WbResponse.text_response(content, content_type='text/html; charset="utf-8"')
|
2017-04-21 15:37:21 -07:00
|
|
|
|
2017-09-06 23:25:30 -07:00
|
|
|
def serve_cdx(self, environ, coll='$root'):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""Make the upstream CDX query for a collection and response with the results of the query
|
|
|
|
|
|
|
|
:param dict environ: The WSGI environment dictionary for the request
|
|
|
|
:param str coll: The name of the collection this CDX query is for
|
|
|
|
:return: The WbResponse containing the results of the CDX query
|
|
|
|
:rtype: WbResponse
|
|
|
|
"""
|
2017-09-06 23:25:30 -07:00
|
|
|
base_url = self.rewriterapp.paths['cdx-server']
|
|
|
|
|
2019-09-10 14:45:05 -04:00
|
|
|
# if coll == self.all_coll:
|
2017-10-03 15:31:08 -07:00
|
|
|
# coll = '*'
|
2017-09-28 02:08:31 -07:00
|
|
|
|
2017-09-06 23:25:30 -07:00
|
|
|
cdx_url = base_url.format(coll=coll)
|
|
|
|
|
|
|
|
if environ.get('QUERY_STRING'):
|
|
|
|
cdx_url += '&' if '?' in cdx_url else '?'
|
|
|
|
cdx_url += environ.get('QUERY_STRING')
|
|
|
|
|
2019-11-07 10:25:49 -08:00
|
|
|
if self.query_limit:
|
|
|
|
cdx_url += '&' if '?' in cdx_url else '?'
|
|
|
|
cdx_url += 'limit=' + str(self.query_limit)
|
|
|
|
|
2017-09-06 23:25:30 -07:00
|
|
|
try:
|
|
|
|
res = requests.get(cdx_url, stream=True)
|
2017-11-14 20:08:59 -08:00
|
|
|
|
|
|
|
content_type = res.headers.get('Content-Type')
|
|
|
|
|
|
|
|
return WbResponse.bin_stream(StreamIter(res.raw),
|
|
|
|
content_type=content_type)
|
2017-09-06 23:25:30 -07:00
|
|
|
|
|
|
|
except Exception as e:
|
2017-09-21 22:12:57 -07:00
|
|
|
return WbResponse.text_response('Error: ' + str(e), status='400 Bad Request')
|
2017-09-06 23:25:30 -07:00
|
|
|
|
2017-09-21 22:12:57 -07:00
|
|
|
def serve_record(self, environ, coll='$root', url=''):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""Serve a URL's content from a WARC/ARC record in replay mode or from the live web in
|
|
|
|
live, proxy, and record mode.
|
|
|
|
|
|
|
|
:param dict environ: The WSGI environment dictionary for the request
|
|
|
|
:param str coll: The name of the collection the record is to be served from
|
|
|
|
:param str url: The URL for the corresponding record to be served if it exists
|
|
|
|
:return: WbResponse containing the contents of the record/URL
|
|
|
|
:rtype: WbResponse
|
|
|
|
"""
|
2017-09-21 22:12:57 -07:00
|
|
|
if coll in self.warcserver.list_fixed_routes():
|
|
|
|
return WbResponse.text_response('Error: Can Not Record Into Custom Collection "{0}"'.format(coll))
|
|
|
|
|
|
|
|
return self.serve_content(environ, coll, url, record=True)
|
|
|
|
|
|
|
|
def serve_content(self, environ, coll='$root', url='', timemap_output='', record=False):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""Serve the contents of a URL/Record rewriting the contents of the response when applicable.
|
|
|
|
|
|
|
|
:param dict environ: The WSGI environment dictionary for the request
|
|
|
|
:param str coll: The name of the collection the record is to be served from
|
|
|
|
:param str url: The URL for the corresponding record to be served if it exists
|
|
|
|
:param str timemap_output: The contents of the timemap included in the link header of the response
|
|
|
|
:param bool record: Should the content being served by recorded (save to a warc). Only valid in record mode
|
|
|
|
:return: WbResponse containing the contents of the record/URL
|
|
|
|
:rtype: WbResponse
|
|
|
|
"""
|
2017-02-27 19:07:51 -08:00
|
|
|
if not self.is_valid_coll(coll):
|
2018-03-02 15:54:27 -08:00
|
|
|
self.raise_not_found(environ, 'coll_not_found', coll)
|
2017-02-27 19:07:51 -08:00
|
|
|
|
2017-09-21 22:12:57 -07:00
|
|
|
self.setup_paths(environ, coll, record)
|
2017-04-26 12:12:34 -07:00
|
|
|
|
2018-04-10 17:17:38 -07:00
|
|
|
request_uri = environ.get('REQUEST_URI')
|
|
|
|
script_name = environ.get('SCRIPT_NAME', '') + '/'
|
|
|
|
if request_uri and request_uri.startswith(script_name):
|
|
|
|
wb_url_str = request_uri[len(script_name):]
|
2017-04-28 14:40:42 -07:00
|
|
|
|
2018-04-10 17:17:38 -07:00
|
|
|
else:
|
|
|
|
wb_url_str = to_native_str(url)
|
|
|
|
|
|
|
|
if environ.get('QUERY_STRING'):
|
|
|
|
wb_url_str += '?' + environ.get('QUERY_STRING')
|
2017-02-17 18:04:07 -08:00
|
|
|
|
2019-10-27 01:39:52 +01:00
|
|
|
coll_config = self.get_coll_config(coll)
|
2017-09-21 22:12:57 -07:00
|
|
|
if record:
|
2019-10-27 01:39:52 +01:00
|
|
|
coll_config['type'] = 'record'
|
2021-01-26 18:53:54 -08:00
|
|
|
coll_config['cache'] = self.rec_cache_mode
|
2017-09-21 22:12:57 -07:00
|
|
|
|
2017-08-07 16:47:49 -07:00
|
|
|
if timemap_output:
|
2019-10-27 01:39:52 +01:00
|
|
|
coll_config['output'] = timemap_output
|
2018-12-05 18:06:40 -05:00
|
|
|
# ensure that the timemap path information is not included
|
|
|
|
wb_url_str = wb_url_str.replace('timemap/{0}/'.format(timemap_output), '')
|
2019-09-11 09:03:55 -07:00
|
|
|
|
2019-10-27 01:39:52 +01:00
|
|
|
return self.rewriterapp.render_content(wb_url_str, coll_config, environ)
|
2017-02-17 18:04:07 -08:00
|
|
|
|
2017-09-21 22:12:57 -07:00
|
|
|
def setup_paths(self, environ, coll, record=False):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""Populates the WSGI environment dictionary with the path information necessary to perform a response for
|
|
|
|
content or record.
|
|
|
|
|
|
|
|
:param dict environ: The WSGI environment dictionary for the request
|
|
|
|
:param str coll: The name of the collection the record is to be served from
|
|
|
|
:param bool record: Should the content being served by recorded (save to a warc). Only valid in record mode
|
|
|
|
"""
|
2017-05-23 19:08:29 -07:00
|
|
|
if not coll or not self.warcserver.root_dir:
|
2017-04-26 12:12:34 -07:00
|
|
|
return
|
|
|
|
|
2017-08-07 22:09:02 -07:00
|
|
|
if coll != '$root':
|
|
|
|
pop_path_info(environ)
|
2017-09-21 22:12:57 -07:00
|
|
|
if record:
|
|
|
|
pop_path_info(environ)
|
2017-08-07 22:09:02 -07:00
|
|
|
|
|
|
|
paths = [self.warcserver.root_dir]
|
|
|
|
|
|
|
|
if coll != '$root':
|
|
|
|
paths.append(coll)
|
|
|
|
|
|
|
|
paths.append(self.templates_dir)
|
|
|
|
|
2017-08-05 17:12:16 -07:00
|
|
|
# jinja2 template paths always use '/' as separator
|
2017-08-07 22:09:02 -07:00
|
|
|
environ['pywb.templates_dir'] = '/'.join(paths)
|
2017-04-26 12:12:34 -07:00
|
|
|
|
|
|
|
def serve_listing(self, environ):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""Serves the response for WARCServer fixed and dynamic listing (paths)
|
|
|
|
|
|
|
|
:param dict environ: The WSGI environment dictionary for the request
|
|
|
|
:return: WbResponse containing the frontend apps WARCServer URL paths
|
|
|
|
:rtype: WbResponse
|
|
|
|
"""
|
2017-05-23 19:08:29 -07:00
|
|
|
result = {'fixed': self.warcserver.list_fixed_routes(),
|
|
|
|
'dynamic': self.warcserver.list_dynamic_routes()
|
2019-09-10 14:45:05 -04:00
|
|
|
}
|
2017-04-26 12:12:34 -07:00
|
|
|
|
|
|
|
return WbResponse.json_response(result)
|
|
|
|
|
|
|
|
def is_valid_coll(self, coll):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""Determines if the collection name for a request is valid (exists)
|
|
|
|
|
|
|
|
:param str coll: The name of the collection to check
|
|
|
|
:return: True if the collection is valid, false otherwise
|
|
|
|
:rtype: bool
|
|
|
|
"""
|
2019-09-10 14:45:05 -04:00
|
|
|
# if coll == self.all_coll:
|
2017-10-03 15:31:08 -07:00
|
|
|
# return True
|
2017-09-28 02:08:31 -07:00
|
|
|
|
2017-05-23 19:08:29 -07:00
|
|
|
return (coll in self.warcserver.list_fixed_routes() or
|
|
|
|
coll in self.warcserver.list_dynamic_routes())
|
2017-04-26 12:12:34 -07:00
|
|
|
|
2018-03-02 15:54:27 -08:00
|
|
|
def raise_not_found(self, environ, err_type, url):
|
2019-03-07 18:21:06 -08:00
|
|
|
"""Utility function for raising a werkzeug.exceptions.NotFound execption with the supplied WSGI environment
|
|
|
|
and message.
|
|
|
|
|
|
|
|
:param dict environ: The WSGI environment dictionary for the request
|
|
|
|
:param str err_type: The identifier for type of error that occured
|
|
|
|
:param str url: The url of the archived page that was requested
|
|
|
|
"""
|
2018-03-02 15:54:27 -08:00
|
|
|
raise AppPageNotFound(err_type, url)
|
2017-04-26 12:12:34 -07:00
|
|
|
|
2017-02-27 19:07:51 -08:00
|
|
|
def _check_refer_redirect(self, environ):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""Returns a WbResponse for a HTTP 307 redirection if the HTTP referer header is the same as the HTTP host header
|
|
|
|
|
|
|
|
:param dict environ: The WSGI environment dictionary for the request
|
|
|
|
:return: WbResponse HTTP 307 redirection
|
|
|
|
:rtype: WbResponse
|
|
|
|
"""
|
2017-02-27 19:07:51 -08:00
|
|
|
referer = environ.get('HTTP_REFERER')
|
|
|
|
if not referer:
|
|
|
|
return
|
|
|
|
|
|
|
|
host = environ.get('HTTP_HOST')
|
|
|
|
if host not in referer:
|
|
|
|
return
|
|
|
|
|
|
|
|
inx = referer[1:].find('http')
|
|
|
|
if not inx:
|
|
|
|
inx = referer[1:].find('///')
|
|
|
|
|
|
|
|
if inx < 0:
|
|
|
|
return
|
|
|
|
|
|
|
|
url = referer[inx + 1:]
|
|
|
|
host = referer[:inx + 1]
|
|
|
|
|
|
|
|
orig_url = environ['PATH_INFO']
|
|
|
|
if environ.get('QUERY_STRING'):
|
|
|
|
orig_url += '?' + environ['QUERY_STRING']
|
|
|
|
|
|
|
|
full_url = host + urljoin(url, orig_url)
|
|
|
|
return WbResponse.redir_response(full_url, '307 Redirect')
|
|
|
|
|
2017-02-17 18:04:07 -08:00
|
|
|
def __call__(self, environ, start_response):
|
2019-04-10 14:00:53 -04:00
|
|
|
"""Handles a request
|
|
|
|
|
|
|
|
:param dict environ: The WSGI environment dictionary for the request
|
|
|
|
:param start_response:
|
|
|
|
:return: The WbResponse for the request
|
|
|
|
:rtype: WbResponse
|
|
|
|
"""
|
2017-10-01 09:46:54 -07:00
|
|
|
return self.handler(environ, start_response)
|
|
|
|
|
|
|
|
def handle_request(self, environ, start_response):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""Retrieves the route handler and calls the handler returning its the response
|
|
|
|
|
|
|
|
:param dict environ: The WSGI environment dictionary for the request
|
|
|
|
:param start_response:
|
|
|
|
:return: The WbResponse for the request
|
|
|
|
:rtype: WbResponse
|
|
|
|
"""
|
2017-02-17 18:04:07 -08:00
|
|
|
urls = self.url_map.bind_to_environ(environ)
|
|
|
|
try:
|
|
|
|
endpoint, args = urls.match()
|
2018-08-24 17:59:02 -07:00
|
|
|
# store original script_name (original prefix) before modifications are made
|
2019-07-24 10:47:17 -07:00
|
|
|
environ['pywb.app_prefix'] = environ.get('SCRIPT_NAME', '')
|
2018-08-24 17:59:02 -07:00
|
|
|
|
2018-07-24 18:25:30 -07:00
|
|
|
# store original script_name (original prefix) before modifications are made
|
|
|
|
environ['ORIG_SCRIPT_NAME'] = environ.get('SCRIPT_NAME')
|
|
|
|
|
2018-02-26 22:53:52 -08:00
|
|
|
lang = args.pop('lang', '')
|
|
|
|
if lang:
|
|
|
|
pop_path_info(environ)
|
|
|
|
environ['pywb_lang'] = lang
|
|
|
|
|
2017-02-17 18:04:07 -08:00
|
|
|
response = endpoint(environ, **args)
|
|
|
|
|
2019-09-11 09:03:55 -07:00
|
|
|
except RequestRedirect as rr:
|
|
|
|
# if werkzeug throws this, likely a missing slash redirect
|
|
|
|
# also check referrer here to avoid another redirect later
|
2017-02-27 19:07:51 -08:00
|
|
|
redir = self._check_refer_redirect(environ)
|
|
|
|
if redir:
|
|
|
|
return redir(environ, start_response)
|
|
|
|
|
2019-09-11 09:03:55 -07:00
|
|
|
response = WbResponse.redir_response(rr.new_url, '307 Redirect')
|
2018-02-25 14:06:23 -08:00
|
|
|
|
|
|
|
except WbException as wbe:
|
|
|
|
if wbe.status_code == 404:
|
|
|
|
redir = self._check_refer_redirect(environ)
|
|
|
|
if redir:
|
|
|
|
return redir(environ, start_response)
|
|
|
|
|
|
|
|
response = self.rewriterapp.handle_error(environ, wbe)
|
2017-02-27 19:07:51 -08:00
|
|
|
|
2017-02-17 18:04:07 -08:00
|
|
|
except Exception as e:
|
|
|
|
if self.debug:
|
|
|
|
traceback.print_exc()
|
|
|
|
|
2018-02-17 21:59:30 -08:00
|
|
|
response = self.rewriterapp._error_response(environ, WbException('Internal Error: ' + str(e)))
|
2018-02-25 14:06:23 -08:00
|
|
|
|
|
|
|
return response(environ, start_response)
|
2017-02-17 18:04:07 -08:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def create_app(cls, port):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""Create a new instance of FrontEndApp that listens on port with a hostname of 0.0.0.0
|
|
|
|
|
|
|
|
:param int port: The port FrontEndApp is to listen on
|
|
|
|
:return: A new instance of FrontEndApp wrapped in GeventServer
|
|
|
|
:rtype: GeventServer
|
|
|
|
"""
|
2017-02-17 18:04:07 -08:00
|
|
|
app = FrontEndApp()
|
|
|
|
app_server = GeventServer(app, port=port, hostname='0.0.0.0')
|
|
|
|
return app_server
|
|
|
|
|
2017-10-01 09:46:54 -07:00
|
|
|
def init_proxy(self, config):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""Initialize and start proxy mode. If proxy configuration entry is not contained in the config
|
|
|
|
this is a no op. Causes handler to become an instance of WSGIProxMiddleware.
|
|
|
|
|
|
|
|
:param dict config: The configuration object used to configure this instance of FrontEndApp
|
|
|
|
"""
|
2017-10-01 09:46:54 -07:00
|
|
|
proxy_config = config.get('proxy')
|
|
|
|
if not proxy_config:
|
|
|
|
return
|
|
|
|
|
|
|
|
if isinstance(proxy_config, str):
|
|
|
|
proxy_coll = proxy_config
|
|
|
|
proxy_config = {}
|
|
|
|
else:
|
|
|
|
proxy_coll = proxy_config['coll']
|
|
|
|
|
|
|
|
if '/' in proxy_coll:
|
|
|
|
raise Exception('Proxy collection can not contain "/"')
|
|
|
|
|
|
|
|
proxy_config['ca_name'] = proxy_config.get('ca_name', self.PROXY_CA_NAME)
|
|
|
|
proxy_config['ca_file_cache'] = proxy_config.get('ca_file_cache', self.PROXY_CA_PATH)
|
|
|
|
|
|
|
|
if proxy_config.get('recording'):
|
|
|
|
logging.info('Proxy recording into collection "{0}"'.format(proxy_coll))
|
|
|
|
if proxy_coll in self.warcserver.list_fixed_routes():
|
|
|
|
raise Exception('Can not record into fixed collection')
|
|
|
|
|
2020-02-20 21:53:00 -08:00
|
|
|
proxy_route = proxy_coll + self.RECORD_ROUTE
|
2017-10-01 09:46:54 -07:00
|
|
|
if not config.get('recorder'):
|
|
|
|
config['recorder'] = 'live'
|
|
|
|
|
2020-02-20 21:53:00 -08:00
|
|
|
self.proxy_record = True
|
|
|
|
|
2017-10-01 09:46:54 -07:00
|
|
|
else:
|
|
|
|
logging.info('Proxy enabled for collection "{0}"'.format(proxy_coll))
|
2020-02-20 21:53:00 -08:00
|
|
|
self.proxy_record = False
|
|
|
|
proxy_route = proxy_coll
|
2017-09-27 13:47:02 -07:00
|
|
|
|
2018-10-22 17:12:22 -07:00
|
|
|
if proxy_config.get('enable_content_rewrite', True):
|
2020-02-20 21:53:00 -08:00
|
|
|
self.proxy_prefix = '/{0}/bn_/'.format(proxy_route)
|
2018-04-20 10:04:34 -07:00
|
|
|
else:
|
2020-02-20 21:53:00 -08:00
|
|
|
self.proxy_prefix = '/{0}/id_/'.format(proxy_route)
|
2017-09-27 13:47:02 -07:00
|
|
|
|
2019-03-11 16:28:09 -07:00
|
|
|
self.proxy_default_timestamp = proxy_config.get('default_timestamp')
|
|
|
|
if self.proxy_default_timestamp:
|
|
|
|
if not self.ALL_DIGITS.match(self.proxy_default_timestamp):
|
|
|
|
try:
|
|
|
|
self.proxy_default_timestamp = iso_date_to_timestamp(self.proxy_default_timestamp)
|
2019-09-10 14:45:05 -04:00
|
|
|
except Exception:
|
2019-03-11 16:28:09 -07:00
|
|
|
raise Exception('Invalid Proxy Timestamp: Must Be All-Digit Timestamp or ISO Date Format')
|
|
|
|
|
2018-10-03 16:27:49 -04:00
|
|
|
self.proxy_coll = proxy_coll
|
|
|
|
|
2018-04-20 15:20:56 -07:00
|
|
|
self.handler = WSGIProxMiddleware(self.handle_request,
|
2018-10-03 16:27:49 -04:00
|
|
|
self.proxy_route_request,
|
|
|
|
proxy_host=proxy_config.get('host', 'pywb.proxy'),
|
|
|
|
proxy_options=proxy_config)
|
2017-09-27 13:47:02 -07:00
|
|
|
|
2018-04-20 15:20:56 -07:00
|
|
|
def proxy_route_request(self, url, environ):
|
|
|
|
""" Return the full url that this proxy request will be routed to
|
|
|
|
The 'environ' PATH_INFO and REQUEST_URI will be modified based on the returned url
|
|
|
|
|
|
|
|
Default is to use the 'proxy_prefix' to point to the proxy collection
|
|
|
|
"""
|
2019-03-11 16:28:09 -07:00
|
|
|
if self.proxy_default_timestamp:
|
|
|
|
environ['pywb_proxy_default_timestamp'] = self.proxy_default_timestamp
|
|
|
|
|
2018-04-20 15:20:56 -07:00
|
|
|
return self.proxy_prefix + url
|
|
|
|
|
2018-10-03 16:27:49 -04:00
|
|
|
def proxy_fetch(self, env, url):
|
|
|
|
"""Proxy mode only endpoint that handles OPTIONS requests and COR fetches for Preservation Worker.
|
|
|
|
|
|
|
|
Due to normal cross-origin browser restrictions in proxy mode, auto fetch worker cannot access the CSS rules
|
|
|
|
of cross-origin style sheets and must re-fetch them in a manner that is CORS safe. This endpoint facilitates
|
|
|
|
that by fetching the stylesheets for the auto fetch worker and then responds with its contents
|
|
|
|
|
|
|
|
:param dict env: The WSGI environment dictionary
|
|
|
|
:param str url: The URL of the resource to be fetched
|
|
|
|
:return: WbResponse that is either response to an Options request or the results of fetching url
|
|
|
|
:rtype: WbResponse
|
|
|
|
"""
|
|
|
|
if not self.is_proxy_enabled(env):
|
|
|
|
# we are not in proxy mode so just respond with forbidden
|
|
|
|
return WbResponse.text_response('proxy mode must be enabled to use this endpoint',
|
|
|
|
status='403 Forbidden')
|
|
|
|
|
|
|
|
if env.get('REQUEST_METHOD') == 'OPTIONS':
|
|
|
|
return WbResponse.options_response(env)
|
|
|
|
|
|
|
|
# ensure full URL
|
2020-02-20 21:53:00 -08:00
|
|
|
url = env['REQUEST_URI'].split('/proxy-fetch/', 1)[-1]
|
|
|
|
|
|
|
|
env['REQUEST_URI'] = self.proxy_prefix + url
|
|
|
|
env['PATH_INFO'] = self.proxy_prefix + env['PATH_INFO'].split('/proxy-fetch/', 1)[-1]
|
|
|
|
|
2018-10-03 16:27:49 -04:00
|
|
|
# make request using normal serve_content
|
2020-02-20 21:53:00 -08:00
|
|
|
response = self.serve_content(env, self.proxy_coll, url, record=self.proxy_record)
|
|
|
|
|
2018-10-03 16:27:49 -04:00
|
|
|
# for WR
|
|
|
|
if isinstance(response, WbResponse):
|
|
|
|
response.add_access_control_headers(env=env)
|
|
|
|
return response
|
|
|
|
|
2017-02-17 18:04:07 -08:00
|
|
|
|
2017-04-26 12:12:34 -07:00
|
|
|
# ============================================================================
|
|
|
|
class MetadataCache(object):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""This class holds the collection medata template string and
|
|
|
|
caches the metadata for a collection once it is rendered once.
|
|
|
|
Cached metadata is updated if its corresponding file has been updated since last cache time (file mtime based)"""
|
|
|
|
|
2017-04-26 12:12:34 -07:00
|
|
|
def __init__(self, template_str):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""
|
|
|
|
:param str template_str: The template string to be cached
|
|
|
|
"""
|
2017-04-26 12:12:34 -07:00
|
|
|
self.template_str = template_str
|
|
|
|
self.cache = {}
|
|
|
|
|
|
|
|
def load(self, coll):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""Load and receive the metadata associated with a collection.
|
|
|
|
|
|
|
|
If the metadata for the collection is not cached yet its metadata file is read in and stored.
|
|
|
|
If the cache has seen the collection before the mtime of the metadata file is checked and if it is more recent
|
|
|
|
than the cached time, the cache is updated and returned otherwise the cached version is returned.
|
|
|
|
|
|
|
|
:param str coll: Name of a collection
|
|
|
|
:return: The cached metadata for a collection
|
|
|
|
:rtype: dict
|
|
|
|
"""
|
2017-04-26 12:12:34 -07:00
|
|
|
path = self.template_str.format(coll=coll)
|
|
|
|
try:
|
|
|
|
mtime = os.path.getmtime(path)
|
|
|
|
obj = self.cache.get(path)
|
2019-09-10 14:45:05 -04:00
|
|
|
except Exception:
|
2017-04-26 12:12:34 -07:00
|
|
|
return {}
|
|
|
|
|
|
|
|
if not obj:
|
|
|
|
return self.store_new(coll, path, mtime)
|
|
|
|
|
|
|
|
cached_mtime, data = obj
|
|
|
|
if mtime == cached_mtime == mtime:
|
|
|
|
return obj
|
|
|
|
|
|
|
|
return self.store_new(coll, path, mtime)
|
|
|
|
|
|
|
|
def store_new(self, coll, path, mtime):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""Load a collections metadata file and store it
|
|
|
|
|
|
|
|
:param str coll: The name of the collection the metadata is for
|
|
|
|
:param str path: The path to the collections metadata file
|
|
|
|
:param float mtime: The current mtime of the collections metadata file
|
|
|
|
:return: The collections metadata
|
|
|
|
:rtype: dict
|
|
|
|
"""
|
2017-04-26 12:12:34 -07:00
|
|
|
obj = load_yaml_config(path)
|
|
|
|
self.cache[coll] = (mtime, obj)
|
|
|
|
return obj
|
|
|
|
|
|
|
|
def get_all(self, routes):
|
2018-10-03 16:27:49 -04:00
|
|
|
"""Load the metadata for all routes (collections) and populate the cache
|
|
|
|
|
|
|
|
:param list[str] routes: List of collection names
|
|
|
|
:return: A dictionary containing each collections metadata
|
|
|
|
:rtype: dict
|
|
|
|
"""
|
2017-04-26 12:12:34 -07:00
|
|
|
for route in routes:
|
|
|
|
self.load(route)
|
|
|
|
|
|
|
|
return {name: value[1] for name, value in iteritems(self.cache)}
|
|
|
|
|
|
|
|
|
2017-02-17 18:04:07 -08:00
|
|
|
# ============================================================================
|
|
|
|
if __name__ == "__main__":
|
|
|
|
app_server = FrontEndApp.create_app(port=8080)
|
|
|
|
app_server.join()
|