From ec0df7b9aec0cdcf476cd1795ebfca66a10082bd Mon Sep 17 00:00:00 2001 From: John Berlin Date: Wed, 3 Oct 2018 16:27:49 -0400 Subject: [PATCH] Refactor of auto-fetch worker system with support for proxy mode, fixes https://github.com/webrecorder/pywb/issues/371: (#379) - Split wombat and auto-fetch worker into two files (proxy mode and non-proxy mode) - Renamed preservationWorker to autoFetchWorker in order to better convey what it does - Root config file control over including wombat and auto-fetch worker in proxy or non-proxy mode - Added additional proxy mode + auto-fetch worker only route for fetching the auto-fetch worker code nicely for CORS - templateview: add 'tobool' formatter to more cleanly format python bools to JS 'true'/'false' - proxy options: config and command line: 'use_auto_fetch_worker' and '--proxy-with-auto-fetch' 'use_wombat' and '--proxy-with-wombat' - head_insert.html: only include wombat in proxy mode when use_wombat or use_auto_fetch_worker are set. - wombatProxyMode.js: slimmed down wombat for proxy mode only including auto-fetch support. - more consistent naming: rename 'preserveWorker' and 'autoArchive' to 'auto-fetch' Updated tests: - test_wbrequestresponse.py: added tests covering constructor defaults, _init_derived, options_response, json_response, encode_stream, text_stream - test_auto_colls.py: fixed broken test test_more_custom_templates, reason using ujson now not json so spacing was off - test_proxy.py: updated existing tests to reflect splitting wombat into proxy and non-proxy mode, added tests covering auto-fetch worker specific endpoints in proxy mode removed duplicate addons key in .travis.yml - test_cli.py: updated to properly test the cli with these changes added ultrajon dep to tests_require in setup.py to reflect its usage by wbrequestresponse.py Fully documented: - cli.py - frontendapp.py - templateview.py - wbrequestresponse.py Removed duplicate addons key in .travis.yml Added ultrajson dependency to tests_require in setup.py to reflect its usage by wbrequestresponse.py Fixes #371 --- .travis.yml | 3 +- pywb/apps/cli.py | 42 +- pywb/apps/frontendapp.py | 225 ++++++++- pywb/apps/rewriterapp.py | 1 + pywb/apps/test/test_wbrequestresponse.py | 98 ++++ pywb/apps/wbrequestresponse.py | 147 +++++- pywb/rewrite/templateview.py | 145 +++++- ...eservationWorker.js => autoFetchWorker.js} | 101 ++-- pywb/static/autoFetchWorkerProxyMode.js | 192 ++++++++ pywb/static/wombat.js | 465 +++++++++--------- pywb/static/wombatProxyMode.js | 376 ++++++++++++++ pywb/templates/head_insert.html | 22 +- setup.py | 1 + tests/test_auto_colls.py | 2 +- tests/test_cli.py | 4 +- tests/test_proxy.py | 147 +++++- 16 files changed, 1631 insertions(+), 340 deletions(-) rename pywb/static/{wombatPreservationWorker.js => autoFetchWorker.js} (63%) create mode 100644 pywb/static/autoFetchWorkerProxyMode.js create mode 100644 pywb/static/wombatProxyMode.js diff --git a/.travis.yml b/.travis.yml index 63e6c5cf..5d504e6b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,7 @@ os: - linux addons: + sauce_connect: true apt: packages: # This is required to run new chrome on old trusty @@ -18,8 +19,6 @@ env: - WR_TEST=no - WR_TEST=yes -addons: - sauce_connect: true cache: directories: diff --git a/pywb/apps/cli.py b/pywb/apps/cli.py index 38c18bca..0c0cd1f4 100644 --- a/pywb/apps/cli.py +++ b/pywb/apps/cli.py @@ -6,6 +6,7 @@ import logging #============================================================================= def warcserver(args=None): + """Utility function for starting pywb's WarcServer""" return WarcServerCli(args=args, default_port=8070, desc='pywb WarcServer').run() @@ -13,6 +14,7 @@ def warcserver(args=None): #============================================================================= def wayback(args=None): + """Utility function for starting pywb's Wayback Machine implementation""" return WaybackCli(args=args, default_port=8080, desc='pywb Wayback Machine Server').run() @@ -20,6 +22,7 @@ def wayback(args=None): #============================================================================= def live_rewrite_server(args=None): + """Utility function for starting pywb's Wayback Machine implementation in live mode""" return LiveCli(args=args, default_port=8090, desc='pywb Live Rewrite Proxy Server').run() @@ -27,7 +30,15 @@ def live_rewrite_server(args=None): #============================================================================= class BaseCli(object): + """Base CLI class that provides the initial arg parser setup, + calls load to receive the application to be started and starts the application.""" + def __init__(self, args=None, default_port=8080, desc=''): + """ + :param args: CLI arguments + :param int default_port: The default port that the application will use + :param str desc: The description for the application to be started + """ parser = ArgumentParser(description=desc) parser.add_argument('-p', '--port', type=int, default=default_port, help='Port to listen on (default %s)' % default_port) @@ -47,6 +58,10 @@ class BaseCli(object): help='Enable HTTP/S proxy on specified collection') parser.add_argument('--proxy-record', action='store_true', help='Enable proxy recording into specified collection') + parser.add_argument('--proxy-with-wombat', action='store_true', + help='Enable partial wombat support in proxy mode') + parser.add_argument('--proxy-with-auto-fetch', action='store_true', + help='Enable auto-load worker in proxy mode') self.desc = desc self.extra_config = {} @@ -57,12 +72,14 @@ class BaseCli(object): logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', level=logging.DEBUG if self.r.debug else logging.INFO) - if self.r.proxy: - self.extra_config['proxy'] = {'coll': self.r.proxy, - 'recording': self.r.proxy_record} + self.extra_config['proxy'] = { + 'coll': self.r.proxy, + 'recording': self.r.proxy_record, + 'use_wombat': self.r.proxy_with_wombat, + 'use_auto_fetch_worker': self.r.proxy_with_auto_fetch, + } self.r.live = True - self.application = self.load() if self.r.profile: @@ -70,9 +87,15 @@ class BaseCli(object): self.application = ProfilerMiddleware(self.application) def _extend_parser(self, parser): #pragma: no cover + """Method provided for subclasses to add their cli argument on top of the default cli arguments. + + :param ArgumentParser parser: The argument parser instance passed by BaseCli + """ pass def load(self): + """This method is called to load the application. Subclasses must return a application + that can be used by used by pywb.utils.geventserver.GeventServer.""" if self.r.live: self.extra_config['collections'] = {'live': {'index': '$live'}} @@ -84,10 +107,12 @@ class BaseCli(object): self.extra_config['recorder'] = 'live' def run(self): + """Start the application""" self.run_gevent() return self def run_gevent(self): + """Created the server that runs the application supplied a subclass""" from pywb.utils.geventserver import GeventServer, RequestURIWSGIHandler logging.info('Starting Gevent Server on ' + str(self.r.port)) ge = GeventServer(self.application, @@ -99,6 +124,8 @@ class BaseCli(object): #============================================================================= class ReplayCli(BaseCli): + """CLI class that adds the cli functionality specific to starting pywb's Wayback Machine implementation""" + def _extend_parser(self, parser): parser.add_argument('-a', '--autoindex', action='store_true', help='Enable auto-indexing') @@ -110,7 +137,6 @@ class ReplayCli(BaseCli): help_dir='Specify root archive dir (default is current working directory)' parser.add_argument('-d', '--directory', help=help_dir) - def load(self): super(ReplayCli, self).load() @@ -129,6 +155,8 @@ class ReplayCli(BaseCli): #============================================================================= class WarcServerCli(BaseCli): + """CLI class for starting a WarcServer""" + def load(self): from pywb.warcserver.warcserver import WarcServer @@ -138,6 +166,8 @@ class WarcServerCli(BaseCli): #============================================================================= class WaybackCli(ReplayCli): + """CLI class for starting the pywb's implementation of the Wayback Machine""" + def load(self): from pywb.apps.frontendapp import FrontEndApp @@ -147,6 +177,8 @@ class WaybackCli(ReplayCli): #============================================================================= class LiveCli(BaseCli): + """CLI class for starting pywb in replay server in live mode""" + def load(self): from pywb.apps.frontendapp import FrontEndApp diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index e740aa1a..9f7a6169 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -6,7 +6,7 @@ from werkzeug.exceptions import HTTPException, NotFound from werkzeug.wsgi import pop_path_info from six.moves.urllib.parse import urljoin from six import iteritems - +from warcio.statusandheaders import StatusAndHeaders from warcio.utils import to_native_str from wsgiprox.wsgiprox import WSGIProxMiddleware @@ -33,6 +33,16 @@ import logging # ============================================================================ class FrontEndApp(object): + """Orchestrates pywb's core Wayback Machine functionality and is comprised of 2 core sub-apps and 3 optional apps. + + Sub-apps: + - WarcServer: Serves the archive content (WARC/ARC and index) as well as from the live web in record/proxy mode + - RewriterApp: Rewrites the content served by pywb (if it is to be rewritten) + - WSGIProxMiddleware (Optional): If proxy mode is enabled, performs pywb's HTTP(s) proxy functionality + - AutoIndexer (Optional): If auto-indexing is enabled for the collections it is started here + - RecorderApp (Optional): Recording functionality, available when recording mode is enabled + """ + REPLAY_API = 'http://localhost:%s/{coll}/resource/postreq' CDX_API = 'http://localhost:%s/{coll}/index' RECORD_SERVER = 'http://localhost:%s' @@ -45,6 +55,10 @@ class FrontEndApp(object): PROXY_CA_PATH = os.path.join('proxy-certs', 'pywb-ca.pem') def __init__(self, config_file='./config.yaml', custom_config=None): + """ + :param str config_file: Path to the config file + :param dict custom_config: Dictionary containing additional configuration information + """ self.handler = self.handle_request self.warcserver = WarcServer(config_file=config_file, custom_config=custom_config) @@ -55,6 +69,8 @@ class FrontEndApp(object): self.warcserver_server = GeventServer(self.warcserver, port=0) + self.proxy_prefix = None # the URL prefix to be used for the collection with proxy mode (e.g. /coll/id_/) + self.proxy_coll = None # the name of the collection that has proxy mode enabled self.init_proxy(config) self.init_recorder(config.get('recorder')) @@ -82,6 +98,8 @@ class FrontEndApp(object): self.metadata_cache = MetadataCache(metadata_templ) def _init_routes(self): + """Initialize the routes and based on the configuration file makes available + specific routes (proxy mode, record)""" self.url_map = Map() self.url_map.add(Rule('/static/_//', endpoint=self.serve_static)) self.url_map.add(Rule('/static/', endpoint=self.serve_static)) @@ -100,9 +118,19 @@ class FrontEndApp(object): if self.recorder_path: self.url_map.add(Rule(coll_prefix + self.RECORD_ROUTE + '/', endpoint=self.serve_record)) + if self.proxy_prefix is not None: + # Add the proxy-fetch endpoint to enable PreservationWorker to make CORS fetches worry free in proxy mode + self.url_map.add(Rule('/proxy-fetch/', endpoint=self.proxy_fetch, + methods=['GET', 'HEAD', 'OPTIONS'])) self.url_map.add(Rule(coll_prefix + '/', endpoint=self.serve_content)) def get_upstream_paths(self, port): + """Retrieve a dictionary containing the full URLs of the upstream apps + + :param int port: The port used by the replay and cdx servers + :return: A dictionary containing the upstream paths (replay, cdx-server, record [if enabled]) + :rtype: dict[str, str] + """ base_paths = { 'replay': self.REPLAY_API % port, 'cdx-server': self.CDX_API % port, @@ -114,6 +142,7 @@ class FrontEndApp(object): return base_paths def init_recorder(self, recorder_config): + """Initialize the recording functionality of pywb. If recording_config is None this function is a no op""" if not recorder_config: self.recorder = None self.recorder_path = None @@ -142,6 +171,10 @@ class FrontEndApp(object): self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll) def init_autoindex(self, auto_interval): + """Initialize and start the auto-indexing of the collections. If auto_interval is None this is a no op. + + :param str|int auto_interval: The auto-indexing interval from the configuration file or CLI argument + """ if not auto_interval: return @@ -161,7 +194,16 @@ class FrontEndApp(object): logging.info(msg.format(indexer.root_path, auto_interval)) indexer.start() + def is_proxy_enabled(self, environ): + return self.proxy_prefix is not None and 'wsgiprox.proxy_host' in environ + def serve_home(self, environ): + """Serves the home (/) view of pywb (not a collections) + + :param dict environ: The WSGI environment dictionary for the request + :return: The WbResponse for serving the home (/) path + :rtype: WbResponse + """ home_view = BaseInsertView(self.rewriterapp.jinja_env, 'index.html') fixed_routes = self.warcserver.list_fixed_routes() dynamic_routes = self.warcserver.list_dynamic_routes() @@ -177,19 +219,38 @@ class FrontEndApp(object): return WbResponse.text_response(content, content_type='text/html; charset="utf-8"') def serve_static(self, environ, coll='', filepath=''): + """Serve a static file associated with a specific collection or one of pywb's own static assets + + :param dict environ: The WSGI environment dictionary for the request + :param str coll: The collection the static file is associated with + :param str filepath: The file path (relative to the collection) for the static assest + :return: The WbResponse for the static asset + :rtype: WbResponse + """ + proxy_enabled = self.is_proxy_enabled(environ) + if proxy_enabled and environ.get('REQUEST_METHOD') == 'OPTIONS': + return WbResponse.options_response(environ) if coll: path = os.path.join(self.warcserver.root_dir, coll, self.static_dir) else: path = self.static_dir environ['pywb.static_dir'] = path - try: - return self.static_handler(environ, filepath) + response = self.static_handler(environ, filepath) + if proxy_enabled: + response.add_access_control_headers(env=environ) + return response except: self.raise_not_found(environ, 'Static File Not Found: {0}'.format(filepath)) def get_metadata(self, coll): + """Retrieve the metadata associated with a collection + + :param str coll: The name of the collection to receive metadata for + :return: The collections metadata if it exists + :rtype: dict + """ #if coll == self.all_coll: # coll = '*' @@ -204,6 +265,13 @@ class FrontEndApp(object): return metadata def serve_coll_page(self, environ, coll='$root'): + """Render and serve a collections search page (search.html). + + :param dict environ: The WSGI environment dictionary for the request + :param str coll: The name of the collection to serve the collections search page for + :return: The WbResponse containing the collections search page + :rtype: WbResponse + """ if not self.is_valid_coll(coll): self.raise_not_found(environ, 'No handler for "/{0}"'.format(coll)) @@ -225,6 +293,13 @@ class FrontEndApp(object): return WbResponse.text_response(content, content_type='text/html; charset="utf-8"') def serve_cdx(self, environ, coll='$root'): + """Make the upstream CDX query for a collection and response with the results of the query + + :param dict environ: The WSGI environment dictionary for the request + :param str coll: The name of the collection this CDX query is for + :return: The WbResponse containing the results of the CDX query + :rtype: WbResponse + """ base_url = self.rewriterapp.paths['cdx-server'] #if coll == self.all_coll: @@ -248,12 +323,31 @@ class FrontEndApp(object): return WbResponse.text_response('Error: ' + str(e), status='400 Bad Request') def serve_record(self, environ, coll='$root', url=''): + """Serve a URL's content from a WARC/ARC record in replay mode or from the live web in + live, proxy, and record mode. + + :param dict environ: The WSGI environment dictionary for the request + :param str coll: The name of the collection the record is to be served from + :param str url: The URL for the corresponding record to be served if it exists + :return: WbResponse containing the contents of the record/URL + :rtype: WbResponse + """ if coll in self.warcserver.list_fixed_routes(): return WbResponse.text_response('Error: Can Not Record Into Custom Collection "{0}"'.format(coll)) return self.serve_content(environ, coll, url, record=True) def serve_content(self, environ, coll='$root', url='', timemap_output='', record=False): + """Serve the contents of a URL/Record rewriting the contents of the response when applicable. + + :param dict environ: The WSGI environment dictionary for the request + :param str coll: The name of the collection the record is to be served from + :param str url: The URL for the corresponding record to be served if it exists + :param str timemap_output: The contents of the timemap included in the link header of the response + :param bool record: Should the content being served by recorded (save to a warc). Only valid in record mode + :return: WbResponse containing the contents of the record/URL + :rtype: WbResponse + """ if not self.is_valid_coll(coll): self.raise_not_found(environ, 'No handler for "/{0}"'.format(coll)) @@ -282,10 +376,16 @@ class FrontEndApp(object): except UpstreamException as ue: response = self.rewriterapp.handle_error(environ, ue) raise HTTPException(response=response) - return response def setup_paths(self, environ, coll, record=False): + """Populates the WSGI environment dictionary with the path information necessary to perform a response for + content or record. + + :param dict environ: The WSGI environment dictionary for the request + :param str coll: The name of the collection the record is to be served from + :param bool record: Should the content being served by recorded (save to a warc). Only valid in record mode + """ if not coll or not self.warcserver.root_dir: return @@ -305,6 +405,12 @@ class FrontEndApp(object): environ['pywb.templates_dir'] = '/'.join(paths) def serve_listing(self, environ): + """Serves the response for WARCServer fixed and dynamic listing (paths) + + :param dict environ: The WSGI environment dictionary for the request + :return: WbResponse containing the frontend apps WARCServer URL paths + :rtype: WbResponse + """ result = {'fixed': self.warcserver.list_fixed_routes(), 'dynamic': self.warcserver.list_dynamic_routes() } @@ -312,6 +418,12 @@ class FrontEndApp(object): return WbResponse.json_response(result) def is_valid_coll(self, coll): + """Determines if the collection name for a request is valid (exists) + + :param str coll: The name of the collection to check + :return: True if the collection is valid, false otherwise + :rtype: bool + """ #if coll == self.all_coll: # return True @@ -319,9 +431,21 @@ class FrontEndApp(object): coll in self.warcserver.list_dynamic_routes()) def raise_not_found(self, environ, msg): + """Utility function for raising a werkzeug.exceptions.NotFound execption with the supplied WSGI environment + and message. + + :param dict environ: The WSGI environment dictionary for the request + :param str msg: The error message + """ raise NotFound(response=self.rewriterapp._error_response(environ, msg)) def _check_refer_redirect(self, environ): + """Returns a WbResponse for a HTTP 307 redirection if the HTTP referer header is the same as the HTTP host header + + :param dict environ: The WSGI environment dictionary for the request + :return: WbResponse HTTP 307 redirection + :rtype: WbResponse + """ referer = environ.get('HTTP_REFERER') if not referer: return @@ -353,10 +477,16 @@ class FrontEndApp(object): return self.handler(environ, start_response) def handle_request(self, environ, start_response): + """Retrieves the route handler and calls the handler returning its the response + + :param dict environ: The WSGI environment dictionary for the request + :param start_response: + :return: The WbResponse for the request + :rtype: WbResponse + """ urls = self.url_map.bind_to_environ(environ) try: endpoint, args = urls.match() - # store original script_name (original prefix) before modifications are made environ['pywb.app_prefix'] = environ.get('SCRIPT_NAME') @@ -379,13 +509,23 @@ class FrontEndApp(object): @classmethod def create_app(cls, port): + """Create a new instance of FrontEndApp that listens on port with a hostname of 0.0.0.0 + + :param int port: The port FrontEndApp is to listen on + :return: A new instance of FrontEndApp wrapped in GeventServer + :rtype: GeventServer + """ app = FrontEndApp() app_server = GeventServer(app, port=port, hostname='0.0.0.0') return app_server def init_proxy(self, config): + """Initialize and start proxy mode. If proxy configuration entry is not contained in the config + this is a no op. Causes handler to become an instance of WSGIProxMiddleware. + + :param dict config: The configuration object used to configure this instance of FrontEndApp + """ proxy_config = config.get('proxy') - self.proxy_prefix = None if not proxy_config: return @@ -418,10 +558,12 @@ class FrontEndApp(object): else: self.proxy_prefix = '/{0}/id_/'.format(proxy_coll) + self.proxy_coll = proxy_coll + self.handler = WSGIProxMiddleware(self.handle_request, - self.proxy_route_request, - proxy_host=proxy_config.get('host', 'pywb.proxy'), - proxy_options=proxy_config) + self.proxy_route_request, + proxy_host=proxy_config.get('host', 'pywb.proxy'), + proxy_options=proxy_config) def proxy_route_request(self, url, environ): """ Return the full url that this proxy request will be routed to @@ -431,14 +573,65 @@ class FrontEndApp(object): """ return self.proxy_prefix + url + def proxy_fetch(self, env, url): + """Proxy mode only endpoint that handles OPTIONS requests and COR fetches for Preservation Worker. + + Due to normal cross-origin browser restrictions in proxy mode, auto fetch worker cannot access the CSS rules + of cross-origin style sheets and must re-fetch them in a manner that is CORS safe. This endpoint facilitates + that by fetching the stylesheets for the auto fetch worker and then responds with its contents + + :param dict env: The WSGI environment dictionary + :param str url: The URL of the resource to be fetched + :return: WbResponse that is either response to an Options request or the results of fetching url + :rtype: WbResponse + """ + if not self.is_proxy_enabled(env): + # we are not in proxy mode so just respond with forbidden + return WbResponse.text_response('proxy mode must be enabled to use this endpoint', + status='403 Forbidden') + + if env.get('REQUEST_METHOD') == 'OPTIONS': + return WbResponse.options_response(env) + + # ensure full URL + request_url = env['REQUEST_URI'] + # replace with /id_ so we do not get rewritten + url = request_url.replace('/proxy-fetch', '/id_') + # update WSGI environment object + env['REQUEST_URI'] = self.proxy_coll + url + env['PATH_INFO'] = env['PATH_INFO'].replace('/proxy-fetch', self.proxy_coll + '/id_') + # make request using normal serve_content + response = self.serve_content(env, self.proxy_coll, url) + # for WR + if isinstance(response, WbResponse): + response.add_access_control_headers(env=env) + return response + # ============================================================================ class MetadataCache(object): + """This class holds the collection medata template string and + caches the metadata for a collection once it is rendered once. + Cached metadata is updated if its corresponding file has been updated since last cache time (file mtime based)""" + def __init__(self, template_str): + """ + :param str template_str: The template string to be cached + """ self.template_str = template_str self.cache = {} def load(self, coll): + """Load and receive the metadata associated with a collection. + + If the metadata for the collection is not cached yet its metadata file is read in and stored. + If the cache has seen the collection before the mtime of the metadata file is checked and if it is more recent + than the cached time, the cache is updated and returned otherwise the cached version is returned. + + :param str coll: Name of a collection + :return: The cached metadata for a collection + :rtype: dict + """ path = self.template_str.format(coll=coll) try: mtime = os.path.getmtime(path) @@ -456,11 +649,25 @@ class MetadataCache(object): return self.store_new(coll, path, mtime) def store_new(self, coll, path, mtime): + """Load a collections metadata file and store it + + :param str coll: The name of the collection the metadata is for + :param str path: The path to the collections metadata file + :param float mtime: The current mtime of the collections metadata file + :return: The collections metadata + :rtype: dict + """ obj = load_yaml_config(path) self.cache[coll] = (mtime, obj) return obj def get_all(self, routes): + """Load the metadata for all routes (collections) and populate the cache + + :param list[str] routes: List of collection names + :return: A dictionary containing each collections metadata + :rtype: dict + """ for route in routes: self.load(route) diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index b467c804..f596b189 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -366,6 +366,7 @@ class RewriterApp(object): top_url, environ, framed_replay, + coll=kwargs.get('coll', ''), replay_mod=self.replay_mod, config=self.config)) diff --git a/pywb/apps/test/test_wbrequestresponse.py b/pywb/apps/test/test_wbrequestresponse.py index 44dc2468..a7dd4698 100644 --- a/pywb/apps/test/test_wbrequestresponse.py +++ b/pywb/apps/test/test_wbrequestresponse.py @@ -1,3 +1,9 @@ +import inspect +try: + import ujson as json +except ImportError: # pragma: no cover + import json + from pywb.apps.wbrequestresponse import WbResponse from warcio.statusandheaders import StatusAndHeaders @@ -40,6 +46,98 @@ def test_resp_4(): assert(resp == expected) +def test_wbresponse_redir_supplied_headers(): + res = WbResponse.redir_response('http://overhere.now', headers=[('A', 'B')]) + assert ('A', 'B') in res.status_headers.headers + + +def test_wbresponse_creation_defaults(): + res = WbResponse(None) + assert res.status_headers is None + assert isinstance(res.body, list) + assert len(res.body) == 0 + + +def test_wbresponse_encode_stream(): + stream = [u'\u00c3'] # Unicode Character 'LATIN CAPITAL LETTER A WITH TILDE' (U+00C3) + expected = [b'\xc3\x83'] + encoding_stream = WbResponse.encode_stream(stream) + assert inspect.isgenerator(encoding_stream) + assert list(encoding_stream) == expected + + +def test_wbresponse_text_stream(): + stream = [u'\u00c3'] # Unicode Character 'LATIN CAPITAL LETTER A WITH TILDE' (U+00C3) + expected = [b'\xc3\x83'] + res = WbResponse.text_stream(stream, content_type='text/plain') + status_headers = res.status_headers + assert status_headers.statusline == '200 OK' + assert ('Content-Type', 'text/plain; charset=utf-8') in status_headers.headers + assert inspect.isgenerator(res.body) + assert list(res.body) == expected + + res = WbResponse.text_stream(stream) + status_headers = res.status_headers + assert status_headers.statusline == '200 OK' + assert ('Content-Type', 'text/plain; charset=utf-8') in status_headers.headers + assert inspect.isgenerator(res.body) + assert list(res.body) == expected + + +def test_wbresponse_options_response(): + res = WbResponse.options_response(dict(HTTP_ORIGIN='http://example.com')) + assert ('Access-Control-Allow-Origin', 'http://example.com') in res.status_headers.headers + res = WbResponse.options_response(dict(HTTP_REFERER='http://example.com')) + assert ('Access-Control-Allow-Origin', 'http://example.com') in res.status_headers.headers + res = WbResponse.options_response(dict()) + assert ('Access-Control-Allow-Origin', '*') in res.status_headers.headers + res = WbResponse.options_response(dict(HTTP_ORIGIN=None)) + assert ('Access-Control-Allow-Origin', '*') in res.status_headers.headers + res = WbResponse.options_response(dict(HTTP_REFERER=None)) + assert ('Access-Control-Allow-Origin', '*') in res.status_headers.headers + + +def test_wbresponse_json_response(): + body = dict(pywb=1, wr=2) + res = WbResponse.json_response(body) + status_headers = res.status_headers + assert status_headers.statusline == '200 OK' + assert ('Content-Type', 'application/json; charset=utf-8') in status_headers.headers + assert json.loads(res.body[0]) == body + + +def test_wbresponse_init_derived(): + class Derived(WbResponse): + def __init__(self, status_headers, value=None, **kwargs): + self.received_kwargs = dict() + super(Derived, self).__init__(status_headers, value=value, **kwargs) + + def _init_derived(self, params): + self.received_kwargs.update(params) + + dres = Derived(None, pywb=1, wr=2) + assert dres.received_kwargs == dict(pywb=1, wr=2) + + +def test_wbresponse_callable(): + expected_body = dict(pywb=1, wr=2) + res = WbResponse.json_response(expected_body) + env = dict(REQUEST_METHOD='GET') + expected_passed_values = dict( + status_line='200 OK', + headers=[('Content-Type', 'application/json; charset=utf-8'), ('Content-Length', '17')] + ) + passed_values = dict(status_line=None, headers=None) + + def start_response(status_line, headers): + passed_values['status_line'] = status_line + passed_values['headers'] = headers + + body = res(env, start_response) + assert json.loads(body[0]) == expected_body + assert passed_values == expected_passed_values + + if __name__ == "__main__": import doctest doctest.testmod() diff --git a/pywb/apps/wbrequestresponse.py b/pywb/apps/wbrequestresponse.py index 66aa248f..db9993b7 100644 --- a/pywb/apps/wbrequestresponse.py +++ b/pywb/apps/wbrequestresponse.py @@ -1,39 +1,76 @@ from warcio.statusandheaders import StatusAndHeaders -import json +try: + import ujson as json +except ImportError: # pragma: no cover + import json - -#================================================================= +# ================================================================= class WbResponse(object): - """ - Represnts a pywb wsgi response object. + """Represnts a pywb wsgi response object. Holds a status_headers object and a response iter, to be - returned to wsgi container. - """ - def __init__(self, status_headers, value=[], **kwargs): + returned to wsgi container.""" + + def __init__(self, status_headers, value=None, **kwargs): + """ + :param StatusAndHeaders status_headers: The StatusAndHeaders object for this response + :param Any value: The response body + :param Any kwargs: Additional keyword arguments to be passed to subclasses + """ + if value is None: + value = list() self.status_headers = status_headers self.body = value self._init_derived(kwargs) def _init_derived(self, params): + """Receive the kwargs used in construction of this class + + :param Any params: + :return: + :rtype: None + """ pass @staticmethod def text_stream(stream, content_type='text/plain; charset=utf-8', status='200 OK'): - def encode(stream): - for obj in stream: - yield obj.encode('utf-8') + """Utility method for constructing a streaming text response. + :param Any stream: The response body stream + :param str content_type: The content-type of the response + :param str status: The HTTP status line + :return: WbResponse that is a text stream + :rtype WbResponse: + """ if 'charset' not in content_type: content_type += '; charset=utf-8' - return WbResponse.bin_stream(encode(stream), content_type, status) + return WbResponse.bin_stream(WbResponse.encode_stream(stream), content_type, status) + + @staticmethod + def encode_stream(stream): + """Utility method to encode a stream using utf-8. + + :param Any stream: The stream to be encoded using utf-8 + :return: A generator that yields the contents of the stream encoded as utf-8 + """ + for obj in stream: + yield obj.encode('utf-8') @staticmethod def bin_stream(stream, content_type, status='200 OK', - headers=None): + headers=None): + """Utility method for constructing a binary response. + + :param Any stream: The response body stream + :param str content_type: The content-type of the response + :param str status: The HTTP status line + :param list[tuple[str, str]] headers: Additional headers for this response + :return: WbResponse that is a binary stream + :rtype: WbResponse + """ def_headers = [('Content-Type', content_type)] if headers: def_headers += headers @@ -44,6 +81,14 @@ class WbResponse(object): @staticmethod def text_response(text, status='200 OK', content_type='text/plain; charset=utf-8'): + """Utility method for constructing a text response. + + :param str text: The text response body + :param str content_type: The content-type of the response + :param str status: The HTTP status line + :return: WbResponse text response + :rtype: WbResponse + """ encoded_text = text.encode('utf-8') status_headers = StatusAndHeaders(status, [('Content-Type', content_type), @@ -53,21 +98,59 @@ class WbResponse(object): @staticmethod def json_response(obj, status='200 OK', content_type='application/json; charset=utf-8'): + """Utility method for constructing a JSON response. + + :param dict obj: The dictionary to be serialized in JSON format + :param str content_type: The content-type of the response + :param str status: The HTTP status line + :return: WbResponse JSON response + :rtype: WbResponse + """ return WbResponse.text_response(json.dumps(obj), status, content_type) @staticmethod def redir_response(location, status='302 Redirect', headers=None): + """Utility method for constructing redirection response. + + :param str location: The location of the resource redirecting to + :param str status: The HTTP status line + :param list[tuple[str, str]] headers: Additional headers for this response + :return: WbResponse redirection response + :rtype: WbResponse + """ redir_headers = [('Location', location), ('Content-Length', '0')] if headers: redir_headers += headers return WbResponse(StatusAndHeaders(status, redir_headers)) + @staticmethod + def options_response(env): + """Construct WbResponse for OPTIONS based on the WSGI env dictionary + + :param dict env: The WSGI environment dictionary + :return: The WBResponse for the options request + :rtype: WbResponse + """ + status_headers = StatusAndHeaders('200 Ok', [ + ('Content-Type', 'text/plain'), + ('Content-Length', '0'), + ]) + response = WbResponse(status_headers) + response.add_access_control_headers(env=env) + return response + def __call__(self, env, start_response): + """Callable definition to allow WbResponse control over how the response is sent + + :param dict env: The WSGI environment dictionary + :param function start_response: The WSGI start_response function + :return: The response body + """ start_response(self.status_headers.statusline, self.status_headers.headers) - - if env['REQUEST_METHOD'] == 'HEAD' or self.status_headers.statusline.startswith('304'): + request_method = env['REQUEST_METHOD'] + if request_method == 'HEAD' or request_method == 'OPTIONS' or self.status_headers.statusline.startswith('304'): if hasattr(self.body, 'close'): self.body.close() return [] @@ -75,8 +158,42 @@ class WbResponse(object): return self.body def add_range(self, *args): + """Add HTTP range header values to this response + + :param int args: The values for the range HTTP header + :return: The same WbResponse but with the values for the range HTTP header added + :rtype: WbResponse + """ self.status_headers.add_range(*args) return self + def add_access_control_headers(self, env=None): + """Adds Access-Control* HTTP headers to this WbResponse's HTTP headers. + + :param dict env: The WSGI environment dictionary + :return: The same WbResponse but with the values for the Access-Control* HTTP header added + :rtype: WbResponse + """ + allowed_methods = 'GET, POST, PUT, OPTIONS, DELETE, PATCH, HEAD, TRACE, CONNECT' + allowed_origin = None + if env is not None: + acr_method = env.get('HTTP_ACCESS_CONTROL_REQUEST_METHOD') + if acr_method is not None and acr_method not in allowed_methods: + allowed_methods = allowed_methods + ', ' + acr_method + r_method = env.get('REQUEST_METHOD') + if r_method is not None and r_method not in allowed_methods: + allowed_methods = allowed_methods + ', ' + r_method + acr_headers = env.get('HTTP_ACCESS_CONTROL_REQUEST_HEADERS') + if acr_headers is not None: + self.status_headers.add_header('Access-Control-Allow-Headers', acr_headers) + allowed_origin = env.get('HTTP_ORIGIN', env.get('HTTP_REFERER', allowed_origin)) + if allowed_origin is None: + allowed_origin = '*' + self.status_headers.replace_header('Access-Control-Allow-Origin', allowed_origin) + self.status_headers.add_header('Access-Control-Allow-Methods', allowed_methods) + self.status_headers.add_header('Access-Control-Allow-Credentials', 'true') + self.status_headers.add_header('Access-Control-Max-Age', '1800') + return self + def __repr__(self): return str(vars(self)) diff --git a/pywb/rewrite/templateview.py b/pywb/rewrite/templateview.py index c972bb87..6dd83094 100644 --- a/pywb/rewrite/templateview.py +++ b/pywb/rewrite/templateview.py @@ -14,9 +14,13 @@ from webassets.env import Resolver from pkg_resources import resource_filename -import json import os +try: + import ujson as json +except ImportError: # pragma: no cover + import json + # ============================================================================ class RelEnvironment(Environment): @@ -27,14 +31,35 @@ class RelEnvironment(Environment): # ============================================================================ class JinjaEnv(object): - def __init__(self, paths=['templates', '.', '/'], - packages=['pywb'], - assets_path=None, - globals=None, - overlay=None, - extensions=None, - env_template_params_key='pywb.template_params', - env_template_dir_key='pywb.templates_dir'): + """Pywb JinjaEnv class that provides utility functions used by the templates, + configured template loaders and template paths, and contains the actual Jinja + env used by each template.""" + + def __init__(self, paths=None, + packages=None, + assets_path=None, + globals=None, + overlay=None, + extensions=None, + env_template_params_key='pywb.template_params', + env_template_dir_key='pywb.templates_dir'): + """Construct a new JinjaEnv. + + :param list[str] paths: List of paths to search for templates + :param list[str] packages: List of assets package names + :param str assets_path: Path to a yaml file containing assets + :param dict[str, str] globals: Dictionary of additional globals available during template rendering + :param overlay: + :param list extensions: List of webassets extension classes + :param str env_template_params_key: The full pywb package key for the template params + :param str env_template_dir_key: The full pywb package key for the template directory + """ + + if paths is None: + paths = ['templates', '.', '/'] + + if packages is None: + packages = ['pywb'] self._init_filters() @@ -72,6 +97,13 @@ class JinjaEnv(object): jinja_env.assets_environment = assets_env def _make_loaders(self, paths, packages): + """Initialize the template loaders based on the supplied paths and packages. + + :param list[str] paths: List of paths to search for templates + :param list[str] packages: List of assets package names + :return: A list of loaders to be used for loading the template assets + :rtype: list[FileSystemLoader|PackageLoader] + """ loaders = [] # add loaders for paths for path in paths: @@ -84,6 +116,15 @@ class JinjaEnv(object): return loaders def template_filter(self, param=None): + """Returns a decorator that adds the wrapped function to dictionary of template filters. + + The wrapped function is keyed by either the supplied param (if supplied) + or by the wrapped functions name. + + :param param: Optional name to use instead of the name of the function to be wrapped + :return: A decorator to wrap a template filter function + :rtype: callable + """ def deco(func): name = param or func.__name__ self.filters[name] = func @@ -92,10 +133,18 @@ class JinjaEnv(object): return deco def _init_filters(self): + """Initialize the default pywb provided Jninja filters available during template rendering""" self.filters = {} @self.template_filter() def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'): + """Formats the supplied timestamp using format_ + + :param str value: The timestamp to be formatted + :param str format_: The format string + :return: The correctly formatted timestamp as determined by format_ + :rtype: str + """ if format_ == '%s': return timestamp_to_sec(value) else: @@ -104,22 +153,58 @@ class JinjaEnv(object): @self.template_filter('urlsplit') def get_urlsplit(url): + """Splits the supplied URL + + :param str url: The url to be split + :return: The split url + :rtype: urllib.parse.SplitResult + """ split = urlsplit(url) return split @self.template_filter() def tojson(obj): + """Converts the supplied object/array/any to a JSON string if it can be JSONified + + :param any obj: The value to be converted to a JSON string + :return: The JSON string representation of the supplied value + :rtype: str + """ return json.dumps(obj) + @self.template_filter() + def tobool(bool_val): + """Converts a python boolean to a JS "true" or "false" string + :param any obj: A value to be evaluated as a boolean + :return: The string "true" or "false" to be inserted into JS + """ + + return 'true' if bool_val else 'false' + # ============================================================================ class BaseInsertView(object): + """Base class of all template views used by Pywb""" + def __init__(self, jenv, insert_file, banner_view=None): + """Create a new BaseInsertView. + + :param JinjaEnv jenv: The instance of pywb.rewrite.templateview.JinjaEnv to be used + :param str insert_file: The name of the template file + :param BaseInsertView banner_view: The banner_view property of pywb.apps.RewriterApp + """ self.jenv = jenv self.insert_file = insert_file self.banner_view = banner_view def render_to_string(self, env, **kwargs): + """Render this template. + + :param dict env: The WSGI environment associated with the request causing this template to be rendered + :param any kwargs: The keyword arguments to be supplied to the Jninja template render method + :return: The rendered template + :rtype: str + """ template = None template_path = env.get(self.jenv.env_template_dir_key) @@ -149,6 +234,9 @@ class BaseInsertView(object): # ============================================================================ class HeadInsertView(BaseInsertView): + """The template view class associated with rendering the HTML inserted + into the head of the pages replayed (WB Insert).""" + def create_insert_func(self, wb_url, wb_prefix, host_prefix, @@ -158,19 +246,32 @@ class HeadInsertView(BaseInsertView): coll='', include_ts=True, **kwargs): + """Create the function used to render the header insert template for the current request. + :param rewrite.wburl.WbUrl wb_url: The WbUrl for the request this template is being rendered for + :param str wb_prefix: The URL prefix pywb is serving the content using (e.g. http://localhost:8080/live/) + :param str host_prefix: The host URL prefix pywb is running on (e.g. http://localhost:8080) + :param str top_url: The full URL for this request (e.g. http://localhost:8080/live/http://example.com) + :param dict env: The WSGI environment dictionary for this request + :param bool is_framed: Is pywb or a specific collection running in framed mode + :param str coll: The name of the collection this request is associated with + :param bool include_ts: Should a timestamp be included in the rendered template + :param kwargs: Additional keyword arguments to be supplied to the Jninja template render method + :return: A function to be used to render the header insert for the request this template is being rendered for + :rtype: callable + """ params = kwargs params['host_prefix'] = host_prefix params['wb_prefix'] = wb_prefix params['wb_url'] = wb_url params['top_url'] = top_url params['coll'] = coll - params['is_framed'] = 'true' if is_framed else 'false' + params['is_framed'] = is_framed def make_head_insert(rule, cdx): params['wombat_ts'] = cdx['timestamp'] if include_ts else '' params['wombat_sec'] = timestamp_to_sec(cdx['timestamp']) - params['is_live'] = 'true' if cdx.get('is_live') else 'false' + params['is_live'] = cdx.get('is_live') if self.banner_view: banner_html = self.banner_view.render_to_string(env, cdx=cdx, **params) @@ -183,6 +284,8 @@ class HeadInsertView(BaseInsertView): # ============================================================================ class TopFrameView(BaseInsertView): + """The template view class associated with rendering the replay iframe""" + def get_top_frame(self, wb_url, wb_prefix, host_prefix, @@ -191,6 +294,18 @@ class TopFrameView(BaseInsertView): replay_mod, coll='', extra_params=None): + """ + :param rewrite.wburl.WbUrl wb_url: The WbUrl for the request this template is being rendered for + :param str wb_prefix: The URL prefix pywb is serving the content using (e.g. http://localhost:8080/live/) + :param str host_prefix: The host URL prefix pywb is running on (e.g. http://localhost:8080) + :param dict env: The WSGI environment dictionary for the request this template is being rendered for + :param str frame_mod: The modifier to be used for framing (e.g. if_) + :param str replay_mod: The modifier to be used in the URL of the page being replayed (e.g. mp_) + :param str coll: The name of the collection this template is being rendered for + :param dict extra_params: Additional parameters to be supplied to the Jninja template render method + :return: The frame insert string + :rtype: str + """ embed_url = wb_url.to_str(mod=replay_mod) @@ -227,7 +342,15 @@ class TopFrameView(BaseInsertView): # ============================================================================ class PkgResResolver(Resolver): + """Class for resolving pywb package resources when install via pypi or setup.py""" + def get_pkg_path(self, item): + """Get the package path for the + + :param str item: A resources full package path + :return: The netloc and path from the items package path + :rtype: tuple[str, str] + """ if not isinstance(item, str): return None diff --git a/pywb/static/wombatPreservationWorker.js b/pywb/static/autoFetchWorker.js similarity index 63% rename from pywb/static/wombatPreservationWorker.js rename to pywb/static/autoFetchWorker.js index 02858a37..b5d46ba9 100644 --- a/pywb/static/wombatPreservationWorker.js +++ b/pywb/static/autoFetchWorker.js @@ -3,8 +3,8 @@ var STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi; var IMPORT_REGEX = /(@import\s+[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi; var srcsetSplit = /\s*(\S*\s+[\d.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/; -// the preserver instance for this worker -var preserver = null; +// the autofetcher instance for this worker +var autofetcher = null; function noop() {} @@ -41,31 +41,25 @@ self.onmessage = function (event) { var data = event.data; switch (data.type) { case 'values': - preserver.preserveMediaSrcset(data); + autofetcher.autofetchMediaSrcset(data); break; } }; -function pMap(p) { - // mapping function to ensure each fetch promises catch has a no op cb - return p.catch(noop); -} - -function Preserver(prefix, mod) { - if (!(this instanceof Preserver)) { - return new Preserver(prefix, mod); +function AutoFetcher(init) { + if (!(this instanceof AutoFetcher)) { + return new AutoFetcher(init); } - this.prefix = prefix; - this.mod = mod; - this.prefixMod = prefix + mod; + this.proxyMode = init.proxyMode; + this.prefix = init.prefix; + this.mod = init.mod; + this.prefixMod = init.prefix + init.mod; // relative url, WorkerLocation is set by owning document - this.relative = prefix.split(location.origin)[1]; + this.relative = init.prefix.split(location.origin)[1]; // schemeless url this.schemeless = '/' + this.relative; // local cache of URLs fetched, to reduce server load this.seen = {}; - // counter used to know when to clear seen (count > 2500) - this.seenCount = 0; // array of promises returned by fetch(URL) this.fetches = []; // array of URL to be fetched @@ -76,7 +70,7 @@ function Preserver(prefix, mod) { this.fetchDone = this.fetchDone.bind(this); } -Preserver.prototype.fixupURL = function (url) { +AutoFetcher.prototype.fixupURL = function (url) { // attempt to fix up the url and do our best to ensure we can get dat 200 OK! if (url.indexOf(this.prefixMod) === 0) { return url; @@ -93,57 +87,54 @@ Preserver.prototype.fixupURL = function (url) { return url; }; -Preserver.prototype.safeFetch = function (url) { +AutoFetcher.prototype.safeFetch = function (url) { var fixedURL = this.fixupURL(url); // check to see if we have seen this url before in order - // to lessen the load against the server content is preserved from + // to lessen the load against the server content is fetched from if (this.seen[url] != null) return; this.seen[url] = true; if (this.queuing) { // we are currently waiting for a batch of fetches to complete return this.queue.push(fixedURL); } - // queue this urls fetch - this.fetches.push(fetch(fixedURL)); + // fetch this url + this.fetches.push(fetch(url)); }; -Preserver.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) { +AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) { // Same function as style_replacer in wombat.rewrite_style, n2 is our URL this.safeFetch(n2); return n1 + n2 + n3; }; -Preserver.prototype.fetchDone = function () { - // clear our fetches array in place - // https://www.ecma-international.org/ecma-262/9.0/index.html#sec-properties-of-array-instances-length - this.fetches.length = 0; +AutoFetcher.prototype.fetchDone = function () { // indicate we no longer need to Q this.queuing = false; if (this.queue.length > 0) { // we have a Q of some length drain it this.drainQ(); - } else if (this.seenCount > 2500) { - // we seen 2500 URLs so lets free some memory as at this point - // we will probably see some more. GC it! - this.seen = {}; - this.seenCount = 0; } }; -Preserver.prototype.fetchAll = function () { +AutoFetcher.prototype.fetchAll = function () { // if we are queuing or have no fetches this is a no op if (this.queuing) return; if (this.fetches.length === 0) return; // we are about to fetch queue anything that comes our way this.queuing = true; - // initiate fetches by turning the initial fetch promises - // into rejctionless promises and "await" all - Promise.all(this.fetches.map(pMap)) + /// initiate fetches by turning the initial fetch promises + // into rejctionless promises and "await" all clearing + // our fetches array in place + var runningFetchers = []; + while (this.fetches.length > 0) { + runningFetchers.push(this.fetches.shift().catch(noop)) + } + Promise.all(runningFetchers) .then(this.fetchDone) .catch(this.fetchDone); }; -Preserver.prototype.drainQ = function () { +AutoFetcher.prototype.drainQ = function () { // clear our Q in place and fill our fetches array while (this.queue.length > 0) { this.fetches.push(fetch(this.queue.shift())); @@ -152,17 +143,18 @@ Preserver.prototype.drainQ = function () { this.fetchAll(); }; -Preserver.prototype.extractMedia = function (mediaRules) { +AutoFetcher.prototype.extractMedia = function (mediaRules) { // this is a broken down rewrite_style - if (mediaRules == null) return; - for (var i = 0; i < mediaRules.length; i++) { - var rule = mediaRules[i]; - rule.replace(STYLE_REGEX, this.urlExtractor); - rule.replace(IMPORT_REGEX, this.urlExtractor); + if (mediaRules == null || mediaRules.values === null) return; + var rules = mediaRules.values; + for (var i = 0; i < rules.length; i++) { + var rule = rules[i]; + rule.replace(STYLE_REGEX, this.urlExtractor) + .replace(IMPORT_REGEX, this.urlExtractor); } }; -Preserver.prototype.extractSrcset = function (srcsets) { +AutoFetcher.prototype.extractSrcset = function (srcsets) { if (srcsets == null || srcsets.values == null) return; var srcsetValues = srcsets.values; // was srcsets from rewrite_srcset and if so no need to split @@ -175,19 +167,21 @@ Preserver.prototype.extractSrcset = function (srcsets) { this.safeFetch(srcset.split(' ')[0]); } else { // was from extract from local doc so we need to duplicate work - var values = srcset.split(srcsetSplit).filter(Boolean); + var values = srcset.split(srcsetSplit); for (var j = 0; j < values.length; j++) { - var value = values[j].trim(); - if (value.length > 0) { - this.safeFetch(value.split(' ')[0]); + if (Boolean(values[j])) { + var value = values[j].trim(); + if (value.length > 0) { + this.safeFetch(value.split(' ')[0]); + } } } } } }; -Preserver.prototype.preserveMediaSrcset = function (data) { - // we got a message and now we preserve! +AutoFetcher.prototype.autofetchMediaSrcset = function (data) { + // we got a message and now we autofetch! // these calls turn into no ops if they have no work this.extractMedia(data.media); this.extractSrcset(data.srcset); @@ -197,9 +191,12 @@ Preserver.prototype.preserveMediaSrcset = function (data) { // initialize ourselves from the query params :) try { var loc = new self.URL(location); - preserver = new Preserver(loc.searchParams.get('prefix'), loc.searchParams.get('mod')); + autofetcher = new AutoFetcher(JSON.parse(loc.searchParams.get('init'))); } catch (e) { // likely we are in an older version of safari var search = decodeURIComponent(location.search.split('?')[1]).split('&'); - preserver = new Preserver(search[0].substr(search[0].indexOf('=') + 1), search[1].substr(search[1].indexOf('=') + 1)); + var init = JSON.parse(search[0].substr(search[0].indexOf('=') + 1)); + init.prefix = decodeURIComponent(init.prefix); + init.baseURI = decodeURIComponent(init.baseURI); + autofetcher = new AutoFetcher(init); } diff --git a/pywb/static/autoFetchWorkerProxyMode.js b/pywb/static/autoFetchWorkerProxyMode.js new file mode 100644 index 00000000..95832840 --- /dev/null +++ b/pywb/static/autoFetchWorkerProxyMode.js @@ -0,0 +1,192 @@ +'use strict'; +// thanks wombat +var STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi; +var IMPORT_REGEX = /(@import\s+[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi; +var srcsetSplit = /\s*(\S*\s+[\d.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/; +// the autofetcher instance for this worker +var autofetcher = null; + +function noop() {} + +if (typeof self.Promise === 'undefined') { + // not kewl we must polyfill Promise + self.Promise = function (executor) { + executor(noop, noop); + }; + self.Promise.prototype.then = function (cb) { + if (cb) cb(); + return this; + }; + self.Promise.prototype.catch = function () { + return this; + }; + self.Promise.all = function (values) { + return new Promise(noop); + }; +} + + +if (typeof self.fetch === 'undefined') { + // not kewl we must polyfill fetch. + self.fetch = function (url) { + return new Promise(function (resolve) { + var xhr = new XMLHttpRequest(); + xhr.open('GET', url); + xhr.send(); + resolve(); + }); + }; +} + +self.onmessage = function (event) { + var data = event.data; + switch (data.type) { + case 'values': + autofetcher.autofetchMediaSrcset(data); + break; + } +}; + +function AutoFetcher() { + if (!(this instanceof AutoFetcher)) { + return new AutoFetcher(); + } + // local cache of URLs fetched, to reduce server load + this.seen = {}; + // array of promises returned by fetch(URL) + this.fetches = []; + // array of URL to be fetched + this.queue = []; + // should we queue a URL or not + this.queuing = false; + // a URL to resolve relative URLs found in the cssText of CSSMedia rules. + this.currentResolver = null; + this.urlExtractor = this.urlExtractor.bind(this); + this.fetchDone = this.fetchDone.bind(this); +} + +AutoFetcher.prototype.safeFetch = function (url) { + // ensure we do not request data urls + if (url.indexOf('data:') === 0) return; + // check to see if we have seen this url before in order + // to lessen the load against the server content is autofetchd from + if (this.seen[url] != null) return; + this.seen[url] = true; + if (this.queuing) { + // we are currently waiting for a batch of fetches to complete + return this.queue.push(url); + } + // fetch this url + this.fetches.push(fetch(url)); +}; + +AutoFetcher.prototype.safeResolve = function (url, resolver) { + // Guard against the exception thrown by the URL constructor if the URL or resolver is bad + // if resolver is undefined/null then this function passes url through + var resolvedURL = url; + if (resolver) { + try { + resolvedURL = (new URL(url, resolver)).href + } catch (e) { + resolvedURL = url; + } + } + return resolvedURL; +}; + + +AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) { + // Same function as style_replacer in wombat.rewrite_style, n2 is our URL + // this.currentResolver is set to the URL which the browser would normally + // resolve relative urls with (URL of the stylesheet) in an exceptionless manner + // (resolvedURL will be undefined if an error occurred) + var resolvedURL = this.safeResolve(n2, this.currentResolver); + if (resolvedURL) { + this.safeFetch(resolvedURL); + } + return n1 + n2 + n3; +}; + +AutoFetcher.prototype.fetchDone = function () { + // indicate we no longer need to Q + this.queuing = false; + if (this.queue.length > 0) { + // we have a Q of some length drain it + this.drainQ(); + } +}; + +AutoFetcher.prototype.fetchAll = function () { + // if we are queuing or have no fetches this is a no op + if (this.queuing) return; + if (this.fetches.length === 0) return; + // we are about to fetch queue anything that comes our way + this.queuing = true; + // initiate fetches by turning the initial fetch promises + // into rejctionless promises and "await" all clearing + // our fetches array in place + var runningFetchers = []; + while (this.fetches.length > 0) { + runningFetchers.push(this.fetches.shift().catch(noop)) + } + Promise.all(runningFetchers) + .then(this.fetchDone) + .catch(this.fetchDone); +}; + +AutoFetcher.prototype.drainQ = function () { + // clear our Q in place and fill our fetches array + while (this.queue.length > 0) { + this.fetches.push(fetch(this.queue.shift())); + } + // fetch all the things + this.fetchAll(); +}; + +AutoFetcher.prototype.extractMedia = function (mediaRules) { + // this is a broken down rewrite_style + if (mediaRules == null) return; + for (var i = 0; i < mediaRules.length; i++) { + // set currentResolver to the value of this stylesheets URL, done to ensure we do not have to + // create functions on each loop iteration because we potentially create a new `URL` object + // twice per iteration + this.currentResolver = mediaRules[i].resolve; + mediaRules[i].cssText + .replace(STYLE_REGEX, this.urlExtractor) + .replace(IMPORT_REGEX, this.urlExtractor); + } +}; + +AutoFetcher.prototype.extractSrcset = function (srcsets) { + // preservation worker in proxy mode sends us the value of the srcset attribute of an element + // and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here + if (srcsets == null) return; + var length = srcsets.length; + var extractedSrcSet, srcsetValue, ssSplit, j; + for (var i = 0; i < length; i++) { + extractedSrcSet = srcsets[i]; + ssSplit = extractedSrcSet.srcset.split(srcsetSplit); + for (j = 0; j < ssSplit.length; j++) { + if (Boolean(ssSplit[j])) { + srcsetValue = ssSplit[j].trim(); + if (srcsetValue.length > 0) { + // resolve the URL in an exceptionless manner (resolvedURL will be undefined if an error occurred) + var resolvedURL = this.safeResolve(srcsetValue.split(' ')[0], extractedSrcSet.resolve); + if (resolvedURL) { + this.safeFetch(resolvedURL); + } + } + } + } + } +}; + +AutoFetcher.prototype.autofetchMediaSrcset = function (data) { + // we got a message and now we autofetch! + // these calls turn into no ops if they have no work + this.extractMedia(data.media); + this.extractSrcset(data.srcset); + this.fetchAll(); +}; + +autofetcher = new AutoFetcher(); diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index b116ec33..0d11050c 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -78,9 +78,9 @@ var _WBWombat = function($wbwindow, wbinfo) { var wb_setAttribute = $wbwindow.Element.prototype.setAttribute; var wb_getAttribute = $wbwindow.Element.prototype.getAttribute; var wb_funToString = Function.prototype.toString; - var WBPreserWorker; + var WBAutoFetchWorker; var wbSheetMediaQChecker; - var wbUsePresWorker = $wbwindow.Worker != null && wbinfo.is_live; + var wbUseAAWorker = $wbwindow.Worker != null && wbinfo.is_live; var wb_info; @@ -131,6 +131,11 @@ var _WBWombat = function($wbwindow, wbinfo) { 'TRACK': {'src': 'oe_'}, }; + // pulled up rewrite_style and rewrite_srcset regex's as they are considered globals (uppercase) + var STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi; + var IMPORT_REGEX = /(@import\s+[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi; + var SRCSET_REGEX = /\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/; + function rwModForElement(elem, attrName) { // this function was created to help add in retrial of element attribute rewrite modifiers if (!elem) { @@ -1329,85 +1334,91 @@ var _WBWombat = function($wbwindow, wbinfo) { } //============================================ - function initPreserveWorker() { - if (!wbUsePresWorker) { + function initAutoFetchWorker() { + if (!wbUseAAWorker) { return; } - var Preserver = (function(Worker) { - function PWorker(prefix, mod) { - if (!(this instanceof PWorker)) { - return new PWorker(prefix, mod); - } - if ($wbwindow === $wbwindow.__WB_replay_top) { - // we are top and can will own this worker - // setup URL for the kewl case + var isTop = $wbwindow === $wbwindow.__WB_replay_top; + + function AutoFetchWorker(prefix, mod) { + if (!(this instanceof AutoFetchWorker)) { + return new AutoFetchWorker(prefix, mod); + } + this.checkIntervalCB = this.checkIntervalCB.bind(this); + if (isTop) { + // we are top and can will own this worker + // setup URL for the kewl case + // Normal replay and preservation mode pworker setup, its all one origin so YAY! var workerURL = wbinfo.static_prefix + - 'wombatPreservationWorker.js?prefix=' + - encodeURIComponent(prefix) + '&mod=' + - encodeURIComponent(mod); - this.worker = new Worker(workerURL); - } else { - this.worker = null; + 'autoFetchWorker.js?init='+ + encodeURIComponent(JSON.stringify({ 'mod': mod, 'prefix': prefix })); + this.worker = new $wbwindow.Worker(workerURL); + } else { + // add only the portions of the worker interface we use since we are not top and if in proxy mode start check polling + this.worker = { + "postMessage": function (msg) { + if (!msg.wb_type) { + msg = { 'wb_type': 'aaworker', 'msg': msg }; + } + $wbwindow.__WB_replay_top.__orig_postMessage(msg, '*'); + }, + "terminate": function () {} + }; + } + } + + AutoFetchWorker.prototype.checkIntervalCB = function () { + this.extractFromLocalDoc(); + }; + + AutoFetchWorker.prototype.deferredSheetExtraction = function (sheet) { + var rules = sheet.cssRules || sheet.rules; + // if no rules this a no op + if (!rules || rules.length === 0) return; + var self = this; + function extract() { + // loop through each rule of the stylesheet + var media = []; + for (var j = 0; j < rules.length; ++j) { + var rule = rules[j]; + if (rule.type === CSSRule.MEDIA_RULE) { + // we are a media rule so get its text + media.push(rule.cssText); + } + } + if (media.length > 0) { + // we have some media rules to preserve + self.preserveMedia(media); } } + // defer things until next time the Promise.resolve Qs are cleared + $wbwindow.Promise.resolve().then(extract); + }; - PWorker.prototype.deferredSheetExtraction = function(sheet) { - var rules = sheet.cssRules || sheet.rules; - // if no rules this a no op - if (!rules || rules.length === 0) return; - function extract() { - // loop through each rule of the stylesheet - var media = []; - for (var j = 0; j < rules.length; ++j) { - var rule = rules[j]; - if (rule instanceof CSSMediaRule) { - // we are a media rule so get its text - media.push(rule.cssText); - } - } - if (media.length > 0) { - // we have some media rules to preserve - WBPreserWorker.preserveMedia(media); - } - } - // defer things until next time the Promise.resolve Qs are cleared - $wbwindow.Promise.resolve().then(extract); - }; + AutoFetchWorker.prototype.terminate = function () { + // terminate the worker, a no op when not replay top + this.worker.terminate(); + }; - PWorker.prototype.terminate = function() { - // terminate the worker, a no op when not replay top - if ($wbwindow === $wbwindow.__WB_replay_top) { - this.worker.terminate(); - } - }; + AutoFetchWorker.prototype.postMessage = function (msg) { + this.worker.postMessage(msg); + }; - PWorker.prototype.postMessage = function(msg) { - if ($wbwindow === $wbwindow.__WB_replay_top) { - // we are actually replay top so send directly to worker - this.worker.postMessage(msg); - } else { - // send message to replay top - $wbwindow.__WB_replay_top.__orig_postMessage({ - 'wb_type': 'pworker', 'msg': msg, - }, '*'); - } - }; + AutoFetchWorker.prototype.preserveSrcset = function (srcset) { + // send values from rewrite_srcset to the worker + this.postMessage({ + 'type': 'values', + 'srcset': {'values': srcset, 'presplit': true}, + }); + }; - PWorker.prototype.preserveSrcset = function(srcset) { - // send values from rewrite_srcset to the worker - this.postMessage({ - 'type': 'values', - 'srcset': {'values': srcset, 'presplit': true}, - }); - }; + AutoFetchWorker.prototype.preserveMedia = function (media) { + // send CSSMediaRule values to the worker + this.postMessage({'type': 'values', 'media': media}) + }; - PWorker.prototype.preserveMedia = function(media) { - // send CSSMediaRule values to the worker - this.postMessage({'type': 'values', 'media': media}) - }; - - PWorker.prototype.extractFromLocalDoc = function() { + AutoFetchWorker.prototype.extractFromLocalDoc = function () { // get the values to be preserved from the documents stylesheets // and all elements with a srcset var media = []; @@ -1415,20 +1426,19 @@ var _WBWombat = function($wbwindow, wbinfo) { var sheets = $wbwindow.document.styleSheets; var i = 0; for (; i < sheets.length; ++i) { - var sheet = sheets[i]; - var rules = sheet.cssRules; + var rules = sheets[i].cssRules; for (var j = 0; j < rules.length; ++j) { var rule = rules[j]; - if (rule instanceof CSSMediaRule) { + if (rule.type === CSSRule.MEDIA_RULE) { media.push(rule.cssText); } } } - var srcsetElems = $wbwindow.document.querySelectorAll('*[srcset]'); + var srcsetElems = $wbwindow.document.querySelectorAll('img[srcset]'); for (i = 0; i < srcsetElems.length; i++) { var srcsetElem = srcsetElems[i]; if (wb_getAttribute) { - srcset.push(wb_getAttribute.call(srcsetElem,'srcset')); + srcset.push(wb_getAttribute.call(srcsetElem, 'srcset')); } else { srcset.push(srcsetElem.getAttribute('srcset')); } @@ -1440,18 +1450,15 @@ var _WBWombat = function($wbwindow, wbinfo) { }); }; - return PWorker; - })($wbwindow.Worker); + WBAutoFetchWorker = new AutoFetchWorker(wb_abs_prefix, wbinfo.mod); - WBPreserWorker = new Preserver(wb_abs_prefix, wbinfo.mod); - - wbSheetMediaQChecker = function checkStyle () { + wbSheetMediaQChecker = function checkStyle() { // used only for link[rel='stylesheet'] so we remove our listener this.removeEventListener('load', wbSheetMediaQChecker); // check no op condition if (this.sheet == null) return; // defer extraction to be nice :) - WBPreserWorker.deferredSheetExtraction(this.sheet); + WBAutoFetchWorker.deferredSheetExtraction(this.sheet); }; } @@ -1612,10 +1619,6 @@ var _WBWombat = function($wbwindow, wbinfo) { //============================================ function rewrite_style(value) { - var STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi; - - var IMPORT_REGEX = /(@import\s+[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi; - function style_replacer(match, n1, n2, n3, offset, string) { return n1 + rewrite_url(n2) + n3; } @@ -1645,14 +1648,14 @@ var _WBWombat = function($wbwindow, wbinfo) { } // Filter removes non-truthy values like null, undefined, and "" - var values = value.split(/\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/).filter(Boolean); + var values = value.split(SRCSET_REGEX).filter(Boolean); for (var i = 0; i < values.length; i++) { values[i] = rewrite_url(values[i].trim()); } - if (wbUsePresWorker) { + if (wbUseAAWorker) { // send post split values to preservation worker - WBPreserWorker.preserveSrcset(values); + WBAutoFetchWorker.preserveSrcset(values); } return values.join(", "); } @@ -1756,16 +1759,16 @@ var _WBWombat = function($wbwindow, wbinfo) { if (elem.textContent !== new_content) { elem.textContent = new_content; changed = true; - if (wbUsePresWorker && elem.sheet != null) { + if (wbUseAAWorker && elem.sheet != null) { // we have a stylesheet so lets be nice to UI thread // and defer extraction - WBPreserWorker.deferredSheetExtraction(elem.sheet); + WBAutoFetchWorker.deferredSheetExtraction(elem.sheet); } } break; case 'LINK': changed = rewrite_attr(elem, 'href'); - if (wbUsePresWorker && elem.rel === 'stylesheet') { + if (wbUseAAWorker && elem.rel === 'stylesheet') { // we can only check link[rel='stylesheet'] when it loads elem.addEventListener('load', wbSheetMediaQChecker); } @@ -2194,9 +2197,9 @@ var _WBWombat = function($wbwindow, wbinfo) { } } orig_setter.call(this, res); - if (wbUsePresWorker && this.tagName === 'STYLE' && this.sheet != null) { + if (wbUseAAWorker && this.tagName === 'STYLE' && this.sheet != null) { // got preserve all the things - WBPreserWorker.deferredSheetExtraction(this.sheet); + WBAutoFetchWorker.deferredSheetExtraction(this.sheet); } }; @@ -3602,140 +3605,138 @@ var _WBWombat = function($wbwindow, wbinfo) { init_wombat_loc($wbwindow); // archival mode: init url-rewriting intercepts - if (!wb_is_proxy) { - init_wombat_top($wbwindow); + init_wombat_top($wbwindow); - // updated wb_unrewrite_rx for imgur.com - var wb_origin = $wbwindow.__WB_replay_top.location.origin; - var wb_host = $wbwindow.__WB_replay_top.location.host; - var wb_proto = $wbwindow.__WB_replay_top.location.protocol; - if (wb_replay_prefix && wb_replay_prefix.indexOf(wb_origin) == 0) { - wb_rel_prefix = wb_replay_prefix.substring(wb_origin.length); - } else { - wb_rel_prefix = wb_replay_prefix; - } - - // make the protocol and host optional now - var rx = "((" + wb_proto + ")?\/\/" + wb_host + ")?" + wb_rel_prefix + "[^/]+/"; - wb_unrewrite_rx = new RegExp(rx, "g"); - - // History - init_history_overrides(); - - // Doc Title - init_doc_title_override(); - - // postMessage - // OPT skip - if (!wb_opts.skip_postmessage) { - init_postmessage_override($wbwindow); - init_messageevent_override($wbwindow); - } - - initMouseEventOverride($wbwindow); - - init_hash_change(); - - // write - init_write_override(); - - // eval - //init_eval_override(); - - // Ajax - init_ajax_rewrite(); - - // Fetch - init_fetch_rewrite(); - init_request_override(); - - // Audio - init_audio_override(); - - // FontFace - initFontFaceOverride($wbwindow); - - // Worker override (experimental) - initPreserveWorker(); - init_web_worker_override(); - init_service_worker_override(); - initSharedWorkerOverride(); - - - // innerHTML can be overriden on prototype! - override_html_assign($wbwindow.HTMLElement, "innerHTML", true); - override_html_assign($wbwindow.HTMLElement, "outerHTML", true); - override_html_assign($wbwindow.HTMLIFrameElement, "srcdoc", true); - override_html_assign($wbwindow.HTMLStyleElement, "textContent"); - - // Document.URL override - override_prop_extract($wbwindow.Document.prototype, "URL"); - override_prop_extract($wbwindow.Document.prototype, "documentURI"); - - // Node.baseURI override - override_prop_extract($wbwindow.Node.prototype, "baseURI"); - - // Attr nodeValue and value - override_attr_props(); - - // init insertAdjacentHTML() override - init_insertAdjacentHTML_override(); - initInsertAdjacentElementOverride(); - - - // iframe.contentWindow and iframe.contentDocument overrides to - // ensure wombat is inited on the iframe $wbwindow! - override_iframe_content_access("contentWindow"); - override_iframe_content_access("contentDocument"); - - // override funcs to convert first arg proxy->obj - override_func_first_arg_proxy_to_obj($wbwindow.MutationObserver, "observe"); - override_func_first_arg_proxy_to_obj($wbwindow.Node, "compareDocumentPosition"); - override_func_first_arg_proxy_to_obj($wbwindow.Node, "contains"); - override_func_first_arg_proxy_to_obj($wbwindow.Document, "createTreeWalker"); - - override_func_this_proxy_to_obj($wbwindow, "getComputedStyle", $wbwindow); - //override_func_this_proxy_to_obj($wbwindow.EventTarget, "addEventListener"); - //override_func_this_proxy_to_obj($wbwindow.EventTarget, "removeEventListener"); - - override_apply_func($wbwindow); - initTimeoutIntervalOverrides($wbwindow, "setTimeout"); - initTimeoutIntervalOverrides($wbwindow, "setInterval"); - - override_frames_access($wbwindow); - - // setAttribute - if (!wb_opts.skip_setAttribute) { - init_setAttribute_override(); - init_getAttribute_override(); - } - init_svg_image_overrides(); - - // override href and src attrs - init_attr_overrides(); - - // Cookies - init_cookies_override(); - - // ensure namespace urls are NOT rewritten - init_createElementNS_fix(); - - // Image - //init_image_override(); - - // DOM - // OPT skip - if (!wb_opts.skip_dom) { - init_dom_override(); - } - - // registerProtocolHandler override - init_registerPH_override(); - - //sendBeacon override - init_beacon_override(); + // updated wb_unrewrite_rx for imgur.com + var wb_origin = $wbwindow.__WB_replay_top.location.origin; + var wb_host = $wbwindow.__WB_replay_top.location.host; + var wb_proto = $wbwindow.__WB_replay_top.location.protocol; + if (wb_replay_prefix && wb_replay_prefix.indexOf(wb_origin) == 0) { + wb_rel_prefix = wb_replay_prefix.substring(wb_origin.length); + } else { + wb_rel_prefix = wb_replay_prefix; } + // make the protocol and host optional now + var rx = "((" + wb_proto + ")?\/\/" + wb_host + ")?" + wb_rel_prefix + "[^/]+/"; + wb_unrewrite_rx = new RegExp(rx, "g"); + + // History + init_history_overrides(); + + // Doc Title + init_doc_title_override(); + + // postMessage + // OPT skip + if (!wb_opts.skip_postmessage) { + init_postmessage_override($wbwindow); + init_messageevent_override($wbwindow); + } + + initMouseEventOverride($wbwindow); + + init_hash_change(); + + // write + init_write_override(); + + // eval + //init_eval_override(); + + // Ajax + init_ajax_rewrite(); + + // Fetch + init_fetch_rewrite(); + init_request_override(); + + // Audio + init_audio_override(); + + // FontFace + initFontFaceOverride($wbwindow); + + // Worker override (experimental) + initAutoFetchWorker(); + init_web_worker_override(); + init_service_worker_override(); + initSharedWorkerOverride(); + + + // innerHTML can be overriden on prototype! + override_html_assign($wbwindow.HTMLElement, "innerHTML", true); + override_html_assign($wbwindow.HTMLElement, "outerHTML", true); + override_html_assign($wbwindow.HTMLIFrameElement, "srcdoc", true); + override_html_assign($wbwindow.HTMLStyleElement, "textContent"); + + // Document.URL override + override_prop_extract($wbwindow.Document.prototype, "URL"); + override_prop_extract($wbwindow.Document.prototype, "documentURI"); + + // Node.baseURI override + override_prop_extract($wbwindow.Node.prototype, "baseURI"); + + // Attr nodeValue and value + override_attr_props(); + + // init insertAdjacentHTML() override + init_insertAdjacentHTML_override(); + initInsertAdjacentElementOverride(); + + + // iframe.contentWindow and iframe.contentDocument overrides to + // ensure wombat is inited on the iframe $wbwindow! + override_iframe_content_access("contentWindow"); + override_iframe_content_access("contentDocument"); + + // override funcs to convert first arg proxy->obj + override_func_first_arg_proxy_to_obj($wbwindow.MutationObserver, "observe"); + override_func_first_arg_proxy_to_obj($wbwindow.Node, "compareDocumentPosition"); + override_func_first_arg_proxy_to_obj($wbwindow.Node, "contains"); + override_func_first_arg_proxy_to_obj($wbwindow.Document, "createTreeWalker"); + + + override_func_this_proxy_to_obj($wbwindow, "getComputedStyle", $wbwindow); + //override_func_this_proxy_to_obj($wbwindow.EventTarget, "addEventListener"); + //override_func_this_proxy_to_obj($wbwindow.EventTarget, "removeEventListener"); + + override_apply_func($wbwindow); + initTimeoutIntervalOverrides($wbwindow, "setTimeout"); + initTimeoutIntervalOverrides($wbwindow, "setInterval"); + + override_frames_access($wbwindow); + + // setAttribute + if (!wb_opts.skip_setAttribute) { + init_setAttribute_override(); + init_getAttribute_override(); + } + init_svg_image_overrides(); + + // override href and src attrs + init_attr_overrides(); + + // Cookies + init_cookies_override(); + + // ensure namespace urls are NOT rewritten + init_createElementNS_fix(); + + // Image + //init_image_override(); + + // DOM + // OPT skip + if (!wb_opts.skip_dom) { + init_dom_override(); + } + + // registerProtocolHandler override + init_registerPH_override(); + + //sendBeacon override + init_beacon_override(); // other overrides // proxy mode: only using these overrides @@ -3765,13 +3766,13 @@ var _WBWombat = function($wbwindow, wbinfo) { init_document_obj_proxy($wbwindow.document); // expose functions - var obj = {} + var obj = {}; obj.extract_orig = extract_orig; obj.rewrite_url = rewrite_url; obj.watch_elem = watch_elem; obj.init_new_window_wombat = init_new_window_wombat; obj.init_paths = init_paths; - obj.local_init = function(name) { + obj.local_init = function (name) { var res = $wbwindow._WB_wombat_obj_proxy[name]; if (name === "document" && res && !res._WB_wombat_obj_proxy) { return init_document_obj_proxy(res) || res; @@ -3812,8 +3813,8 @@ var _WBWombat = function($wbwindow, wbinfo) { return; } - if ($wbwindow.document.readyState === "complete" && wbUsePresWorker) { - WBPreserWorker.extractFromLocalDoc(); + if ($wbwindow.document.readyState === "complete" && wbUseAAWorker) { + WBAutoFetchWorker.extractFromLocalDoc(); } if ($wbwindow != $wbwindow.__WB_replay_top) { @@ -3925,10 +3926,10 @@ var _WBWombat = function($wbwindow, wbinfo) { // Fix .parent only if not embeddable, otherwise leave for accessing embedding window if (!wb_opts.embedded && (replay_top == $wbwindow)) { - if (wbUsePresWorker) { + if (wbUseAAWorker) { $wbwindow.addEventListener("message", function(event) { - if (event.data && event.data.wb_type === 'pworker') { - WBPreserWorker.postMessage(event.data.msg); + if (event.data && event.data.wb_type === 'aaworker') { + WBAutoFetchWorker.postMessage(event.data.msg); } }, false); } @@ -3982,8 +3983,6 @@ var _WBWombat = function($wbwindow, wbinfo) { } - - // Utility functions used by rewriting rules function watch_elem(elem, func) { diff --git a/pywb/static/wombatProxyMode.js b/pywb/static/wombatProxyMode.js new file mode 100644 index 00000000..e5dc6086 --- /dev/null +++ b/pywb/static/wombatProxyMode.js @@ -0,0 +1,376 @@ +/* +Copyright(c) 2013-2018 Rhizome and Ilya Kreymer. Released under the GNU General Public License. + +This file is part of pywb, https://github.com/webrecorder/pywb + + pywb is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + pywb is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with pywb. If not, see . + */ + +//============================================ +// Wombat JS-Rewriting Library v2.53 +//============================================ + +// Wombat lite for proxy-mode +var _WBWombat = function ($wbwindow, wbinfo) { + // Globals + var wb_info = wbinfo; + wb_info.top_host = wb_info.top_host || "*"; + wbinfo.wombat_opts = wbinfo.wombat_opts || {}; + var wbAutoFetchWorkerPrefix = (wb_info.auto_fetch_worker_prefix || wb_info.static_prefix) + 'autoFetchWorkerProxyMode.js'; + var WBAutoFetchWorker; + + function init_seeded_random(seed) { + // Adapted from: + // http://indiegamr.com/generate-repeatable-random-numbers-in-js/ + + $wbwindow.Math.seed = parseInt(seed); + + function seeded_random() { + $wbwindow.Math.seed = ($wbwindow.Math.seed * 9301 + 49297) % 233280; + var rnd = $wbwindow.Math.seed / 233280; + + return rnd; + } + + $wbwindow.Math.random = seeded_random; + } + + function init_crypto_random() { + if (!$wbwindow.crypto || !$wbwindow.Crypto) { + return; + } + + var orig_getrandom = $wbwindow.Crypto.prototype.getRandomValues; + + var new_getrandom = function (array) { + for (var i = 0; i < array.length; i++) { + array[i] = parseInt($wbwindow.Math.random() * 4294967296); + } + return array; + }; + + $wbwindow.Crypto.prototype.getRandomValues = new_getrandom; + $wbwindow.crypto.getRandomValues = new_getrandom; + } + + //============================================ + function init_fixed_ratio() { + // otherwise, just set it + $wbwindow.devicePixelRatio = 1; + + // prevent changing, if possible + if (Object.defineProperty) { + try { + // fixed pix ratio + Object.defineProperty($wbwindow, "devicePixelRatio", {value: 1, writable: false}); + } catch (e) {} + } + } + + //======================================== + function init_date_override(timestamp) { + timestamp = parseInt(timestamp) * 1000; + //var timezone = new Date().getTimezoneOffset() * 60 * 1000; + // Already UTC! + var timezone = 0; + var start_now = $wbwindow.Date.now(); + var timediff = start_now - (timestamp - timezone); + + if ($wbwindow.__wb_Date_now) { + return; + } + + var orig_date = $wbwindow.Date; + + var orig_utc = $wbwindow.Date.UTC; + var orig_parse = $wbwindow.Date.parse; + var orig_now = $wbwindow.Date.now; + + $wbwindow.__wb_Date_now = orig_now; + + $wbwindow.Date = function (Date) { + return function (A, B, C, D, E, F, G) { + // Apply doesn't work for constructors and Date doesn't + // seem to like undefined args, so must explicitly + // call constructor for each possible args 0..7 + if (A === undefined) { + return new Date(orig_now() - timediff); + } else if (B === undefined) { + return new Date(A); + } else if (C === undefined) { + return new Date(A, B); + } else if (D === undefined) { + return new Date(A, B, C); + } else if (E === undefined) { + return new Date(A, B, C, D); + } else if (F === undefined) { + return new Date(A, B, C, D, E); + } else if (G === undefined) { + return new Date(A, B, C, D, E, F); + } else { + return new Date(A, B, C, D, E, F, G); + } + } + }($wbwindow.Date); + + $wbwindow.Date.prototype = orig_date.prototype; + + $wbwindow.Date.now = function () { + return orig_now() - timediff; + }; + + $wbwindow.Date.UTC = orig_utc; + $wbwindow.Date.parse = orig_parse; + + $wbwindow.Date.__WB_timediff = timediff; + + Object.defineProperty($wbwindow.Date.prototype, "constructor", {value: $wbwindow.Date}); + } + + //============================================ + function init_disable_notifications() { + if (window.Notification) { + window.Notification.requestPermission = function (callback) { + if (callback) { + callback("denied"); + } + + return Promise.resolve("denied"); + }; + } + + if (window.geolocation) { + var disabled = function (success, error, options) { + if (error) { + error({"code": 2, "message": "not available"}); + } + }; + + window.geolocation.getCurrentPosition = disabled; + window.geolocation.watchPosition = disabled; + } + } + + function initAutoFetchWorker() { + if (!$wbwindow.Worker) { + return; + } + + var isTop = $wbwindow.self === $wbwindow.top; + + function AutoFetchWorker() { + if (!(this instanceof AutoFetchWorker)) { + return new AutoFetchWorker(); + } + this.checkIntervalTime = 15000; + this.checkIntervalCB = this.checkIntervalCB.bind(this); + if (isTop) { + // Cannot directly load our worker from the proxy origin into the current origin + // however we fetch it from proxy origin and can blob it into the current origin :) + var self = this; + fetch(wbAutoFetchWorkerPrefix) + .then(function (res) { + return res.text().then(function (text) { + var blob = new Blob([text], {"type": "text/javascript"}); + self.worker = new $wbwindow.Worker(URL.createObjectURL(blob)); + // use our origins reference to the document in order for us to parse stylesheets :/ + self.styleTag = document.createElement('style'); + self.styleTag.id = '$wrStyleParser$'; + document.documentElement.appendChild(self.styleTag); + self.startCheckingInterval(); + }); + }); + } else { + // add only the portions of the worker interface we use since we are not top and if in proxy mode start check polling + this.worker = { + "postMessage": function (msg) { + if (!msg.wb_type) { + msg = {'wb_type': 'aaworker', 'msg': msg}; + } + $wbwindow.top.postMessage(msg, '*'); + }, + "terminate": function () {} + }; + this.startCheckingInterval(); + } + } + + AutoFetchWorker.prototype.startCheckingInterval = function () { + // if document ready state is complete do first extraction and start check polling + // otherwise wait for document ready state to complete to extract and start check polling + var self = this; + if ($wbwindow.document.readyState === "complete") { + this.extractFromLocalDoc(); + setInterval(this.checkIntervalCB, this.checkIntervalTime); + } else { + var i = setInterval(function () { + if ($wbwindow.document.readyState === "complete") { + self.extractFromLocalDoc(); + clearInterval(i); + setInterval(self.checkIntervalCB, self.checkIntervalTime); + } + }, 1000); + } + }; + + AutoFetchWorker.prototype.checkIntervalCB = function () { + this.extractFromLocalDoc(); + }; + + AutoFetchWorker.prototype.terminate = function () { + // terminate the worker, a no op when not replay top + this.worker.terminate(); + }; + + AutoFetchWorker.prototype.postMessage = function (msg) { + this.worker.postMessage(msg); + }; + + AutoFetchWorker.prototype.extractMediaRules = function (rules, href) { + // We are in proxy mode and must include a URL to resolve relative URLs in media rules + if (!rules) return []; + var rvlen = rules.length; + var text = []; + var rule; + for (var i = 0; i < rvlen; ++i) { + rule = rules[i]; + if (rule.type === CSSRule.MEDIA_RULE) { + text.push({"cssText": rule.cssText, "resolve": href}); + } + } + return text; + }; + + AutoFetchWorker.prototype.corsCSSFetch = function (href) { + // because this JS in proxy mode operates as it would on the live web + // the rules of CORS apply and we cannot rely on URLs being rewritten correctly + // fetch the cross origin css file and then parse it using a style tag to get the rules + var url = location.protocol + '//' + wb_info.proxy_magic + '/proxy-fetch/' + href; + var aaw = this; + return fetch(url).then(function (res) { + return res.text().then(function (text) { + aaw.styleTag.textContent = text; + var sheet = aaw.styleTag.sheet || {}; + return aaw.extractMediaRules(sheet.cssRules || sheet.rules, href); + }); + }).catch(function (error) { + return []; + }); + }; + + AutoFetchWorker.prototype.shouldSkipSheet = function (sheet) { + // we skip extracting rules from sheets if they are from our parsing style or come from pywb + if (sheet.id === '$wrStyleParser$') return true; + return !!(sheet.href && sheet.href.indexOf(wb_info.proxy_magic) !== -1); + }; + + AutoFetchWorker.prototype.extractFromLocalDoc = function () { + var i = 0; + var media = []; + var deferredMediaURLS = []; + var srcset = []; + var sheet; + var resolve; + // We must use the window reference passed to us to access this origins stylesheets + var styleSheets = $wbwindow.document.styleSheets; + for (; i < styleSheets.length; ++i) { + sheet = styleSheets[i]; + // if the sheet belongs to our parser node we must skip it + if (!this.shouldSkipSheet(sheet)) { + try { + // if no error is thrown due to cross origin sheet the urls then just add + // the resolved URLS if any to the media urls array + if (sheet.cssRules != null) { + resolve = sheet.href || $wbwindow.document.baseURI; + media = media.concat(this.extractMediaRules(sheet.cssRules, resolve)); + } else if (sheet.href != null) { + // depending on the browser cross origin stylesheets will have their + // cssRules property null but href non-null + deferredMediaURLS.push(this.corsCSSFetch(sheet.href)); + } + } catch (error) { + // the stylesheet is cross origin and we must re-fetch via PYWB to get the contents for checking + deferredMediaURLS.push(this.corsCSSFetch(sheet.href)); + } + } + } + // We must use the window reference passed to us to access this origins elements with srcset attr + // like cssRule handling we must include a URL to resolve relative URLs by + var srcsetElems = $wbwindow.document.querySelectorAll('img[srcset]'); + var ssElem, resolveAgainst; + for (i = 0; i < srcsetElems.length; i++) { + ssElem = srcsetElems[i]; + resolveAgainst = ssElem.src != null && ssElem.src !== ' ' ? ssElem.src : $wbwindow.document.baseURI; + srcset.push({'srcset': ssElem.srcset, 'resolve': resolveAgainst}); + } + + // send what we have extracted, if anything, to the worker for processing + if (media.length > 0 || srcset.length > 0) { + this.postMessage({'type': 'values', 'media': media, 'srcset': srcset}); + } + + if (deferredMediaURLS.length > 0) { + // wait for all our deferred fetching and extraction of cross origin + // stylesheets to complete and then send those values, if any, to the worker + var aaw = this; + Promise.all(deferredMediaURLS).then(function (values) { + var results = []; + while (values.length > 0) { + results = results.concat(values.shift()); + } + if (results.length > 0) { + aaw.postMessage({'type': 'values', 'media': results}); + } + }); + } + }; + + WBAutoFetchWorker = new AutoFetchWorker(); + + if (isTop) { + $wbwindow.addEventListener("message", function (event) { + if (event.data && event.data.wb_type === 'aaworker') { + WBAutoFetchWorker.postMessage(event.data.msg); + } + }, false); + } + } + + if (wbinfo.use_auto_fetch_worker && wbinfo.is_live) { + initAutoFetchWorker(); + } + + if (wbinfo.use_wombat) { + // proxy mode overrides + // Random + init_seeded_random(wbinfo.wombat_sec); + + // Crypto Random + init_crypto_random(); + + // set fixed pixel ratio + init_fixed_ratio(); + + // Date + init_date_override(wbinfo.wombat_sec); + + // disable notifications + init_disable_notifications(); + } + + return {}; +}; + +window._WBWombat = _WBWombat; + diff --git a/pywb/templates/head_insert.html b/pywb/templates/head_insert.html index 3eb7fe45..f6b42e3c 100644 --- a/pywb/templates/head_insert.html +++ b/pywb/templates/head_insert.html @@ -1,9 +1,9 @@ - -{% if not wb_url.is_banner_only %} - +{% if env.pywb_proxy_magic %} +{% set whichWombat = 'wombatProxyMode.js' %} +{% else %} +{% set whichWombat = 'wombat.js' %} +{% endif %} +{% if not wb_url.is_banner_only or (env.pywb_proxy_magic and (config.proxy.use_auto_fetch_worker or config.proxy.use_wombat)) %} +