1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Refactor of auto-fetch worker system with support for proxy mode, fixes https://github.com/webrecorder/pywb/issues/371: (#379)

- Split wombat and auto-fetch worker into two files (proxy mode and non-proxy mode)
- Renamed preservationWorker to autoFetchWorker in order to better convey what it does
- Root config file control over including wombat and auto-fetch worker in proxy or non-proxy mode
- Added additional proxy mode + auto-fetch worker only route for fetching the auto-fetch worker code nicely for CORS
- templateview: add 'tobool' formatter to more cleanly format python bools to JS 'true'/'false'
- proxy options: config and command line: 
  'use_auto_fetch_worker' and '--proxy-with-auto-fetch'
  'use_wombat' and '--proxy-with-wombat'
- head_insert.html: only include wombat in proxy mode when use_wombat or use_auto_fetch_worker are set.
- wombatProxyMode.js: slimmed down wombat for proxy mode only including auto-fetch support.
- more consistent naming: rename 'preserveWorker' and 'autoArchive' to 'auto-fetch'

Updated tests:
- test_wbrequestresponse.py: added tests covering constructor defaults, _init_derived, options_response, json_response, encode_stream, text_stream
- test_auto_colls.py: fixed broken test test_more_custom_templates, reason using ujson now not json so spacing was off
- test_proxy.py: updated existing tests to reflect splitting wombat into proxy and non-proxy mode, added tests covering auto-fetch worker specific endpoints in proxy mode
removed duplicate addons key in .travis.yml
- test_cli.py: updated to properly test the cli with these changes
added ultrajon dep to tests_require in setup.py to reflect its usage by wbrequestresponse.py

Fully documented:
- cli.py
- frontendapp.py
- templateview.py
- wbrequestresponse.py

Removed duplicate addons key in .travis.yml
Added ultrajson dependency to tests_require in setup.py to reflect its usage by wbrequestresponse.py

Fixes #371
This commit is contained in:
John Berlin 2018-10-03 16:27:49 -04:00 committed by Ilya Kreymer
parent 71c3eb77de
commit ec0df7b9ae
16 changed files with 1631 additions and 340 deletions

View File

@ -9,6 +9,7 @@ os:
- linux
addons:
sauce_connect: true
apt:
packages:
# This is required to run new chrome on old trusty
@ -18,8 +19,6 @@ env:
- WR_TEST=no
- WR_TEST=yes
addons:
sauce_connect: true
cache:
directories:

View File

@ -6,6 +6,7 @@ import logging
#=============================================================================
def warcserver(args=None):
"""Utility function for starting pywb's WarcServer"""
return WarcServerCli(args=args,
default_port=8070,
desc='pywb WarcServer').run()
@ -13,6 +14,7 @@ def warcserver(args=None):
#=============================================================================
def wayback(args=None):
"""Utility function for starting pywb's Wayback Machine implementation"""
return WaybackCli(args=args,
default_port=8080,
desc='pywb Wayback Machine Server').run()
@ -20,6 +22,7 @@ def wayback(args=None):
#=============================================================================
def live_rewrite_server(args=None):
"""Utility function for starting pywb's Wayback Machine implementation in live mode"""
return LiveCli(args=args,
default_port=8090,
desc='pywb Live Rewrite Proxy Server').run()
@ -27,7 +30,15 @@ def live_rewrite_server(args=None):
#=============================================================================
class BaseCli(object):
"""Base CLI class that provides the initial arg parser setup,
calls load to receive the application to be started and starts the application."""
def __init__(self, args=None, default_port=8080, desc=''):
"""
:param args: CLI arguments
:param int default_port: The default port that the application will use
:param str desc: The description for the application to be started
"""
parser = ArgumentParser(description=desc)
parser.add_argument('-p', '--port', type=int, default=default_port,
help='Port to listen on (default %s)' % default_port)
@ -47,6 +58,10 @@ class BaseCli(object):
help='Enable HTTP/S proxy on specified collection')
parser.add_argument('--proxy-record', action='store_true',
help='Enable proxy recording into specified collection')
parser.add_argument('--proxy-with-wombat', action='store_true',
help='Enable partial wombat support in proxy mode')
parser.add_argument('--proxy-with-auto-fetch', action='store_true',
help='Enable auto-load worker in proxy mode')
self.desc = desc
self.extra_config = {}
@ -57,12 +72,14 @@ class BaseCli(object):
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
level=logging.DEBUG if self.r.debug else logging.INFO)
if self.r.proxy:
self.extra_config['proxy'] = {'coll': self.r.proxy,
'recording': self.r.proxy_record}
self.extra_config['proxy'] = {
'coll': self.r.proxy,
'recording': self.r.proxy_record,
'use_wombat': self.r.proxy_with_wombat,
'use_auto_fetch_worker': self.r.proxy_with_auto_fetch,
}
self.r.live = True
self.application = self.load()
if self.r.profile:
@ -70,9 +87,15 @@ class BaseCli(object):
self.application = ProfilerMiddleware(self.application)
def _extend_parser(self, parser): #pragma: no cover
"""Method provided for subclasses to add their cli argument on top of the default cli arguments.
:param ArgumentParser parser: The argument parser instance passed by BaseCli
"""
pass
def load(self):
"""This method is called to load the application. Subclasses must return a application
that can be used by used by pywb.utils.geventserver.GeventServer."""
if self.r.live:
self.extra_config['collections'] = {'live':
{'index': '$live'}}
@ -84,10 +107,12 @@ class BaseCli(object):
self.extra_config['recorder'] = 'live'
def run(self):
"""Start the application"""
self.run_gevent()
return self
def run_gevent(self):
"""Created the server that runs the application supplied a subclass"""
from pywb.utils.geventserver import GeventServer, RequestURIWSGIHandler
logging.info('Starting Gevent Server on ' + str(self.r.port))
ge = GeventServer(self.application,
@ -99,6 +124,8 @@ class BaseCli(object):
#=============================================================================
class ReplayCli(BaseCli):
"""CLI class that adds the cli functionality specific to starting pywb's Wayback Machine implementation"""
def _extend_parser(self, parser):
parser.add_argument('-a', '--autoindex', action='store_true',
help='Enable auto-indexing')
@ -110,7 +137,6 @@ class ReplayCli(BaseCli):
help_dir='Specify root archive dir (default is current working directory)'
parser.add_argument('-d', '--directory', help=help_dir)
def load(self):
super(ReplayCli, self).load()
@ -129,6 +155,8 @@ class ReplayCli(BaseCli):
#=============================================================================
class WarcServerCli(BaseCli):
"""CLI class for starting a WarcServer"""
def load(self):
from pywb.warcserver.warcserver import WarcServer
@ -138,6 +166,8 @@ class WarcServerCli(BaseCli):
#=============================================================================
class WaybackCli(ReplayCli):
"""CLI class for starting the pywb's implementation of the Wayback Machine"""
def load(self):
from pywb.apps.frontendapp import FrontEndApp
@ -147,6 +177,8 @@ class WaybackCli(ReplayCli):
#=============================================================================
class LiveCli(BaseCli):
"""CLI class for starting pywb in replay server in live mode"""
def load(self):
from pywb.apps.frontendapp import FrontEndApp

View File

@ -6,7 +6,7 @@ from werkzeug.exceptions import HTTPException, NotFound
from werkzeug.wsgi import pop_path_info
from six.moves.urllib.parse import urljoin
from six import iteritems
from warcio.statusandheaders import StatusAndHeaders
from warcio.utils import to_native_str
from wsgiprox.wsgiprox import WSGIProxMiddleware
@ -33,6 +33,16 @@ import logging
# ============================================================================
class FrontEndApp(object):
"""Orchestrates pywb's core Wayback Machine functionality and is comprised of 2 core sub-apps and 3 optional apps.
Sub-apps:
- WarcServer: Serves the archive content (WARC/ARC and index) as well as from the live web in record/proxy mode
- RewriterApp: Rewrites the content served by pywb (if it is to be rewritten)
- WSGIProxMiddleware (Optional): If proxy mode is enabled, performs pywb's HTTP(s) proxy functionality
- AutoIndexer (Optional): If auto-indexing is enabled for the collections it is started here
- RecorderApp (Optional): Recording functionality, available when recording mode is enabled
"""
REPLAY_API = 'http://localhost:%s/{coll}/resource/postreq'
CDX_API = 'http://localhost:%s/{coll}/index'
RECORD_SERVER = 'http://localhost:%s'
@ -45,6 +55,10 @@ class FrontEndApp(object):
PROXY_CA_PATH = os.path.join('proxy-certs', 'pywb-ca.pem')
def __init__(self, config_file='./config.yaml', custom_config=None):
"""
:param str config_file: Path to the config file
:param dict custom_config: Dictionary containing additional configuration information
"""
self.handler = self.handle_request
self.warcserver = WarcServer(config_file=config_file,
custom_config=custom_config)
@ -55,6 +69,8 @@ class FrontEndApp(object):
self.warcserver_server = GeventServer(self.warcserver, port=0)
self.proxy_prefix = None # the URL prefix to be used for the collection with proxy mode (e.g. /coll/id_/)
self.proxy_coll = None # the name of the collection that has proxy mode enabled
self.init_proxy(config)
self.init_recorder(config.get('recorder'))
@ -82,6 +98,8 @@ class FrontEndApp(object):
self.metadata_cache = MetadataCache(metadata_templ)
def _init_routes(self):
"""Initialize the routes and based on the configuration file makes available
specific routes (proxy mode, record)"""
self.url_map = Map()
self.url_map.add(Rule('/static/_/<coll>/<path:filepath>', endpoint=self.serve_static))
self.url_map.add(Rule('/static/<path:filepath>', endpoint=self.serve_static))
@ -100,9 +118,19 @@ class FrontEndApp(object):
if self.recorder_path:
self.url_map.add(Rule(coll_prefix + self.RECORD_ROUTE + '/<path:url>', endpoint=self.serve_record))
if self.proxy_prefix is not None:
# Add the proxy-fetch endpoint to enable PreservationWorker to make CORS fetches worry free in proxy mode
self.url_map.add(Rule('/proxy-fetch/<path:url>', endpoint=self.proxy_fetch,
methods=['GET', 'HEAD', 'OPTIONS']))
self.url_map.add(Rule(coll_prefix + '/<path:url>', endpoint=self.serve_content))
def get_upstream_paths(self, port):
"""Retrieve a dictionary containing the full URLs of the upstream apps
:param int port: The port used by the replay and cdx servers
:return: A dictionary containing the upstream paths (replay, cdx-server, record [if enabled])
:rtype: dict[str, str]
"""
base_paths = {
'replay': self.REPLAY_API % port,
'cdx-server': self.CDX_API % port,
@ -114,6 +142,7 @@ class FrontEndApp(object):
return base_paths
def init_recorder(self, recorder_config):
"""Initialize the recording functionality of pywb. If recording_config is None this function is a no op"""
if not recorder_config:
self.recorder = None
self.recorder_path = None
@ -142,6 +171,10 @@ class FrontEndApp(object):
self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll)
def init_autoindex(self, auto_interval):
"""Initialize and start the auto-indexing of the collections. If auto_interval is None this is a no op.
:param str|int auto_interval: The auto-indexing interval from the configuration file or CLI argument
"""
if not auto_interval:
return
@ -161,7 +194,16 @@ class FrontEndApp(object):
logging.info(msg.format(indexer.root_path, auto_interval))
indexer.start()
def is_proxy_enabled(self, environ):
return self.proxy_prefix is not None and 'wsgiprox.proxy_host' in environ
def serve_home(self, environ):
"""Serves the home (/) view of pywb (not a collections)
:param dict environ: The WSGI environment dictionary for the request
:return: The WbResponse for serving the home (/) path
:rtype: WbResponse
"""
home_view = BaseInsertView(self.rewriterapp.jinja_env, 'index.html')
fixed_routes = self.warcserver.list_fixed_routes()
dynamic_routes = self.warcserver.list_dynamic_routes()
@ -177,19 +219,38 @@ class FrontEndApp(object):
return WbResponse.text_response(content, content_type='text/html; charset="utf-8"')
def serve_static(self, environ, coll='', filepath=''):
"""Serve a static file associated with a specific collection or one of pywb's own static assets
:param dict environ: The WSGI environment dictionary for the request
:param str coll: The collection the static file is associated with
:param str filepath: The file path (relative to the collection) for the static assest
:return: The WbResponse for the static asset
:rtype: WbResponse
"""
proxy_enabled = self.is_proxy_enabled(environ)
if proxy_enabled and environ.get('REQUEST_METHOD') == 'OPTIONS':
return WbResponse.options_response(environ)
if coll:
path = os.path.join(self.warcserver.root_dir, coll, self.static_dir)
else:
path = self.static_dir
environ['pywb.static_dir'] = path
try:
return self.static_handler(environ, filepath)
response = self.static_handler(environ, filepath)
if proxy_enabled:
response.add_access_control_headers(env=environ)
return response
except:
self.raise_not_found(environ, 'Static File Not Found: {0}'.format(filepath))
def get_metadata(self, coll):
"""Retrieve the metadata associated with a collection
:param str coll: The name of the collection to receive metadata for
:return: The collections metadata if it exists
:rtype: dict
"""
#if coll == self.all_coll:
# coll = '*'
@ -204,6 +265,13 @@ class FrontEndApp(object):
return metadata
def serve_coll_page(self, environ, coll='$root'):
"""Render and serve a collections search page (search.html).
:param dict environ: The WSGI environment dictionary for the request
:param str coll: The name of the collection to serve the collections search page for
:return: The WbResponse containing the collections search page
:rtype: WbResponse
"""
if not self.is_valid_coll(coll):
self.raise_not_found(environ, 'No handler for "/{0}"'.format(coll))
@ -225,6 +293,13 @@ class FrontEndApp(object):
return WbResponse.text_response(content, content_type='text/html; charset="utf-8"')
def serve_cdx(self, environ, coll='$root'):
"""Make the upstream CDX query for a collection and response with the results of the query
:param dict environ: The WSGI environment dictionary for the request
:param str coll: The name of the collection this CDX query is for
:return: The WbResponse containing the results of the CDX query
:rtype: WbResponse
"""
base_url = self.rewriterapp.paths['cdx-server']
#if coll == self.all_coll:
@ -248,12 +323,31 @@ class FrontEndApp(object):
return WbResponse.text_response('Error: ' + str(e), status='400 Bad Request')
def serve_record(self, environ, coll='$root', url=''):
"""Serve a URL's content from a WARC/ARC record in replay mode or from the live web in
live, proxy, and record mode.
:param dict environ: The WSGI environment dictionary for the request
:param str coll: The name of the collection the record is to be served from
:param str url: The URL for the corresponding record to be served if it exists
:return: WbResponse containing the contents of the record/URL
:rtype: WbResponse
"""
if coll in self.warcserver.list_fixed_routes():
return WbResponse.text_response('Error: Can Not Record Into Custom Collection "{0}"'.format(coll))
return self.serve_content(environ, coll, url, record=True)
def serve_content(self, environ, coll='$root', url='', timemap_output='', record=False):
"""Serve the contents of a URL/Record rewriting the contents of the response when applicable.
:param dict environ: The WSGI environment dictionary for the request
:param str coll: The name of the collection the record is to be served from
:param str url: The URL for the corresponding record to be served if it exists
:param str timemap_output: The contents of the timemap included in the link header of the response
:param bool record: Should the content being served by recorded (save to a warc). Only valid in record mode
:return: WbResponse containing the contents of the record/URL
:rtype: WbResponse
"""
if not self.is_valid_coll(coll):
self.raise_not_found(environ, 'No handler for "/{0}"'.format(coll))
@ -282,10 +376,16 @@ class FrontEndApp(object):
except UpstreamException as ue:
response = self.rewriterapp.handle_error(environ, ue)
raise HTTPException(response=response)
return response
def setup_paths(self, environ, coll, record=False):
"""Populates the WSGI environment dictionary with the path information necessary to perform a response for
content or record.
:param dict environ: The WSGI environment dictionary for the request
:param str coll: The name of the collection the record is to be served from
:param bool record: Should the content being served by recorded (save to a warc). Only valid in record mode
"""
if not coll or not self.warcserver.root_dir:
return
@ -305,6 +405,12 @@ class FrontEndApp(object):
environ['pywb.templates_dir'] = '/'.join(paths)
def serve_listing(self, environ):
"""Serves the response for WARCServer fixed and dynamic listing (paths)
:param dict environ: The WSGI environment dictionary for the request
:return: WbResponse containing the frontend apps WARCServer URL paths
:rtype: WbResponse
"""
result = {'fixed': self.warcserver.list_fixed_routes(),
'dynamic': self.warcserver.list_dynamic_routes()
}
@ -312,6 +418,12 @@ class FrontEndApp(object):
return WbResponse.json_response(result)
def is_valid_coll(self, coll):
"""Determines if the collection name for a request is valid (exists)
:param str coll: The name of the collection to check
:return: True if the collection is valid, false otherwise
:rtype: bool
"""
#if coll == self.all_coll:
# return True
@ -319,9 +431,21 @@ class FrontEndApp(object):
coll in self.warcserver.list_dynamic_routes())
def raise_not_found(self, environ, msg):
"""Utility function for raising a werkzeug.exceptions.NotFound execption with the supplied WSGI environment
and message.
:param dict environ: The WSGI environment dictionary for the request
:param str msg: The error message
"""
raise NotFound(response=self.rewriterapp._error_response(environ, msg))
def _check_refer_redirect(self, environ):
"""Returns a WbResponse for a HTTP 307 redirection if the HTTP referer header is the same as the HTTP host header
:param dict environ: The WSGI environment dictionary for the request
:return: WbResponse HTTP 307 redirection
:rtype: WbResponse
"""
referer = environ.get('HTTP_REFERER')
if not referer:
return
@ -353,10 +477,16 @@ class FrontEndApp(object):
return self.handler(environ, start_response)
def handle_request(self, environ, start_response):
"""Retrieves the route handler and calls the handler returning its the response
:param dict environ: The WSGI environment dictionary for the request
:param start_response:
:return: The WbResponse for the request
:rtype: WbResponse
"""
urls = self.url_map.bind_to_environ(environ)
try:
endpoint, args = urls.match()
# store original script_name (original prefix) before modifications are made
environ['pywb.app_prefix'] = environ.get('SCRIPT_NAME')
@ -379,13 +509,23 @@ class FrontEndApp(object):
@classmethod
def create_app(cls, port):
"""Create a new instance of FrontEndApp that listens on port with a hostname of 0.0.0.0
:param int port: The port FrontEndApp is to listen on
:return: A new instance of FrontEndApp wrapped in GeventServer
:rtype: GeventServer
"""
app = FrontEndApp()
app_server = GeventServer(app, port=port, hostname='0.0.0.0')
return app_server
def init_proxy(self, config):
"""Initialize and start proxy mode. If proxy configuration entry is not contained in the config
this is a no op. Causes handler to become an instance of WSGIProxMiddleware.
:param dict config: The configuration object used to configure this instance of FrontEndApp
"""
proxy_config = config.get('proxy')
self.proxy_prefix = None
if not proxy_config:
return
@ -418,10 +558,12 @@ class FrontEndApp(object):
else:
self.proxy_prefix = '/{0}/id_/'.format(proxy_coll)
self.proxy_coll = proxy_coll
self.handler = WSGIProxMiddleware(self.handle_request,
self.proxy_route_request,
proxy_host=proxy_config.get('host', 'pywb.proxy'),
proxy_options=proxy_config)
self.proxy_route_request,
proxy_host=proxy_config.get('host', 'pywb.proxy'),
proxy_options=proxy_config)
def proxy_route_request(self, url, environ):
""" Return the full url that this proxy request will be routed to
@ -431,14 +573,65 @@ class FrontEndApp(object):
"""
return self.proxy_prefix + url
def proxy_fetch(self, env, url):
"""Proxy mode only endpoint that handles OPTIONS requests and COR fetches for Preservation Worker.
Due to normal cross-origin browser restrictions in proxy mode, auto fetch worker cannot access the CSS rules
of cross-origin style sheets and must re-fetch them in a manner that is CORS safe. This endpoint facilitates
that by fetching the stylesheets for the auto fetch worker and then responds with its contents
:param dict env: The WSGI environment dictionary
:param str url: The URL of the resource to be fetched
:return: WbResponse that is either response to an Options request or the results of fetching url
:rtype: WbResponse
"""
if not self.is_proxy_enabled(env):
# we are not in proxy mode so just respond with forbidden
return WbResponse.text_response('proxy mode must be enabled to use this endpoint',
status='403 Forbidden')
if env.get('REQUEST_METHOD') == 'OPTIONS':
return WbResponse.options_response(env)
# ensure full URL
request_url = env['REQUEST_URI']
# replace with /id_ so we do not get rewritten
url = request_url.replace('/proxy-fetch', '/id_')
# update WSGI environment object
env['REQUEST_URI'] = self.proxy_coll + url
env['PATH_INFO'] = env['PATH_INFO'].replace('/proxy-fetch', self.proxy_coll + '/id_')
# make request using normal serve_content
response = self.serve_content(env, self.proxy_coll, url)
# for WR
if isinstance(response, WbResponse):
response.add_access_control_headers(env=env)
return response
# ============================================================================
class MetadataCache(object):
"""This class holds the collection medata template string and
caches the metadata for a collection once it is rendered once.
Cached metadata is updated if its corresponding file has been updated since last cache time (file mtime based)"""
def __init__(self, template_str):
"""
:param str template_str: The template string to be cached
"""
self.template_str = template_str
self.cache = {}
def load(self, coll):
"""Load and receive the metadata associated with a collection.
If the metadata for the collection is not cached yet its metadata file is read in and stored.
If the cache has seen the collection before the mtime of the metadata file is checked and if it is more recent
than the cached time, the cache is updated and returned otherwise the cached version is returned.
:param str coll: Name of a collection
:return: The cached metadata for a collection
:rtype: dict
"""
path = self.template_str.format(coll=coll)
try:
mtime = os.path.getmtime(path)
@ -456,11 +649,25 @@ class MetadataCache(object):
return self.store_new(coll, path, mtime)
def store_new(self, coll, path, mtime):
"""Load a collections metadata file and store it
:param str coll: The name of the collection the metadata is for
:param str path: The path to the collections metadata file
:param float mtime: The current mtime of the collections metadata file
:return: The collections metadata
:rtype: dict
"""
obj = load_yaml_config(path)
self.cache[coll] = (mtime, obj)
return obj
def get_all(self, routes):
"""Load the metadata for all routes (collections) and populate the cache
:param list[str] routes: List of collection names
:return: A dictionary containing each collections metadata
:rtype: dict
"""
for route in routes:
self.load(route)

View File

@ -366,6 +366,7 @@ class RewriterApp(object):
top_url,
environ,
framed_replay,
coll=kwargs.get('coll', ''),
replay_mod=self.replay_mod,
config=self.config))

View File

@ -1,3 +1,9 @@
import inspect
try:
import ujson as json
except ImportError: # pragma: no cover
import json
from pywb.apps.wbrequestresponse import WbResponse
from warcio.statusandheaders import StatusAndHeaders
@ -40,6 +46,98 @@ def test_resp_4():
assert(resp == expected)
def test_wbresponse_redir_supplied_headers():
res = WbResponse.redir_response('http://overhere.now', headers=[('A', 'B')])
assert ('A', 'B') in res.status_headers.headers
def test_wbresponse_creation_defaults():
res = WbResponse(None)
assert res.status_headers is None
assert isinstance(res.body, list)
assert len(res.body) == 0
def test_wbresponse_encode_stream():
stream = [u'\u00c3'] # Unicode Character 'LATIN CAPITAL LETTER A WITH TILDE' (U+00C3)
expected = [b'\xc3\x83']
encoding_stream = WbResponse.encode_stream(stream)
assert inspect.isgenerator(encoding_stream)
assert list(encoding_stream) == expected
def test_wbresponse_text_stream():
stream = [u'\u00c3'] # Unicode Character 'LATIN CAPITAL LETTER A WITH TILDE' (U+00C3)
expected = [b'\xc3\x83']
res = WbResponse.text_stream(stream, content_type='text/plain')
status_headers = res.status_headers
assert status_headers.statusline == '200 OK'
assert ('Content-Type', 'text/plain; charset=utf-8') in status_headers.headers
assert inspect.isgenerator(res.body)
assert list(res.body) == expected
res = WbResponse.text_stream(stream)
status_headers = res.status_headers
assert status_headers.statusline == '200 OK'
assert ('Content-Type', 'text/plain; charset=utf-8') in status_headers.headers
assert inspect.isgenerator(res.body)
assert list(res.body) == expected
def test_wbresponse_options_response():
res = WbResponse.options_response(dict(HTTP_ORIGIN='http://example.com'))
assert ('Access-Control-Allow-Origin', 'http://example.com') in res.status_headers.headers
res = WbResponse.options_response(dict(HTTP_REFERER='http://example.com'))
assert ('Access-Control-Allow-Origin', 'http://example.com') in res.status_headers.headers
res = WbResponse.options_response(dict())
assert ('Access-Control-Allow-Origin', '*') in res.status_headers.headers
res = WbResponse.options_response(dict(HTTP_ORIGIN=None))
assert ('Access-Control-Allow-Origin', '*') in res.status_headers.headers
res = WbResponse.options_response(dict(HTTP_REFERER=None))
assert ('Access-Control-Allow-Origin', '*') in res.status_headers.headers
def test_wbresponse_json_response():
body = dict(pywb=1, wr=2)
res = WbResponse.json_response(body)
status_headers = res.status_headers
assert status_headers.statusline == '200 OK'
assert ('Content-Type', 'application/json; charset=utf-8') in status_headers.headers
assert json.loads(res.body[0]) == body
def test_wbresponse_init_derived():
class Derived(WbResponse):
def __init__(self, status_headers, value=None, **kwargs):
self.received_kwargs = dict()
super(Derived, self).__init__(status_headers, value=value, **kwargs)
def _init_derived(self, params):
self.received_kwargs.update(params)
dres = Derived(None, pywb=1, wr=2)
assert dres.received_kwargs == dict(pywb=1, wr=2)
def test_wbresponse_callable():
expected_body = dict(pywb=1, wr=2)
res = WbResponse.json_response(expected_body)
env = dict(REQUEST_METHOD='GET')
expected_passed_values = dict(
status_line='200 OK',
headers=[('Content-Type', 'application/json; charset=utf-8'), ('Content-Length', '17')]
)
passed_values = dict(status_line=None, headers=None)
def start_response(status_line, headers):
passed_values['status_line'] = status_line
passed_values['headers'] = headers
body = res(env, start_response)
assert json.loads(body[0]) == expected_body
assert passed_values == expected_passed_values
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -1,39 +1,76 @@
from warcio.statusandheaders import StatusAndHeaders
import json
try:
import ujson as json
except ImportError: # pragma: no cover
import json
#=================================================================
# =================================================================
class WbResponse(object):
"""
Represnts a pywb wsgi response object.
"""Represnts a pywb wsgi response object.
Holds a status_headers object and a response iter, to be
returned to wsgi container.
"""
def __init__(self, status_headers, value=[], **kwargs):
returned to wsgi container."""
def __init__(self, status_headers, value=None, **kwargs):
"""
:param StatusAndHeaders status_headers: The StatusAndHeaders object for this response
:param Any value: The response body
:param Any kwargs: Additional keyword arguments to be passed to subclasses
"""
if value is None:
value = list()
self.status_headers = status_headers
self.body = value
self._init_derived(kwargs)
def _init_derived(self, params):
"""Receive the kwargs used in construction of this class
:param Any params:
:return:
:rtype: None
"""
pass
@staticmethod
def text_stream(stream, content_type='text/plain; charset=utf-8', status='200 OK'):
def encode(stream):
for obj in stream:
yield obj.encode('utf-8')
"""Utility method for constructing a streaming text response.
:param Any stream: The response body stream
:param str content_type: The content-type of the response
:param str status: The HTTP status line
:return: WbResponse that is a text stream
:rtype WbResponse:
"""
if 'charset' not in content_type:
content_type += '; charset=utf-8'
return WbResponse.bin_stream(encode(stream), content_type, status)
return WbResponse.bin_stream(WbResponse.encode_stream(stream), content_type, status)
@staticmethod
def encode_stream(stream):
"""Utility method to encode a stream using utf-8.
:param Any stream: The stream to be encoded using utf-8
:return: A generator that yields the contents of the stream encoded as utf-8
"""
for obj in stream:
yield obj.encode('utf-8')
@staticmethod
def bin_stream(stream, content_type, status='200 OK',
headers=None):
headers=None):
"""Utility method for constructing a binary response.
:param Any stream: The response body stream
:param str content_type: The content-type of the response
:param str status: The HTTP status line
:param list[tuple[str, str]] headers: Additional headers for this response
:return: WbResponse that is a binary stream
:rtype: WbResponse
"""
def_headers = [('Content-Type', content_type)]
if headers:
def_headers += headers
@ -44,6 +81,14 @@ class WbResponse(object):
@staticmethod
def text_response(text, status='200 OK', content_type='text/plain; charset=utf-8'):
"""Utility method for constructing a text response.
:param str text: The text response body
:param str content_type: The content-type of the response
:param str status: The HTTP status line
:return: WbResponse text response
:rtype: WbResponse
"""
encoded_text = text.encode('utf-8')
status_headers = StatusAndHeaders(status,
[('Content-Type', content_type),
@ -53,21 +98,59 @@ class WbResponse(object):
@staticmethod
def json_response(obj, status='200 OK', content_type='application/json; charset=utf-8'):
"""Utility method for constructing a JSON response.
:param dict obj: The dictionary to be serialized in JSON format
:param str content_type: The content-type of the response
:param str status: The HTTP status line
:return: WbResponse JSON response
:rtype: WbResponse
"""
return WbResponse.text_response(json.dumps(obj), status, content_type)
@staticmethod
def redir_response(location, status='302 Redirect', headers=None):
"""Utility method for constructing redirection response.
:param str location: The location of the resource redirecting to
:param str status: The HTTP status line
:param list[tuple[str, str]] headers: Additional headers for this response
:return: WbResponse redirection response
:rtype: WbResponse
"""
redir_headers = [('Location', location), ('Content-Length', '0')]
if headers:
redir_headers += headers
return WbResponse(StatusAndHeaders(status, redir_headers))
@staticmethod
def options_response(env):
"""Construct WbResponse for OPTIONS based on the WSGI env dictionary
:param dict env: The WSGI environment dictionary
:return: The WBResponse for the options request
:rtype: WbResponse
"""
status_headers = StatusAndHeaders('200 Ok', [
('Content-Type', 'text/plain'),
('Content-Length', '0'),
])
response = WbResponse(status_headers)
response.add_access_control_headers(env=env)
return response
def __call__(self, env, start_response):
"""Callable definition to allow WbResponse control over how the response is sent
:param dict env: The WSGI environment dictionary
:param function start_response: The WSGI start_response function
:return: The response body
"""
start_response(self.status_headers.statusline,
self.status_headers.headers)
if env['REQUEST_METHOD'] == 'HEAD' or self.status_headers.statusline.startswith('304'):
request_method = env['REQUEST_METHOD']
if request_method == 'HEAD' or request_method == 'OPTIONS' or self.status_headers.statusline.startswith('304'):
if hasattr(self.body, 'close'):
self.body.close()
return []
@ -75,8 +158,42 @@ class WbResponse(object):
return self.body
def add_range(self, *args):
"""Add HTTP range header values to this response
:param int args: The values for the range HTTP header
:return: The same WbResponse but with the values for the range HTTP header added
:rtype: WbResponse
"""
self.status_headers.add_range(*args)
return self
def add_access_control_headers(self, env=None):
"""Adds Access-Control* HTTP headers to this WbResponse's HTTP headers.
:param dict env: The WSGI environment dictionary
:return: The same WbResponse but with the values for the Access-Control* HTTP header added
:rtype: WbResponse
"""
allowed_methods = 'GET, POST, PUT, OPTIONS, DELETE, PATCH, HEAD, TRACE, CONNECT'
allowed_origin = None
if env is not None:
acr_method = env.get('HTTP_ACCESS_CONTROL_REQUEST_METHOD')
if acr_method is not None and acr_method not in allowed_methods:
allowed_methods = allowed_methods + ', ' + acr_method
r_method = env.get('REQUEST_METHOD')
if r_method is not None and r_method not in allowed_methods:
allowed_methods = allowed_methods + ', ' + r_method
acr_headers = env.get('HTTP_ACCESS_CONTROL_REQUEST_HEADERS')
if acr_headers is not None:
self.status_headers.add_header('Access-Control-Allow-Headers', acr_headers)
allowed_origin = env.get('HTTP_ORIGIN', env.get('HTTP_REFERER', allowed_origin))
if allowed_origin is None:
allowed_origin = '*'
self.status_headers.replace_header('Access-Control-Allow-Origin', allowed_origin)
self.status_headers.add_header('Access-Control-Allow-Methods', allowed_methods)
self.status_headers.add_header('Access-Control-Allow-Credentials', 'true')
self.status_headers.add_header('Access-Control-Max-Age', '1800')
return self
def __repr__(self):
return str(vars(self))

View File

@ -14,9 +14,13 @@ from webassets.env import Resolver
from pkg_resources import resource_filename
import json
import os
try:
import ujson as json
except ImportError: # pragma: no cover
import json
# ============================================================================
class RelEnvironment(Environment):
@ -27,14 +31,35 @@ class RelEnvironment(Environment):
# ============================================================================
class JinjaEnv(object):
def __init__(self, paths=['templates', '.', '/'],
packages=['pywb'],
assets_path=None,
globals=None,
overlay=None,
extensions=None,
env_template_params_key='pywb.template_params',
env_template_dir_key='pywb.templates_dir'):
"""Pywb JinjaEnv class that provides utility functions used by the templates,
configured template loaders and template paths, and contains the actual Jinja
env used by each template."""
def __init__(self, paths=None,
packages=None,
assets_path=None,
globals=None,
overlay=None,
extensions=None,
env_template_params_key='pywb.template_params',
env_template_dir_key='pywb.templates_dir'):
"""Construct a new JinjaEnv.
:param list[str] paths: List of paths to search for templates
:param list[str] packages: List of assets package names
:param str assets_path: Path to a yaml file containing assets
:param dict[str, str] globals: Dictionary of additional globals available during template rendering
:param overlay:
:param list extensions: List of webassets extension classes
:param str env_template_params_key: The full pywb package key for the template params
:param str env_template_dir_key: The full pywb package key for the template directory
"""
if paths is None:
paths = ['templates', '.', '/']
if packages is None:
packages = ['pywb']
self._init_filters()
@ -72,6 +97,13 @@ class JinjaEnv(object):
jinja_env.assets_environment = assets_env
def _make_loaders(self, paths, packages):
"""Initialize the template loaders based on the supplied paths and packages.
:param list[str] paths: List of paths to search for templates
:param list[str] packages: List of assets package names
:return: A list of loaders to be used for loading the template assets
:rtype: list[FileSystemLoader|PackageLoader]
"""
loaders = []
# add loaders for paths
for path in paths:
@ -84,6 +116,15 @@ class JinjaEnv(object):
return loaders
def template_filter(self, param=None):
"""Returns a decorator that adds the wrapped function to dictionary of template filters.
The wrapped function is keyed by either the supplied param (if supplied)
or by the wrapped functions name.
:param param: Optional name to use instead of the name of the function to be wrapped
:return: A decorator to wrap a template filter function
:rtype: callable
"""
def deco(func):
name = param or func.__name__
self.filters[name] = func
@ -92,10 +133,18 @@ class JinjaEnv(object):
return deco
def _init_filters(self):
"""Initialize the default pywb provided Jninja filters available during template rendering"""
self.filters = {}
@self.template_filter()
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
"""Formats the supplied timestamp using format_
:param str value: The timestamp to be formatted
:param str format_: The format string
:return: The correctly formatted timestamp as determined by format_
:rtype: str
"""
if format_ == '%s':
return timestamp_to_sec(value)
else:
@ -104,22 +153,58 @@ class JinjaEnv(object):
@self.template_filter('urlsplit')
def get_urlsplit(url):
"""Splits the supplied URL
:param str url: The url to be split
:return: The split url
:rtype: urllib.parse.SplitResult
"""
split = urlsplit(url)
return split
@self.template_filter()
def tojson(obj):
"""Converts the supplied object/array/any to a JSON string if it can be JSONified
:param any obj: The value to be converted to a JSON string
:return: The JSON string representation of the supplied value
:rtype: str
"""
return json.dumps(obj)
@self.template_filter()
def tobool(bool_val):
"""Converts a python boolean to a JS "true" or "false" string
:param any obj: A value to be evaluated as a boolean
:return: The string "true" or "false" to be inserted into JS
"""
return 'true' if bool_val else 'false'
# ============================================================================
class BaseInsertView(object):
"""Base class of all template views used by Pywb"""
def __init__(self, jenv, insert_file, banner_view=None):
"""Create a new BaseInsertView.
:param JinjaEnv jenv: The instance of pywb.rewrite.templateview.JinjaEnv to be used
:param str insert_file: The name of the template file
:param BaseInsertView banner_view: The banner_view property of pywb.apps.RewriterApp
"""
self.jenv = jenv
self.insert_file = insert_file
self.banner_view = banner_view
def render_to_string(self, env, **kwargs):
"""Render this template.
:param dict env: The WSGI environment associated with the request causing this template to be rendered
:param any kwargs: The keyword arguments to be supplied to the Jninja template render method
:return: The rendered template
:rtype: str
"""
template = None
template_path = env.get(self.jenv.env_template_dir_key)
@ -149,6 +234,9 @@ class BaseInsertView(object):
# ============================================================================
class HeadInsertView(BaseInsertView):
"""The template view class associated with rendering the HTML inserted
into the head of the pages replayed (WB Insert)."""
def create_insert_func(self, wb_url,
wb_prefix,
host_prefix,
@ -158,19 +246,32 @@ class HeadInsertView(BaseInsertView):
coll='',
include_ts=True,
**kwargs):
"""Create the function used to render the header insert template for the current request.
:param rewrite.wburl.WbUrl wb_url: The WbUrl for the request this template is being rendered for
:param str wb_prefix: The URL prefix pywb is serving the content using (e.g. http://localhost:8080/live/)
:param str host_prefix: The host URL prefix pywb is running on (e.g. http://localhost:8080)
:param str top_url: The full URL for this request (e.g. http://localhost:8080/live/http://example.com)
:param dict env: The WSGI environment dictionary for this request
:param bool is_framed: Is pywb or a specific collection running in framed mode
:param str coll: The name of the collection this request is associated with
:param bool include_ts: Should a timestamp be included in the rendered template
:param kwargs: Additional keyword arguments to be supplied to the Jninja template render method
:return: A function to be used to render the header insert for the request this template is being rendered for
:rtype: callable
"""
params = kwargs
params['host_prefix'] = host_prefix
params['wb_prefix'] = wb_prefix
params['wb_url'] = wb_url
params['top_url'] = top_url
params['coll'] = coll
params['is_framed'] = 'true' if is_framed else 'false'
params['is_framed'] = is_framed
def make_head_insert(rule, cdx):
params['wombat_ts'] = cdx['timestamp'] if include_ts else ''
params['wombat_sec'] = timestamp_to_sec(cdx['timestamp'])
params['is_live'] = 'true' if cdx.get('is_live') else 'false'
params['is_live'] = cdx.get('is_live')
if self.banner_view:
banner_html = self.banner_view.render_to_string(env, cdx=cdx, **params)
@ -183,6 +284,8 @@ class HeadInsertView(BaseInsertView):
# ============================================================================
class TopFrameView(BaseInsertView):
"""The template view class associated with rendering the replay iframe"""
def get_top_frame(self, wb_url,
wb_prefix,
host_prefix,
@ -191,6 +294,18 @@ class TopFrameView(BaseInsertView):
replay_mod,
coll='',
extra_params=None):
"""
:param rewrite.wburl.WbUrl wb_url: The WbUrl for the request this template is being rendered for
:param str wb_prefix: The URL prefix pywb is serving the content using (e.g. http://localhost:8080/live/)
:param str host_prefix: The host URL prefix pywb is running on (e.g. http://localhost:8080)
:param dict env: The WSGI environment dictionary for the request this template is being rendered for
:param str frame_mod: The modifier to be used for framing (e.g. if_)
:param str replay_mod: The modifier to be used in the URL of the page being replayed (e.g. mp_)
:param str coll: The name of the collection this template is being rendered for
:param dict extra_params: Additional parameters to be supplied to the Jninja template render method
:return: The frame insert string
:rtype: str
"""
embed_url = wb_url.to_str(mod=replay_mod)
@ -227,7 +342,15 @@ class TopFrameView(BaseInsertView):
# ============================================================================
class PkgResResolver(Resolver):
"""Class for resolving pywb package resources when install via pypi or setup.py"""
def get_pkg_path(self, item):
"""Get the package path for the
:param str item: A resources full package path
:return: The netloc and path from the items package path
:rtype: tuple[str, str]
"""
if not isinstance(item, str):
return None

View File

@ -3,8 +3,8 @@
var STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
var IMPORT_REGEX = /(@import\s+[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
var srcsetSplit = /\s*(\S*\s+[\d.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/;
// the preserver instance for this worker
var preserver = null;
// the autofetcher instance for this worker
var autofetcher = null;
function noop() {}
@ -41,31 +41,25 @@ self.onmessage = function (event) {
var data = event.data;
switch (data.type) {
case 'values':
preserver.preserveMediaSrcset(data);
autofetcher.autofetchMediaSrcset(data);
break;
}
};
function pMap(p) {
// mapping function to ensure each fetch promises catch has a no op cb
return p.catch(noop);
}
function Preserver(prefix, mod) {
if (!(this instanceof Preserver)) {
return new Preserver(prefix, mod);
function AutoFetcher(init) {
if (!(this instanceof AutoFetcher)) {
return new AutoFetcher(init);
}
this.prefix = prefix;
this.mod = mod;
this.prefixMod = prefix + mod;
this.proxyMode = init.proxyMode;
this.prefix = init.prefix;
this.mod = init.mod;
this.prefixMod = init.prefix + init.mod;
// relative url, WorkerLocation is set by owning document
this.relative = prefix.split(location.origin)[1];
this.relative = init.prefix.split(location.origin)[1];
// schemeless url
this.schemeless = '/' + this.relative;
// local cache of URLs fetched, to reduce server load
this.seen = {};
// counter used to know when to clear seen (count > 2500)
this.seenCount = 0;
// array of promises returned by fetch(URL)
this.fetches = [];
// array of URL to be fetched
@ -76,7 +70,7 @@ function Preserver(prefix, mod) {
this.fetchDone = this.fetchDone.bind(this);
}
Preserver.prototype.fixupURL = function (url) {
AutoFetcher.prototype.fixupURL = function (url) {
// attempt to fix up the url and do our best to ensure we can get dat 200 OK!
if (url.indexOf(this.prefixMod) === 0) {
return url;
@ -93,57 +87,54 @@ Preserver.prototype.fixupURL = function (url) {
return url;
};
Preserver.prototype.safeFetch = function (url) {
AutoFetcher.prototype.safeFetch = function (url) {
var fixedURL = this.fixupURL(url);
// check to see if we have seen this url before in order
// to lessen the load against the server content is preserved from
// to lessen the load against the server content is fetched from
if (this.seen[url] != null) return;
this.seen[url] = true;
if (this.queuing) {
// we are currently waiting for a batch of fetches to complete
return this.queue.push(fixedURL);
}
// queue this urls fetch
this.fetches.push(fetch(fixedURL));
// fetch this url
this.fetches.push(fetch(url));
};
Preserver.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) {
AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) {
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
this.safeFetch(n2);
return n1 + n2 + n3;
};
Preserver.prototype.fetchDone = function () {
// clear our fetches array in place
// https://www.ecma-international.org/ecma-262/9.0/index.html#sec-properties-of-array-instances-length
this.fetches.length = 0;
AutoFetcher.prototype.fetchDone = function () {
// indicate we no longer need to Q
this.queuing = false;
if (this.queue.length > 0) {
// we have a Q of some length drain it
this.drainQ();
} else if (this.seenCount > 2500) {
// we seen 2500 URLs so lets free some memory as at this point
// we will probably see some more. GC it!
this.seen = {};
this.seenCount = 0;
}
};
Preserver.prototype.fetchAll = function () {
AutoFetcher.prototype.fetchAll = function () {
// if we are queuing or have no fetches this is a no op
if (this.queuing) return;
if (this.fetches.length === 0) return;
// we are about to fetch queue anything that comes our way
this.queuing = true;
// initiate fetches by turning the initial fetch promises
// into rejctionless promises and "await" all
Promise.all(this.fetches.map(pMap))
/// initiate fetches by turning the initial fetch promises
// into rejctionless promises and "await" all clearing
// our fetches array in place
var runningFetchers = [];
while (this.fetches.length > 0) {
runningFetchers.push(this.fetches.shift().catch(noop))
}
Promise.all(runningFetchers)
.then(this.fetchDone)
.catch(this.fetchDone);
};
Preserver.prototype.drainQ = function () {
AutoFetcher.prototype.drainQ = function () {
// clear our Q in place and fill our fetches array
while (this.queue.length > 0) {
this.fetches.push(fetch(this.queue.shift()));
@ -152,17 +143,18 @@ Preserver.prototype.drainQ = function () {
this.fetchAll();
};
Preserver.prototype.extractMedia = function (mediaRules) {
AutoFetcher.prototype.extractMedia = function (mediaRules) {
// this is a broken down rewrite_style
if (mediaRules == null) return;
for (var i = 0; i < mediaRules.length; i++) {
var rule = mediaRules[i];
rule.replace(STYLE_REGEX, this.urlExtractor);
rule.replace(IMPORT_REGEX, this.urlExtractor);
if (mediaRules == null || mediaRules.values === null) return;
var rules = mediaRules.values;
for (var i = 0; i < rules.length; i++) {
var rule = rules[i];
rule.replace(STYLE_REGEX, this.urlExtractor)
.replace(IMPORT_REGEX, this.urlExtractor);
}
};
Preserver.prototype.extractSrcset = function (srcsets) {
AutoFetcher.prototype.extractSrcset = function (srcsets) {
if (srcsets == null || srcsets.values == null) return;
var srcsetValues = srcsets.values;
// was srcsets from rewrite_srcset and if so no need to split
@ -175,19 +167,21 @@ Preserver.prototype.extractSrcset = function (srcsets) {
this.safeFetch(srcset.split(' ')[0]);
} else {
// was from extract from local doc so we need to duplicate work
var values = srcset.split(srcsetSplit).filter(Boolean);
var values = srcset.split(srcsetSplit);
for (var j = 0; j < values.length; j++) {
var value = values[j].trim();
if (value.length > 0) {
this.safeFetch(value.split(' ')[0]);
if (Boolean(values[j])) {
var value = values[j].trim();
if (value.length > 0) {
this.safeFetch(value.split(' ')[0]);
}
}
}
}
}
};
Preserver.prototype.preserveMediaSrcset = function (data) {
// we got a message and now we preserve!
AutoFetcher.prototype.autofetchMediaSrcset = function (data) {
// we got a message and now we autofetch!
// these calls turn into no ops if they have no work
this.extractMedia(data.media);
this.extractSrcset(data.srcset);
@ -197,9 +191,12 @@ Preserver.prototype.preserveMediaSrcset = function (data) {
// initialize ourselves from the query params :)
try {
var loc = new self.URL(location);
preserver = new Preserver(loc.searchParams.get('prefix'), loc.searchParams.get('mod'));
autofetcher = new AutoFetcher(JSON.parse(loc.searchParams.get('init')));
} catch (e) {
// likely we are in an older version of safari
var search = decodeURIComponent(location.search.split('?')[1]).split('&');
preserver = new Preserver(search[0].substr(search[0].indexOf('=') + 1), search[1].substr(search[1].indexOf('=') + 1));
var init = JSON.parse(search[0].substr(search[0].indexOf('=') + 1));
init.prefix = decodeURIComponent(init.prefix);
init.baseURI = decodeURIComponent(init.baseURI);
autofetcher = new AutoFetcher(init);
}

View File

@ -0,0 +1,192 @@
'use strict';
// thanks wombat
var STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
var IMPORT_REGEX = /(@import\s+[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
var srcsetSplit = /\s*(\S*\s+[\d.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/;
// the autofetcher instance for this worker
var autofetcher = null;
function noop() {}
if (typeof self.Promise === 'undefined') {
// not kewl we must polyfill Promise
self.Promise = function (executor) {
executor(noop, noop);
};
self.Promise.prototype.then = function (cb) {
if (cb) cb();
return this;
};
self.Promise.prototype.catch = function () {
return this;
};
self.Promise.all = function (values) {
return new Promise(noop);
};
}
if (typeof self.fetch === 'undefined') {
// not kewl we must polyfill fetch.
self.fetch = function (url) {
return new Promise(function (resolve) {
var xhr = new XMLHttpRequest();
xhr.open('GET', url);
xhr.send();
resolve();
});
};
}
self.onmessage = function (event) {
var data = event.data;
switch (data.type) {
case 'values':
autofetcher.autofetchMediaSrcset(data);
break;
}
};
function AutoFetcher() {
if (!(this instanceof AutoFetcher)) {
return new AutoFetcher();
}
// local cache of URLs fetched, to reduce server load
this.seen = {};
// array of promises returned by fetch(URL)
this.fetches = [];
// array of URL to be fetched
this.queue = [];
// should we queue a URL or not
this.queuing = false;
// a URL to resolve relative URLs found in the cssText of CSSMedia rules.
this.currentResolver = null;
this.urlExtractor = this.urlExtractor.bind(this);
this.fetchDone = this.fetchDone.bind(this);
}
AutoFetcher.prototype.safeFetch = function (url) {
// ensure we do not request data urls
if (url.indexOf('data:') === 0) return;
// check to see if we have seen this url before in order
// to lessen the load against the server content is autofetchd from
if (this.seen[url] != null) return;
this.seen[url] = true;
if (this.queuing) {
// we are currently waiting for a batch of fetches to complete
return this.queue.push(url);
}
// fetch this url
this.fetches.push(fetch(url));
};
AutoFetcher.prototype.safeResolve = function (url, resolver) {
// Guard against the exception thrown by the URL constructor if the URL or resolver is bad
// if resolver is undefined/null then this function passes url through
var resolvedURL = url;
if (resolver) {
try {
resolvedURL = (new URL(url, resolver)).href
} catch (e) {
resolvedURL = url;
}
}
return resolvedURL;
};
AutoFetcher.prototype.urlExtractor = function (match, n1, n2, n3, offset, string) {
// Same function as style_replacer in wombat.rewrite_style, n2 is our URL
// this.currentResolver is set to the URL which the browser would normally
// resolve relative urls with (URL of the stylesheet) in an exceptionless manner
// (resolvedURL will be undefined if an error occurred)
var resolvedURL = this.safeResolve(n2, this.currentResolver);
if (resolvedURL) {
this.safeFetch(resolvedURL);
}
return n1 + n2 + n3;
};
AutoFetcher.prototype.fetchDone = function () {
// indicate we no longer need to Q
this.queuing = false;
if (this.queue.length > 0) {
// we have a Q of some length drain it
this.drainQ();
}
};
AutoFetcher.prototype.fetchAll = function () {
// if we are queuing or have no fetches this is a no op
if (this.queuing) return;
if (this.fetches.length === 0) return;
// we are about to fetch queue anything that comes our way
this.queuing = true;
// initiate fetches by turning the initial fetch promises
// into rejctionless promises and "await" all clearing
// our fetches array in place
var runningFetchers = [];
while (this.fetches.length > 0) {
runningFetchers.push(this.fetches.shift().catch(noop))
}
Promise.all(runningFetchers)
.then(this.fetchDone)
.catch(this.fetchDone);
};
AutoFetcher.prototype.drainQ = function () {
// clear our Q in place and fill our fetches array
while (this.queue.length > 0) {
this.fetches.push(fetch(this.queue.shift()));
}
// fetch all the things
this.fetchAll();
};
AutoFetcher.prototype.extractMedia = function (mediaRules) {
// this is a broken down rewrite_style
if (mediaRules == null) return;
for (var i = 0; i < mediaRules.length; i++) {
// set currentResolver to the value of this stylesheets URL, done to ensure we do not have to
// create functions on each loop iteration because we potentially create a new `URL` object
// twice per iteration
this.currentResolver = mediaRules[i].resolve;
mediaRules[i].cssText
.replace(STYLE_REGEX, this.urlExtractor)
.replace(IMPORT_REGEX, this.urlExtractor);
}
};
AutoFetcher.prototype.extractSrcset = function (srcsets) {
// preservation worker in proxy mode sends us the value of the srcset attribute of an element
// and a URL to correctly resolve relative URLS. Thus we must recreate rewrite_srcset logic here
if (srcsets == null) return;
var length = srcsets.length;
var extractedSrcSet, srcsetValue, ssSplit, j;
for (var i = 0; i < length; i++) {
extractedSrcSet = srcsets[i];
ssSplit = extractedSrcSet.srcset.split(srcsetSplit);
for (j = 0; j < ssSplit.length; j++) {
if (Boolean(ssSplit[j])) {
srcsetValue = ssSplit[j].trim();
if (srcsetValue.length > 0) {
// resolve the URL in an exceptionless manner (resolvedURL will be undefined if an error occurred)
var resolvedURL = this.safeResolve(srcsetValue.split(' ')[0], extractedSrcSet.resolve);
if (resolvedURL) {
this.safeFetch(resolvedURL);
}
}
}
}
}
};
AutoFetcher.prototype.autofetchMediaSrcset = function (data) {
// we got a message and now we autofetch!
// these calls turn into no ops if they have no work
this.extractMedia(data.media);
this.extractSrcset(data.srcset);
this.fetchAll();
};
autofetcher = new AutoFetcher();

View File

@ -78,9 +78,9 @@ var _WBWombat = function($wbwindow, wbinfo) {
var wb_setAttribute = $wbwindow.Element.prototype.setAttribute;
var wb_getAttribute = $wbwindow.Element.prototype.getAttribute;
var wb_funToString = Function.prototype.toString;
var WBPreserWorker;
var WBAutoFetchWorker;
var wbSheetMediaQChecker;
var wbUsePresWorker = $wbwindow.Worker != null && wbinfo.is_live;
var wbUseAAWorker = $wbwindow.Worker != null && wbinfo.is_live;
var wb_info;
@ -131,6 +131,11 @@ var _WBWombat = function($wbwindow, wbinfo) {
'TRACK': {'src': 'oe_'},
};
// pulled up rewrite_style and rewrite_srcset regex's as they are considered globals (uppercase)
var STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
var IMPORT_REGEX = /(@import\s+[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
var SRCSET_REGEX = /\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/;
function rwModForElement(elem, attrName) {
// this function was created to help add in retrial of element attribute rewrite modifiers
if (!elem) {
@ -1329,85 +1334,91 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
//============================================
function initPreserveWorker() {
if (!wbUsePresWorker) {
function initAutoFetchWorker() {
if (!wbUseAAWorker) {
return;
}
var Preserver = (function(Worker) {
function PWorker(prefix, mod) {
if (!(this instanceof PWorker)) {
return new PWorker(prefix, mod);
}
if ($wbwindow === $wbwindow.__WB_replay_top) {
// we are top and can will own this worker
// setup URL for the kewl case
var isTop = $wbwindow === $wbwindow.__WB_replay_top;
function AutoFetchWorker(prefix, mod) {
if (!(this instanceof AutoFetchWorker)) {
return new AutoFetchWorker(prefix, mod);
}
this.checkIntervalCB = this.checkIntervalCB.bind(this);
if (isTop) {
// we are top and can will own this worker
// setup URL for the kewl case
// Normal replay and preservation mode pworker setup, its all one origin so YAY!
var workerURL = wbinfo.static_prefix +
'wombatPreservationWorker.js?prefix=' +
encodeURIComponent(prefix) + '&mod=' +
encodeURIComponent(mod);
this.worker = new Worker(workerURL);
} else {
this.worker = null;
'autoFetchWorker.js?init='+
encodeURIComponent(JSON.stringify({ 'mod': mod, 'prefix': prefix }));
this.worker = new $wbwindow.Worker(workerURL);
} else {
// add only the portions of the worker interface we use since we are not top and if in proxy mode start check polling
this.worker = {
"postMessage": function (msg) {
if (!msg.wb_type) {
msg = { 'wb_type': 'aaworker', 'msg': msg };
}
$wbwindow.__WB_replay_top.__orig_postMessage(msg, '*');
},
"terminate": function () {}
};
}
}
AutoFetchWorker.prototype.checkIntervalCB = function () {
this.extractFromLocalDoc();
};
AutoFetchWorker.prototype.deferredSheetExtraction = function (sheet) {
var rules = sheet.cssRules || sheet.rules;
// if no rules this a no op
if (!rules || rules.length === 0) return;
var self = this;
function extract() {
// loop through each rule of the stylesheet
var media = [];
for (var j = 0; j < rules.length; ++j) {
var rule = rules[j];
if (rule.type === CSSRule.MEDIA_RULE) {
// we are a media rule so get its text
media.push(rule.cssText);
}
}
if (media.length > 0) {
// we have some media rules to preserve
self.preserveMedia(media);
}
}
// defer things until next time the Promise.resolve Qs are cleared
$wbwindow.Promise.resolve().then(extract);
};
PWorker.prototype.deferredSheetExtraction = function(sheet) {
var rules = sheet.cssRules || sheet.rules;
// if no rules this a no op
if (!rules || rules.length === 0) return;
function extract() {
// loop through each rule of the stylesheet
var media = [];
for (var j = 0; j < rules.length; ++j) {
var rule = rules[j];
if (rule instanceof CSSMediaRule) {
// we are a media rule so get its text
media.push(rule.cssText);
}
}
if (media.length > 0) {
// we have some media rules to preserve
WBPreserWorker.preserveMedia(media);
}
}
// defer things until next time the Promise.resolve Qs are cleared
$wbwindow.Promise.resolve().then(extract);
};
AutoFetchWorker.prototype.terminate = function () {
// terminate the worker, a no op when not replay top
this.worker.terminate();
};
PWorker.prototype.terminate = function() {
// terminate the worker, a no op when not replay top
if ($wbwindow === $wbwindow.__WB_replay_top) {
this.worker.terminate();
}
};
AutoFetchWorker.prototype.postMessage = function (msg) {
this.worker.postMessage(msg);
};
PWorker.prototype.postMessage = function(msg) {
if ($wbwindow === $wbwindow.__WB_replay_top) {
// we are actually replay top so send directly to worker
this.worker.postMessage(msg);
} else {
// send message to replay top
$wbwindow.__WB_replay_top.__orig_postMessage({
'wb_type': 'pworker', 'msg': msg,
}, '*');
}
};
AutoFetchWorker.prototype.preserveSrcset = function (srcset) {
// send values from rewrite_srcset to the worker
this.postMessage({
'type': 'values',
'srcset': {'values': srcset, 'presplit': true},
});
};
PWorker.prototype.preserveSrcset = function(srcset) {
// send values from rewrite_srcset to the worker
this.postMessage({
'type': 'values',
'srcset': {'values': srcset, 'presplit': true},
});
};
AutoFetchWorker.prototype.preserveMedia = function (media) {
// send CSSMediaRule values to the worker
this.postMessage({'type': 'values', 'media': media})
};
PWorker.prototype.preserveMedia = function(media) {
// send CSSMediaRule values to the worker
this.postMessage({'type': 'values', 'media': media})
};
PWorker.prototype.extractFromLocalDoc = function() {
AutoFetchWorker.prototype.extractFromLocalDoc = function () {
// get the values to be preserved from the documents stylesheets
// and all elements with a srcset
var media = [];
@ -1415,20 +1426,19 @@ var _WBWombat = function($wbwindow, wbinfo) {
var sheets = $wbwindow.document.styleSheets;
var i = 0;
for (; i < sheets.length; ++i) {
var sheet = sheets[i];
var rules = sheet.cssRules;
var rules = sheets[i].cssRules;
for (var j = 0; j < rules.length; ++j) {
var rule = rules[j];
if (rule instanceof CSSMediaRule) {
if (rule.type === CSSRule.MEDIA_RULE) {
media.push(rule.cssText);
}
}
}
var srcsetElems = $wbwindow.document.querySelectorAll('*[srcset]');
var srcsetElems = $wbwindow.document.querySelectorAll('img[srcset]');
for (i = 0; i < srcsetElems.length; i++) {
var srcsetElem = srcsetElems[i];
if (wb_getAttribute) {
srcset.push(wb_getAttribute.call(srcsetElem,'srcset'));
srcset.push(wb_getAttribute.call(srcsetElem, 'srcset'));
} else {
srcset.push(srcsetElem.getAttribute('srcset'));
}
@ -1440,18 +1450,15 @@ var _WBWombat = function($wbwindow, wbinfo) {
});
};
return PWorker;
})($wbwindow.Worker);
WBAutoFetchWorker = new AutoFetchWorker(wb_abs_prefix, wbinfo.mod);
WBPreserWorker = new Preserver(wb_abs_prefix, wbinfo.mod);
wbSheetMediaQChecker = function checkStyle () {
wbSheetMediaQChecker = function checkStyle() {
// used only for link[rel='stylesheet'] so we remove our listener
this.removeEventListener('load', wbSheetMediaQChecker);
// check no op condition
if (this.sheet == null) return;
// defer extraction to be nice :)
WBPreserWorker.deferredSheetExtraction(this.sheet);
WBAutoFetchWorker.deferredSheetExtraction(this.sheet);
};
}
@ -1612,10 +1619,6 @@ var _WBWombat = function($wbwindow, wbinfo) {
//============================================
function rewrite_style(value)
{
var STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
var IMPORT_REGEX = /(@import\s+[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
function style_replacer(match, n1, n2, n3, offset, string) {
return n1 + rewrite_url(n2) + n3;
}
@ -1645,14 +1648,14 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
// Filter removes non-truthy values like null, undefined, and ""
var values = value.split(/\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/).filter(Boolean);
var values = value.split(SRCSET_REGEX).filter(Boolean);
for (var i = 0; i < values.length; i++) {
values[i] = rewrite_url(values[i].trim());
}
if (wbUsePresWorker) {
if (wbUseAAWorker) {
// send post split values to preservation worker
WBPreserWorker.preserveSrcset(values);
WBAutoFetchWorker.preserveSrcset(values);
}
return values.join(", ");
}
@ -1756,16 +1759,16 @@ var _WBWombat = function($wbwindow, wbinfo) {
if (elem.textContent !== new_content) {
elem.textContent = new_content;
changed = true;
if (wbUsePresWorker && elem.sheet != null) {
if (wbUseAAWorker && elem.sheet != null) {
// we have a stylesheet so lets be nice to UI thread
// and defer extraction
WBPreserWorker.deferredSheetExtraction(elem.sheet);
WBAutoFetchWorker.deferredSheetExtraction(elem.sheet);
}
}
break;
case 'LINK':
changed = rewrite_attr(elem, 'href');
if (wbUsePresWorker && elem.rel === 'stylesheet') {
if (wbUseAAWorker && elem.rel === 'stylesheet') {
// we can only check link[rel='stylesheet'] when it loads
elem.addEventListener('load', wbSheetMediaQChecker);
}
@ -2194,9 +2197,9 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
}
orig_setter.call(this, res);
if (wbUsePresWorker && this.tagName === 'STYLE' && this.sheet != null) {
if (wbUseAAWorker && this.tagName === 'STYLE' && this.sheet != null) {
// got preserve all the things
WBPreserWorker.deferredSheetExtraction(this.sheet);
WBAutoFetchWorker.deferredSheetExtraction(this.sheet);
}
};
@ -3602,140 +3605,138 @@ var _WBWombat = function($wbwindow, wbinfo) {
init_wombat_loc($wbwindow);
// archival mode: init url-rewriting intercepts
if (!wb_is_proxy) {
init_wombat_top($wbwindow);
init_wombat_top($wbwindow);
// updated wb_unrewrite_rx for imgur.com
var wb_origin = $wbwindow.__WB_replay_top.location.origin;
var wb_host = $wbwindow.__WB_replay_top.location.host;
var wb_proto = $wbwindow.__WB_replay_top.location.protocol;
if (wb_replay_prefix && wb_replay_prefix.indexOf(wb_origin) == 0) {
wb_rel_prefix = wb_replay_prefix.substring(wb_origin.length);
} else {
wb_rel_prefix = wb_replay_prefix;
}
// make the protocol and host optional now
var rx = "((" + wb_proto + ")?\/\/" + wb_host + ")?" + wb_rel_prefix + "[^/]+/";
wb_unrewrite_rx = new RegExp(rx, "g");
// History
init_history_overrides();
// Doc Title
init_doc_title_override();
// postMessage
// OPT skip
if (!wb_opts.skip_postmessage) {
init_postmessage_override($wbwindow);
init_messageevent_override($wbwindow);
}
initMouseEventOverride($wbwindow);
init_hash_change();
// write
init_write_override();
// eval
//init_eval_override();
// Ajax
init_ajax_rewrite();
// Fetch
init_fetch_rewrite();
init_request_override();
// Audio
init_audio_override();
// FontFace
initFontFaceOverride($wbwindow);
// Worker override (experimental)
initPreserveWorker();
init_web_worker_override();
init_service_worker_override();
initSharedWorkerOverride();
// innerHTML can be overriden on prototype!
override_html_assign($wbwindow.HTMLElement, "innerHTML", true);
override_html_assign($wbwindow.HTMLElement, "outerHTML", true);
override_html_assign($wbwindow.HTMLIFrameElement, "srcdoc", true);
override_html_assign($wbwindow.HTMLStyleElement, "textContent");
// Document.URL override
override_prop_extract($wbwindow.Document.prototype, "URL");
override_prop_extract($wbwindow.Document.prototype, "documentURI");
// Node.baseURI override
override_prop_extract($wbwindow.Node.prototype, "baseURI");
// Attr nodeValue and value
override_attr_props();
// init insertAdjacentHTML() override
init_insertAdjacentHTML_override();
initInsertAdjacentElementOverride();
// iframe.contentWindow and iframe.contentDocument overrides to
// ensure wombat is inited on the iframe $wbwindow!
override_iframe_content_access("contentWindow");
override_iframe_content_access("contentDocument");
// override funcs to convert first arg proxy->obj
override_func_first_arg_proxy_to_obj($wbwindow.MutationObserver, "observe");
override_func_first_arg_proxy_to_obj($wbwindow.Node, "compareDocumentPosition");
override_func_first_arg_proxy_to_obj($wbwindow.Node, "contains");
override_func_first_arg_proxy_to_obj($wbwindow.Document, "createTreeWalker");
override_func_this_proxy_to_obj($wbwindow, "getComputedStyle", $wbwindow);
//override_func_this_proxy_to_obj($wbwindow.EventTarget, "addEventListener");
//override_func_this_proxy_to_obj($wbwindow.EventTarget, "removeEventListener");
override_apply_func($wbwindow);
initTimeoutIntervalOverrides($wbwindow, "setTimeout");
initTimeoutIntervalOverrides($wbwindow, "setInterval");
override_frames_access($wbwindow);
// setAttribute
if (!wb_opts.skip_setAttribute) {
init_setAttribute_override();
init_getAttribute_override();
}
init_svg_image_overrides();
// override href and src attrs
init_attr_overrides();
// Cookies
init_cookies_override();
// ensure namespace urls are NOT rewritten
init_createElementNS_fix();
// Image
//init_image_override();
// DOM
// OPT skip
if (!wb_opts.skip_dom) {
init_dom_override();
}
// registerProtocolHandler override
init_registerPH_override();
//sendBeacon override
init_beacon_override();
// updated wb_unrewrite_rx for imgur.com
var wb_origin = $wbwindow.__WB_replay_top.location.origin;
var wb_host = $wbwindow.__WB_replay_top.location.host;
var wb_proto = $wbwindow.__WB_replay_top.location.protocol;
if (wb_replay_prefix && wb_replay_prefix.indexOf(wb_origin) == 0) {
wb_rel_prefix = wb_replay_prefix.substring(wb_origin.length);
} else {
wb_rel_prefix = wb_replay_prefix;
}
// make the protocol and host optional now
var rx = "((" + wb_proto + ")?\/\/" + wb_host + ")?" + wb_rel_prefix + "[^/]+/";
wb_unrewrite_rx = new RegExp(rx, "g");
// History
init_history_overrides();
// Doc Title
init_doc_title_override();
// postMessage
// OPT skip
if (!wb_opts.skip_postmessage) {
init_postmessage_override($wbwindow);
init_messageevent_override($wbwindow);
}
initMouseEventOverride($wbwindow);
init_hash_change();
// write
init_write_override();
// eval
//init_eval_override();
// Ajax
init_ajax_rewrite();
// Fetch
init_fetch_rewrite();
init_request_override();
// Audio
init_audio_override();
// FontFace
initFontFaceOverride($wbwindow);
// Worker override (experimental)
initAutoFetchWorker();
init_web_worker_override();
init_service_worker_override();
initSharedWorkerOverride();
// innerHTML can be overriden on prototype!
override_html_assign($wbwindow.HTMLElement, "innerHTML", true);
override_html_assign($wbwindow.HTMLElement, "outerHTML", true);
override_html_assign($wbwindow.HTMLIFrameElement, "srcdoc", true);
override_html_assign($wbwindow.HTMLStyleElement, "textContent");
// Document.URL override
override_prop_extract($wbwindow.Document.prototype, "URL");
override_prop_extract($wbwindow.Document.prototype, "documentURI");
// Node.baseURI override
override_prop_extract($wbwindow.Node.prototype, "baseURI");
// Attr nodeValue and value
override_attr_props();
// init insertAdjacentHTML() override
init_insertAdjacentHTML_override();
initInsertAdjacentElementOverride();
// iframe.contentWindow and iframe.contentDocument overrides to
// ensure wombat is inited on the iframe $wbwindow!
override_iframe_content_access("contentWindow");
override_iframe_content_access("contentDocument");
// override funcs to convert first arg proxy->obj
override_func_first_arg_proxy_to_obj($wbwindow.MutationObserver, "observe");
override_func_first_arg_proxy_to_obj($wbwindow.Node, "compareDocumentPosition");
override_func_first_arg_proxy_to_obj($wbwindow.Node, "contains");
override_func_first_arg_proxy_to_obj($wbwindow.Document, "createTreeWalker");
override_func_this_proxy_to_obj($wbwindow, "getComputedStyle", $wbwindow);
//override_func_this_proxy_to_obj($wbwindow.EventTarget, "addEventListener");
//override_func_this_proxy_to_obj($wbwindow.EventTarget, "removeEventListener");
override_apply_func($wbwindow);
initTimeoutIntervalOverrides($wbwindow, "setTimeout");
initTimeoutIntervalOverrides($wbwindow, "setInterval");
override_frames_access($wbwindow);
// setAttribute
if (!wb_opts.skip_setAttribute) {
init_setAttribute_override();
init_getAttribute_override();
}
init_svg_image_overrides();
// override href and src attrs
init_attr_overrides();
// Cookies
init_cookies_override();
// ensure namespace urls are NOT rewritten
init_createElementNS_fix();
// Image
//init_image_override();
// DOM
// OPT skip
if (!wb_opts.skip_dom) {
init_dom_override();
}
// registerProtocolHandler override
init_registerPH_override();
//sendBeacon override
init_beacon_override();
// other overrides
// proxy mode: only using these overrides
@ -3765,13 +3766,13 @@ var _WBWombat = function($wbwindow, wbinfo) {
init_document_obj_proxy($wbwindow.document);
// expose functions
var obj = {}
var obj = {};
obj.extract_orig = extract_orig;
obj.rewrite_url = rewrite_url;
obj.watch_elem = watch_elem;
obj.init_new_window_wombat = init_new_window_wombat;
obj.init_paths = init_paths;
obj.local_init = function(name) {
obj.local_init = function (name) {
var res = $wbwindow._WB_wombat_obj_proxy[name];
if (name === "document" && res && !res._WB_wombat_obj_proxy) {
return init_document_obj_proxy(res) || res;
@ -3812,8 +3813,8 @@ var _WBWombat = function($wbwindow, wbinfo) {
return;
}
if ($wbwindow.document.readyState === "complete" && wbUsePresWorker) {
WBPreserWorker.extractFromLocalDoc();
if ($wbwindow.document.readyState === "complete" && wbUseAAWorker) {
WBAutoFetchWorker.extractFromLocalDoc();
}
if ($wbwindow != $wbwindow.__WB_replay_top) {
@ -3925,10 +3926,10 @@ var _WBWombat = function($wbwindow, wbinfo) {
// Fix .parent only if not embeddable, otherwise leave for accessing embedding window
if (!wb_opts.embedded && (replay_top == $wbwindow)) {
if (wbUsePresWorker) {
if (wbUseAAWorker) {
$wbwindow.addEventListener("message", function(event) {
if (event.data && event.data.wb_type === 'pworker') {
WBPreserWorker.postMessage(event.data.msg);
if (event.data && event.data.wb_type === 'aaworker') {
WBAutoFetchWorker.postMessage(event.data.msg);
}
}, false);
}
@ -3982,8 +3983,6 @@ var _WBWombat = function($wbwindow, wbinfo) {
}
// Utility functions used by rewriting rules
function watch_elem(elem, func)
{

View File

@ -0,0 +1,376 @@
/*
Copyright(c) 2013-2018 Rhizome and Ilya Kreymer. Released under the GNU General Public License.
This file is part of pywb, https://github.com/webrecorder/pywb
pywb is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
pywb is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with pywb. If not, see <http://www.gnu.org/licenses/>.
*/
//============================================
// Wombat JS-Rewriting Library v2.53
//============================================
// Wombat lite for proxy-mode
var _WBWombat = function ($wbwindow, wbinfo) {
// Globals
var wb_info = wbinfo;
wb_info.top_host = wb_info.top_host || "*";
wbinfo.wombat_opts = wbinfo.wombat_opts || {};
var wbAutoFetchWorkerPrefix = (wb_info.auto_fetch_worker_prefix || wb_info.static_prefix) + 'autoFetchWorkerProxyMode.js';
var WBAutoFetchWorker;
function init_seeded_random(seed) {
// Adapted from:
// http://indiegamr.com/generate-repeatable-random-numbers-in-js/
$wbwindow.Math.seed = parseInt(seed);
function seeded_random() {
$wbwindow.Math.seed = ($wbwindow.Math.seed * 9301 + 49297) % 233280;
var rnd = $wbwindow.Math.seed / 233280;
return rnd;
}
$wbwindow.Math.random = seeded_random;
}
function init_crypto_random() {
if (!$wbwindow.crypto || !$wbwindow.Crypto) {
return;
}
var orig_getrandom = $wbwindow.Crypto.prototype.getRandomValues;
var new_getrandom = function (array) {
for (var i = 0; i < array.length; i++) {
array[i] = parseInt($wbwindow.Math.random() * 4294967296);
}
return array;
};
$wbwindow.Crypto.prototype.getRandomValues = new_getrandom;
$wbwindow.crypto.getRandomValues = new_getrandom;
}
//============================================
function init_fixed_ratio() {
// otherwise, just set it
$wbwindow.devicePixelRatio = 1;
// prevent changing, if possible
if (Object.defineProperty) {
try {
// fixed pix ratio
Object.defineProperty($wbwindow, "devicePixelRatio", {value: 1, writable: false});
} catch (e) {}
}
}
//========================================
function init_date_override(timestamp) {
timestamp = parseInt(timestamp) * 1000;
//var timezone = new Date().getTimezoneOffset() * 60 * 1000;
// Already UTC!
var timezone = 0;
var start_now = $wbwindow.Date.now();
var timediff = start_now - (timestamp - timezone);
if ($wbwindow.__wb_Date_now) {
return;
}
var orig_date = $wbwindow.Date;
var orig_utc = $wbwindow.Date.UTC;
var orig_parse = $wbwindow.Date.parse;
var orig_now = $wbwindow.Date.now;
$wbwindow.__wb_Date_now = orig_now;
$wbwindow.Date = function (Date) {
return function (A, B, C, D, E, F, G) {
// Apply doesn't work for constructors and Date doesn't
// seem to like undefined args, so must explicitly
// call constructor for each possible args 0..7
if (A === undefined) {
return new Date(orig_now() - timediff);
} else if (B === undefined) {
return new Date(A);
} else if (C === undefined) {
return new Date(A, B);
} else if (D === undefined) {
return new Date(A, B, C);
} else if (E === undefined) {
return new Date(A, B, C, D);
} else if (F === undefined) {
return new Date(A, B, C, D, E);
} else if (G === undefined) {
return new Date(A, B, C, D, E, F);
} else {
return new Date(A, B, C, D, E, F, G);
}
}
}($wbwindow.Date);
$wbwindow.Date.prototype = orig_date.prototype;
$wbwindow.Date.now = function () {
return orig_now() - timediff;
};
$wbwindow.Date.UTC = orig_utc;
$wbwindow.Date.parse = orig_parse;
$wbwindow.Date.__WB_timediff = timediff;
Object.defineProperty($wbwindow.Date.prototype, "constructor", {value: $wbwindow.Date});
}
//============================================
function init_disable_notifications() {
if (window.Notification) {
window.Notification.requestPermission = function (callback) {
if (callback) {
callback("denied");
}
return Promise.resolve("denied");
};
}
if (window.geolocation) {
var disabled = function (success, error, options) {
if (error) {
error({"code": 2, "message": "not available"});
}
};
window.geolocation.getCurrentPosition = disabled;
window.geolocation.watchPosition = disabled;
}
}
function initAutoFetchWorker() {
if (!$wbwindow.Worker) {
return;
}
var isTop = $wbwindow.self === $wbwindow.top;
function AutoFetchWorker() {
if (!(this instanceof AutoFetchWorker)) {
return new AutoFetchWorker();
}
this.checkIntervalTime = 15000;
this.checkIntervalCB = this.checkIntervalCB.bind(this);
if (isTop) {
// Cannot directly load our worker from the proxy origin into the current origin
// however we fetch it from proxy origin and can blob it into the current origin :)
var self = this;
fetch(wbAutoFetchWorkerPrefix)
.then(function (res) {
return res.text().then(function (text) {
var blob = new Blob([text], {"type": "text/javascript"});
self.worker = new $wbwindow.Worker(URL.createObjectURL(blob));
// use our origins reference to the document in order for us to parse stylesheets :/
self.styleTag = document.createElement('style');
self.styleTag.id = '$wrStyleParser$';
document.documentElement.appendChild(self.styleTag);
self.startCheckingInterval();
});
});
} else {
// add only the portions of the worker interface we use since we are not top and if in proxy mode start check polling
this.worker = {
"postMessage": function (msg) {
if (!msg.wb_type) {
msg = {'wb_type': 'aaworker', 'msg': msg};
}
$wbwindow.top.postMessage(msg, '*');
},
"terminate": function () {}
};
this.startCheckingInterval();
}
}
AutoFetchWorker.prototype.startCheckingInterval = function () {
// if document ready state is complete do first extraction and start check polling
// otherwise wait for document ready state to complete to extract and start check polling
var self = this;
if ($wbwindow.document.readyState === "complete") {
this.extractFromLocalDoc();
setInterval(this.checkIntervalCB, this.checkIntervalTime);
} else {
var i = setInterval(function () {
if ($wbwindow.document.readyState === "complete") {
self.extractFromLocalDoc();
clearInterval(i);
setInterval(self.checkIntervalCB, self.checkIntervalTime);
}
}, 1000);
}
};
AutoFetchWorker.prototype.checkIntervalCB = function () {
this.extractFromLocalDoc();
};
AutoFetchWorker.prototype.terminate = function () {
// terminate the worker, a no op when not replay top
this.worker.terminate();
};
AutoFetchWorker.prototype.postMessage = function (msg) {
this.worker.postMessage(msg);
};
AutoFetchWorker.prototype.extractMediaRules = function (rules, href) {
// We are in proxy mode and must include a URL to resolve relative URLs in media rules
if (!rules) return [];
var rvlen = rules.length;
var text = [];
var rule;
for (var i = 0; i < rvlen; ++i) {
rule = rules[i];
if (rule.type === CSSRule.MEDIA_RULE) {
text.push({"cssText": rule.cssText, "resolve": href});
}
}
return text;
};
AutoFetchWorker.prototype.corsCSSFetch = function (href) {
// because this JS in proxy mode operates as it would on the live web
// the rules of CORS apply and we cannot rely on URLs being rewritten correctly
// fetch the cross origin css file and then parse it using a style tag to get the rules
var url = location.protocol + '//' + wb_info.proxy_magic + '/proxy-fetch/' + href;
var aaw = this;
return fetch(url).then(function (res) {
return res.text().then(function (text) {
aaw.styleTag.textContent = text;
var sheet = aaw.styleTag.sheet || {};
return aaw.extractMediaRules(sheet.cssRules || sheet.rules, href);
});
}).catch(function (error) {
return [];
});
};
AutoFetchWorker.prototype.shouldSkipSheet = function (sheet) {
// we skip extracting rules from sheets if they are from our parsing style or come from pywb
if (sheet.id === '$wrStyleParser$') return true;
return !!(sheet.href && sheet.href.indexOf(wb_info.proxy_magic) !== -1);
};
AutoFetchWorker.prototype.extractFromLocalDoc = function () {
var i = 0;
var media = [];
var deferredMediaURLS = [];
var srcset = [];
var sheet;
var resolve;
// We must use the window reference passed to us to access this origins stylesheets
var styleSheets = $wbwindow.document.styleSheets;
for (; i < styleSheets.length; ++i) {
sheet = styleSheets[i];
// if the sheet belongs to our parser node we must skip it
if (!this.shouldSkipSheet(sheet)) {
try {
// if no error is thrown due to cross origin sheet the urls then just add
// the resolved URLS if any to the media urls array
if (sheet.cssRules != null) {
resolve = sheet.href || $wbwindow.document.baseURI;
media = media.concat(this.extractMediaRules(sheet.cssRules, resolve));
} else if (sheet.href != null) {
// depending on the browser cross origin stylesheets will have their
// cssRules property null but href non-null
deferredMediaURLS.push(this.corsCSSFetch(sheet.href));
}
} catch (error) {
// the stylesheet is cross origin and we must re-fetch via PYWB to get the contents for checking
deferredMediaURLS.push(this.corsCSSFetch(sheet.href));
}
}
}
// We must use the window reference passed to us to access this origins elements with srcset attr
// like cssRule handling we must include a URL to resolve relative URLs by
var srcsetElems = $wbwindow.document.querySelectorAll('img[srcset]');
var ssElem, resolveAgainst;
for (i = 0; i < srcsetElems.length; i++) {
ssElem = srcsetElems[i];
resolveAgainst = ssElem.src != null && ssElem.src !== ' ' ? ssElem.src : $wbwindow.document.baseURI;
srcset.push({'srcset': ssElem.srcset, 'resolve': resolveAgainst});
}
// send what we have extracted, if anything, to the worker for processing
if (media.length > 0 || srcset.length > 0) {
this.postMessage({'type': 'values', 'media': media, 'srcset': srcset});
}
if (deferredMediaURLS.length > 0) {
// wait for all our deferred fetching and extraction of cross origin
// stylesheets to complete and then send those values, if any, to the worker
var aaw = this;
Promise.all(deferredMediaURLS).then(function (values) {
var results = [];
while (values.length > 0) {
results = results.concat(values.shift());
}
if (results.length > 0) {
aaw.postMessage({'type': 'values', 'media': results});
}
});
}
};
WBAutoFetchWorker = new AutoFetchWorker();
if (isTop) {
$wbwindow.addEventListener("message", function (event) {
if (event.data && event.data.wb_type === 'aaworker') {
WBAutoFetchWorker.postMessage(event.data.msg);
}
}, false);
}
}
if (wbinfo.use_auto_fetch_worker && wbinfo.is_live) {
initAutoFetchWorker();
}
if (wbinfo.use_wombat) {
// proxy mode overrides
// Random
init_seeded_random(wbinfo.wombat_sec);
// Crypto Random
init_crypto_random();
// set fixed pixel ratio
init_fixed_ratio();
// Date
init_date_override(wbinfo.wombat_sec);
// disable notifications
init_disable_notifications();
}
return {};
};
window._WBWombat = _WBWombat;

View File

@ -1,9 +1,9 @@
<!-- WB Insert -->
<script>
{% set urlsplit = cdx.url | urlsplit %}
wbinfo = {}
wbinfo = {};
wbinfo.top_url = "{{ top_url }}";
{% if is_framed == 'true' %}
{% if is_framed %}
// Fast Top-Frame Redirect
if (window == window.top && wbinfo.top_url) {
var loc = window.location.href.replace(window.location.hash, "");
@ -19,15 +19,23 @@
wbinfo.request_ts = "{{ wb_url.timestamp }}";
wbinfo.prefix = decodeURI("{{ wb_prefix }}");
wbinfo.mod = "{{ replay_mod }}";
wbinfo.is_framed = {{ is_framed }};
wbinfo.is_live = {{ is_live }};
wbinfo.is_framed = {{ is_framed | tobool }};
wbinfo.is_live = {{ is_live | tobool }};
wbinfo.coll = "{{ coll }}";
wbinfo.proxy_magic = "{{ env.pywb_proxy_magic }}";
wbinfo.static_prefix = "{{ static_prefix }}/";
{% if env.pywb_proxy_magic %}
wbinfo.use_auto_fetch_worker = {{ config.proxy.use_auto_fetch_worker | tobool }};
wbinfo.use_wombat = {{ config.proxy.use_wombat | tobool }} || wbinfo.use_auto_fetch_worker;
{% endif %}
</script>
{% if not wb_url.is_banner_only %}
<script src='{{ static_prefix }}/wombat.js'> </script>
{% if env.pywb_proxy_magic %}
{% set whichWombat = 'wombatProxyMode.js' %}
{% else %}
{% set whichWombat = 'wombat.js' %}
{% endif %}
{% if not wb_url.is_banner_only or (env.pywb_proxy_magic and (config.proxy.use_auto_fetch_worker or config.proxy.use_wombat)) %}
<script src='{{ static_prefix }}/{{ whichWombat }}'> </script>
<script>
wbinfo.wombat_ts = "{{ wombat_ts }}";
wbinfo.wombat_sec = "{{ wombat_sec }}";

View File

@ -113,6 +113,7 @@ setup(
'urllib3',
'werkzeug',
'httpbin==0.5.0',
'ujson'
],
cmdclass={'test': PyTest},
test_suite='',

View File

@ -312,7 +312,7 @@ class TestManagedColls(CollsDirMixin, BaseConfigTest):
assert resp.status_int == 200
assert resp.content_type == 'text/html'
assert 'overriden search page: ' in resp.text
assert '"some": "value"' in resp.text
assert '"some":"value"' in resp.text
def test_more_custom_templates_replay(self, fmod):
resp = self.get('/test/20140103030321{0}/http://example.com/?example=1', fmod)

View File

@ -19,7 +19,9 @@ class TestProxyCLIConfig(CollsDirMixin, BaseTestClass):
exp = {'ca_file_cache': os.path.join('proxy-certs', 'pywb-ca.pem'),
'ca_name': 'pywb HTTPS Proxy CA',
'coll': 'test',
'recording': False}
'recording': False,
'use_wombat': False,
'use_auto_fetch_worker': False}
assert res.extra_config['proxy'] == exp
def test_proxy_cli_rec(self):

View File

@ -66,8 +66,9 @@ class TestProxy(BaseTestProxy):
# wb insert
assert 'WB Insert' in res.text
# no wombat.js
# no wombat.js and wombatProxyMode.js
assert 'wombat.js' not in res.text
assert 'wombatProxyMode.js' not in res.text
# no redirect check
assert 'window == window.top' not in res.text
@ -85,8 +86,9 @@ class TestProxy(BaseTestProxy):
assert 'WB Insert' in res.text
assert 'Example Domain' in res.text
# no wombat.js
# no wombat.js and wombatProxyMode.js
assert 'wombat.js' not in res.text
assert 'wombatProxyMode.js' not in res.text
# banner
assert 'default_banner.js' in res.text
@ -167,8 +169,9 @@ class TestProxyNoBanner(BaseTestProxy):
# no banner
assert 'default_banner.js' not in res.text
# no wombat.js
# no wombat.js and wombatProxyMode.js
assert 'wombat.js' not in res.text
assert 'wombatProxyMode.js' not in res.text
# no redirect check
assert 'window == window.top' not in res.text
@ -197,8 +200,9 @@ class TestProxyNoHeadInsert(BaseTestProxy):
# no banner
assert 'default_banner.js' not in res.text
# no wombat.js
# no wombat.js and wombatProxyMode.js
assert 'wombat.js' not in res.text
assert 'wombatProxyMode.js' not in res.text
# no redirect check
assert 'window == window.top' not in res.text
@ -207,3 +211,138 @@ class TestProxyNoHeadInsert(BaseTestProxy):
assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
# ============================================================================
class TestProxyIncludeBothWombatAutoFetchWorker(BaseTestProxy):
@classmethod
def setup_class(cls):
super(TestProxyIncludeBothWombatAutoFetchWorker, cls).setup_class(
extra_opts={'use_wombat': True, 'use_auto_fetch_worker': True}
)
def test_include_both_wombat_auto_fetch_worker(self, scheme):
res = requests.get('{0}://example.com/'.format(scheme),
proxies=self.proxies,
verify=self.root_ca_file)
# content
assert 'Example Domain' in res.text
# yes head insert
assert 'WB Insert' in res.text
# no wombat.js, yes wombatProxyMode.js
assert 'wombat.js' not in res.text
assert 'wombatProxyMode.js' in res.text
assert 'wbinfo.use_wombat = true || wbinfo.use_auto_fetch_worker;' in res.text
assert 'wbinfo.use_auto_fetch_worker = true;' in res.text
# ============================================================================
class TestProxyIncludeWombatNotAutoFetchWorker(BaseTestProxy):
@classmethod
def setup_class(cls):
super(TestProxyIncludeWombatNotAutoFetchWorker, cls).setup_class(
extra_opts={'use_wombat': True, 'use_auto_fetch': False}
)
def test_include_wombat_not_auto_fetch_worker(self, scheme):
res = requests.get('{0}://example.com/'.format(scheme),
proxies=self.proxies,
verify=self.root_ca_file)
# content
assert 'Example Domain' in res.text
# yes head insert
assert 'WB Insert' in res.text
# no wombat.js, yes wombatProxyMode.js
assert 'wombat.js' not in res.text
assert 'wombatProxyMode.js' in res.text
assert 'wbinfo.use_wombat = true || wbinfo.use_auto_fetch_worker;' in res.text
assert 'wbinfo.use_auto_fetch_worker = false;' in res.text
# ============================================================================
class TestProxyIncludeAutoFetchWorkerNotWombat(BaseTestProxy):
@classmethod
def setup_class(cls):
super(TestProxyIncludeAutoFetchWorkerNotWombat, cls).setup_class(
extra_opts={'use_wombat': False, 'use_auto_fetch': True}
)
def test_include_auto_fetch_worker_not_wombat(self, scheme):
res = requests.get('{0}://example.com/'.format(scheme),
proxies=self.proxies,
verify=self.root_ca_file)
# content
assert 'Example Domain' in res.text
# yes head insert
assert 'WB Insert' in res.text
# no wombat.js, no wombatProxyMode.js
# auto fetch worker requires wombat
assert 'wombat.js' not in res.text
assert 'wombatProxyMode.js' not in res.text
# ============================================================================
class TestProxyAutoFetchWorkerEndPoints(BaseTestProxy):
@classmethod
def setup_class(cls):
super(TestProxyAutoFetchWorkerEndPoints, cls).setup_class(
extra_opts={'use_wombat': True, 'use_auto_fetch': True}
)
def test_proxy_fetch_options_request(self, scheme):
expected_origin = '{0}://example.com'.format(scheme)
res = requests.options('{0}://pywb.proxy/proxy-fetch/{1}'.format(scheme, expected_origin),
headers=dict(Origin=expected_origin),
proxies=self.proxies, verify=self.root_ca_file)
assert res.ok
assert res.headers.get('Access-Control-Allow-Origin') == expected_origin
def test_proxy_fetch(self, scheme):
expected_origin = '{0}://example.com'.format(scheme)
res = requests.get('{0}://pywb.proxy/proxy-fetch/{1}'.format(scheme, expected_origin),
headers=dict(Origin='{0}://example.com'.format(scheme)),
proxies=self.proxies, verify=self.root_ca_file)
assert res.ok
assert 'Example Domain' in res.text
res = requests.get('{0}://pywb.proxy/proxy-fetch/{1}'.format(scheme, expected_origin),
proxies=self.proxies, verify=self.root_ca_file)
assert res.ok
assert 'Example Domain' in res.text
def test_proxy_worker_options_request(self, scheme):
expected_origin = '{0}://example.com'.format(scheme)
res = requests.options('{0}://pywb.proxy/static/autoFetchWorkerProxyMode.js'.format(scheme),
headers=dict(Origin=expected_origin),
proxies=self.proxies, verify=self.root_ca_file)
assert res.ok
assert res.headers.get('Access-Control-Allow-Origin') == expected_origin
def test_proxy_worker_fetch(self, scheme):
origin = '{0}://example.com'.format(scheme)
url = '{0}://pywb.proxy/static/autoFetchWorkerProxyMode.js'.format(scheme)
res = requests.get(url,
headers=dict(Origin=origin),
proxies=self.proxies, verify=self.root_ca_file)
assert res.ok
assert res.headers.get('Content-Type') == 'application/javascript'
assert res.headers.get('Access-Control-Allow-Origin') == origin
assert 'AutoFetcher.prototype.safeResolve' in res.text
res = requests.get(url, proxies=self.proxies, verify=self.root_ca_file)
assert res.ok
assert res.headers.get('Content-Type') == 'application/javascript'
assert res.headers.get('Access-Control-Allow-Origin') == '*'
assert 'AutoFetcher.prototype.safeResolve' in res.text