diff --git a/config.yaml b/config.yaml index 94b02788..89e01314 100644 --- a/config.yaml +++ b/config.yaml @@ -3,6 +3,7 @@ # # Settings for each collection + collections: # : # collection will be accessed via / diff --git a/pywb/apps/cli.py b/pywb/apps/cli.py index 9ff4a2db..f7094c61 100644 --- a/pywb/apps/cli.py +++ b/pywb/apps/cli.py @@ -57,7 +57,9 @@ class BaseCli(object): def load(self): if self.r.live: - self.extra_config = {'collections': {'live': '$live'}} + self.extra_config = {'collections': + {'live': {'index': '$live', + 'use_js_obj_proxy': True}}} def run(self): self.run_gevent() @@ -80,6 +82,7 @@ class ReplayCli(BaseCli): def load(self): super(ReplayCli, self).load() + import os if self.r.directory: #pragma: no cover os.chdir(self.r.directory) diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index d2edf55c..f8ef6dff 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -88,13 +88,25 @@ class FrontEndApp(object): except: self.raise_not_found(environ, 'Static File Not Found: {0}'.format(filepath)) + def get_metadata(self, coll): + metadata = {'coll': coll} + + if coll in self.warcserver.list_fixed_routes(): + metadata.update(self.warcserver.get_coll_config(coll)) + metadata['type'] = 'replay-fixed' + else: + metadata.update(self.metadata_cache.load(coll)) + metadata['type'] = 'replay-dyn' + + return metadata + def serve_coll_page(self, environ, coll): if not self.is_valid_coll(coll): self.raise_not_found(environ, 'No handler for "/{0}"'.format(coll)) self.setup_paths(environ, coll) - metadata = self.metadata_cache.load(coll) + metadata = self.get_metadata(coll) view = BaseInsertView(self.rewriterapp.jinja_env, 'search.html') @@ -115,15 +127,10 @@ class FrontEndApp(object): if environ.get('QUERY_STRING'): wb_url_str += '?' + environ.get('QUERY_STRING') - kwargs = {'coll': coll} - - if coll in self.warcserver.list_fixed_routes(): - kwargs['type'] = 'replay-fixed' - else: - kwargs['type'] = 'replay-dyn' + metadata = self.get_metadata(coll) try: - response = self.rewriterapp.render_content(wb_url_str, kwargs, environ) + response = self.rewriterapp.render_content(wb_url_str, metadata, environ) except UpstreamException as ue: response = self.rewriterapp.handle_error(environ, ue) raise HTTPException(response=response) diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index 5ae3160b..1162b046 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -3,10 +3,7 @@ import requests from werkzeug.http import HTTP_STATUS_CODES from six.moves.urllib.parse import urlencode, urlsplit, urlunsplit -#from pywb.rewrite.rewrite_amf import RewriteAMFMixin -#from pywb.rewrite.rewrite_dash import RewriteDASHMixin -#from pywb.rewrite.rewrite_content import RewriteContent -from pywb.rewrite.default_rewriter import DefaultRewriter +from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy from pywb.rewrite.wburl import WbUrl from pywb.rewrite.url_rewriter import UrlRewriter, SchemeOnlyUrlRewriter @@ -66,10 +63,8 @@ class RewriterApp(object): self.frame_mod = None self.replay_mod = '' - #frame_type = 'inverse' if framed_replay else False - - #self.content_rewriter = Rewriter(is_framed_replay=frame_type) - self.content_rw = DefaultRewriter(replay_mod=self.replay_mod) + self.default_rw = DefaultRewriter(replay_mod=self.replay_mod) + self.js_proxy_rw = RewriterWithJSProxy(replay_mod=self.replay_mod) if not jinja_env: jinja_env = JinjaEnv(globals={'static_path': 'static'}) @@ -148,8 +143,12 @@ class RewriterApp(object): urlkey = canonicalize(wb_url.url) - inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, - self.content_rw) + if kwargs.get('use_js_obj_proxy'): + content_rw = self.js_proxy_rw + else: + content_rw = self.default_rw + + inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw) inputreq.include_post_query(wb_url.url) @@ -267,15 +266,8 @@ class RewriterApp(object): cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter, cookie_key) - #result = self.content_rewriter.rewrite_content(urlrewriter, - # record.http_headers, - # record.raw_stream, - # head_insert_func, - # urlkey, - # cdx, - # cookie_rewriter, - # environ) - result = self.content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx) + urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT') + result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx) status_headers, gen, is_rw = result diff --git a/pywb/apps/static_handler.py b/pywb/apps/static_handler.py index 26fdf976..2ade7aec 100644 --- a/pywb/apps/static_handler.py +++ b/pywb/apps/static_handler.py @@ -4,6 +4,7 @@ import os from pywb.utils.loaders import LocalFileLoader from pywb.apps.wbrequestresponse import WbResponse +from pywb.utils.wbexception import NotFoundException #================================================================= diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index d2ea744f..451c639f 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -37,7 +37,10 @@ directory structure expected by pywb COLL_RX = re.compile('^[\w][-\w]*$') - def __init__(self, coll_name, colls_dir='collections', must_exist=True): + COLLS_DIR = 'collections' + + def __init__(self, coll_name, colls_dir=None, must_exist=True): + colls_dir = colls_dir or self.COLLS_DIR self.default_config = load_yaml_config(DEFAULT_CONFIG) if coll_name and not self.COLL_RX.match(coll_name): diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index 55149c41..50646daf 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -20,14 +20,16 @@ class BaseContentRewriter(object): def __init__(self, rules_file, replay_mod=''): self.rules = [] + self.all_rewriters = [] self.load_rules(rules_file) self.replay_mod = replay_mod - #for rw in self.known_rewriters: - # self.all_rewriters[rw.name] = rw def add_rewriter(self, rw): self.all_rewriters[rw.name] = rw + def get_rewriter(self, rw_type, rwinfo=None): + return self.all_rewriters.get(rw_type) + def load_rules(self, filename): config = load_yaml_config(filename) for rule in config.get('rules'): @@ -68,7 +70,7 @@ class BaseContentRewriter(object): text_type = 'js-proxy' rw_type = rule.get(text_type, text_type) - rw_class = self.all_rewriters.get(rw_type) + rw_class = self.get_rewriter(rw_type, rwinfo) return rw_type, rw_class @@ -98,7 +100,7 @@ class BaseContentRewriter(object): # if no js rewriter, then do banner insert only if not js_rewriter: - rw_class = self.all_rewriters.get('html-banner-only') + rw_class = self.get_rewriter('html-banner-only', rwinfo) rw = rw_class(rwinfo.url_rewriter, js_rewriter=js_rewriter, @@ -146,7 +148,7 @@ class BaseContentRewriter(object): return charset def rewrite_headers(self, rwinfo): - header_rw_class = self.all_rewriters.get('header') + header_rw_class = self.get_rewriter('header', rwinfo) return header_rw_class(rwinfo)() def __call__(self, record, url_rewriter, cookie_rewriter, @@ -268,7 +270,7 @@ class RewriteInfo(object): self.url_rewriter = url_rewriter if not cookie_rewriter: - cookie_rw_class = content_rewriter.all_rewriters.get('cookie') + cookie_rw_class = content_rewriter.get_rewriter('cookie', self) if cookie_rw_class: cookie_rewriter = cookie_rw_class(url_rewriter) @@ -328,20 +330,20 @@ class RewriteInfo(object): return buff def should_rw_content(self): + if not self.text_type: + return False + if self.url_rewriter.wburl.mod == 'id_': return False - if self.text_type == 'html': - if self.url_rewriter.rewrite_opts.get('is_ajax'): + if self.url_rewriter.rewrite_opts.get('is_ajax'): + if self.text_type in ('html', 'js'): return False - elif self.text_type == 'plain': + if self.text_type == 'plain': if self.url_rewriter.wburl.mod not in ('js_', 'cs_'): return False - elif not self.text_type: - return False - elif self.text_type == 'css' or self.text_type == 'xml': if self.url_rewriter.wburl.mod == 'bn_': return False diff --git a/pywb/rewrite/default_rewriter.py b/pywb/rewrite/default_rewriter.py index 6cd323f0..fc7bdeee 100644 --- a/pywb/rewrite/default_rewriter.py +++ b/pywb/rewrite/default_rewriter.py @@ -4,8 +4,7 @@ from pywb.rewrite.html_rewriter import HTMLRewriter from pywb.rewrite.html_insert_rewriter import HTMLInsertOnlyRewriter from pywb.rewrite.regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter -from pywb.rewrite.regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter -from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter +from pywb.rewrite.regex_rewriters import JSLocationOnlyRewriter, JSNoneRewriter, JSWombatProxyRewriter from pywb.rewrite.header_rewriter import PrefixHeaderRewriter from pywb.rewrite.cookie_rewriter import HostScopeCookieRewriter @@ -16,10 +15,13 @@ from pywb.rewrite.rewrite_dash import RewriteDASH from pywb.rewrite.rewrite_hls import RewriteHLS from pywb.rewrite.rewrite_amf import RewriteAMF +import copy +from werkzeug.useragents import UserAgent + # ============================================================================ class DefaultRewriter(BaseContentRewriter): - all_rewriters = { + DEFAULT_REWRITERS = { 'header': PrefixHeaderRewriter, 'cookie': HostScopeCookieRewriter, @@ -68,10 +70,10 @@ class DefaultRewriter(BaseContentRewriter): # AMF 'application/x-amf': 'amf', - # XML - 'text/xml': 'xml', - 'application/xml': 'xml', - 'application/rss+xml': 'xml', + # XML -- don't rewrite xml + #'text/xml': 'xml', + #'application/xml': 'xml', + #'application/rss+xml': 'xml', # PLAIN 'text/plain': 'plain', @@ -80,9 +82,48 @@ class DefaultRewriter(BaseContentRewriter): def __init__(self, rules_file=None, replay_mod=''): rules_file = rules_file or 'pkg://pywb/rules.yaml' super(DefaultRewriter, self).__init__(rules_file, replay_mod) + self.all_rewriters = copy.copy(self.DEFAULT_REWRITERS) def init_js_regex(self, regexs): return RegexRewriter.parse_rules_from_config(regexs) def get_rewrite_types(self): return self.rewrite_types + + +# ============================================================================ +class RewriterWithJSProxy(DefaultRewriter): + def __init__(self, *args, **kwargs): + super(RewriterWithJSProxy, self).__init__(*args, **kwargs) + + def get_rewriter(self, rw_type, rwinfo=None): + if rw_type == 'js' and rwinfo: + # check if UA allows this + if self.ua_allows_obj_proxy(rwinfo.url_rewriter.rewrite_opts): + return JSWombatProxyRewriter + + # otherwise, return default rewriter + return super(RewriterWithJSProxy, self).get_rewriter(rw_type, rwinfo) + + def ua_allows_obj_proxy(self, opts): + ua = opts.get('ua') + if not ua: + ua_string = opts.get('ua_string') + if ua_string: + ua = UserAgent(ua_string) + + if ua is None: + return True + + supported = { + 'chrome': '49.0', + 'firefox': '44.0', + 'safari': '10.0', + 'opera': '36.0', + 'edge': '12.0', + 'msie': None, + } + + min_vers = supported.get(ua.browser) + + return (min_vers and ua.version >= min_vers) diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index c182d14b..9f1fa100 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -1,9 +1,8 @@ import re - from pywb.rewrite.content_rewriter import StreamingRewriter -#================================================================= +# ================================================================= def load_function(string): import importlib @@ -12,10 +11,10 @@ def load_function(string): return getattr(mod, string[1]) -#================================================================= +# ================================================================= class RegexRewriter(StreamingRewriter): - #@staticmethod - #def comment_out(string): + # @staticmethod + # def comment_out(string): # return '/*' + string + '*/' @staticmethod @@ -34,17 +33,17 @@ class RegexRewriter(StreamingRewriter): def archival_rewrite(rewriter): return lambda string: rewriter.rewrite(string) - #@staticmethod - #def replacer(other): + # @staticmethod + # def replacer(other): # return lambda m, string: other HTTPX_MATCH_STR = r'https?:\\?/\\?/[A-Za-z0-9:_@.-]+' - #DEFAULT_OP = add_prefix + # DEFAULT_OP = add_prefix def __init__(self, rewriter, rules): super(RegexRewriter, self).__init__(rewriter) - #rules = self.create_rules(http_prefix) + # rules = self.create_rules(http_prefix) # Build regexstr, concatenating regex list regex_str = '|'.join(['(' + rx + ')' for rx, op, count in rules]) @@ -79,7 +78,7 @@ class RegexRewriter(StreamingRewriter): return m.group(0) # Custom func - #if not hasattr(op, '__call__'): + # if not hasattr(op, '__call__'): # op = RegexRewriter.DEFAULT_OP(op) result = op(m.group(i)) @@ -109,19 +108,20 @@ class RegexRewriter(StreamingRewriter): return result return list(map(parse_rule, config)) + return run_parse_rules -#================================================================= +# ================================================================= class JSLinkRewriterMixin(object): """ JS Rewriter which rewrites absolute http://, https:// and // urls at the beginning of a string """ - #JS_HTTPX = r'(?:(?:(?<=["\';])https?:)|(?<=["\']))\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+.*(?=["\s\';&\\])' - #JS_HTTPX = r'(?<=["\';])(?:https?:)?\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.\-/\\?&#]+(?=["\';&\\])' + # JS_HTTPX = r'(?:(?:(?<=["\';])https?:)|(?<=["\']))\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+.*(?=["\s\';&\\])' + # JS_HTTPX = r'(?<=["\';])(?:https?:)?\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.\-/\\?&#]+(?=["\';&\\])' - #JS_HTTPX = r'(?:(?<=["\';])https?:|(?<=["\']))\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-][^"\s\';&\\]*(?=["\';&\\])' + # JS_HTTPX = r'(?:(?<=["\';])https?:|(?<=["\']))\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-][^"\s\';&\\]*(?=["\';&\\])' JS_HTTPX = r'(?:(?<=["\';])https?:|(?<=["\']))\\{0,4}/\\{0,4}/[A-Za-z0-9:_@%.\\-]+/' def __init__(self, rewriter, rules=[]): @@ -131,7 +131,7 @@ class JSLinkRewriterMixin(object): super(JSLinkRewriterMixin, self).__init__(rewriter, rules) -#================================================================= +# ================================================================= class JSLocationRewriterMixin(object): """ JS Rewriter mixin which rewrites location and domain to the @@ -140,46 +140,96 @@ class JSLocationRewriterMixin(object): def __init__(self, rewriter, rules=[], prefix='WB_wombat_'): rules = rules + [ - (r'(? 0) { - var new_style = rewrite_style(style); - if (new_style != style) { - r.target.style.cssText = new_style; - } - } - } - } - }); - - m.observe($wbwindow.document.documentElement, { - childList: false, - attributes: true, - subtree: true, - //attributeOldValue: true, - attributeFilter: ["style"]}); - } -*/ - //============================================ -/* function init_href_src_obs($wbwindow) - { - if (!$wbwindow.MutationObserver) { - return; - } - - var m = new MutationObserver(function(records, observer) - { - for (var i = 0; i < records.length; i++) { - var r = records[i]; - if (r.type == "attributes") { - //var curr = wb_getAttribute(r.target, r.attributeName); - var curr = r.target.getAttribute(r.attributeName); - var new_url = rewrite_url(curr); - if (curr != new_url) { - wb_setAttribute.call(r.target, r.attributeName, new_url); - } - } - } - }); - - m.observe($wbwindow.document.documentElement, { - childList: false, - attributes: true, - subtree: true, - //attributeOldValue: true, - attributeFilter: ["src", "href"]}); - - } - - - //============================================ - function init_iframe_insert_obs(root) - { - if (!$wbwindow.MutationObserver) { - return; - } - - var m = new MutationObserver(function(records, observer) - { - for (var i = 0; i < records.length; i++) { - var r = records[i]; - if (r.type == "childList") { - for (var j = 0; j < r.addedNodes.length; j++) { - if (r.addedNodes[j].tagName == "IFRAME") { - init_iframe_wombat(r.addedNodes[j]); - } - } - } - } - }); - - m.observe(root, { - childList: true, - subtree: true, - }); - } -*/ //============================================ function rewrite_attr(elem, name, abs_url_only) { if (!elem || !elem.getAttribute) { @@ -1515,6 +1445,27 @@ var _WBWombat = function($wbwindow, wbinfo) { return orig_setter; } + //============================================ + function rewrite_inline_style(orig) { + var decoded; + + try { + decoded = decodeURIComponent(orig); + } catch (e) { + decoded = orig; + } + + if (decoded != orig) { + val = rewrite_style(decoded); + var parts = val.split(",", 2); + val = parts[0] + "," + encodeURIComponent(parts[1]); + } else { + val = rewrite_style(orig); + } + + return val; + } + //============================================ function override_attr(obj, attr, mod, default_to_setget) { var orig_getter = get_orig_getter(obj, attr); @@ -1524,21 +1475,7 @@ var _WBWombat = function($wbwindow, wbinfo) { var val; if (mod == "cs_" && orig.indexOf("data:text/css") == 0) { - var decoded; - - try { - decoded = decodeURIComponent(orig); - } catch (e) { - decoded = orig; - } - - if (decoded != orig) { - val = rewrite_style(decoded); - var parts = val.split(",", 2); - val = parts[0] + "," + encodeURIComponent(parts[1]); - } else { - val = rewrite_style(orig); - } + val = rewrite_inline_style(orig); } else { val = rewrite_url(orig, false, mod); } @@ -1598,7 +1535,7 @@ var _WBWombat = function($wbwindow, wbinfo) { //============================================ - function init_attr_overrides($wbwindow) { + function init_attr_overrides() { override_attr($wbwindow.HTMLLinkElement.prototype, "href", "cs_"); override_attr($wbwindow.CSSStyleSheet.prototype, "href", "cs_"); override_attr($wbwindow.HTMLImageElement.prototype, "src", "im_"); @@ -1631,6 +1568,8 @@ var _WBWombat = function($wbwindow, wbinfo) { override_style_attr(style_proto, "background", "background"); override_style_attr(style_proto, "backgroundImage", "background-image"); + override_style_attr(style_proto, "cursor", "cursor"); + override_style_attr(style_proto, "listStyle", "list-style"); override_style_attr(style_proto, "listStyleImage", "list-style-image"); @@ -1730,7 +1669,9 @@ var _WBWombat = function($wbwindow, wbinfo) { var getter = function() { init_iframe_wombat(this); - return orig_getter.call(this); + var res = orig_getter.call(this); + res = (res && res._WB_wombat_obj_proxy) || res; + return res; }; def_prop(obj, prop, orig_setter, getter); @@ -1882,6 +1823,9 @@ var _WBWombat = function($wbwindow, wbinfo) { replace_dom_func("appendChild"); replace_dom_func("insertBefore"); replace_dom_func("replaceChild"); + + override_prop_to_proxy($wbwindow.Node.prototype, "ownerDocument"); + override_prop_to_proxy($wbwindow.HTMLHtmlElement.prototype, "parentNode"); } @@ -1911,7 +1855,9 @@ var _WBWombat = function($wbwindow, wbinfo) { function receive_hash_change(event) { - if (!event.data || event.source != $wbwindow.__WB_top_frame) { + var source = event.source.__WBProxyRealObj__ || event.source; + + if (!event.data || source != $wbwindow.__WB_top_frame) { return; } @@ -2026,6 +1972,8 @@ var _WBWombat = function($wbwindow, wbinfo) { source = win.__WB_win_id[event.data.src_id]; } + source = source.__WBProxyRealObj__ || source; + ne = new MessageEvent("message", {"bubbles": event.bubbles, "cancelable": event.cancelable, @@ -2073,7 +2021,7 @@ var _WBWombat = function($wbwindow, wbinfo) { return _orig_addEventListener.call(this, type, listener, useCapture); } } - + $wbwindow.addEventListener = addEventListener_rewritten; // REMOVE @@ -2123,9 +2071,23 @@ var _WBWombat = function($wbwindow, wbinfo) { addMEOverride("eventPhase"); addMEOverride("path"); + override_prop_to_proxy($wbwindow.MessageEvent.prototype, "source"); + $wbwindow.MessageEvent.prototype.__extended = true; } + //============================================ + function init_mo_from_proxy() { + var orig_observe = $wbwindow.MutationObserver.prototype.observe; + + function observe_deproxy(target, options) { + target = target && target.__WBProxyRealObj__ || target; + return orig_observe.call(this, target, options); + } + + $wbwindow.MutationObserver.prototype.observe = observe_deproxy; + } + //============================================ function init_open_override() { @@ -2140,7 +2102,7 @@ var _WBWombat = function($wbwindow, wbinfo) { var res = orig.call(this, strUrl, strWindowName, strWindowFeatures); init_new_window_wombat(res, strUrl); return res; - } + }; $wbwindow.open = open_rewritten; @@ -2158,7 +2120,7 @@ var _WBWombat = function($wbwindow, wbinfo) { } //============================================ - function init_cookies_override($wbwindow) + function init_cookies_override() { var cookie_path_regex = /\bPath=\'?\"?([^;'"\s]+)/i; var cookie_domain_regex = /\bDomain=([^;'"\s]+)/i; @@ -2342,6 +2304,7 @@ var _WBWombat = function($wbwindow, wbinfo) { init_new_window_wombat(win, src); } + //============================================ function init_new_window_wombat(win, src) { if (!win || win._wb_wombat) { return; @@ -2366,76 +2329,46 @@ var _WBWombat = function($wbwindow, wbinfo) { //============================================ - function init_doc_overrides($wbwindow) { + function init_doc_overrides($document) { if (!Object.defineProperty) { return; } - if ($wbwindow.document._wb_override) { - return; - } + // referrer + override_prop_extract($document, "referrer"); - var orig_referrer = extract_orig($wbwindow.document.referrer); - - var domain_info; - - if ($wbwindow.wbinfo) { - domain_info = $wbwindow.wbinfo; - } else { - domain_info = wbinfo; - } - - domain_info.domain = domain_info.wombat_host; + // origin + def_prop($document, "origin", undefined, function() { return this._WB_wombat_location.origin; }); + // domain var domain_setter = function(val) { - if (ends_with(domain_info.wombat_host, val)) { - domain_info.domain = val; + if (ends_with(this._WB_wombat_location.hostname, val)) { + this.__wb_domain = val; } } var domain_getter = function() { - return domain_info.domain; + return this.__wb_domain || this._WB_wombat_location.hostname; } - // changing domain disallowed, but set as no-op to avoid errors - def_prop($wbwindow.document, "domain", domain_setter, domain_getter); + def_prop($document, "domain", domain_setter, domain_getter); - def_prop($wbwindow.document, "referrer", undefined, function() { return orig_referrer; }); - - - // Cookies - init_cookies_override($wbwindow); - - // Init mutation observer (for style only) - //init_mutation_obs($wbwindow); - - // override href and src attrs - init_attr_overrides($wbwindow); - - - init_form_overrides($wbwindow); - - - // Attr observers - //if (!wb_opts.skip_attr_observers) { - // init_href_src_obs($wbwindow); - //} - - $wbwindow.document._wb_override = true; + // override form action + init_form_overrides($document); } //============================================ // Necessary since HTMLFormElement.prototype.action is not consistently // overridable - function init_form_overrides($wbwindow) { + function init_form_overrides($document) { var do_init_forms = function() { - for (var i = 0; i < $wbwindow.document.forms.length; i++) { - var new_action = rewrite_url($wbwindow.document.forms[i].action); - if (new_action != $wbwindow.document.forms[i].action) { - $wbwindow.document.forms[i].action = new_action; + for (var i = 0; i < $document.forms.length; i++) { + var new_action = rewrite_url($document.forms[i].action); + if (new_action != $document.forms[i].action) { + $document.forms[i].action = new_action; } - override_attr($wbwindow.document.forms[i], "action", "", true); + override_attr($document.forms[i], "action", "", true); } } @@ -2546,6 +2479,185 @@ var _WBWombat = function($wbwindow, wbinfo) { init_bad_prefixes(wb_replay_prefix); } + + //============================================ + // New Proxy Obj Override Functions + // Original Concept by John Berlin (https://github.com/N0taN3rd) + //============================================ + function getAllOwnProps(obj) { + var ownProps = []; + + var props = Object.getOwnPropertyNames(obj); + + for (var i = 0; i < props.length; i++) { + var prop = props[i]; + + try { + if (obj[prop] && !obj[prop].prototype) { + ownProps.push(prop); + } + } catch (e) {} + } + + obj = Object.getPrototypeOf(obj); + + while (obj) { + props = Object.getOwnPropertyNames(obj); + for (var i = 0; i < props.length; i++) { + ownProps.push(props[i]); + } + obj = Object.getPrototypeOf(obj); + } + + return ownProps; + } + + //============================================ + function default_proxy_get(obj, prop, ownProps) { + if (prop == '__WBProxyRealObj__') { + return obj; + } else if (prop == 'location') { + return obj._WB_wombat_location; + } else if (prop == "_WB_wombat_obj_proxy") { + return obj._WB_wombat_obj_proxy; + } + + var retVal = obj[prop]; + + var type = (typeof retVal); + + if (type === "function" && ownProps.indexOf(prop) != -1) { + return retVal.bind(obj); + } else if (type === "object" && retVal && retVal._WB_wombat_obj_proxy) { + return retVal._WB_wombat_obj_proxy; + } + + return retVal; + } + + //============================================ + function init_window_obj_proxy($wbwindow) { + if (!$wbwindow.Proxy) { + return undefined; + } + + var ownProps = getAllOwnProps($wbwindow); + + $wbwindow._WB_wombat_obj_proxy = new $wbwindow.Proxy({}, { + get: function(target, prop) { + if (prop == 'top') { + return $wbwindow.WB_wombat_top._WB_wombat_obj_proxy; + } + + return default_proxy_get($wbwindow, prop, ownProps); + }, + + set: function(target, prop, value) { + if (prop === 'location') { + $wbwindow.WB_wombat_location = value; + return true; + } else if (prop === 'postMessage' || prop === 'document') { + return true; + } else { + try { + if (!Reflect.set(target, prop, value)) { + return false; + } + } catch(e) {} + + return Reflect.set($wbwindow, prop, value); + } + }, + has: function(target, prop) { + return prop in $wbwindow; + }, + ownKeys: function(target) { + return Object.getOwnPropertyNames($wbwindow).concat(Object.getOwnPropertySymbols($wbwindow)); + }, + getOwnPropertyDescriptor: function(target, key) { + // console.log(key); + // hack for some JS libraries that do a for in + // since we are proxying an empty object need to add configurable = true + // Proxies know we are an empty object and if window says not configurable + // throws an error + var descriptor = Object.getOwnPropertyDescriptor($wbwindow, key); + if (descriptor && !descriptor.configurable) { + descriptor.configurable = true; + } + return descriptor; + }, + getPrototypeOf: function(target) { + return Object.getPrototypeOf($wbwindow); + }, + setPrototypeOf: function(target, newProto) { + return false; + }, + isExtensible: function(target) { + return Object.isExtensible($wbwindow); + }, + preventExtensions: function(target) { + Object.preventExtensions($wbwindow); + return true; + }, + deleteProperty: function(target, prop) { + var propDescriptor = Object.getOwnPropertyDescriptor($wbwindow, prop); + if (propDescriptor === undefined) { + return true; + } + if (propDescriptor.configurable === false) { + return false; + } + delete $wbwindow[prop]; + return true; + }, + defineProperty: function(target, prop, desc) { + desc = desc || {}; + if (!desc.value && !desc.get) { + desc.value = $wbwindow[prop]; + } + + var res = Reflect.defineProperty($wbwindow, prop, desc); + + return Reflect.defineProperty(target, prop, desc); + } + }); + + return $wbwindow._WB_wombat_obj_proxy; + } + + //============================================ + function init_document_obj_proxy($document) { + init_doc_overrides($document); + + if (!$wbwindow.Proxy) { + return undefined; + } + + var ownProps = getAllOwnProps($document); + + $document._WB_wombat_obj_proxy = new $wbwindow.Proxy($document, { + get: function(target, prop) { + return default_proxy_get($document, prop, ownProps); + }, + + set: function(target, prop, value) { + if (prop === 'location') { + $document.WB_wombat_location = value; + return true; + } else { + target[prop] = value; + return true; + } + }, + }); + + return $document._WB_wombat_obj_proxy; + } + + // End Proxy Obj Override System + + + //============================================ function wombat_init(wbinfo) { init_paths(wbinfo); @@ -2572,8 +2684,6 @@ var _WBWombat = function($wbwindow, wbinfo) { //$wbwindow.document.WB_wombat_domain = wbinfo.wombat_host; //$wbwindow.document.WB_wombat_referrer = extract_orig($wbwindow.document.referrer); - init_doc_overrides($wbwindow, wb_opts); - // History override_history_func("pushState"); override_history_func("replaceState"); @@ -2638,6 +2748,12 @@ var _WBWombat = function($wbwindow, wbinfo) { init_getAttribute_override(); } + // override href and src attrs + init_attr_overrides(); + + // Cookies + init_cookies_override(); + // createElement attr override if (!wb_opts.skip_createElement) { init_createElement_override(); @@ -2680,6 +2796,12 @@ var _WBWombat = function($wbwindow, wbinfo) { // disable notifications init_disable_notifications(); + // add window and document obj proxies, if available + init_window_obj_proxy($wbwindow); + init_document_obj_proxy($wbwindow.document); + + init_mo_from_proxy(); + // expose functions var obj = {} obj.extract_orig = extract_orig; @@ -2687,6 +2809,14 @@ var _WBWombat = function($wbwindow, wbinfo) { obj.watch_elem = watch_elem; obj.init_new_window_wombat = init_new_window_wombat; obj.init_paths = init_paths; + obj.local_init = function(name) { + var res = $wbwindow._WB_wombat_obj_proxy[name]; + if (name === "document" && res && !res._WB_wombat_obj_proxy) { + return init_document_obj_proxy(res) || res; + } + return res; + } + return obj; } @@ -2842,3 +2972,4 @@ var _WBWombat = function($wbwindow, wbinfo) { }; window._WBWombat = _WBWombat; + diff --git a/pywb/warcserver/warcserver.py b/pywb/warcserver/warcserver.py index 28386c9e..c4a0dcce 100644 --- a/pywb/warcserver/warcserver.py +++ b/pywb/warcserver/warcserver.py @@ -108,6 +108,16 @@ class WarcServer(BaseWarcServer): def list_fixed_routes(self): return list(self.fixed_routes.keys()) + def get_coll_config(self, name): + colls = self.config.get('collections', None) + if not colls: + return {} + + res = colls.get(name, {}) + if not isinstance(res, dict): + res = {'index': res} + return res + def list_dynamic_routes(self): if not self.root_dir: return [] diff --git a/requirements.txt b/requirements.txt index db2e6230..c32c542d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ six -warcio>=1.3.4 +warcio>=1.4.0 chardet requests redis diff --git a/tests/config_test.yaml b/tests/config_test.yaml index a52fac1d..2fb8f04b 100644 --- a/tests/config_test.yaml +++ b/tests/config_test.yaml @@ -2,9 +2,15 @@ debug: true +collections_root: _test_colls + collections: pywb: ./sample_archive/cdx/ + with-js-proxy: + index: ./sample_archive/cdx/ + use_js_obj_proxy: true + # live collection live: $live diff --git a/tests/test_auto_colls.py b/tests/test_auto_colls.py index 8708c1c2..3354d864 100644 --- a/tests/test_auto_colls.py +++ b/tests/test_auto_colls.py @@ -19,7 +19,7 @@ from mock import patch from pywb import get_test_dir from pywb.warcserver.test.testutils import TempDirTests, BaseTestClass -from pywb.manager.manager import main +from pywb.manager.manager import main, CollectionsManager import pywb.manager.autoindex @@ -32,6 +32,9 @@ from pywb.apps.frontendapp import FrontEndApp #============================================================================= ARCHIVE_DIR = 'archive' INDEX_DIR = 'indexes' +COLLECTIONS = '_test_colls' + +CollectionsManager.COLLS_DIR = COLLECTIONS INDEX_FILE = 'index.cdxj' AUTOINDEX_FILE = 'autoindex.cdxj' @@ -76,7 +79,7 @@ class TestManagedColls(TempDirTests, BaseTestClass): with raises(SystemExit): wayback(['-a', '-p', '0']) - colls = os.path.join(self.root_dir, 'collections') + colls = os.path.join(self.root_dir, COLLECTIONS) os.mkdir(colls) pywb.manager.autoindex.keep_running = False @@ -87,7 +90,7 @@ class TestManagedColls(TempDirTests, BaseTestClass): """ main(['init', 'test']) - colls = os.path.join(self.root_dir, 'collections') + colls = os.path.join(self.root_dir, COLLECTIONS) assert os.path.isdir(colls) test = os.path.join(colls, 'test') @@ -128,7 +131,7 @@ class TestManagedColls(TempDirTests, BaseTestClass): main(['add', 'test', warc1, warc2]) # Spurrious file in collections - with open(os.path.join(self.root_dir, 'collections', 'blah'), 'w+b') as fh: + with open(os.path.join(self.root_dir, COLLECTIONS, 'blah'), 'w+b') as fh: fh.write(b'foo\n') with raises(IOError): @@ -147,7 +150,7 @@ class TestManagedColls(TempDirTests, BaseTestClass): main(['init', 'nested']) - nested_root = os.path.join(self.root_dir, 'collections', 'nested', ARCHIVE_DIR) + nested_root = os.path.join(self.root_dir, COLLECTIONS, 'nested', ARCHIVE_DIR) nested_a = os.path.join(nested_root, 'A') nested_b = os.path.join(nested_root, 'B', 'sub') @@ -166,7 +169,7 @@ class TestManagedColls(TempDirTests, BaseTestClass): os.path.join(nested_b, 'example.warc.gz') ]) - nested_cdx = os.path.join(self.root_dir, 'collections', 'nested', INDEX_DIR, INDEX_FILE) + nested_cdx = os.path.join(self.root_dir, COLLECTIONS, 'nested', INDEX_DIR, INDEX_FILE) with open(nested_cdx) as fh: nested_cdx_index = fh.read() @@ -190,7 +193,7 @@ class TestManagedColls(TempDirTests, BaseTestClass): to ensure equality of indexes """ # ensure merged index is same as full reindex - coll_dir = os.path.join(self.root_dir, 'collections', 'test', INDEX_DIR) + coll_dir = os.path.join(self.root_dir, COLLECTIONS, 'test', INDEX_DIR) orig = os.path.join(coll_dir, INDEX_FILE) bak = os.path.join(coll_dir, 'index.bak') @@ -210,7 +213,7 @@ class TestManagedColls(TempDirTests, BaseTestClass): def test_add_static(self): """ Test adding static file to collection, check access """ - a_static = os.path.join(self.root_dir, 'collections', 'test', 'static', 'abc.js') + a_static = os.path.join(self.root_dir, COLLECTIONS, 'test', 'static', 'abc.js') with open(a_static, 'w+b') as fh: fh.write(b'/* Some JS File */') @@ -281,7 +284,7 @@ class TestManagedColls(TempDirTests, BaseTestClass): def test_custom_template_search(self): """ Test manually added custom search template search.html """ - a_static = os.path.join(self.root_dir, 'collections', 'test', 'templates', 'search.html') + a_static = os.path.join(self.root_dir, COLLECTIONS, 'test', 'templates', 'search.html') with open(a_static, 'w+b') as fh: fh.write(b'pywb custom search page') @@ -299,7 +302,7 @@ class TestManagedColls(TempDirTests, BaseTestClass): Template is relative to collection-specific dir Add custom metadata and test its presence in custom search page """ - custom_search = os.path.join(self.root_dir, 'collections', 'test', + custom_search = os.path.join(self.root_dir, COLLECTIONS, 'test', 'templates', 'search.html') # add metadata @@ -314,7 +317,8 @@ class TestManagedColls(TempDirTests, BaseTestClass): resp.charset = 'utf-8' assert resp.status_int == 200 assert resp.content_type == 'text/html' - assert 'overriden search page: {"some": "value"}' in resp.text + assert 'overriden search page: ' in resp.text + assert '"some": "value"' in resp.text resp = self.testapp.get('/test/20140103030321/http://example.com?example=1') assert resp.status_int == 200 @@ -328,7 +332,7 @@ class TestManagedColls(TempDirTests, BaseTestClass): # Add collection template main(['template', 'foo', '--add', 'query_html']) - assert os.path.isfile(os.path.join(self.root_dir, 'collections', 'foo', 'templates', 'query.html')) + assert os.path.isfile(os.path.join(self.root_dir, COLLECTIONS, 'foo', 'templates', 'query.html')) # overwrite -- force main(['template', 'foo', '--add', 'query_html', '-f']) @@ -389,7 +393,7 @@ class TestManagedColls(TempDirTests, BaseTestClass): def test_no_templates(self): """ Test removing templates dir, using default template again """ - shutil.rmtree(os.path.join(self.root_dir, 'collections', 'foo', 'templates')) + shutil.rmtree(os.path.join(self.root_dir, COLLECTIONS, 'foo', 'templates')) self._create_app() @@ -462,7 +466,7 @@ class TestManagedColls(TempDirTests, BaseTestClass): def test_auto_index(self): main(['init', 'auto']) - auto_dir = os.path.join(self.root_dir, 'collections', 'auto') + auto_dir = os.path.join(self.root_dir, COLLECTIONS, 'auto') archive_dir = os.path.join(auto_dir, ARCHIVE_DIR) archive_sub_dir = os.path.join(archive_dir, 'sub') @@ -545,7 +549,7 @@ class TestManagedColls(TempDirTests, BaseTestClass): def test_err_wrong_warcs(self): warc1 = self._get_sample_warc('example.warc.gz') - invalid_warc = os.path.join(self.root_dir, 'collections', 'test', ARCHIVE_DIR, 'invalid.warc.gz') + invalid_warc = os.path.join(self.root_dir, COLLECTIONS, 'test', ARCHIVE_DIR, 'invalid.warc.gz') # Empty warc list, argparse calls exit with raises(SystemExit): @@ -572,7 +576,7 @@ class TestManagedColls(TempDirTests, BaseTestClass): """ Test various errors with missing warcs dir, missing cdx dir, non dir cdx file, and missing collections root """ - colls = os.path.join(self.root_dir, 'collections') + colls = os.path.join(self.root_dir, COLLECTIONS) # No Statics -- ignorable shutil.rmtree(os.path.join(colls, 'foo', 'static')) diff --git a/tests/test_integration.py b/tests/test_integration.py index 3342daf7..10214419 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -254,11 +254,25 @@ class TestWbIntegration(BaseConfigTest): assert resp.content_length == 0 assert resp.content_type == 'application/x-javascript' - #def test_redirect_exact(self): - # resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/') - # assert resp.status_int == 302 + def test_replay_js_obj_proxy(self, fmod): + # test js proxy obj with jquery -- no user agent + resp = self.get('/with-js-proxy/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod) - # assert resp.headers['Location'].endswith('/pywb/20140127171238/http://iana.org') + assert resp.status_int == 200 + assert resp.content_length != 0 + assert resp.content_type == 'application/x-javascript' + + # test with Chrome user agent + resp = self.get('/with-js-proxy/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod, + headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) + assert 'let window = _____WB$wombat$assign$function_____(' in resp.text + + def test_replay_js_ie11_no_obj_proxy(self, fmod): + # IE11 user-agent, no proxy + resp = self.get('/with-js-proxy/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod, + headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'}) + + assert 'let window = _____WB$wombat$assign$function_____(' not in resp.text def test_replay_non_exact(self, fmod): # non-exact mode, don't redirect to exact capture @@ -448,7 +462,7 @@ class TestWbIntegration(BaseConfigTest): resp = self.testapp.get('/collinfo.json') assert resp.content_type == 'application/json' value = resp.json - assert len(value['fixed']) == 4 + assert len(value['fixed']) == 5 assert len(value['dynamic']) == 0 #def test_invalid_config(self):