diff --git a/CHANGES.rst b/CHANGES.rst index 7d7e6d7c..19993af1 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,13 +1,26 @@ +pywb 0.7.6 changelist +~~~~~~~~~~~~~~~~~~~~~ + +* new not found Jinja2 template: Add per-collection-overridable ``not_found.html`` template, specified via ``not_found_html`` option. For missing resources, the ``not_found_html`` template is now used instead of the generic ``error_html`` + +* client-side rewrite: improved wombat rewrite of postMessage events, unrewrite target on receive, improved Vine replay + +* packaging: allow adding multiple packages for Jinja2 template resolving + pywb 0.7.5 changelist ~~~~~~~~~~~~~~~~~~~~~ * Cross platform fixes to support Windows -- all tests pass on Linux, OS X and Windows now. Improved cross-platform support includes: + - read all files as binary to avoid line ending issues - - properly convert url <-> file - - avoid platform dependent apis + - properly convert between platform dependent file paths and urls + - add .gitattributes to ensure line endings on *.warc*, *.arc*, *.cdx* files are unaltered + - avoid platform dependent apis (eg. %s for strftime) * Change any unhandled exceptions to result in a 500 error, instead of 400. +* Setup: switch to ``zip_safe=True`` to allow for embedding pywb egg in one-file app with `pyinstaller `_ + * More compresensive client side ``src`` attribute rewriting (via wombat.js), additional server-side HTML tag rewriting. @@ -39,6 +52,7 @@ pywb 0.7.1 changelist * Use `youtube-dl `_ to find actual video streams from page urls, record video info. * New, improved wombat 2.1 -- improved rewriting of dynamic content, including: + - setAttribute override - Date override sets date to replay timestamp - Image() object override diff --git a/README.rst b/README.rst index 062dda5a..d0ec127d 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -PyWb 0.7.5 +PyWb 0.7.6 ========== .. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master diff --git a/pywb/rules.yaml b/pywb/rules.yaml index cb87a843..325bbd9d 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -126,7 +126,15 @@ rules: - videoFileId - signature + # vine + - url_prefix: 'co,vine,cdn,' + rewrite: + js_rewrite_location: urls + js_regexs: + - match: 'window.location' + replace: 'WB_wombat_location' + # youtube rules #================================================================= diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index dc9b2205..f0582b43 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -18,7 +18,7 @@ This file is part of pywb, https://github.com/ikreymer/pywb */ //============================================ -// Wombat JS-Rewriting Library v2.1 +// Wombat JS-Rewriting Library v2.2 //============================================ _WBWombat = (function() { @@ -68,6 +68,16 @@ _WBWombat = (function() { return undefined; } + //============================================ + function equals_any(string, arr) { + for (var i = 0; i < arr.length; i++) { + if (string === arr[i]) { + return arr[i]; + } + } + return undefined; + } + //============================================ function ends_with(str, suffix) { if (str.indexOf(suffix, str.length - suffix.length) !== -1) { @@ -107,6 +117,8 @@ _WBWombat = (function() { var SRC_TAGS = ["IMG", "SCRIPT", "VIDEO", "AUDIO", "SOURCE", "EMBED", "INPUT"]; + var REWRITE_ATTRS = ["src", "href", "poster"]; + //============================================ function rewrite_url_(url) { // If undefined, just return it @@ -158,12 +170,19 @@ _WBWombat = (function() { return wb_replay_date_prefix + wb_orig_host + url; } - // If full url starting with http://, add prefix - + // If full url starting with http://, https:// or // + // add rewrite prefix var prefix = starts_with(url, VALID_PREFIXES); if (prefix) { + // if already rewriting url, must still check scheme if (starts_with(url, prefix + window.location.host + '/')) { + var curr_scheme = window.location.protocol + '//'; + + // replace scheme to ensure using the correct server scheme + if (starts_with(url, wb_orig_scheme) && (wb_orig_scheme != curr_scheme)) { + url = curr_scheme + url.substring(wb_orig_scheme.length); + } return url; } return wb_replay_date_prefix + url; @@ -254,7 +273,11 @@ _WBWombat = (function() { }); return true; } catch (e) { - console.log(e); + var info = "Can't redefine prop " + prop; + if (obj && obj.tagName) { + info += " on " + obj.tagName; + } + console.log(info); obj[prop] = value; return false; } @@ -480,13 +503,6 @@ _WBWombat = (function() { async = true; } - // extra check for correct scheme here.. maybe move to rewrite_url.. - var curr_scheme = window.location.protocol + '//'; - - if (starts_with(url, wb_orig_scheme) && (wb_orig_scheme != curr_scheme)) { - url = curr_scheme + url.substring(wb_orig_scheme.length); - } - return orig.call(this, method, url, async, user, password); } @@ -507,7 +523,7 @@ _WBWombat = (function() { Element.prototype.setAttribute = function(name, value) { if (name) { var lowername = name.toLowerCase(); - if (lowername == "src" || lowername == "href") { + if (equals_any(lowername, REWRITE_ATTRS)) { if (!this._no_rewrite) { value = rewrite_url(value); } @@ -518,6 +534,21 @@ _WBWombat = (function() { }; } + //============================================ + function init_createElementNS_fix() + { + if (!document.createElementNS) { + return; + } + + document._orig_createElementNS = document.createElementNS; + document.createElementNS = function(namespaceURI, qualifiedName) + { + namespaceURI = extract_orig(namespaceURI); + return document._orig_createElementNS(namespaceURI, qualifiedName); + } + } + //============================================ function init_image_override() { window.__Image = window.Image; @@ -612,8 +643,6 @@ _WBWombat = (function() { //============================================ function rewrite_style(value) { - //console.log("style rewrite: " + value); - STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/g; function style_replacer(match, n1, n2, n3, offset, string) { @@ -694,7 +723,7 @@ _WBWombat = (function() { } override_attr(created, "src"); - } else if (created.tagName && starts_with(created.tagName, SRC_TAGS)) { + } else if (created.tagName && equals_any(created.tagName, SRC_TAGS)) { override_attr(created, "src"); } @@ -710,22 +739,28 @@ _WBWombat = (function() { //============================================ function init_postmessage_override() { - if (!Window.prototype.postMessage) { + if (!window.postMessage) { return; } - var orig = Window.prototype.postMessage; + var orig = window.postMessage; var postmessage_rewritten = function(message, targetOrigin, transfer) { + message = {"origin": targetOrigin, "message": message}; + if (targetOrigin && targetOrigin != "*") { targetOrigin = window.location.origin; } + return orig.call(this, message, targetOrigin, transfer); } window.postMessage = postmessage_rewritten; - window.Window.prototype.postMessage = postmessage_rewritten; + + if (Window.prototype.postMessage) { + window.Window.prototype.postMessage = postmessage_rewritten; + } for (var i = 0; i < window.frames.length; i++) { try { @@ -734,6 +769,30 @@ _WBWombat = (function() { console.log(e); } } + + + window._orig_addEventListener = window.addEventListener; + + window.addEventListener = function(type, listener, useCapture) { + if (type == "message") { + var orig_listener = listener; + listener = function(event) { + + var ne = new MessageEvent("message", + {"bubbles": event.bubbles, + "cancelable": event.cancelable, + "data": event.data.message, + "origin": event.data.origin, + "lastEventId": event.lastEventId, + "source": event.source, + "ports": event.ports}); + + return orig_listener(ne); + } + } + + return window._orig_addEventListener(type, listener, useCapture); + } } //============================================ @@ -882,6 +941,7 @@ _WBWombat = (function() { document.WB_wombat_domain = orig_host; document.WB_wombat_referrer = extract_orig(document.referrer); + // History copy_history_func(window.history, 'pushState'); copy_history_func(window.history, 'replaceState'); @@ -902,6 +962,9 @@ _WBWombat = (function() { // setAttribute init_setAttribute_override(); + // ensure namespace urls are NOT rewritten + init_createElementNS_fix(); + // Image init_image_override(); diff --git a/pywb/ui/error.html b/pywb/ui/error.html index b122fc38..b3a8c478 100644 --- a/pywb/ui/error.html +++ b/pywb/ui/error.html @@ -9,10 +9,3 @@

{% endif %} - -{% if env.pywb_proxy_magic and err_url and status == '404 Not Found' %} -

-Try Different Collection -

-{% endif %} - diff --git a/pywb/ui/not_found.html b/pywb/ui/not_found.html new file mode 100644 index 00000000..39faa3b3 --- /dev/null +++ b/pywb/ui/not_found.html @@ -0,0 +1,10 @@ +

Url Not Found

+ +The url {{ url }} could not be found in this collection. + +{% if env.pywb_proxy_magic and url %} +

+Try Different Collection +

+{% endif %} + diff --git a/pywb/ui/query.html b/pywb/ui/query.html index 2d1f5c86..3e54534b 100644 --- a/pywb/ui/query.html +++ b/pywb/ui/query.html @@ -24,7 +24,8 @@ function ts_to_date(ts, is_gmt) -

pywb Sample Calendar Results

+

pywb Query Results

+ {% if cdx_lines | length > 0 %} {{ cdx_lines | length }} captures of {{ url }} @@ -47,5 +48,8 @@ function ts_to_date(ts, is_gmt)

* Unique captures are bold. Other captures are duplicates of a previous capture.

+ {% else %} + No captures found for {{ url }} + {% endif %} diff --git a/pywb/webapp/handlers.py b/pywb/webapp/handlers.py index 54ef92e4..ed5a5af4 100644 --- a/pywb/webapp/handlers.py +++ b/pywb/webapp/handlers.py @@ -49,12 +49,9 @@ class SearchPageWbUrlHandler(WbUrlHandler): self.banner_html = None def render_search_page(self, wbrequest, **kwargs): - if self.search_view: - return self.search_view.render_response(wbrequest=wbrequest, - prefix=wbrequest.wb_prefix, - **kwargs) - else: - return WbResponse.text_response('No Lookup Url Specified') + return self.search_view.render_response(wbrequest=wbrequest, + prefix=wbrequest.wb_prefix, + **kwargs) def __call__(self, wbrequest): # root search page @@ -110,6 +107,9 @@ class WBHandler(SearchPageWbUrlHandler): super(WBHandler, self).__init__(config) self.index_reader = query_handler + self.not_found_view = (J2TemplateView. + create_template(config.get('not_found_html'), + 'Not Found Error')) cookie_maker = config.get('cookie_maker') record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker) @@ -152,12 +152,19 @@ class WBHandler(SearchPageWbUrlHandler): cdx_callback) def handle_not_found(self, wbrequest, nfe): - if (not self.fallback_handler or - wbrequest.wb_url.is_query() or - wbrequest.wb_url.is_identity): - raise + # check fallback: only for replay queries and not for identity + if (self.fallback_handler and + not wbrequest.wb_url.is_query() and + not wbrequest.wb_url.is_identity): + return self.fallback_handler(wbrequest) - return self.fallback_handler(wbrequest) + # if capture query, just return capture page + if wbrequest.wb_url.is_query(): + return self.index_reader.make_cdx_response(wbrequest, [], 'html') + else: + return self.not_found_view.render_response(status='404 Not Found', + env=wbrequest.env, + url=wbrequest.wb_url.url) def __str__(self): return 'Web Archive Replay Handler' diff --git a/pywb/webapp/pywb_init.py b/pywb/webapp/pywb_init.py index bcd329b3..d31a91c7 100644 --- a/pywb/webapp/pywb_init.py +++ b/pywb/webapp/pywb_init.py @@ -34,6 +34,7 @@ DEFAULTS = { 'search_html': 'ui/search.html', 'home_html': 'ui/index.html', 'error_html': 'ui/error.html', + 'not_found_html': 'ui/not_found.html', 'proxy_select_html': 'ui/proxy_select.html', 'proxy_cert_download_html': 'ui/proxy_cert_download.html', diff --git a/pywb/webapp/views.py b/pywb/webapp/views.py index 23c528e2..9f2dd1e7 100644 --- a/pywb/webapp/views.py +++ b/pywb/webapp/views.py @@ -61,7 +61,7 @@ def is_wb_handler(obj): #================================================================= class J2TemplateView(object): env_globals = {'static_path': 'static/default', - 'package': 'pywb'} + 'packages': ['pywb']} def __init__(self, filename): template_dir, template_file = path.split(filename) @@ -84,8 +84,11 @@ class J2TemplateView(object): # add relative and absolute path loaders for banner support loaders.append(FileSystemLoader('.')) loaders.append(FileSystemLoader('/')) - loaders.append(PackageLoader(self.env_globals['package'], - template_dir)) + + # add loaders for all specified packages + for package in self.env_globals['packages']: + loaders.append(PackageLoader(package, + template_dir)) return loaders def render_to_string(self, **kwargs): diff --git a/setup.py b/setup.py index e53c340e..2efb448b 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ class PyTest(TestCommand): setup( name='pywb', - version='0.7.5', + version='0.7.6', url='https://github.com/ikreymer/pywb', author='Ilya Kreymer', author_email='ikreymer@gmail.com', diff --git a/tests/test_config.yaml b/tests/test_config.yaml index ad010789..1d034671 100644 --- a/tests/test_config.yaml +++ b/tests/test_config.yaml @@ -86,6 +86,10 @@ home_html: ui/index.html # if omitted, a text response is returned error_html: ui/error.html + +# template for 404 not found error, may be customized per collection +not_found_html: ui/not_found.html + # ==== Other Paths ==== # Rewrite urls with absolute paths instead of relative diff --git a/tests/test_integration.py b/tests/test_integration.py index 3322613b..17161ae3 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -77,6 +77,13 @@ class TestWb: # 17 Captures + header assert len(resp.html.find_all('tr')) == 18 + def test_calendar_not_found(self): + # query with no results + resp = self.testapp.get('/pywb/*/http://not-exist.example.com') + self._assert_basic_html(resp) + assert 'No captures found' in resp.body, resp.body + assert len(resp.html.find_all('tr')) == 0 + def test_cdx_query(self): resp = self.testapp.get('/pywb/cdx_/*/http://www.iana.org/') self._assert_basic_text(resp) @@ -374,6 +381,11 @@ class TestWb: assert resp.status_int == 403 assert 'Excluded' in resp.body + def test_replay_not_found(self): + resp = self.testapp.head('/pywb/http://not-exist.example.com', status=404) + assert resp.content_type == 'text/html' + assert resp.status_int == 404 + def test_static_content(self): resp = self.testapp.get('/static/test/route/wb.css') assert resp.status_int == 200