From fb85570974f1cad5a039443ed4040f1ebc28a9d1 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 28 Oct 2014 10:36:48 -0700 Subject: [PATCH 1/3] video: add video rewriting use vidrw client side and youtube-dl on the server add vi_ modifier: -on record, gets video_info from youtube-dl, sends to proxy, if any, via PUTMETA to create metadata record -on playback, fetches special metadata record with video info and returns to client as json -vidrw script: fetches video info, if any, and attempts to replace iframe and embed tags (so far) which are videos wombat: export extract_url function, fix spaces and use object instance semantics --- pywb/rewrite/url_rewriter.py | 15 ++- pywb/rules.yaml | 17 ++- pywb/static/vidrw.js | 110 ++++++++++++++++ pywb/static/wombat.js | 197 ++++++++++++++-------------- pywb/ui/head_insert.html | 3 +- pywb/webapp/live_rewrite_handler.py | 42 +++++- pywb/webapp/query_handler.py | 10 ++ tests/test_integration.py | 4 +- 8 files changed, 291 insertions(+), 107 deletions(-) create mode 100644 pywb/static/vidrw.js diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 61a48e50..a162f67e 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -138,11 +138,7 @@ class HttpsUrlRewriter(UrlRewriter): HTTPS = 'https://' def rewrite(self, url, mod=None): - if url.startswith(self.HTTPS): - result = self.HTTP + url[len(self.HTTPS):] - return result - else: - return url + return self.remove_https(url) def get_new_url(self, **kwargs): return kwargs.get('url') @@ -155,3 +151,12 @@ class HttpsUrlRewriter(UrlRewriter): def deprefix_url(self): return self.wburl.url + + @staticmethod + def remove_https(url): + rw = HttpsUrlRewriter + if url.startswith(rw.HTTPS): + result = rw.HTTP + url[len(rw.HTTPS):] + return result + else: + return url diff --git a/pywb/rules.yaml b/pywb/rules.yaml index b601b2df..e5d79bbe 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -114,7 +114,22 @@ rules: - ownerId - videoFileId - signature - + + + # youtube rules + #================================================================= + + - url_prefix: 'com,youtube)/get_video_info' + + fuzzy_lookup: + - video_id + - html5 + + + - url_prefix: 'com,googlevideo,' + + fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]+).*(range=[^&]+)' + # testing rules -- not for valid domain #================================================================= diff --git a/pywb/static/vidrw.js b/pywb/static/vidrw.js new file mode 100644 index 00000000..14746481 --- /dev/null +++ b/pywb/static/vidrw.js @@ -0,0 +1,110 @@ +/* +Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License. + +This file is part of pywb, https://github.com/ikreymer/pywb + + pywb is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + pywb is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with pywb. If not, see . +*/ + +// VidRw 1.0 -- video rewriting + +__wbvidrw = (function() { + + var already_checked = false; + + function check_videos() { + if (already_checked) { + return; + } + + var iframes = document.getElementsByTagName("iframe"); + + for (var i = 0; i < iframes.length; i++) { + already_checked = true; + check_replacement(iframes[i], iframes[i].getAttribute("src")); + } + + var embeds = document.getElementsByTagName("embed"); + + for (var i = 0; i < embeds.length; i++) { + already_checked = true; + check_replacement(embeds[i], embeds[i].getAttribute("src")); + } + } + + function check_replacement(elem, src) { + if (!src) { + return; + } + + src = _wb_wombat.extract_orig(src); + + var xhr = new XMLHttpRequest(); + xhr._no_rewrite = true; + xhr.open('GET', wbinfo.prefix + 'vi_/' + src, true); + xhr.onload = function() { + if (xhr.status == 200) { + do_replace_video(elem, JSON.parse(xhr.responseText)); + } + }; + xhr.send(); + } + + function do_replace_video(elem, video_info) { + // TODO: select based on size? + var video_url = video_info.url; + video_url = wbinfo.prefix + video_url; + + console.log("REPLACING: " + video_url); + var width = elem.getAttribute("width"); + var height = elem.getAttribute("height"); + + console.log(video_info.ext); + + // Try HTML5 Video + var htmlvideo = document.createElement("video"); + + htmlvideo.setAttribute("src", video_url); + htmlvideo.setAttribute("width", width); + htmlvideo.setAttribute("height", height); + htmlvideo.setAttribute("controls", "1"); + htmlvideo.style.backgroundColor = "#000"; + + if (video_info.thumbnail) { + var thumbnail = wbinfo.prefix + video_info.thumbnail; + htmlvideo.setAttribute("thumbnail", thumbnail); + } + + htmlvideo.addEventListener("error", function() { + console.log("html5 video error"); + }); + + htmlvideo.addEventListener("loadstart", function() { + console.log("html5 video success"); + }); + + console.log(elem.tagName); + + if (elem.tagName.toLowerCase() == "iframe") { + elem.parentNode.replaceChild(htmlvideo, elem); + } else if (elem.tagName.toLowerCase() == "embed") { + if (elem.parentNode && elem.parentElement.tagName.toLowerCase() == "object") { + elem = elem.parentNode; + } + elem.parentNode.replaceChild(htmlvideo, elem); + } + } + + document.addEventListener("DOMContentLoaded", check_videos); +})(); diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 0e6d8327..4ad51067 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -20,7 +20,7 @@ This file is part of pywb, https://github.com/ikreymer/pywb //============================================ // Wombat JS-Rewriting Library v2.0 //============================================ -WB_wombat_init = (function() { +var _WBWombat = (function() { // Globals var wb_replay_prefix; @@ -64,7 +64,7 @@ WB_wombat_init = (function() { } else if (string.indexOf(arr_or_prefix) == 0) { return arr_or_prefix; } - + return undefined; } @@ -89,31 +89,31 @@ WB_wombat_init = (function() { } return rewritten; } - + //============================================ var HTTP_PREFIX = "http://"; var HTTPS_PREFIX = "https://"; var REL_PREFIX = "//"; - + var VALID_PREFIXES = [HTTP_PREFIX, HTTPS_PREFIX, REL_PREFIX]; var IGNORE_PREFIXES = ["#", "about:", "data:", "mailto:", "javascript:"]; - + var BAD_PREFIXES; - + function init_bad_prefixes(prefix) { BAD_PREFIXES = ["http:" + prefix, "https:" + prefix, "http:/" + prefix, "https:/" + prefix]; } - + //============================================ function rewrite_url_(url) { // If undefined, just return it if (!url) { return url; } - + var urltype_ = (typeof url); - + // If object, use toString if (urltype_ == "object") { url = url.toString(); @@ -129,7 +129,7 @@ WB_wombat_init = (function() { return url; } } - + // just in case wombat reference made it into url! url = url.replace("WB_wombat_", ""); @@ -166,10 +166,10 @@ WB_wombat_init = (function() { } return wb_replay_date_prefix + url; } - + // Check for common bad prefixes and remove them prefix = starts_with(url, BAD_PREFIXES); - + if (prefix) { url = extract_orig(url); return wb_replay_date_prefix + url; @@ -189,16 +189,16 @@ WB_wombat_init = (function() { if (!href) { return ""; } - + // proxy mode: no extraction needed if (!wb_replay_prefix) { return href; } - + href = href.toString(); var index = href.indexOf("/http", 1); - + // extract original url from wburl if (index > 0) { href = href.substr(index + 1); @@ -207,12 +207,12 @@ WB_wombat_init = (function() { if (index >= 0) { href = href.substr(index + wb_replay_prefix.length); } - if ((href.length > 4) && - (href.charAt(2) == "_") && + if ((href.length > 4) && + (href.charAt(2) == "_") && (href.charAt(3) == "/")) { href = href.substr(4); } - + if (!starts_with(href, "http")) { href = HTTP_PREFIX + href; } @@ -225,18 +225,18 @@ WB_wombat_init = (function() { return href; } - + //============================================ // Define custom property function def_prop(obj, prop, value, set_func, get_func) { var key = "_" + prop; obj[key] = value; - + try { Object.defineProperty(obj, prop, { configurable: false, enumerable: true, - set: function(newval) { + set: function(newval) { var result = set_func.call(obj, newval); if (result != undefined) { obj[key] = result; @@ -256,12 +256,12 @@ WB_wombat_init = (function() { obj[prop] = value; return false; } - } - + } + //============================================ //Define WombatLocation - - function WombatLocation(loc) { + + function WombatLocation(loc) { this._orig_loc = loc; this._orig_href = loc.href; @@ -273,53 +273,53 @@ WB_wombat_init = (function() { return this._orig_loc.assign(rewrite_url(url)); } this.reload = loc.reload; - + // Adapted from: // https://gist.github.com/jlong/2428561 var parser = document.createElement('a'); var href = extract_orig(this._orig_href); parser.href = href; - + this._autooverride = false; - + var _set_hash = function(hash) { this._orig_loc.hash = hash; return this._orig_loc.hash; } - + var _get_hash = function() { return this._orig_loc.hash; } - + var _get_url_with_hash = function(url) { return url + this._orig_loc.hash; } - + href = parser.href; var hash = parser.hash; - + if (hash) { var hidx = href.lastIndexOf("#"); if (hidx > 0) { href = href.substring(0, hidx); } } - + if (Object.defineProperty) { var res1 = def_prop(this, "href", href, this.assign, _get_url_with_hash); - + var res2 = def_prop(this, "hash", parser.hash, _set_hash, _get_hash); - + this._autooverride = res1 && res2; } else { this.href = href; this.hash = parser.hash; } - + this.host = parser.host; this.hostname = parser.hostname; @@ -335,17 +335,17 @@ WB_wombat_init = (function() { this.toString = function() { return this.href; } - + // Copy any remaining properties for (prop in loc) { if (this.hasOwnProperty(prop)) { continue; } - + if ((typeof loc[prop]) != "function") { this[prop] = loc[prop]; } - } + } } //============================================ @@ -460,7 +460,7 @@ WB_wombat_init = (function() { //============================================ function init_ajax_rewrite() { - if (!window.XMLHttpRequest || + if (!window.XMLHttpRequest || !window.XMLHttpRequest.prototype || !window.XMLHttpRequest.prototype.open) { return; @@ -469,7 +469,9 @@ WB_wombat_init = (function() { var orig = window.XMLHttpRequest.prototype.open; function open_rewritten(method, url, async, user, password) { - url = rewrite_url(url); + if (!this._no_rewrite) { + url = rewrite_url(url); + } // defaults to true if (async != false) { @@ -534,7 +536,7 @@ WB_wombat_init = (function() { rewrite_attr(elem, "src", rewrite_url); rewrite_attr(elem, "href", rewrite_url); rewrite_attr(elem, "style", rewrite_style); - + if (elem && elem.getAttribute && elem.getAttribute("crossorigin")) { elem.removeAttribute("crossorigin"); } @@ -545,7 +547,7 @@ WB_wombat_init = (function() { if (!Node || !Node.prototype) { return; } - + function override_attr(obj, attr) { var setter = function(orig) { var val = rewrite_url(orig); @@ -553,15 +555,15 @@ WB_wombat_init = (function() { this.setAttribute(attr, val); return val; } - + var getter = function(val) { var res = this.getAttribute(attr); return res; } - + var curr_src = obj.getAttribute(attr); - - def_prop(obj, attr, curr_src, setter, getter); + + def_prop(obj, attr, curr_src, setter, getter); } function replace_dom_func(funcname) { @@ -569,7 +571,7 @@ WB_wombat_init = (function() { Node.prototype[funcname] = function() { var child = arguments[0]; - + rewrite_elem(child); var desc; @@ -587,19 +589,19 @@ WB_wombat_init = (function() { } var created = orig.apply(this, arguments); - - if (created.tagName == "IFRAME") { + + if (created.tagName == "IFRAME") { if (created.contentWindow) { created.contentWindow.window.WB_wombat_location = created.contentWindow.window.location; } - + override_attr(created, "src"); } - + // } else if (created.tagName == "A") { // override_attr(created, "href"); // } - + return created; } } @@ -608,29 +610,29 @@ WB_wombat_init = (function() { replace_dom_func("insertBefore"); replace_dom_func("replaceChild"); } - + var postmessage_rewritten; - + //============================================ function init_postmessage_override() - { + { if (!Window.prototype.postMessage) { return; } - + var orig = Window.prototype.postMessage; - + postmessage_rewritten = function(message, targetOrigin, transfer) { if (targetOrigin && targetOrigin != "*") { targetOrigin = window.location.origin; } - + return orig.call(this, message, targetOrigin, transfer); } - + window.postMessage = postmessage_rewritten; window.Window.prototype.postMessage = postmessage_rewritten; - + for (var i = 0; i < window.frames.length; i++) { try { window.frames[i].postMessage = postmessage_rewritten; @@ -639,24 +641,24 @@ WB_wombat_init = (function() { } } } - + //============================================ function init_open_override() - { + { if (!Window.prototype.open) { return; } - + var orig = Window.prototype.open; - + var open_rewritten = function(strUrl, strWindowName, strWindowFeatures) { strUrl = rewrite_url(strUrl); return orig.call(this, strUrl, strWindowName, strWindowFeatures); } - + window.open = open_rewritten; window.Window.prototype.open = open_rewritten; - + for (var i = 0; i < window.frames.length; i++) { try { window.frames[i].open = open_rewritten; @@ -665,41 +667,41 @@ WB_wombat_init = (function() { } } } - + function init_cookies_override() { var cookie_path_regex = /\bPath=\'?\"?([^;'"\s]+)/i; - + var get_cookie = function() { return document.cookie; } - + var set_cookie = function(value) { var matched = value.match(cookie_path_regex); - + // if has cookie path, rewrite and replace if (matched) { var rewritten = rewrite_url(matched[1]); value = value.replace(matched[1], rewritten); } - + document.cookie = value; } - + def_prop(document, "WB_wombat_cookie", document.cookie, set_cookie, get_cookie); } - + //============================================ function init_write_override() { document.write = function(string) { var doc = new DOMParser().parseFromString(string, "text/html"); - + if (doc) { var children = doc.body.children; - + for (var i = 0; i < children.length; i++) { document.body.appendChild(children[i]); } @@ -710,52 +712,52 @@ WB_wombat_init = (function() { //============================================ function wombat_init(replay_prefix, capture_date, orig_scheme, orig_host, timestamp, mod) { wb_replay_prefix = replay_prefix; - + if (wb_replay_prefix) { wb_replay_date_prefix = replay_prefix + capture_date + mod + "/"; - + if (capture_date.length > 0) { wb_capture_date_part = "/" + capture_date + "/"; } else { wb_capture_date_part = ""; } - + wb_orig_scheme = orig_scheme + '://'; - + wb_orig_host = wb_orig_scheme + orig_host; - + init_bad_prefixes(replay_prefix); } // Location var wombat_location = new WombatLocation(window.self.location); - + if (wombat_location._autooverride) { - + var setter = function(val) { - if (typeof(val) == "string") { + if (typeof(val) == "string") { if (starts_with(val, "about:")) { return undefined; } this._WB_wombat_location.href = val; } } - + def_prop(window, "WB_wombat_location", wombat_location, setter); def_prop(document, "WB_wombat_location", wombat_location, setter); } else { window.WB_wombat_location = wombat_location; document.WB_wombat_location = wombat_location; - + // Check quickly after page load - setTimeout(check_all_locations, 500); - + setTimeout(check_all_locations, 500); + // Check periodically every few seconds setInterval(check_all_locations, 500); } - + var is_framed = (window.top.wbinfo && window.top.wbinfo.is_frame); - + function find_next_top(win) { while ((win.parent != win) && (win.parent != win.top)) { win = win.parent; @@ -766,9 +768,9 @@ WB_wombat_init = (function() { if (window.self.location != window.top.location) { if (is_framed) { window.top.WB_wombat_location = window.WB_wombat_location; - + window.WB_wombat_top = find_next_top(window.self); - + } else { window.top.WB_wombat_location = new WombatLocation(window.top.location); window.WB_wombat_top = window.top; @@ -788,20 +790,20 @@ WB_wombat_init = (function() { // History copy_history_func(window.history, 'pushState'); copy_history_func(window.history, 'replaceState'); - + // open init_open_override(); // postMessage init_postmessage_override(); - + // write init_write_override(); - + // Ajax init_ajax_rewrite(); init_worker_override(); - + // Cookies init_cookies_override(); @@ -810,6 +812,9 @@ WB_wombat_init = (function() { // Random init_seeded_random(timestamp); + + // expose functions + this.extract_orig = extract_orig; } return wombat_init; diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html index 4e53a5d0..36563167 100644 --- a/pywb/ui/head_insert.html +++ b/pywb/ui/head_insert.html @@ -3,7 +3,7 @@ + {% include banner_html ignore missing %} diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py index abaf80c2..e38d10e9 100644 --- a/pywb/webapp/live_rewrite_handler.py +++ b/pywb/webapp/live_rewrite_handler.py @@ -4,12 +4,17 @@ from pywb.framework.archivalrouter import ArchivalRouter, Route from pywb.rewrite.rewrite_live import LiveRewriter from pywb.rewrite.wburl import WbUrl +from pywb.rewrite.url_rewriter import HttpsUrlRewriter from handlers import StaticHandler, SearchPageWbUrlHandler from views import HeadInsertView from pywb.utils.wbexception import WbException +import json +import requests +from youtube_dl import YoutubeDL + #================================================================= class LiveResourceException(WbException): @@ -25,14 +30,16 @@ class RewriteHandler(SearchPageWbUrlHandler): def __init__(self, config): super(RewriteHandler, self).__init__(config) - default_proxy = config.get('proxyhostport') + self.default_proxy = config.get('proxyhostport') self.rewriter = LiveRewriter(is_framed_replay=self.is_frame_mode, - default_proxy=default_proxy) + default_proxy=self.default_proxy) self.head_insert_view = HeadInsertView.init_from_config(config) self.live_cookie = config.get('live-cookie', self.LIVE_COOKIE) + self.ydl = None + def handle_request(self, wbrequest): try: return self.render_content(wbrequest) @@ -50,6 +57,9 @@ class RewriteHandler(SearchPageWbUrlHandler): return {} def render_content(self, wbrequest): + if wbrequest.wb_url.mod == 'vi_': + return self.get_video_info(wbrequest) + head_insert_func = self.head_insert_view.create_insert_func(wbrequest) req_headers = self._live_request_headers(wbrequest) @@ -76,6 +86,34 @@ class RewriteHandler(SearchPageWbUrlHandler): return WbResponse(status_headers, gen) + + def get_video_info(self, wbrequest): + if not self.ydl: + self.ydl = YoutubeDL(dict(simulate=True, + youtube_include_dash_manifest=False)) + + self.ydl.add_default_info_extractors() + + info = self.ydl.extract_info(wbrequest.wb_url.url) + content_type = 'application/vnd.youtube-dl_formats+json' + metadata = json.dumps(info) + + if self.default_proxy: + proxies = {'http': self.default_proxy} + + headers = {'Content-Type': content_type} + + url = HttpsUrlRewriter.remove_https(wbrequest.wb_url.url) + + response = requests.request(method='PUTMETA', + url=url, + data=metadata, + headers=headers, + proxies=proxies, + verify=False) + + return WbResponse.text_response(metadata, content_type=content_type) + def __str__(self): return 'Live Web Rewrite Handler' diff --git a/pywb/webapp/query_handler.py b/pywb/webapp/query_handler.py index 6a404c17..052243d5 100644 --- a/pywb/webapp/query_handler.py +++ b/pywb/webapp/query_handler.py @@ -68,6 +68,14 @@ class QueryHandler(object): params['url'] = wb_url.url params['output'] = output + params['filter'].append('!mimetype:-') + + # get metadata + if wb_url.mod == 'vi_': + # matching metadata explicitly with special scheme + params['url'] = wb_url.url.replace('http:/', 'metadata:/') + params['filter'].append('~original:metadata://') + cdx_iter = self.load_cdx(wbrequest, params) return cdx_iter, output @@ -132,6 +140,7 @@ class QueryHandler(object): 'limit': limit, 'fl': ('urlkey,original,timestamp,' + 'endtimestamp,groupcount,uniqcount'), + 'filter':[], }, wburl.REPLAY: @@ -147,6 +156,7 @@ class QueryHandler(object): # Not appropriate as default # Should be an option to configure status code filtering in general # 'filter': ['statuscode:[23]..|-'], + 'filter': [], 'limit': '1', 'resolveRevisits': True, } diff --git a/tests/test_integration.py b/tests/test_integration.py index 3375329f..208ceb9c 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -98,7 +98,7 @@ class TestWb: assert '"20140127171238"' in resp.body assert 'wb.js' in resp.body - assert 'WB_wombat_init' in resp.body + assert 'new _WBWombat' in resp.body, resp.body assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body def test_replay_non_frame_content(self): @@ -149,7 +149,7 @@ class TestWb: assert 'wb.js' in resp.body # no wombat present - assert 'WB_wombat_init' not in resp.body + assert '_WBWombat' not in resp.body # url not rewritten #assert '"http://www.iana.org/domains/example"' in resp.body From d7b1bc8151f1caa355ef87a5014a0dd3e1b8c61b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 28 Oct 2014 11:11:15 -0700 Subject: [PATCH 2/3] setup: add youtube_dl as dependency rewrite: add call to _live_rewrite_headers to get correct headers for proxy on video info --- pywb/webapp/live_rewrite_handler.py | 3 ++- setup.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py index e38d10e9..078d544b 100644 --- a/pywb/webapp/live_rewrite_handler.py +++ b/pywb/webapp/live_rewrite_handler.py @@ -101,7 +101,8 @@ class RewriteHandler(SearchPageWbUrlHandler): if self.default_proxy: proxies = {'http': self.default_proxy} - headers = {'Content-Type': content_type} + headers = self._live_request_headers(wbrequest) + headers['Content-Type'] = content_type url = HttpsUrlRewriter.remove_https(wbrequest.wb_url.url) diff --git a/setup.py b/setup.py index 12225bee..54e3c021 100755 --- a/setup.py +++ b/setup.py @@ -70,6 +70,7 @@ setup( 'jinja2', 'surt', 'pyyaml', + 'youtube_dl' ], tests_require=[ 'pytest', From 3d4526f994ed0cc4852140a58a97ce1d6a286cb0 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 28 Oct 2014 11:46:16 -0700 Subject: [PATCH 3/3] wombat: add wombat class to window --- pywb/static/wombat.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 4ad51067..29ba40d6 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -20,7 +20,7 @@ This file is part of pywb, https://github.com/ikreymer/pywb //============================================ // Wombat JS-Rewriting Library v2.0 //============================================ -var _WBWombat = (function() { +window._WBWombat = (function() { // Globals var wb_replay_prefix;