diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 19c807d8..643daced 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -148,12 +148,12 @@ rules: - url_prefix: 'com,youtube,c' - fuzzy_lookup: 'com,youtube,c.*/videogoodput.*(id=[^&]+)' + fuzzy_lookup: 'com,youtube,c.*/videogoodput.*([?&]id=[^&]+)' - url_prefix: 'com,googlevideo,' fuzzy_lookup: - match: 'com,googlevideo.*/videoplayback.*(id=[^&]+).*(itag=[^&]+).*(mime=[^&]+)' + match: 'com,googlevideo.*/videoplayback.*([?&]id=[^&]+).*([?&]itag=[^&]+).*([?&]mime=[^&]+)' filter: - '~urlkey:{0}' - '!mimetype:text/plain' diff --git a/pywb/static/vidrw.js b/pywb/static/vidrw.js index 80adb165..64a9978e 100644 --- a/pywb/static/vidrw.js +++ b/pywb/static/vidrw.js @@ -18,13 +18,35 @@ This file is part of pywb, https://github.com/ikreymer/pywb */ // VidRw 1.0 -- video rewriting +// +// + +var _pywbvid = "default"; + +var _pywb_yt_err = undefined; + +if (window.location.hash) { + var m = window.location.hash.match(/_pywbvid=([\w]+)/); + if (m) { + _pywbvid = m[1]; + } + + if (_pywbvid == "html" || _pywbvid == "flash") { + var YT_W_E_RX = /^(https?:\/\/.*youtube.com)\/(watch|embed).*$/; + + if (wbinfo.url.match(YT_W_E_RX)) { + // special case: prevent yt player from being inited + Object.defineProperty(window, 'yt', {writeable: false}); + Object.defineProperty(window, 'ytplayer', {writeable: false}); + } + } +} + __wbvidrw = (function() { var found_embeds = false; - var vid_type = "default"; - var FLASH_PLAYER = wbinfo.static_prefix + "/flowplayer/flowplayer-3.2.18.swf"; function check_videos() { @@ -32,14 +54,6 @@ __wbvidrw = (function() { return; } - // extract_typ - if (window.location.hash) { - var m = window.location.hash.match(/_pywbvid=([\w]+)/); - if (m) { - vid_type = m[1]; - } - } - function handle_all_embeds() { var embeds = document.getElementsByTagName("embed"); @@ -61,10 +75,9 @@ __wbvidrw = (function() { found_embeds = true; - handle_yt_videos(vid_type); - - //window.setInterval(handle_all_embeds, 1000); + handle_yt_videos(_pywbvid); + //window.setInterval(handle_all_embeds, 2000); //_wb_wombat.add_tag_handler("embed", handle_all_embeds); //_wb_wombat.add_tag_handler("object", handle_all_objects); } @@ -100,8 +113,8 @@ __wbvidrw = (function() { return false; } - for (var j = 0; j < objects[i].children.length; j++) { - var child = objects[i].children[j]; + for (var j = 0; j < elem.children.length; j++) { + var child = elem.children[j]; if (child.tagName == "EMBED") { return false; @@ -125,7 +138,7 @@ __wbvidrw = (function() { elem._vidrw = true; - check_replacement(elem, src); + check_replacement(elem, obj_url); return true; } @@ -136,41 +149,85 @@ __wbvidrw = (function() { var YT_V_RX = /^(https?:\/\/.*youtube.com)\/v\/([^&?]+)(.*)$/; var VIMEO_RX = /^https?:\/\/.*vimeo.*clip_id=([^&]+)/; - - function handle_yt_videos(vid_type) + function remove_yt() { - function do_yt_video_replace() + // yt special case + if (window.yt && window.yt.player && window.yt.player.getPlayerByElement) { + //yt.player.Application.create("player-api", ytplayer.config).dispose(); + + var elem = window.yt.player.getPlayerByElement("player-api"); + + if (!elem) { + elem = window.yt.player.getPlayerByElement("player"); + } + + if (elem) { + elem.destroy(); + } + + delete window.yt; + if (window.ytplayer) { + delete window.ytplayer; + } + } + // end yt special case + } + + function handle_yt_videos(_pywbvid) + { + function do_yt_video_replace(elem) { - console.log("REPLACING YT: " + wbinfo.url); - ytvideo[0].autoplay = false; - ytvideo[0].preload = "none"; + remove_yt(); - var elem = ytvideo[0]; - // get ancestor 'div' - if (elem.parentElement) { - elem = elem.parentElement; + while (elem.hasChildNodes()) { + elem.removeChild(elem.lastChild); } - if (elem.parentElement) { - elem = elem.parentElement; - } - console.log(elem); - // Experimental - - check_replacement(elem, wbinfo.url); + //add placeholder child to remove + var placeholder = document.createElement("div"); + elem.appendChild(placeholder); + check_replacement(placeholder, wbinfo.url); } // special case: yt if (wbinfo.url.match(YT_W_E_RX)) { - var ytvideo = document.getElementsByTagName("video"); + //var ytvideo = document.getElementsByTagName("video"); + var player_div = document.getElementById("player-api"); + if (!player_div) { + player_div = document.getElementById("player"); + } + + //if (ytvideo.length == 1 && ytvideo[0].getAttribute("data-youtube-id") != "") { + if (player_div) { + if (_pywbvid == "html" || _pywbvid == "flash") { + do_yt_video_replace(player_div); + } else if (!wbinfo.is_live) { + var player = window.yt.player.getPlayerByElement(player_div); + + if (player) { + _pywb_yt_err = function() { + do_yt_video_replace(player_div); + } + + player.addEventListener("onError", "_pywb_yt_err"); + } - if (ytvideo.length == 1 && ytvideo[0].getAttribute("data-youtube-id") != "") { - if (vid_type == "html") { - do_yt_video_replace(); - } else { setTimeout(function() { - if (!ytvideo || !ytvideo.length || ytvideo[0].readyState == 0) { - do_yt_video_replace(); + if (!window.yt || !window.yt.player) { + do_yt_video_replace(player_div); + return; + } + + var state = -1; + + if (player && player.getPlayerState) { + state = player.getPlayerState(); + } + + // if no player or player is still buffering (is this ok), then replace + if (state < 0 || state == 3) { + do_yt_video_replace(player_div); + return; } }, 4000); } @@ -197,7 +254,7 @@ __wbvidrw = (function() { src = src.replace(VIMEO_RX, "http://player.vimeo.com/video/$1"); - if (vid_type == "orig") { + if (_pywbvid == "orig") { var repl_src = src.replace(YT_V_RX, "$1/embed/$2?$3&controls=0"); if (repl_src != src) { do_replace_iframe(elem, repl_src); @@ -275,12 +332,6 @@ __wbvidrw = (function() { } else { elem.parentNode.replaceChild(replacement, elem); } - - if (window.yt) { - yt.player.Application.create("player-api", ytplayer.config).dispose(); - delete window.yt; - delete window.ytplayer; - } } @@ -315,7 +366,7 @@ __wbvidrw = (function() { if (type == "audio") { htmlelem = document.createElement("audio"); } - if (vid_type != "flash") { + if (_pywbvid != "flash") { replacement = init_html_player(htmlelem, type, width, height, info, thumb_url); } } @@ -402,7 +453,6 @@ __wbvidrw = (function() { return; } - //console.log("html5 " + type +" error"); var replacement = document.createElement("div"); var vidId = "_wb_vid" + Date.now(); diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py index 6a3bcab4..928637f6 100644 --- a/pywb/webapp/live_rewrite_handler.py +++ b/pywb/webapp/live_rewrite_handler.py @@ -67,7 +67,7 @@ class RewriteHandler(SearchPageWbUrlHandler): def render_content(self, wbrequest): if wbrequest.wb_url.mod == 'vi_': - return self.get_video_info(wbrequest) + return self._get_video_info(wbrequest) head_insert_func = self.head_insert_view.create_insert_func(wbrequest) req_headers = self._live_request_headers(wbrequest) @@ -79,6 +79,7 @@ class RewriteHandler(SearchPageWbUrlHandler): ignore_proxies = False use_206 = False url = None + rangeres = None readd_range = False cache_key = None @@ -100,7 +101,7 @@ class RewriteHandler(SearchPageWbUrlHandler): ignore_proxies = True # sets cache_key only if not already cached - cache_key = self._check_url_cache(url) + cache_key = self._get_cache_key('r:', url) result = self.rewriter.fetch_request(wbrequest.wb_url.url, wbrequest.urlrewriter, @@ -124,6 +125,18 @@ class RewriteHandler(SearchPageWbUrlHandler): if cache_key: self._add_proxy_ping(cache_key, url, wbrequest, wbresponse) + if rangeres: + referrer = wbrequest.env.get('REL_REFERER') + + # also ping video info + if referrer: + try: + resp = self._get_video_info(wbrequest, + info_url=referrer, + video_url=url) + except: + print('Error getting video info') + return wbresponse def _make_response(self, wbrequest, status_headers, gen, is_rewritten): @@ -138,22 +151,26 @@ class RewriteHandler(SearchPageWbUrlHandler): return WbResponse(status_headers, gen) - def _check_url_cache(self, url): + def _get_cache_key(self, prefix, url): if not self._cache: self._cache = create_cache() - hash_ = hashlib.md5() - hash_.update(url) - key = hash_.hexdigest() + key = self.create_cache_key(prefix, url) if key in self._cache: return None return key - def _add_proxy_ping(self, key, url, wbrequest, wbresponse): - referrer = wbrequest.env.get('REL_REFERER') + @staticmethod + def create_cache_key(prefix, url): + hash_ = hashlib.md5() + hash_.update(url) + key = hash_.hexdigest() + key = prefix + key + return key + def _add_proxy_ping(self, key, url, wbrequest, wbresponse): def do_ping(): headers = self._live_request_headers(wbrequest) headers['Connection'] = 'close' @@ -175,12 +192,6 @@ class RewriteHandler(SearchPageWbUrlHandler): del self._cache[key] raise - # also ping video info - if referrer: - resp = self.get_video_info(wbrequest, - info_url=referrer, - video_url=url) - def wrap_buff_gen(gen): for x in gen: yield x @@ -194,7 +205,7 @@ class RewriteHandler(SearchPageWbUrlHandler): wbresponse.body = wrap_buff_gen(wbresponse.body) return wbresponse - def get_video_info(self, wbrequest, info_url=None, video_url=None): + def _get_video_info(self, wbrequest, info_url=None, video_url=None): if not self.youtubedl: self.youtubedl = YoutubeDLWrapper() @@ -204,12 +215,18 @@ class RewriteHandler(SearchPageWbUrlHandler): if not info_url: info_url = wbrequest.wb_url.url + cache_key = None + if self.proxies: + cache_key = self._get_cache_key('v:', video_url) + info = self.youtubedl.extract_info(video_url) + #if info and info.formats and len(info.formats) == 1: + content_type = self.YT_DL_TYPE metadata = json.dumps(info) - if self.proxies: + if (self.proxies and cache_key): headers = self._live_request_headers(wbrequest) headers['Content-Type'] = content_type @@ -222,6 +239,8 @@ class RewriteHandler(SearchPageWbUrlHandler): proxies=self.proxies, verify=False) + self._cache[cache_key] = '1' + return WbResponse.text_response(metadata, content_type=content_type) def __str__(self): diff --git a/pywb/webapp/rangecache.py b/pywb/webapp/rangecache.py index 550117cd..84b585b1 100644 --- a/pywb/webapp/rangecache.py +++ b/pywb/webapp/rangecache.py @@ -6,6 +6,7 @@ from tempfile import NamedTemporaryFile, mkdtemp import yaml import os +from shutil import rmtree import atexit @@ -19,9 +20,8 @@ class RangeCache(object): def cleanup(self): if self.temp_dir: # pragma: no cover - import shutil print('Removing: ' + self.temp_dir) - shutil.rmtree(self.temp_dir, True) + rmtree(self.temp_dir, True) self.temp_dir = None def handle_range(self, wbrequest, digest, wbresponse_func, diff --git a/tests/test_live_proxy.py b/tests/test_live_proxy.py index 580593d1..700dc7e2 100644 --- a/tests/test_live_proxy.py +++ b/tests/test_live_proxy.py @@ -9,9 +9,10 @@ from pywb.framework.wsgi_wrappers import init_app import webtest import shutil +import pywb.webapp.live_rewrite_handler + #================================================================= -#ThreadingMixIn.deamon_threads = True #class ProxyServer(ThreadingMixIn, HTTPServer): class ProxyServer(HTTPServer): @@ -49,6 +50,7 @@ class ProxyRequest(BaseHTTPRequestHandler): class TestProxyLiveRewriter: def setup(self): self.requestlog = [] + self.cache = {} def make_httpd(app): proxyserv = ProxyServer(('', 0), ProxyRequest) @@ -63,7 +65,11 @@ class TestProxyLiveRewriter: config=dict(framed_replay=True, proxyhostport=self.server.proxy_dict)) - print(self.server.proxy_dict) + def create_cache(): + return self.cache + + pywb.webapp.live_rewrite_handler.create_cache = create_cache + self.testapp = webtest.TestApp(self.app) def teardown(self): @@ -83,6 +89,8 @@ class TestProxyLiveRewriter: assert resp.body.startswith('GET http://example.com/ HTTP/1.1') assert 'referer: http://other.example.com' in resp.body + assert len(self.cache) == 0 + def test_echo_proxy_start_unbounded_remove_range(self): headers = [('Range', 'bytes=0-')] resp = self.testapp.get('/rewrite/http://example.com/', headers=headers) @@ -101,6 +109,8 @@ class TestProxyLiveRewriter: assert self.requestlog[0].startswith('GET http://example.com/ HTTP/1.1') assert 'range: ' not in self.requestlog[0] + assert len(self.cache) == 0 + def test_echo_proxy_bounded_noproxy_range(self): headers = [('Range', 'bytes=10-1000')] resp = self.testapp.get('/rewrite/http://example.com/foobar', headers=headers) @@ -124,6 +134,10 @@ class TestProxyLiveRewriter: # no range request assert 'range: ' not in self.requestlog[0] + # r: key cached + assert len(self.cache) == 1 + assert RewriteHandler.create_cache_key('r:', 'http://example.com/foobar') in self.cache + # Second Request # clear log self.requestlog.pop() @@ -140,6 +154,7 @@ class TestProxyLiveRewriter: # already pinged proxy, no additional requests set to proxy assert len(self.requestlog) == 0 + assert len(self.cache) == 1 def test_echo_proxy_video_info(self): resp = self.testapp.get('/rewrite/vi_/https://www.youtube.com/watch?v=DjFZyFWSt1M') @@ -149,6 +164,14 @@ class TestProxyLiveRewriter: assert len(self.requestlog) == 1 assert self.requestlog[0].startswith('PUTMETA http://www.youtube.com/watch?v=DjFZyFWSt1M HTTP/1.1') + # second request, not sent to proxy + resp = self.testapp.get('/rewrite/vi_/https://www.youtube.com/watch?v=DjFZyFWSt1M') + assert len(self.requestlog) == 1 + + # v: video info cache + assert len(self.cache) == 1 + assert RewriteHandler.create_cache_key('v:', 'https://www.youtube.com/watch?v=DjFZyFWSt1M') in self.cache + def test_echo_proxy_video_with_referrer(self): headers = [('Range', 'bytes=1000-2000'), ('Referer', 'http://localhost:80/rewrite/https://example.com/')] resp = self.testapp.get('/rewrite/http://www.youtube.com/watch?v=DjFZyFWSt1M', headers=headers) @@ -159,12 +182,18 @@ class TestProxyLiveRewriter: # proxy receives two requests assert len(self.requestlog) == 2 - # first, non-ranged request for page - assert self.requestlog[0].startswith('GET http://www.youtube.com/watch?v=DjFZyFWSt1M HTTP/1.1') - assert 'range' not in self.requestlog[0] + # first, a video info request recording the page + assert self.requestlog[0].startswith('PUTMETA http://example.com/ HTTP/1.1') + + # second, non-ranged request for page + assert self.requestlog[1].startswith('GET http://www.youtube.com/watch?v=DjFZyFWSt1M HTTP/1.1') + assert 'range' not in self.requestlog[1] + + # both video info and range cached + assert len(self.cache) == 2 + assert RewriteHandler.create_cache_key('v:', 'http://www.youtube.com/watch?v=DjFZyFWSt1M') in self.cache + assert RewriteHandler.create_cache_key('r:', 'http://www.youtube.com/watch?v=DjFZyFWSt1M') in self.cache - # also a video info request recording the page - assert self.requestlog[1].startswith('PUTMETA http://example.com/ HTTP/1.1') def test_echo_proxy_error(self): headers = [('Range', 'bytes=1000-2000'), ('Referer', 'http://localhost:80/rewrite/https://example.com/')] @@ -177,3 +206,6 @@ class TestProxyLiveRewriter: # no proxy requests as we're forcing exception assert len(self.requestlog) == 0 + + assert len(self.cache) == 0 +