1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

video work: improved yt handling:

- disable yt using yt api, for forced html/flash, diable on load
- use yt error event to detect error
- better fallback on recorded video
use seperate cache for range and video info tracking
fix yt rules query to account for & and ?
This commit is contained in:
Ilya Kreymer 2014-12-26 13:02:47 -08:00
parent ca17410056
commit 4c08a6a064
5 changed files with 177 additions and 76 deletions

View File

@ -148,12 +148,12 @@ rules:
- url_prefix: 'com,youtube,c'
fuzzy_lookup: 'com,youtube,c.*/videogoodput.*(id=[^&]+)'
fuzzy_lookup: 'com,youtube,c.*/videogoodput.*([?&]id=[^&]+)'
- url_prefix: 'com,googlevideo,'
fuzzy_lookup:
match: 'com,googlevideo.*/videoplayback.*(id=[^&]+).*(itag=[^&]+).*(mime=[^&]+)'
match: 'com,googlevideo.*/videoplayback.*([?&]id=[^&]+).*([?&]itag=[^&]+).*([?&]mime=[^&]+)'
filter:
- '~urlkey:{0}'
- '!mimetype:text/plain'

View File

@ -18,13 +18,35 @@ This file is part of pywb, https://github.com/ikreymer/pywb
*/
// VidRw 1.0 -- video rewriting
//
//
var _pywbvid = "default";
var _pywb_yt_err = undefined;
if (window.location.hash) {
var m = window.location.hash.match(/_pywbvid=([\w]+)/);
if (m) {
_pywbvid = m[1];
}
if (_pywbvid == "html" || _pywbvid == "flash") {
var YT_W_E_RX = /^(https?:\/\/.*youtube.com)\/(watch|embed).*$/;
if (wbinfo.url.match(YT_W_E_RX)) {
// special case: prevent yt player from being inited
Object.defineProperty(window, 'yt', {writeable: false});
Object.defineProperty(window, 'ytplayer', {writeable: false});
}
}
}
__wbvidrw = (function() {
var found_embeds = false;
var vid_type = "default";
var FLASH_PLAYER = wbinfo.static_prefix + "/flowplayer/flowplayer-3.2.18.swf";
function check_videos() {
@ -32,14 +54,6 @@ __wbvidrw = (function() {
return;
}
// extract_typ
if (window.location.hash) {
var m = window.location.hash.match(/_pywbvid=([\w]+)/);
if (m) {
vid_type = m[1];
}
}
function handle_all_embeds() {
var embeds = document.getElementsByTagName("embed");
@ -61,10 +75,9 @@ __wbvidrw = (function() {
found_embeds = true;
handle_yt_videos(vid_type);
//window.setInterval(handle_all_embeds, 1000);
handle_yt_videos(_pywbvid);
//window.setInterval(handle_all_embeds, 2000);
//_wb_wombat.add_tag_handler("embed", handle_all_embeds);
//_wb_wombat.add_tag_handler("object", handle_all_objects);
}
@ -100,8 +113,8 @@ __wbvidrw = (function() {
return false;
}
for (var j = 0; j < objects[i].children.length; j++) {
var child = objects[i].children[j];
for (var j = 0; j < elem.children.length; j++) {
var child = elem.children[j];
if (child.tagName == "EMBED") {
return false;
@ -125,7 +138,7 @@ __wbvidrw = (function() {
elem._vidrw = true;
check_replacement(elem, src);
check_replacement(elem, obj_url);
return true;
}
@ -136,41 +149,85 @@ __wbvidrw = (function() {
var YT_V_RX = /^(https?:\/\/.*youtube.com)\/v\/([^&?]+)(.*)$/;
var VIMEO_RX = /^https?:\/\/.*vimeo.*clip_id=([^&]+)/;
function handle_yt_videos(vid_type)
function remove_yt()
{
function do_yt_video_replace()
// yt special case
if (window.yt && window.yt.player && window.yt.player.getPlayerByElement) {
//yt.player.Application.create("player-api", ytplayer.config).dispose();
var elem = window.yt.player.getPlayerByElement("player-api");
if (!elem) {
elem = window.yt.player.getPlayerByElement("player");
}
if (elem) {
elem.destroy();
}
delete window.yt;
if (window.ytplayer) {
delete window.ytplayer;
}
}
// end yt special case
}
function handle_yt_videos(_pywbvid)
{
function do_yt_video_replace(elem)
{
console.log("REPLACING YT: " + wbinfo.url);
ytvideo[0].autoplay = false;
ytvideo[0].preload = "none";
remove_yt();
var elem = ytvideo[0];
// get ancestor 'div'
if (elem.parentElement) {
elem = elem.parentElement;
while (elem.hasChildNodes()) {
elem.removeChild(elem.lastChild);
}
if (elem.parentElement) {
elem = elem.parentElement;
}
console.log(elem);
// Experimental
check_replacement(elem, wbinfo.url);
//add placeholder child to remove
var placeholder = document.createElement("div");
elem.appendChild(placeholder);
check_replacement(placeholder, wbinfo.url);
}
// special case: yt
if (wbinfo.url.match(YT_W_E_RX)) {
var ytvideo = document.getElementsByTagName("video");
//var ytvideo = document.getElementsByTagName("video");
var player_div = document.getElementById("player-api");
if (!player_div) {
player_div = document.getElementById("player");
}
//if (ytvideo.length == 1 && ytvideo[0].getAttribute("data-youtube-id") != "") {
if (player_div) {
if (_pywbvid == "html" || _pywbvid == "flash") {
do_yt_video_replace(player_div);
} else if (!wbinfo.is_live) {
var player = window.yt.player.getPlayerByElement(player_div);
if (player) {
_pywb_yt_err = function() {
do_yt_video_replace(player_div);
}
player.addEventListener("onError", "_pywb_yt_err");
}
if (ytvideo.length == 1 && ytvideo[0].getAttribute("data-youtube-id") != "") {
if (vid_type == "html") {
do_yt_video_replace();
} else {
setTimeout(function() {
if (!ytvideo || !ytvideo.length || ytvideo[0].readyState == 0) {
do_yt_video_replace();
if (!window.yt || !window.yt.player) {
do_yt_video_replace(player_div);
return;
}
var state = -1;
if (player && player.getPlayerState) {
state = player.getPlayerState();
}
// if no player or player is still buffering (is this ok), then replace
if (state < 0 || state == 3) {
do_yt_video_replace(player_div);
return;
}
}, 4000);
}
@ -197,7 +254,7 @@ __wbvidrw = (function() {
src = src.replace(VIMEO_RX, "http://player.vimeo.com/video/$1");
if (vid_type == "orig") {
if (_pywbvid == "orig") {
var repl_src = src.replace(YT_V_RX, "$1/embed/$2?$3&controls=0");
if (repl_src != src) {
do_replace_iframe(elem, repl_src);
@ -275,12 +332,6 @@ __wbvidrw = (function() {
} else {
elem.parentNode.replaceChild(replacement, elem);
}
if (window.yt) {
yt.player.Application.create("player-api", ytplayer.config).dispose();
delete window.yt;
delete window.ytplayer;
}
}
@ -315,7 +366,7 @@ __wbvidrw = (function() {
if (type == "audio") {
htmlelem = document.createElement("audio");
}
if (vid_type != "flash") {
if (_pywbvid != "flash") {
replacement = init_html_player(htmlelem, type, width, height, info, thumb_url);
}
}
@ -402,7 +453,6 @@ __wbvidrw = (function() {
return;
}
//console.log("html5 " + type +" error");
var replacement = document.createElement("div");
var vidId = "_wb_vid" + Date.now();

View File

@ -67,7 +67,7 @@ class RewriteHandler(SearchPageWbUrlHandler):
def render_content(self, wbrequest):
if wbrequest.wb_url.mod == 'vi_':
return self.get_video_info(wbrequest)
return self._get_video_info(wbrequest)
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
req_headers = self._live_request_headers(wbrequest)
@ -79,6 +79,7 @@ class RewriteHandler(SearchPageWbUrlHandler):
ignore_proxies = False
use_206 = False
url = None
rangeres = None
readd_range = False
cache_key = None
@ -100,7 +101,7 @@ class RewriteHandler(SearchPageWbUrlHandler):
ignore_proxies = True
# sets cache_key only if not already cached
cache_key = self._check_url_cache(url)
cache_key = self._get_cache_key('r:', url)
result = self.rewriter.fetch_request(wbrequest.wb_url.url,
wbrequest.urlrewriter,
@ -124,6 +125,18 @@ class RewriteHandler(SearchPageWbUrlHandler):
if cache_key:
self._add_proxy_ping(cache_key, url, wbrequest, wbresponse)
if rangeres:
referrer = wbrequest.env.get('REL_REFERER')
# also ping video info
if referrer:
try:
resp = self._get_video_info(wbrequest,
info_url=referrer,
video_url=url)
except:
print('Error getting video info')
return wbresponse
def _make_response(self, wbrequest, status_headers, gen, is_rewritten):
@ -138,22 +151,26 @@ class RewriteHandler(SearchPageWbUrlHandler):
return WbResponse(status_headers, gen)
def _check_url_cache(self, url):
def _get_cache_key(self, prefix, url):
if not self._cache:
self._cache = create_cache()
hash_ = hashlib.md5()
hash_.update(url)
key = hash_.hexdigest()
key = self.create_cache_key(prefix, url)
if key in self._cache:
return None
return key
def _add_proxy_ping(self, key, url, wbrequest, wbresponse):
referrer = wbrequest.env.get('REL_REFERER')
@staticmethod
def create_cache_key(prefix, url):
hash_ = hashlib.md5()
hash_.update(url)
key = hash_.hexdigest()
key = prefix + key
return key
def _add_proxy_ping(self, key, url, wbrequest, wbresponse):
def do_ping():
headers = self._live_request_headers(wbrequest)
headers['Connection'] = 'close'
@ -175,12 +192,6 @@ class RewriteHandler(SearchPageWbUrlHandler):
del self._cache[key]
raise
# also ping video info
if referrer:
resp = self.get_video_info(wbrequest,
info_url=referrer,
video_url=url)
def wrap_buff_gen(gen):
for x in gen:
yield x
@ -194,7 +205,7 @@ class RewriteHandler(SearchPageWbUrlHandler):
wbresponse.body = wrap_buff_gen(wbresponse.body)
return wbresponse
def get_video_info(self, wbrequest, info_url=None, video_url=None):
def _get_video_info(self, wbrequest, info_url=None, video_url=None):
if not self.youtubedl:
self.youtubedl = YoutubeDLWrapper()
@ -204,12 +215,18 @@ class RewriteHandler(SearchPageWbUrlHandler):
if not info_url:
info_url = wbrequest.wb_url.url
cache_key = None
if self.proxies:
cache_key = self._get_cache_key('v:', video_url)
info = self.youtubedl.extract_info(video_url)
#if info and info.formats and len(info.formats) == 1:
content_type = self.YT_DL_TYPE
metadata = json.dumps(info)
if self.proxies:
if (self.proxies and cache_key):
headers = self._live_request_headers(wbrequest)
headers['Content-Type'] = content_type
@ -222,6 +239,8 @@ class RewriteHandler(SearchPageWbUrlHandler):
proxies=self.proxies,
verify=False)
self._cache[cache_key] = '1'
return WbResponse.text_response(metadata, content_type=content_type)
def __str__(self):

View File

@ -6,6 +6,7 @@ from tempfile import NamedTemporaryFile, mkdtemp
import yaml
import os
from shutil import rmtree
import atexit
@ -19,9 +20,8 @@ class RangeCache(object):
def cleanup(self):
if self.temp_dir: # pragma: no cover
import shutil
print('Removing: ' + self.temp_dir)
shutil.rmtree(self.temp_dir, True)
rmtree(self.temp_dir, True)
self.temp_dir = None
def handle_range(self, wbrequest, digest, wbresponse_func,

View File

@ -9,9 +9,10 @@ from pywb.framework.wsgi_wrappers import init_app
import webtest
import shutil
import pywb.webapp.live_rewrite_handler
#=================================================================
#ThreadingMixIn.deamon_threads = True
#class ProxyServer(ThreadingMixIn, HTTPServer):
class ProxyServer(HTTPServer):
@ -49,6 +50,7 @@ class ProxyRequest(BaseHTTPRequestHandler):
class TestProxyLiveRewriter:
def setup(self):
self.requestlog = []
self.cache = {}
def make_httpd(app):
proxyserv = ProxyServer(('', 0), ProxyRequest)
@ -63,7 +65,11 @@ class TestProxyLiveRewriter:
config=dict(framed_replay=True,
proxyhostport=self.server.proxy_dict))
print(self.server.proxy_dict)
def create_cache():
return self.cache
pywb.webapp.live_rewrite_handler.create_cache = create_cache
self.testapp = webtest.TestApp(self.app)
def teardown(self):
@ -83,6 +89,8 @@ class TestProxyLiveRewriter:
assert resp.body.startswith('GET http://example.com/ HTTP/1.1')
assert 'referer: http://other.example.com' in resp.body
assert len(self.cache) == 0
def test_echo_proxy_start_unbounded_remove_range(self):
headers = [('Range', 'bytes=0-')]
resp = self.testapp.get('/rewrite/http://example.com/', headers=headers)
@ -101,6 +109,8 @@ class TestProxyLiveRewriter:
assert self.requestlog[0].startswith('GET http://example.com/ HTTP/1.1')
assert 'range: ' not in self.requestlog[0]
assert len(self.cache) == 0
def test_echo_proxy_bounded_noproxy_range(self):
headers = [('Range', 'bytes=10-1000')]
resp = self.testapp.get('/rewrite/http://example.com/foobar', headers=headers)
@ -124,6 +134,10 @@ class TestProxyLiveRewriter:
# no range request
assert 'range: ' not in self.requestlog[0]
# r: key cached
assert len(self.cache) == 1
assert RewriteHandler.create_cache_key('r:', 'http://example.com/foobar') in self.cache
# Second Request
# clear log
self.requestlog.pop()
@ -140,6 +154,7 @@ class TestProxyLiveRewriter:
# already pinged proxy, no additional requests set to proxy
assert len(self.requestlog) == 0
assert len(self.cache) == 1
def test_echo_proxy_video_info(self):
resp = self.testapp.get('/rewrite/vi_/https://www.youtube.com/watch?v=DjFZyFWSt1M')
@ -149,6 +164,14 @@ class TestProxyLiveRewriter:
assert len(self.requestlog) == 1
assert self.requestlog[0].startswith('PUTMETA http://www.youtube.com/watch?v=DjFZyFWSt1M HTTP/1.1')
# second request, not sent to proxy
resp = self.testapp.get('/rewrite/vi_/https://www.youtube.com/watch?v=DjFZyFWSt1M')
assert len(self.requestlog) == 1
# v: video info cache
assert len(self.cache) == 1
assert RewriteHandler.create_cache_key('v:', 'https://www.youtube.com/watch?v=DjFZyFWSt1M') in self.cache
def test_echo_proxy_video_with_referrer(self):
headers = [('Range', 'bytes=1000-2000'), ('Referer', 'http://localhost:80/rewrite/https://example.com/')]
resp = self.testapp.get('/rewrite/http://www.youtube.com/watch?v=DjFZyFWSt1M', headers=headers)
@ -159,12 +182,18 @@ class TestProxyLiveRewriter:
# proxy receives two requests
assert len(self.requestlog) == 2
# first, non-ranged request for page
assert self.requestlog[0].startswith('GET http://www.youtube.com/watch?v=DjFZyFWSt1M HTTP/1.1')
assert 'range' not in self.requestlog[0]
# first, a video info request recording the page
assert self.requestlog[0].startswith('PUTMETA http://example.com/ HTTP/1.1')
# second, non-ranged request for page
assert self.requestlog[1].startswith('GET http://www.youtube.com/watch?v=DjFZyFWSt1M HTTP/1.1')
assert 'range' not in self.requestlog[1]
# both video info and range cached
assert len(self.cache) == 2
assert RewriteHandler.create_cache_key('v:', 'http://www.youtube.com/watch?v=DjFZyFWSt1M') in self.cache
assert RewriteHandler.create_cache_key('r:', 'http://www.youtube.com/watch?v=DjFZyFWSt1M') in self.cache
# also a video info request recording the page
assert self.requestlog[1].startswith('PUTMETA http://example.com/ HTTP/1.1')
def test_echo_proxy_error(self):
headers = [('Range', 'bytes=1000-2000'), ('Referer', 'http://localhost:80/rewrite/https://example.com/')]
@ -177,3 +206,6 @@ class TestProxyLiveRewriter:
# no proxy requests as we're forcing exception
assert len(self.requestlog) == 0
assert len(self.cache) == 0