1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-25 23:47:47 +01:00

video: add video rewriting use vidrw client side and youtube-dl on the server

add vi_ modifier:
-on record, gets video_info from youtube-dl, sends to proxy,
if any, via PUTMETA to create metadata record
-on playback, fetches special metadata record with video info and
returns to client as json
-vidrw script: fetches video info, if any, and attempts to replace
iframe and embed tags (so far) which are videos
wombat: export extract_url function, fix spaces and use object instance
semantics
This commit is contained in:
Ilya Kreymer 2014-10-28 10:36:48 -07:00
parent 61ce53a0e0
commit fb85570974
8 changed files with 291 additions and 107 deletions

View File

@ -138,11 +138,7 @@ class HttpsUrlRewriter(UrlRewriter):
HTTPS = 'https://' HTTPS = 'https://'
def rewrite(self, url, mod=None): def rewrite(self, url, mod=None):
if url.startswith(self.HTTPS): return self.remove_https(url)
result = self.HTTP + url[len(self.HTTPS):]
return result
else:
return url
def get_new_url(self, **kwargs): def get_new_url(self, **kwargs):
return kwargs.get('url') return kwargs.get('url')
@ -155,3 +151,12 @@ class HttpsUrlRewriter(UrlRewriter):
def deprefix_url(self): def deprefix_url(self):
return self.wburl.url return self.wburl.url
@staticmethod
def remove_https(url):
rw = HttpsUrlRewriter
if url.startswith(rw.HTTPS):
result = rw.HTTP + url[len(rw.HTTPS):]
return result
else:
return url

View File

@ -116,6 +116,21 @@ rules:
- signature - signature
# youtube rules
#=================================================================
- url_prefix: 'com,youtube)/get_video_info'
fuzzy_lookup:
- video_id
- html5
- url_prefix: 'com,googlevideo,'
fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]+).*(range=[^&]+)'
# testing rules -- not for valid domain # testing rules -- not for valid domain
#================================================================= #=================================================================
# this rule block is a non-existent prefix merely for testing # this rule block is a non-existent prefix merely for testing

110
pywb/static/vidrw.js Normal file
View File

@ -0,0 +1,110 @@
/*
Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License.
This file is part of pywb, https://github.com/ikreymer/pywb
pywb is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
pywb is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with pywb. If not, see <http://www.gnu.org/licenses/>.
*/
// VidRw 1.0 -- video rewriting
__wbvidrw = (function() {
var already_checked = false;
function check_videos() {
if (already_checked) {
return;
}
var iframes = document.getElementsByTagName("iframe");
for (var i = 0; i < iframes.length; i++) {
already_checked = true;
check_replacement(iframes[i], iframes[i].getAttribute("src"));
}
var embeds = document.getElementsByTagName("embed");
for (var i = 0; i < embeds.length; i++) {
already_checked = true;
check_replacement(embeds[i], embeds[i].getAttribute("src"));
}
}
function check_replacement(elem, src) {
if (!src) {
return;
}
src = _wb_wombat.extract_orig(src);
var xhr = new XMLHttpRequest();
xhr._no_rewrite = true;
xhr.open('GET', wbinfo.prefix + 'vi_/' + src, true);
xhr.onload = function() {
if (xhr.status == 200) {
do_replace_video(elem, JSON.parse(xhr.responseText));
}
};
xhr.send();
}
function do_replace_video(elem, video_info) {
// TODO: select based on size?
var video_url = video_info.url;
video_url = wbinfo.prefix + video_url;
console.log("REPLACING: " + video_url);
var width = elem.getAttribute("width");
var height = elem.getAttribute("height");
console.log(video_info.ext);
// Try HTML5 Video
var htmlvideo = document.createElement("video");
htmlvideo.setAttribute("src", video_url);
htmlvideo.setAttribute("width", width);
htmlvideo.setAttribute("height", height);
htmlvideo.setAttribute("controls", "1");
htmlvideo.style.backgroundColor = "#000";
if (video_info.thumbnail) {
var thumbnail = wbinfo.prefix + video_info.thumbnail;
htmlvideo.setAttribute("thumbnail", thumbnail);
}
htmlvideo.addEventListener("error", function() {
console.log("html5 video error");
});
htmlvideo.addEventListener("loadstart", function() {
console.log("html5 video success");
});
console.log(elem.tagName);
if (elem.tagName.toLowerCase() == "iframe") {
elem.parentNode.replaceChild(htmlvideo, elem);
} else if (elem.tagName.toLowerCase() == "embed") {
if (elem.parentNode && elem.parentElement.tagName.toLowerCase() == "object") {
elem = elem.parentNode;
}
elem.parentNode.replaceChild(htmlvideo, elem);
}
}
document.addEventListener("DOMContentLoaded", check_videos);
})();

View File

@ -20,7 +20,7 @@ This file is part of pywb, https://github.com/ikreymer/pywb
//============================================ //============================================
// Wombat JS-Rewriting Library v2.0 // Wombat JS-Rewriting Library v2.0
//============================================ //============================================
WB_wombat_init = (function() { var _WBWombat = (function() {
// Globals // Globals
var wb_replay_prefix; var wb_replay_prefix;
@ -469,7 +469,9 @@ WB_wombat_init = (function() {
var orig = window.XMLHttpRequest.prototype.open; var orig = window.XMLHttpRequest.prototype.open;
function open_rewritten(method, url, async, user, password) { function open_rewritten(method, url, async, user, password) {
if (!this._no_rewrite) {
url = rewrite_url(url); url = rewrite_url(url);
}
// defaults to true // defaults to true
if (async != false) { if (async != false) {
@ -810,6 +812,9 @@ WB_wombat_init = (function() {
// Random // Random
init_seeded_random(timestamp); init_seeded_random(timestamp);
// expose functions
this.extract_orig = extract_orig;
} }
return wombat_init; return wombat_init;

View File

@ -3,7 +3,7 @@
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wombat.js'> </script> <script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wombat.js'> </script>
<script> <script>
{% set urlsplit = cdx.original | urlsplit %} {% set urlsplit = cdx.original | urlsplit %}
WB_wombat_init("{{ wbrequest.wb_prefix}}", var _wb_wombat = new _WBWombat("{{ wbrequest.wb_prefix}}",
"{{ cdx['timestamp'] if include_ts else ''}}", "{{ cdx['timestamp'] if include_ts else ''}}",
"{{ urlsplit.scheme }}", "{{ urlsplit.scheme }}",
"{{ urlsplit.netloc }}", "{{ urlsplit.netloc }}",
@ -24,6 +24,7 @@
</script> </script>
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script> <script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script>
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/vidrw.js'> </script>
{% include banner_html ignore missing %} {% include banner_html ignore missing %}

View File

@ -4,12 +4,17 @@ from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.rewrite.rewrite_live import LiveRewriter from pywb.rewrite.rewrite_live import LiveRewriter
from pywb.rewrite.wburl import WbUrl from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import HttpsUrlRewriter
from handlers import StaticHandler, SearchPageWbUrlHandler from handlers import StaticHandler, SearchPageWbUrlHandler
from views import HeadInsertView from views import HeadInsertView
from pywb.utils.wbexception import WbException from pywb.utils.wbexception import WbException
import json
import requests
from youtube_dl import YoutubeDL
#================================================================= #=================================================================
class LiveResourceException(WbException): class LiveResourceException(WbException):
@ -25,14 +30,16 @@ class RewriteHandler(SearchPageWbUrlHandler):
def __init__(self, config): def __init__(self, config):
super(RewriteHandler, self).__init__(config) super(RewriteHandler, self).__init__(config)
default_proxy = config.get('proxyhostport') self.default_proxy = config.get('proxyhostport')
self.rewriter = LiveRewriter(is_framed_replay=self.is_frame_mode, self.rewriter = LiveRewriter(is_framed_replay=self.is_frame_mode,
default_proxy=default_proxy) default_proxy=self.default_proxy)
self.head_insert_view = HeadInsertView.init_from_config(config) self.head_insert_view = HeadInsertView.init_from_config(config)
self.live_cookie = config.get('live-cookie', self.LIVE_COOKIE) self.live_cookie = config.get('live-cookie', self.LIVE_COOKIE)
self.ydl = None
def handle_request(self, wbrequest): def handle_request(self, wbrequest):
try: try:
return self.render_content(wbrequest) return self.render_content(wbrequest)
@ -50,6 +57,9 @@ class RewriteHandler(SearchPageWbUrlHandler):
return {} return {}
def render_content(self, wbrequest): def render_content(self, wbrequest):
if wbrequest.wb_url.mod == 'vi_':
return self.get_video_info(wbrequest)
head_insert_func = self.head_insert_view.create_insert_func(wbrequest) head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
req_headers = self._live_request_headers(wbrequest) req_headers = self._live_request_headers(wbrequest)
@ -76,6 +86,34 @@ class RewriteHandler(SearchPageWbUrlHandler):
return WbResponse(status_headers, gen) return WbResponse(status_headers, gen)
def get_video_info(self, wbrequest):
if not self.ydl:
self.ydl = YoutubeDL(dict(simulate=True,
youtube_include_dash_manifest=False))
self.ydl.add_default_info_extractors()
info = self.ydl.extract_info(wbrequest.wb_url.url)
content_type = 'application/vnd.youtube-dl_formats+json'
metadata = json.dumps(info)
if self.default_proxy:
proxies = {'http': self.default_proxy}
headers = {'Content-Type': content_type}
url = HttpsUrlRewriter.remove_https(wbrequest.wb_url.url)
response = requests.request(method='PUTMETA',
url=url,
data=metadata,
headers=headers,
proxies=proxies,
verify=False)
return WbResponse.text_response(metadata, content_type=content_type)
def __str__(self): def __str__(self):
return 'Live Web Rewrite Handler' return 'Live Web Rewrite Handler'

View File

@ -68,6 +68,14 @@ class QueryHandler(object):
params['url'] = wb_url.url params['url'] = wb_url.url
params['output'] = output params['output'] = output
params['filter'].append('!mimetype:-')
# get metadata
if wb_url.mod == 'vi_':
# matching metadata explicitly with special scheme
params['url'] = wb_url.url.replace('http:/', 'metadata:/')
params['filter'].append('~original:metadata://')
cdx_iter = self.load_cdx(wbrequest, params) cdx_iter = self.load_cdx(wbrequest, params)
return cdx_iter, output return cdx_iter, output
@ -132,6 +140,7 @@ class QueryHandler(object):
'limit': limit, 'limit': limit,
'fl': ('urlkey,original,timestamp,' + 'fl': ('urlkey,original,timestamp,' +
'endtimestamp,groupcount,uniqcount'), 'endtimestamp,groupcount,uniqcount'),
'filter':[],
}, },
wburl.REPLAY: wburl.REPLAY:
@ -147,6 +156,7 @@ class QueryHandler(object):
# Not appropriate as default # Not appropriate as default
# Should be an option to configure status code filtering in general # Should be an option to configure status code filtering in general
# 'filter': ['statuscode:[23]..|-'], # 'filter': ['statuscode:[23]..|-'],
'filter': [],
'limit': '1', 'limit': '1',
'resolveRevisits': True, 'resolveRevisits': True,
} }

View File

@ -98,7 +98,7 @@ class TestWb:
assert '"20140127171238"' in resp.body assert '"20140127171238"' in resp.body
assert 'wb.js' in resp.body assert 'wb.js' in resp.body
assert 'WB_wombat_init' in resp.body assert 'new _WBWombat' in resp.body, resp.body
assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
def test_replay_non_frame_content(self): def test_replay_non_frame_content(self):
@ -149,7 +149,7 @@ class TestWb:
assert 'wb.js' in resp.body assert 'wb.js' in resp.body
# no wombat present # no wombat present
assert 'WB_wombat_init' not in resp.body assert '_WBWombat' not in resp.body
# url not rewritten # url not rewritten
#assert '"http://www.iana.org/domains/example"' in resp.body #assert '"http://www.iana.org/domains/example"' in resp.body