mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 15:09:54 +01:00
video: add video rewriting use vidrw client side and youtube-dl on the server
add vi_ modifier: -on record, gets video_info from youtube-dl, sends to proxy, if any, via PUTMETA to create metadata record -on playback, fetches special metadata record with video info and returns to client as json -vidrw script: fetches video info, if any, and attempts to replace iframe and embed tags (so far) which are videos wombat: export extract_url function, fix spaces and use object instance semantics
This commit is contained in:
parent
61ce53a0e0
commit
fb85570974
@ -138,11 +138,7 @@ class HttpsUrlRewriter(UrlRewriter):
|
||||
HTTPS = 'https://'
|
||||
|
||||
def rewrite(self, url, mod=None):
|
||||
if url.startswith(self.HTTPS):
|
||||
result = self.HTTP + url[len(self.HTTPS):]
|
||||
return result
|
||||
else:
|
||||
return url
|
||||
return self.remove_https(url)
|
||||
|
||||
def get_new_url(self, **kwargs):
|
||||
return kwargs.get('url')
|
||||
@ -155,3 +151,12 @@ class HttpsUrlRewriter(UrlRewriter):
|
||||
|
||||
def deprefix_url(self):
|
||||
return self.wburl.url
|
||||
|
||||
@staticmethod
|
||||
def remove_https(url):
|
||||
rw = HttpsUrlRewriter
|
||||
if url.startswith(rw.HTTPS):
|
||||
result = rw.HTTP + url[len(rw.HTTPS):]
|
||||
return result
|
||||
else:
|
||||
return url
|
||||
|
@ -116,6 +116,21 @@ rules:
|
||||
- signature
|
||||
|
||||
|
||||
# youtube rules
|
||||
#=================================================================
|
||||
|
||||
- url_prefix: 'com,youtube)/get_video_info'
|
||||
|
||||
fuzzy_lookup:
|
||||
- video_id
|
||||
- html5
|
||||
|
||||
|
||||
- url_prefix: 'com,googlevideo,'
|
||||
|
||||
fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]+).*(range=[^&]+)'
|
||||
|
||||
|
||||
# testing rules -- not for valid domain
|
||||
#=================================================================
|
||||
# this rule block is a non-existent prefix merely for testing
|
||||
|
110
pywb/static/vidrw.js
Normal file
110
pywb/static/vidrw.js
Normal file
@ -0,0 +1,110 @@
|
||||
/*
|
||||
Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License.
|
||||
|
||||
This file is part of pywb, https://github.com/ikreymer/pywb
|
||||
|
||||
pywb is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
pywb is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with pywb. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
// VidRw 1.0 -- video rewriting
|
||||
|
||||
__wbvidrw = (function() {
|
||||
|
||||
var already_checked = false;
|
||||
|
||||
function check_videos() {
|
||||
if (already_checked) {
|
||||
return;
|
||||
}
|
||||
|
||||
var iframes = document.getElementsByTagName("iframe");
|
||||
|
||||
for (var i = 0; i < iframes.length; i++) {
|
||||
already_checked = true;
|
||||
check_replacement(iframes[i], iframes[i].getAttribute("src"));
|
||||
}
|
||||
|
||||
var embeds = document.getElementsByTagName("embed");
|
||||
|
||||
for (var i = 0; i < embeds.length; i++) {
|
||||
already_checked = true;
|
||||
check_replacement(embeds[i], embeds[i].getAttribute("src"));
|
||||
}
|
||||
}
|
||||
|
||||
function check_replacement(elem, src) {
|
||||
if (!src) {
|
||||
return;
|
||||
}
|
||||
|
||||
src = _wb_wombat.extract_orig(src);
|
||||
|
||||
var xhr = new XMLHttpRequest();
|
||||
xhr._no_rewrite = true;
|
||||
xhr.open('GET', wbinfo.prefix + 'vi_/' + src, true);
|
||||
xhr.onload = function() {
|
||||
if (xhr.status == 200) {
|
||||
do_replace_video(elem, JSON.parse(xhr.responseText));
|
||||
}
|
||||
};
|
||||
xhr.send();
|
||||
}
|
||||
|
||||
function do_replace_video(elem, video_info) {
|
||||
// TODO: select based on size?
|
||||
var video_url = video_info.url;
|
||||
video_url = wbinfo.prefix + video_url;
|
||||
|
||||
console.log("REPLACING: " + video_url);
|
||||
var width = elem.getAttribute("width");
|
||||
var height = elem.getAttribute("height");
|
||||
|
||||
console.log(video_info.ext);
|
||||
|
||||
// Try HTML5 Video
|
||||
var htmlvideo = document.createElement("video");
|
||||
|
||||
htmlvideo.setAttribute("src", video_url);
|
||||
htmlvideo.setAttribute("width", width);
|
||||
htmlvideo.setAttribute("height", height);
|
||||
htmlvideo.setAttribute("controls", "1");
|
||||
htmlvideo.style.backgroundColor = "#000";
|
||||
|
||||
if (video_info.thumbnail) {
|
||||
var thumbnail = wbinfo.prefix + video_info.thumbnail;
|
||||
htmlvideo.setAttribute("thumbnail", thumbnail);
|
||||
}
|
||||
|
||||
htmlvideo.addEventListener("error", function() {
|
||||
console.log("html5 video error");
|
||||
});
|
||||
|
||||
htmlvideo.addEventListener("loadstart", function() {
|
||||
console.log("html5 video success");
|
||||
});
|
||||
|
||||
console.log(elem.tagName);
|
||||
|
||||
if (elem.tagName.toLowerCase() == "iframe") {
|
||||
elem.parentNode.replaceChild(htmlvideo, elem);
|
||||
} else if (elem.tagName.toLowerCase() == "embed") {
|
||||
if (elem.parentNode && elem.parentElement.tagName.toLowerCase() == "object") {
|
||||
elem = elem.parentNode;
|
||||
}
|
||||
elem.parentNode.replaceChild(htmlvideo, elem);
|
||||
}
|
||||
}
|
||||
|
||||
document.addEventListener("DOMContentLoaded", check_videos);
|
||||
})();
|
@ -20,7 +20,7 @@ This file is part of pywb, https://github.com/ikreymer/pywb
|
||||
//============================================
|
||||
// Wombat JS-Rewriting Library v2.0
|
||||
//============================================
|
||||
WB_wombat_init = (function() {
|
||||
var _WBWombat = (function() {
|
||||
|
||||
// Globals
|
||||
var wb_replay_prefix;
|
||||
@ -469,7 +469,9 @@ WB_wombat_init = (function() {
|
||||
var orig = window.XMLHttpRequest.prototype.open;
|
||||
|
||||
function open_rewritten(method, url, async, user, password) {
|
||||
if (!this._no_rewrite) {
|
||||
url = rewrite_url(url);
|
||||
}
|
||||
|
||||
// defaults to true
|
||||
if (async != false) {
|
||||
@ -810,6 +812,9 @@ WB_wombat_init = (function() {
|
||||
|
||||
// Random
|
||||
init_seeded_random(timestamp);
|
||||
|
||||
// expose functions
|
||||
this.extract_orig = extract_orig;
|
||||
}
|
||||
|
||||
return wombat_init;
|
||||
|
@ -3,7 +3,7 @@
|
||||
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wombat.js'> </script>
|
||||
<script>
|
||||
{% set urlsplit = cdx.original | urlsplit %}
|
||||
WB_wombat_init("{{ wbrequest.wb_prefix}}",
|
||||
var _wb_wombat = new _WBWombat("{{ wbrequest.wb_prefix}}",
|
||||
"{{ cdx['timestamp'] if include_ts else ''}}",
|
||||
"{{ urlsplit.scheme }}",
|
||||
"{{ urlsplit.netloc }}",
|
||||
@ -24,6 +24,7 @@
|
||||
</script>
|
||||
|
||||
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script>
|
||||
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/vidrw.js'> </script>
|
||||
|
||||
{% include banner_html ignore missing %}
|
||||
|
||||
|
@ -4,12 +4,17 @@ from pywb.framework.archivalrouter import ArchivalRouter, Route
|
||||
|
||||
from pywb.rewrite.rewrite_live import LiveRewriter
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import HttpsUrlRewriter
|
||||
|
||||
from handlers import StaticHandler, SearchPageWbUrlHandler
|
||||
from views import HeadInsertView
|
||||
|
||||
from pywb.utils.wbexception import WbException
|
||||
|
||||
import json
|
||||
import requests
|
||||
from youtube_dl import YoutubeDL
|
||||
|
||||
|
||||
#=================================================================
|
||||
class LiveResourceException(WbException):
|
||||
@ -25,14 +30,16 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
||||
def __init__(self, config):
|
||||
super(RewriteHandler, self).__init__(config)
|
||||
|
||||
default_proxy = config.get('proxyhostport')
|
||||
self.default_proxy = config.get('proxyhostport')
|
||||
self.rewriter = LiveRewriter(is_framed_replay=self.is_frame_mode,
|
||||
default_proxy=default_proxy)
|
||||
default_proxy=self.default_proxy)
|
||||
|
||||
self.head_insert_view = HeadInsertView.init_from_config(config)
|
||||
|
||||
self.live_cookie = config.get('live-cookie', self.LIVE_COOKIE)
|
||||
|
||||
self.ydl = None
|
||||
|
||||
def handle_request(self, wbrequest):
|
||||
try:
|
||||
return self.render_content(wbrequest)
|
||||
@ -50,6 +57,9 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
||||
return {}
|
||||
|
||||
def render_content(self, wbrequest):
|
||||
if wbrequest.wb_url.mod == 'vi_':
|
||||
return self.get_video_info(wbrequest)
|
||||
|
||||
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
|
||||
req_headers = self._live_request_headers(wbrequest)
|
||||
|
||||
@ -76,6 +86,34 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
||||
|
||||
return WbResponse(status_headers, gen)
|
||||
|
||||
|
||||
def get_video_info(self, wbrequest):
|
||||
if not self.ydl:
|
||||
self.ydl = YoutubeDL(dict(simulate=True,
|
||||
youtube_include_dash_manifest=False))
|
||||
|
||||
self.ydl.add_default_info_extractors()
|
||||
|
||||
info = self.ydl.extract_info(wbrequest.wb_url.url)
|
||||
content_type = 'application/vnd.youtube-dl_formats+json'
|
||||
metadata = json.dumps(info)
|
||||
|
||||
if self.default_proxy:
|
||||
proxies = {'http': self.default_proxy}
|
||||
|
||||
headers = {'Content-Type': content_type}
|
||||
|
||||
url = HttpsUrlRewriter.remove_https(wbrequest.wb_url.url)
|
||||
|
||||
response = requests.request(method='PUTMETA',
|
||||
url=url,
|
||||
data=metadata,
|
||||
headers=headers,
|
||||
proxies=proxies,
|
||||
verify=False)
|
||||
|
||||
return WbResponse.text_response(metadata, content_type=content_type)
|
||||
|
||||
def __str__(self):
|
||||
return 'Live Web Rewrite Handler'
|
||||
|
||||
|
@ -68,6 +68,14 @@ class QueryHandler(object):
|
||||
params['url'] = wb_url.url
|
||||
params['output'] = output
|
||||
|
||||
params['filter'].append('!mimetype:-')
|
||||
|
||||
# get metadata
|
||||
if wb_url.mod == 'vi_':
|
||||
# matching metadata explicitly with special scheme
|
||||
params['url'] = wb_url.url.replace('http:/', 'metadata:/')
|
||||
params['filter'].append('~original:metadata://')
|
||||
|
||||
cdx_iter = self.load_cdx(wbrequest, params)
|
||||
return cdx_iter, output
|
||||
|
||||
@ -132,6 +140,7 @@ class QueryHandler(object):
|
||||
'limit': limit,
|
||||
'fl': ('urlkey,original,timestamp,' +
|
||||
'endtimestamp,groupcount,uniqcount'),
|
||||
'filter':[],
|
||||
},
|
||||
|
||||
wburl.REPLAY:
|
||||
@ -147,6 +156,7 @@ class QueryHandler(object):
|
||||
# Not appropriate as default
|
||||
# Should be an option to configure status code filtering in general
|
||||
# 'filter': ['statuscode:[23]..|-'],
|
||||
'filter': [],
|
||||
'limit': '1',
|
||||
'resolveRevisits': True,
|
||||
}
|
||||
|
@ -98,7 +98,7 @@ class TestWb:
|
||||
|
||||
assert '"20140127171238"' in resp.body
|
||||
assert 'wb.js' in resp.body
|
||||
assert 'WB_wombat_init' in resp.body
|
||||
assert 'new _WBWombat' in resp.body, resp.body
|
||||
assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
|
||||
|
||||
def test_replay_non_frame_content(self):
|
||||
@ -149,7 +149,7 @@ class TestWb:
|
||||
assert 'wb.js' in resp.body
|
||||
|
||||
# no wombat present
|
||||
assert 'WB_wombat_init' not in resp.body
|
||||
assert '_WBWombat' not in resp.body
|
||||
|
||||
# url not rewritten
|
||||
#assert '"http://www.iana.org/domains/example"' in resp.body
|
||||
|
Loading…
x
Reference in New Issue
Block a user