diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py
index 61a48e50..a162f67e 100644
--- a/pywb/rewrite/url_rewriter.py
+++ b/pywb/rewrite/url_rewriter.py
@@ -138,11 +138,7 @@ class HttpsUrlRewriter(UrlRewriter):
HTTPS = 'https://'
def rewrite(self, url, mod=None):
- if url.startswith(self.HTTPS):
- result = self.HTTP + url[len(self.HTTPS):]
- return result
- else:
- return url
+ return self.remove_https(url)
def get_new_url(self, **kwargs):
return kwargs.get('url')
@@ -155,3 +151,12 @@ class HttpsUrlRewriter(UrlRewriter):
def deprefix_url(self):
return self.wburl.url
+
+ @staticmethod
+ def remove_https(url):
+ rw = HttpsUrlRewriter
+ if url.startswith(rw.HTTPS):
+ result = rw.HTTP + url[len(rw.HTTPS):]
+ return result
+ else:
+ return url
diff --git a/pywb/rules.yaml b/pywb/rules.yaml
index b601b2df..e5d79bbe 100644
--- a/pywb/rules.yaml
+++ b/pywb/rules.yaml
@@ -114,7 +114,22 @@ rules:
- ownerId
- videoFileId
- signature
-
+
+
+ # youtube rules
+ #=================================================================
+
+ - url_prefix: 'com,youtube)/get_video_info'
+
+ fuzzy_lookup:
+ - video_id
+ - html5
+
+
+ - url_prefix: 'com,googlevideo,'
+
+ fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]+).*(range=[^&]+)'
+
# testing rules -- not for valid domain
#=================================================================
diff --git a/pywb/static/vidrw.js b/pywb/static/vidrw.js
new file mode 100644
index 00000000..14746481
--- /dev/null
+++ b/pywb/static/vidrw.js
@@ -0,0 +1,110 @@
+/*
+Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License.
+
+This file is part of pywb, https://github.com/ikreymer/pywb
+
+ pywb is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ pywb is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with pywb. If not, see .
+*/
+
+// VidRw 1.0 -- video rewriting
+
+__wbvidrw = (function() {
+
+ var already_checked = false;
+
+ function check_videos() {
+ if (already_checked) {
+ return;
+ }
+
+ var iframes = document.getElementsByTagName("iframe");
+
+ for (var i = 0; i < iframes.length; i++) {
+ already_checked = true;
+ check_replacement(iframes[i], iframes[i].getAttribute("src"));
+ }
+
+ var embeds = document.getElementsByTagName("embed");
+
+ for (var i = 0; i < embeds.length; i++) {
+ already_checked = true;
+ check_replacement(embeds[i], embeds[i].getAttribute("src"));
+ }
+ }
+
+ function check_replacement(elem, src) {
+ if (!src) {
+ return;
+ }
+
+ src = _wb_wombat.extract_orig(src);
+
+ var xhr = new XMLHttpRequest();
+ xhr._no_rewrite = true;
+ xhr.open('GET', wbinfo.prefix + 'vi_/' + src, true);
+ xhr.onload = function() {
+ if (xhr.status == 200) {
+ do_replace_video(elem, JSON.parse(xhr.responseText));
+ }
+ };
+ xhr.send();
+ }
+
+ function do_replace_video(elem, video_info) {
+ // TODO: select based on size?
+ var video_url = video_info.url;
+ video_url = wbinfo.prefix + video_url;
+
+ console.log("REPLACING: " + video_url);
+ var width = elem.getAttribute("width");
+ var height = elem.getAttribute("height");
+
+ console.log(video_info.ext);
+
+ // Try HTML5 Video
+ var htmlvideo = document.createElement("video");
+
+ htmlvideo.setAttribute("src", video_url);
+ htmlvideo.setAttribute("width", width);
+ htmlvideo.setAttribute("height", height);
+ htmlvideo.setAttribute("controls", "1");
+ htmlvideo.style.backgroundColor = "#000";
+
+ if (video_info.thumbnail) {
+ var thumbnail = wbinfo.prefix + video_info.thumbnail;
+ htmlvideo.setAttribute("thumbnail", thumbnail);
+ }
+
+ htmlvideo.addEventListener("error", function() {
+ console.log("html5 video error");
+ });
+
+ htmlvideo.addEventListener("loadstart", function() {
+ console.log("html5 video success");
+ });
+
+ console.log(elem.tagName);
+
+ if (elem.tagName.toLowerCase() == "iframe") {
+ elem.parentNode.replaceChild(htmlvideo, elem);
+ } else if (elem.tagName.toLowerCase() == "embed") {
+ if (elem.parentNode && elem.parentElement.tagName.toLowerCase() == "object") {
+ elem = elem.parentNode;
+ }
+ elem.parentNode.replaceChild(htmlvideo, elem);
+ }
+ }
+
+ document.addEventListener("DOMContentLoaded", check_videos);
+})();
diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js
index 0e6d8327..4ad51067 100644
--- a/pywb/static/wombat.js
+++ b/pywb/static/wombat.js
@@ -20,7 +20,7 @@ This file is part of pywb, https://github.com/ikreymer/pywb
//============================================
// Wombat JS-Rewriting Library v2.0
//============================================
-WB_wombat_init = (function() {
+var _WBWombat = (function() {
// Globals
var wb_replay_prefix;
@@ -64,7 +64,7 @@ WB_wombat_init = (function() {
} else if (string.indexOf(arr_or_prefix) == 0) {
return arr_or_prefix;
}
-
+
return undefined;
}
@@ -89,31 +89,31 @@ WB_wombat_init = (function() {
}
return rewritten;
}
-
+
//============================================
var HTTP_PREFIX = "http://";
var HTTPS_PREFIX = "https://";
var REL_PREFIX = "//";
-
+
var VALID_PREFIXES = [HTTP_PREFIX, HTTPS_PREFIX, REL_PREFIX];
var IGNORE_PREFIXES = ["#", "about:", "data:", "mailto:", "javascript:"];
-
+
var BAD_PREFIXES;
-
+
function init_bad_prefixes(prefix) {
BAD_PREFIXES = ["http:" + prefix, "https:" + prefix,
"http:/" + prefix, "https:/" + prefix];
}
-
+
//============================================
function rewrite_url_(url) {
// If undefined, just return it
if (!url) {
return url;
}
-
+
var urltype_ = (typeof url);
-
+
// If object, use toString
if (urltype_ == "object") {
url = url.toString();
@@ -129,7 +129,7 @@ WB_wombat_init = (function() {
return url;
}
}
-
+
// just in case wombat reference made it into url!
url = url.replace("WB_wombat_", "");
@@ -166,10 +166,10 @@ WB_wombat_init = (function() {
}
return wb_replay_date_prefix + url;
}
-
+
// Check for common bad prefixes and remove them
prefix = starts_with(url, BAD_PREFIXES);
-
+
if (prefix) {
url = extract_orig(url);
return wb_replay_date_prefix + url;
@@ -189,16 +189,16 @@ WB_wombat_init = (function() {
if (!href) {
return "";
}
-
+
// proxy mode: no extraction needed
if (!wb_replay_prefix) {
return href;
}
-
+
href = href.toString();
var index = href.indexOf("/http", 1);
-
+
// extract original url from wburl
if (index > 0) {
href = href.substr(index + 1);
@@ -207,12 +207,12 @@ WB_wombat_init = (function() {
if (index >= 0) {
href = href.substr(index + wb_replay_prefix.length);
}
- if ((href.length > 4) &&
- (href.charAt(2) == "_") &&
+ if ((href.length > 4) &&
+ (href.charAt(2) == "_") &&
(href.charAt(3) == "/")) {
href = href.substr(4);
}
-
+
if (!starts_with(href, "http")) {
href = HTTP_PREFIX + href;
}
@@ -225,18 +225,18 @@ WB_wombat_init = (function() {
return href;
}
-
+
//============================================
// Define custom property
function def_prop(obj, prop, value, set_func, get_func) {
var key = "_" + prop;
obj[key] = value;
-
+
try {
Object.defineProperty(obj, prop, {
configurable: false,
enumerable: true,
- set: function(newval) {
+ set: function(newval) {
var result = set_func.call(obj, newval);
if (result != undefined) {
obj[key] = result;
@@ -256,12 +256,12 @@ WB_wombat_init = (function() {
obj[prop] = value;
return false;
}
- }
-
+ }
+
//============================================
//Define WombatLocation
-
- function WombatLocation(loc) {
+
+ function WombatLocation(loc) {
this._orig_loc = loc;
this._orig_href = loc.href;
@@ -273,53 +273,53 @@ WB_wombat_init = (function() {
return this._orig_loc.assign(rewrite_url(url));
}
this.reload = loc.reload;
-
+
// Adapted from:
// https://gist.github.com/jlong/2428561
var parser = document.createElement('a');
var href = extract_orig(this._orig_href);
parser.href = href;
-
+
this._autooverride = false;
-
+
var _set_hash = function(hash) {
this._orig_loc.hash = hash;
return this._orig_loc.hash;
}
-
+
var _get_hash = function() {
return this._orig_loc.hash;
}
-
+
var _get_url_with_hash = function(url) {
return url + this._orig_loc.hash;
}
-
+
href = parser.href;
var hash = parser.hash;
-
+
if (hash) {
var hidx = href.lastIndexOf("#");
if (hidx > 0) {
href = href.substring(0, hidx);
}
}
-
+
if (Object.defineProperty) {
var res1 = def_prop(this, "href", href,
this.assign,
_get_url_with_hash);
-
+
var res2 = def_prop(this, "hash", parser.hash,
_set_hash,
_get_hash);
-
+
this._autooverride = res1 && res2;
} else {
this.href = href;
this.hash = parser.hash;
}
-
+
this.host = parser.host;
this.hostname = parser.hostname;
@@ -335,17 +335,17 @@ WB_wombat_init = (function() {
this.toString = function() {
return this.href;
}
-
+
// Copy any remaining properties
for (prop in loc) {
if (this.hasOwnProperty(prop)) {
continue;
}
-
+
if ((typeof loc[prop]) != "function") {
this[prop] = loc[prop];
}
- }
+ }
}
//============================================
@@ -460,7 +460,7 @@ WB_wombat_init = (function() {
//============================================
function init_ajax_rewrite() {
- if (!window.XMLHttpRequest ||
+ if (!window.XMLHttpRequest ||
!window.XMLHttpRequest.prototype ||
!window.XMLHttpRequest.prototype.open) {
return;
@@ -469,7 +469,9 @@ WB_wombat_init = (function() {
var orig = window.XMLHttpRequest.prototype.open;
function open_rewritten(method, url, async, user, password) {
- url = rewrite_url(url);
+ if (!this._no_rewrite) {
+ url = rewrite_url(url);
+ }
// defaults to true
if (async != false) {
@@ -534,7 +536,7 @@ WB_wombat_init = (function() {
rewrite_attr(elem, "src", rewrite_url);
rewrite_attr(elem, "href", rewrite_url);
rewrite_attr(elem, "style", rewrite_style);
-
+
if (elem && elem.getAttribute && elem.getAttribute("crossorigin")) {
elem.removeAttribute("crossorigin");
}
@@ -545,7 +547,7 @@ WB_wombat_init = (function() {
if (!Node || !Node.prototype) {
return;
}
-
+
function override_attr(obj, attr) {
var setter = function(orig) {
var val = rewrite_url(orig);
@@ -553,15 +555,15 @@ WB_wombat_init = (function() {
this.setAttribute(attr, val);
return val;
}
-
+
var getter = function(val) {
var res = this.getAttribute(attr);
return res;
}
-
+
var curr_src = obj.getAttribute(attr);
-
- def_prop(obj, attr, curr_src, setter, getter);
+
+ def_prop(obj, attr, curr_src, setter, getter);
}
function replace_dom_func(funcname) {
@@ -569,7 +571,7 @@ WB_wombat_init = (function() {
Node.prototype[funcname] = function() {
var child = arguments[0];
-
+
rewrite_elem(child);
var desc;
@@ -587,19 +589,19 @@ WB_wombat_init = (function() {
}
var created = orig.apply(this, arguments);
-
- if (created.tagName == "IFRAME") {
+
+ if (created.tagName == "IFRAME") {
if (created.contentWindow) {
created.contentWindow.window.WB_wombat_location = created.contentWindow.window.location;
}
-
+
override_attr(created, "src");
}
-
+
// } else if (created.tagName == "A") {
// override_attr(created, "href");
// }
-
+
return created;
}
}
@@ -608,29 +610,29 @@ WB_wombat_init = (function() {
replace_dom_func("insertBefore");
replace_dom_func("replaceChild");
}
-
+
var postmessage_rewritten;
-
+
//============================================
function init_postmessage_override()
- {
+ {
if (!Window.prototype.postMessage) {
return;
}
-
+
var orig = Window.prototype.postMessage;
-
+
postmessage_rewritten = function(message, targetOrigin, transfer) {
if (targetOrigin && targetOrigin != "*") {
targetOrigin = window.location.origin;
}
-
+
return orig.call(this, message, targetOrigin, transfer);
}
-
+
window.postMessage = postmessage_rewritten;
window.Window.prototype.postMessage = postmessage_rewritten;
-
+
for (var i = 0; i < window.frames.length; i++) {
try {
window.frames[i].postMessage = postmessage_rewritten;
@@ -639,24 +641,24 @@ WB_wombat_init = (function() {
}
}
}
-
+
//============================================
function init_open_override()
- {
+ {
if (!Window.prototype.open) {
return;
}
-
+
var orig = Window.prototype.open;
-
+
var open_rewritten = function(strUrl, strWindowName, strWindowFeatures) {
strUrl = rewrite_url(strUrl);
return orig.call(this, strUrl, strWindowName, strWindowFeatures);
}
-
+
window.open = open_rewritten;
window.Window.prototype.open = open_rewritten;
-
+
for (var i = 0; i < window.frames.length; i++) {
try {
window.frames[i].open = open_rewritten;
@@ -665,41 +667,41 @@ WB_wombat_init = (function() {
}
}
}
-
+
function init_cookies_override()
{
var cookie_path_regex = /\bPath=\'?\"?([^;'"\s]+)/i;
-
+
var get_cookie = function() {
return document.cookie;
}
-
+
var set_cookie = function(value) {
var matched = value.match(cookie_path_regex);
-
+
// if has cookie path, rewrite and replace
if (matched) {
var rewritten = rewrite_url(matched[1]);
value = value.replace(matched[1], rewritten);
}
-
+
document.cookie = value;
}
-
+
def_prop(document, "WB_wombat_cookie", document.cookie,
set_cookie,
get_cookie);
}
-
+
//============================================
function init_write_override()
{
document.write = function(string) {
var doc = new DOMParser().parseFromString(string, "text/html");
-
+
if (doc) {
var children = doc.body.children;
-
+
for (var i = 0; i < children.length; i++) {
document.body.appendChild(children[i]);
}
@@ -710,52 +712,52 @@ WB_wombat_init = (function() {
//============================================
function wombat_init(replay_prefix, capture_date, orig_scheme, orig_host, timestamp, mod) {
wb_replay_prefix = replay_prefix;
-
+
if (wb_replay_prefix) {
wb_replay_date_prefix = replay_prefix + capture_date + mod + "/";
-
+
if (capture_date.length > 0) {
wb_capture_date_part = "/" + capture_date + "/";
} else {
wb_capture_date_part = "";
}
-
+
wb_orig_scheme = orig_scheme + '://';
-
+
wb_orig_host = wb_orig_scheme + orig_host;
-
+
init_bad_prefixes(replay_prefix);
}
// Location
var wombat_location = new WombatLocation(window.self.location);
-
+
if (wombat_location._autooverride) {
-
+
var setter = function(val) {
- if (typeof(val) == "string") {
+ if (typeof(val) == "string") {
if (starts_with(val, "about:")) {
return undefined;
}
this._WB_wombat_location.href = val;
}
}
-
+
def_prop(window, "WB_wombat_location", wombat_location, setter);
def_prop(document, "WB_wombat_location", wombat_location, setter);
} else {
window.WB_wombat_location = wombat_location;
document.WB_wombat_location = wombat_location;
-
+
// Check quickly after page load
- setTimeout(check_all_locations, 500);
-
+ setTimeout(check_all_locations, 500);
+
// Check periodically every few seconds
setInterval(check_all_locations, 500);
}
-
+
var is_framed = (window.top.wbinfo && window.top.wbinfo.is_frame);
-
+
function find_next_top(win) {
while ((win.parent != win) && (win.parent != win.top)) {
win = win.parent;
@@ -766,9 +768,9 @@ WB_wombat_init = (function() {
if (window.self.location != window.top.location) {
if (is_framed) {
window.top.WB_wombat_location = window.WB_wombat_location;
-
+
window.WB_wombat_top = find_next_top(window.self);
-
+
} else {
window.top.WB_wombat_location = new WombatLocation(window.top.location);
window.WB_wombat_top = window.top;
@@ -788,20 +790,20 @@ WB_wombat_init = (function() {
// History
copy_history_func(window.history, 'pushState');
copy_history_func(window.history, 'replaceState');
-
+
// open
init_open_override();
// postMessage
init_postmessage_override();
-
+
// write
init_write_override();
-
+
// Ajax
init_ajax_rewrite();
init_worker_override();
-
+
// Cookies
init_cookies_override();
@@ -810,6 +812,9 @@ WB_wombat_init = (function() {
// Random
init_seeded_random(timestamp);
+
+ // expose functions
+ this.extract_orig = extract_orig;
}
return wombat_init;
diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html
index 4e53a5d0..36563167 100644
--- a/pywb/ui/head_insert.html
+++ b/pywb/ui/head_insert.html
@@ -3,7 +3,7 @@
+
{% include banner_html ignore missing %}
diff --git a/pywb/webapp/live_rewrite_handler.py b/pywb/webapp/live_rewrite_handler.py
index abaf80c2..e38d10e9 100644
--- a/pywb/webapp/live_rewrite_handler.py
+++ b/pywb/webapp/live_rewrite_handler.py
@@ -4,12 +4,17 @@ from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.rewrite.rewrite_live import LiveRewriter
from pywb.rewrite.wburl import WbUrl
+from pywb.rewrite.url_rewriter import HttpsUrlRewriter
from handlers import StaticHandler, SearchPageWbUrlHandler
from views import HeadInsertView
from pywb.utils.wbexception import WbException
+import json
+import requests
+from youtube_dl import YoutubeDL
+
#=================================================================
class LiveResourceException(WbException):
@@ -25,14 +30,16 @@ class RewriteHandler(SearchPageWbUrlHandler):
def __init__(self, config):
super(RewriteHandler, self).__init__(config)
- default_proxy = config.get('proxyhostport')
+ self.default_proxy = config.get('proxyhostport')
self.rewriter = LiveRewriter(is_framed_replay=self.is_frame_mode,
- default_proxy=default_proxy)
+ default_proxy=self.default_proxy)
self.head_insert_view = HeadInsertView.init_from_config(config)
self.live_cookie = config.get('live-cookie', self.LIVE_COOKIE)
+ self.ydl = None
+
def handle_request(self, wbrequest):
try:
return self.render_content(wbrequest)
@@ -50,6 +57,9 @@ class RewriteHandler(SearchPageWbUrlHandler):
return {}
def render_content(self, wbrequest):
+ if wbrequest.wb_url.mod == 'vi_':
+ return self.get_video_info(wbrequest)
+
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
req_headers = self._live_request_headers(wbrequest)
@@ -76,6 +86,34 @@ class RewriteHandler(SearchPageWbUrlHandler):
return WbResponse(status_headers, gen)
+
+ def get_video_info(self, wbrequest):
+ if not self.ydl:
+ self.ydl = YoutubeDL(dict(simulate=True,
+ youtube_include_dash_manifest=False))
+
+ self.ydl.add_default_info_extractors()
+
+ info = self.ydl.extract_info(wbrequest.wb_url.url)
+ content_type = 'application/vnd.youtube-dl_formats+json'
+ metadata = json.dumps(info)
+
+ if self.default_proxy:
+ proxies = {'http': self.default_proxy}
+
+ headers = {'Content-Type': content_type}
+
+ url = HttpsUrlRewriter.remove_https(wbrequest.wb_url.url)
+
+ response = requests.request(method='PUTMETA',
+ url=url,
+ data=metadata,
+ headers=headers,
+ proxies=proxies,
+ verify=False)
+
+ return WbResponse.text_response(metadata, content_type=content_type)
+
def __str__(self):
return 'Live Web Rewrite Handler'
diff --git a/pywb/webapp/query_handler.py b/pywb/webapp/query_handler.py
index 6a404c17..052243d5 100644
--- a/pywb/webapp/query_handler.py
+++ b/pywb/webapp/query_handler.py
@@ -68,6 +68,14 @@ class QueryHandler(object):
params['url'] = wb_url.url
params['output'] = output
+ params['filter'].append('!mimetype:-')
+
+ # get metadata
+ if wb_url.mod == 'vi_':
+ # matching metadata explicitly with special scheme
+ params['url'] = wb_url.url.replace('http:/', 'metadata:/')
+ params['filter'].append('~original:metadata://')
+
cdx_iter = self.load_cdx(wbrequest, params)
return cdx_iter, output
@@ -132,6 +140,7 @@ class QueryHandler(object):
'limit': limit,
'fl': ('urlkey,original,timestamp,' +
'endtimestamp,groupcount,uniqcount'),
+ 'filter':[],
},
wburl.REPLAY:
@@ -147,6 +156,7 @@ class QueryHandler(object):
# Not appropriate as default
# Should be an option to configure status code filtering in general
# 'filter': ['statuscode:[23]..|-'],
+ 'filter': [],
'limit': '1',
'resolveRevisits': True,
}
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 3375329f..208ceb9c 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -98,7 +98,7 @@ class TestWb:
assert '"20140127171238"' in resp.body
assert 'wb.js' in resp.body
- assert 'WB_wombat_init' in resp.body
+ assert 'new _WBWombat' in resp.body, resp.body
assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
def test_replay_non_frame_content(self):
@@ -149,7 +149,7 @@ class TestWb:
assert 'wb.js' in resp.body
# no wombat present
- assert 'WB_wombat_init' not in resp.body
+ assert '_WBWombat' not in resp.body
# url not rewritten
#assert '"http://www.iana.org/domains/example"' in resp.body