mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
video work: live rewrite pings proxy with full rewrite, proxies direct
range request reorg rangecache to support is_range() check, yt-specific logic (experimental) wombat: add date override (experimental) bump tentative version to 0.7.0! yt replays work with native player! (though still issues remain)
This commit is contained in:
parent
fea48fd27a
commit
88f553dce7
@ -1,3 +1,9 @@
|
||||
pywb 0.7.0 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Video Buffering Replay
|
||||
|
||||
|
||||
pywb 0.6.4 changelist
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
PyWb 0.6.4
|
||||
PyWb 0.7.0
|
||||
==========
|
||||
|
||||
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop
|
||||
|
@ -88,7 +88,9 @@ class LiveRewriter(object):
|
||||
method = 'GET'
|
||||
data = None
|
||||
|
||||
if not proxies and self.default_proxy:
|
||||
if proxies == False:
|
||||
proxies = None
|
||||
elif not proxies and self.default_proxy:
|
||||
proxies = {'http': self.default_proxy,
|
||||
'https': self.default_proxy}
|
||||
|
||||
|
@ -126,10 +126,20 @@ rules:
|
||||
- video_id
|
||||
- html5
|
||||
|
||||
- url_prefix: 'com,youtube,s)/api/stats/qoe'
|
||||
|
||||
fuzzy_lookup:
|
||||
- docid
|
||||
|
||||
- url_prefix: 'com,youtube,s)/api/stats/watch'
|
||||
|
||||
fuzzy_lookup:
|
||||
- docid
|
||||
|
||||
- url_prefix: 'com,googlevideo,'
|
||||
|
||||
fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]).*(mime=[^&]).*(signature=[^&])'
|
||||
#fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]).*(mime=[^&]).*(signature=[^&])'
|
||||
fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]).*(mime=[^&])'
|
||||
|
||||
|
||||
# testing rules -- not for valid domain
|
||||
|
@ -44,11 +44,19 @@ __wbvidrw = (function() {
|
||||
|
||||
if (wbinfo.url.indexOf("://www.youtube.com/watch") > 0) {
|
||||
var ytvideo = document.getElementsByTagName("video");
|
||||
/*
|
||||
if (ytvideo.length == 1) {
|
||||
if (ytvideo[0].getAttribute("data-youtube-id") != "") {
|
||||
check_replacement(ytvideo[0], wbinfo.url);
|
||||
// Wait to see if video is playing, if so, don't replace it
|
||||
window.setTimeout(function() {
|
||||
if (ytvideo[0].readyState == 0) {
|
||||
console.log("Replacing Broken Video");
|
||||
check_replacement(ytvideo[0], wbinfo.url);
|
||||
}
|
||||
}, 3000);
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -517,7 +517,21 @@ window._WBWombat = (function() {
|
||||
override_attr(image, "src");
|
||||
return image;
|
||||
}
|
||||
}(Image);
|
||||
}(window.Image);
|
||||
}
|
||||
|
||||
//============================================
|
||||
function init_date_override(timestamp) {
|
||||
window.Date = function (Date) {
|
||||
return function (A, B, C, D, E, F, G) {
|
||||
if (arguments.length == 0) {
|
||||
timestamp = parseInt(timestamp) * 1000;
|
||||
return new Date(timestamp);
|
||||
} else {
|
||||
return new Date(A, B, C, D, E, F, G);
|
||||
}
|
||||
}
|
||||
}(window.Date);
|
||||
}
|
||||
|
||||
//============================================
|
||||
@ -859,6 +873,9 @@ window._WBWombat = (function() {
|
||||
// Random
|
||||
init_seeded_random(timestamp);
|
||||
|
||||
// Date
|
||||
init_date_override(timestamp);
|
||||
|
||||
// expose functions
|
||||
this.extract_orig = extract_orig;
|
||||
}
|
||||
|
@ -13,6 +13,7 @@ from pywb.utils.wbexception import WbException
|
||||
|
||||
import json
|
||||
import requests
|
||||
import hashlib
|
||||
|
||||
from rangecache import range_cache
|
||||
|
||||
@ -70,25 +71,36 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
||||
if ref_wburl_str:
|
||||
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
|
||||
|
||||
def do_req():
|
||||
result = self.rewriter.fetch_request(wbrequest.wb_url.url,
|
||||
wbrequest.urlrewriter,
|
||||
head_insert_func=head_insert_func,
|
||||
req_headers=req_headers,
|
||||
env=wbrequest.env)
|
||||
proxies = None # default
|
||||
ping_url = None
|
||||
ping_cache_key = None
|
||||
|
||||
return self._make_response(wbrequest, *result)
|
||||
if self.default_proxy and range_cache:
|
||||
rangeres = range_cache.is_ranged(wbrequest)
|
||||
if rangeres:
|
||||
proxies = False
|
||||
|
||||
cdx = dict(url=wbrequest.wb_url.url)
|
||||
hash_ = hashlib.md5()
|
||||
hash_.update(rangeres[0])
|
||||
ping_cache_key = hash_.hexdigest()
|
||||
|
||||
range_status, range_iter = range_cache(wbrequest, cdx, do_req)
|
||||
if ping_cache_key not in range_cache.cache:
|
||||
ping_url = rangeres[0]
|
||||
|
||||
if not range_status or not range_iter:
|
||||
return do_req()
|
||||
else:
|
||||
result = range_status, range_iter, False
|
||||
return self._make_response(wbrequest, *result)
|
||||
result = self.rewriter.fetch_request(wbrequest.wb_url.url,
|
||||
wbrequest.urlrewriter,
|
||||
head_insert_func=head_insert_func,
|
||||
req_headers=req_headers,
|
||||
env=wbrequest.env,
|
||||
proxies=proxies)
|
||||
|
||||
wbresponse = self._make_response(wbrequest, *result)
|
||||
|
||||
if ping_url:
|
||||
self._proxy_ping(wbrequest, wbresponse,
|
||||
ping_url, ping_cache_key)
|
||||
|
||||
return wbresponse
|
||||
|
||||
def _make_response(self, wbrequest, status_headers, gen, is_rewritten):
|
||||
# if cookie set, pass recorded timestamp info via cookie
|
||||
@ -102,6 +114,37 @@ class RewriteHandler(SearchPageWbUrlHandler):
|
||||
|
||||
return WbResponse(status_headers, gen)
|
||||
|
||||
def _proxy_ping(self, wbrequest, wbresponse, url, key):
|
||||
def do_proxy_ping():
|
||||
proxies = {'http': self.default_proxy,
|
||||
'https': self.default_proxy}
|
||||
|
||||
headers = self._live_request_headers(wbrequest)
|
||||
print('PINGING PROXY: ' + url)
|
||||
resp = requests.get(url=url,
|
||||
headers=headers,
|
||||
proxies=proxies,
|
||||
verify=False,
|
||||
stream=True)
|
||||
|
||||
# don't actually read whole response, proxy response for writing it
|
||||
resp.raw.close()
|
||||
resp.close()
|
||||
|
||||
# mark as pinged
|
||||
range_cache.cache[key] = '1'
|
||||
|
||||
return None
|
||||
|
||||
def check_buff_gen(gen):
|
||||
for x in gen:
|
||||
yield x
|
||||
|
||||
do_proxy_ping()
|
||||
|
||||
wbresponse.body = check_buff_gen(wbresponse.body)
|
||||
return wbresponse
|
||||
|
||||
def get_video_info(self, wbrequest):
|
||||
if not self.youtubedl:
|
||||
self.youtubedl = YoutubeDLWrapper()
|
||||
|
@ -4,7 +4,6 @@ from pywb.framework.cache import create_cache
|
||||
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
import hashlib
|
||||
import yaml
|
||||
import os
|
||||
import re
|
||||
@ -28,16 +27,14 @@ class RangeCache(object):
|
||||
|
||||
new_url = RangeCache.YT_EXTRACT_RX.sub(repl_range, url)
|
||||
if range_h_res:
|
||||
print('MATCHED')
|
||||
return range_h_res[0], new_url
|
||||
else:
|
||||
return None, url
|
||||
|
||||
def __init__(self):
|
||||
self.cache = create_cache()
|
||||
print(type(self.cache))
|
||||
|
||||
def __call__(self, wbrequest, cdx, wbresponse_func):
|
||||
def is_ranged(self, wbrequest):
|
||||
url = wbrequest.wb_url.url
|
||||
range_h = None
|
||||
use_206 = False
|
||||
@ -45,37 +42,34 @@ class RangeCache(object):
|
||||
result = self.match_yt(url)
|
||||
if result:
|
||||
range_h, url = result
|
||||
wbrequest.wb_url.url = url
|
||||
print(range_h)
|
||||
|
||||
# check for standard range header
|
||||
if not range_h:
|
||||
range_h = wbrequest.env.get('HTTP_RANGE')
|
||||
if not range_h:
|
||||
return None, None
|
||||
range_h = True
|
||||
return None
|
||||
|
||||
return self.handle_range(wbrequest, cdx, url,
|
||||
wbresponse_func,
|
||||
range_h, use_206)
|
||||
use_206 = True
|
||||
|
||||
def handle_range(self, wbrequest, cdx, url, wbresponse_func,
|
||||
range_h, use_206):
|
||||
return url, range_h, use_206
|
||||
|
||||
def __call__(self, wbrequest, digest, wbresponse_func):
|
||||
result = self.is_ranged(wbrequest)
|
||||
if not result:
|
||||
return None, None
|
||||
|
||||
return self.handle_range(wbrequest, digest, wbresponse_func,
|
||||
*result)
|
||||
|
||||
def handle_range(self, wbrequest, digest, wbresponse_func,
|
||||
url, range_h, use_206):
|
||||
|
||||
range_h = range_h.split('=')[-1]
|
||||
key = cdx.get('digest')
|
||||
if not key:
|
||||
hash_ = hashlib.md5()
|
||||
hash_.update(url)
|
||||
#hash_.update(cdx['timestamp'])
|
||||
key = hash_.hexdigest()
|
||||
|
||||
print('KEY: ', key)
|
||||
print('RANGE: ', range_h)
|
||||
|
||||
key = digest
|
||||
if not key in self.cache:
|
||||
print('MISS')
|
||||
response = wbresponse_func()
|
||||
if not response:
|
||||
return None, None
|
||||
|
||||
with NamedTemporaryFile(delete=False) as fh:
|
||||
for obj in response.body:
|
||||
@ -86,21 +80,19 @@ class RangeCache(object):
|
||||
spec = dict(name=fh.name,
|
||||
headers=response.status_headers.headers)
|
||||
|
||||
print('SET CACHE: ' + key)
|
||||
self.cache[key] = yaml.dump(spec)
|
||||
else:
|
||||
print('HIT')
|
||||
spec = yaml.load(self.cache[key])
|
||||
if not spec:
|
||||
return None, None
|
||||
|
||||
spec['headers'] = [tuple(x) for x in spec['headers']]
|
||||
|
||||
print(spec['headers'])
|
||||
print('TEMP FILE: ' + spec['name'])
|
||||
filelen = os.path.getsize(spec['name'])
|
||||
|
||||
range_h = range_h.rstrip()
|
||||
|
||||
if range_h == '0-':
|
||||
print('FIX RANGE')
|
||||
range_h = '0-120000'
|
||||
|
||||
parts = range_h.rstrip().split('-')
|
||||
@ -119,7 +111,6 @@ class RangeCache(object):
|
||||
fh = LimitReader.wrap_stream(fh, maxlen)
|
||||
while True:
|
||||
buf = fh.read()
|
||||
print('READ: ', len(buf))
|
||||
if not buf:
|
||||
break
|
||||
|
||||
@ -129,17 +120,16 @@ class RangeCache(object):
|
||||
content_range = 'bytes {0}-{1}/{2}'.format(start,
|
||||
start + maxlen - 1,
|
||||
filelen)
|
||||
print('CONTENT_RANGE: ', content_range)
|
||||
|
||||
status_headers = StatusAndHeaders('206 Partial Content', spec['headers'])
|
||||
status_headers.replace_header('Content-Range', content_range)
|
||||
else:
|
||||
status_headers = StatusAndHeaders('200 OK', spec['headers'])
|
||||
|
||||
status_headers.headers.append(('Accept-Ranges', 'bytes'))
|
||||
status_headers.headers.append(('Access-Control-Allow-Credentials', 'true'))
|
||||
status_headers.headers.append(('Access-Control-Allow-Origin', 'http://localhost:8080'))
|
||||
status_headers.headers.append(('Timing-Allow-Origin', 'http://localhost:8080'))
|
||||
#status_headers.headers.append(('Accept-Ranges', 'bytes'))
|
||||
#status_headers.headers.append(('Access-Control-Allow-Credentials', 'true'))
|
||||
#status_headers.headers.append(('Access-Control-Allow-Origin', 'http://localhost:8080'))
|
||||
#status_headers.headers.append(('Timing-Allow-Origin', 'http://localhost:8080'))
|
||||
|
||||
status_headers.replace_header('Content-Length', str(maxlen))
|
||||
return status_headers, read_range()
|
||||
|
@ -107,7 +107,7 @@ class ReplayView(object):
|
||||
return self.replay_capture(wbrequest, cdx, cdx_loader, failed_files)
|
||||
|
||||
range_status, range_iter = range_cache(wbrequest,
|
||||
cdx,
|
||||
cdx.get('digest'),
|
||||
get_capture)
|
||||
if range_status and range_iter:
|
||||
response = self.response_class(range_status,
|
||||
|
Loading…
x
Reference in New Issue
Block a user