1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

video work: live rewrite pings proxy with full rewrite, proxies direct

range request
reorg rangecache to support is_range() check, yt-specific logic
(experimental)
wombat: add date override (experimental)
bump tentative version to 0.7.0!
yt replays work with native player! (though still issues remain)
This commit is contained in:
Ilya Kreymer 2014-11-04 22:11:25 -08:00
parent fea48fd27a
commit 88f553dce7
10 changed files with 140 additions and 64 deletions

View File

@ -1,3 +1,9 @@
pywb 0.7.0 changelist
~~~~~~~~~~~~~~~~~~~~~
Video Buffering Replay
pywb 0.6.4 changelist
~~~~~~~~~~~~~~~~~~~~~

View File

@ -1,4 +1,4 @@
PyWb 0.6.4
PyWb 0.7.0
==========
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop

View File

@ -88,7 +88,9 @@ class LiveRewriter(object):
method = 'GET'
data = None
if not proxies and self.default_proxy:
if proxies == False:
proxies = None
elif not proxies and self.default_proxy:
proxies = {'http': self.default_proxy,
'https': self.default_proxy}

View File

@ -126,10 +126,20 @@ rules:
- video_id
- html5
- url_prefix: 'com,youtube,s)/api/stats/qoe'
fuzzy_lookup:
- docid
- url_prefix: 'com,youtube,s)/api/stats/watch'
fuzzy_lookup:
- docid
- url_prefix: 'com,googlevideo,'
fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]).*(mime=[^&]).*(signature=[^&])'
#fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]).*(mime=[^&]).*(signature=[^&])'
fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]).*(mime=[^&])'
# testing rules -- not for valid domain

View File

@ -44,11 +44,19 @@ __wbvidrw = (function() {
if (wbinfo.url.indexOf("://www.youtube.com/watch") > 0) {
var ytvideo = document.getElementsByTagName("video");
/*
if (ytvideo.length == 1) {
if (ytvideo[0].getAttribute("data-youtube-id") != "") {
check_replacement(ytvideo[0], wbinfo.url);
// Wait to see if video is playing, if so, don't replace it
window.setTimeout(function() {
if (ytvideo[0].readyState == 0) {
console.log("Replacing Broken Video");
check_replacement(ytvideo[0], wbinfo.url);
}
}, 3000);
}
}
*/
}
}

View File

@ -517,7 +517,21 @@ window._WBWombat = (function() {
override_attr(image, "src");
return image;
}
}(Image);
}(window.Image);
}
//============================================
function init_date_override(timestamp) {
window.Date = function (Date) {
return function (A, B, C, D, E, F, G) {
if (arguments.length == 0) {
timestamp = parseInt(timestamp) * 1000;
return new Date(timestamp);
} else {
return new Date(A, B, C, D, E, F, G);
}
}
}(window.Date);
}
//============================================
@ -859,6 +873,9 @@ window._WBWombat = (function() {
// Random
init_seeded_random(timestamp);
// Date
init_date_override(timestamp);
// expose functions
this.extract_orig = extract_orig;
}

View File

@ -13,6 +13,7 @@ from pywb.utils.wbexception import WbException
import json
import requests
import hashlib
from rangecache import range_cache
@ -70,25 +71,36 @@ class RewriteHandler(SearchPageWbUrlHandler):
if ref_wburl_str:
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
def do_req():
result = self.rewriter.fetch_request(wbrequest.wb_url.url,
wbrequest.urlrewriter,
head_insert_func=head_insert_func,
req_headers=req_headers,
env=wbrequest.env)
proxies = None # default
ping_url = None
ping_cache_key = None
return self._make_response(wbrequest, *result)
if self.default_proxy and range_cache:
rangeres = range_cache.is_ranged(wbrequest)
if rangeres:
proxies = False
cdx = dict(url=wbrequest.wb_url.url)
hash_ = hashlib.md5()
hash_.update(rangeres[0])
ping_cache_key = hash_.hexdigest()
range_status, range_iter = range_cache(wbrequest, cdx, do_req)
if ping_cache_key not in range_cache.cache:
ping_url = rangeres[0]
if not range_status or not range_iter:
return do_req()
else:
result = range_status, range_iter, False
return self._make_response(wbrequest, *result)
result = self.rewriter.fetch_request(wbrequest.wb_url.url,
wbrequest.urlrewriter,
head_insert_func=head_insert_func,
req_headers=req_headers,
env=wbrequest.env,
proxies=proxies)
wbresponse = self._make_response(wbrequest, *result)
if ping_url:
self._proxy_ping(wbrequest, wbresponse,
ping_url, ping_cache_key)
return wbresponse
def _make_response(self, wbrequest, status_headers, gen, is_rewritten):
# if cookie set, pass recorded timestamp info via cookie
@ -102,6 +114,37 @@ class RewriteHandler(SearchPageWbUrlHandler):
return WbResponse(status_headers, gen)
def _proxy_ping(self, wbrequest, wbresponse, url, key):
def do_proxy_ping():
proxies = {'http': self.default_proxy,
'https': self.default_proxy}
headers = self._live_request_headers(wbrequest)
print('PINGING PROXY: ' + url)
resp = requests.get(url=url,
headers=headers,
proxies=proxies,
verify=False,
stream=True)
# don't actually read whole response, proxy response for writing it
resp.raw.close()
resp.close()
# mark as pinged
range_cache.cache[key] = '1'
return None
def check_buff_gen(gen):
for x in gen:
yield x
do_proxy_ping()
wbresponse.body = check_buff_gen(wbresponse.body)
return wbresponse
def get_video_info(self, wbrequest):
if not self.youtubedl:
self.youtubedl = YoutubeDLWrapper()

View File

@ -4,7 +4,6 @@ from pywb.framework.cache import create_cache
from tempfile import NamedTemporaryFile
import hashlib
import yaml
import os
import re
@ -28,16 +27,14 @@ class RangeCache(object):
new_url = RangeCache.YT_EXTRACT_RX.sub(repl_range, url)
if range_h_res:
print('MATCHED')
return range_h_res[0], new_url
else:
return None, url
def __init__(self):
self.cache = create_cache()
print(type(self.cache))
def __call__(self, wbrequest, cdx, wbresponse_func):
def is_ranged(self, wbrequest):
url = wbrequest.wb_url.url
range_h = None
use_206 = False
@ -45,37 +42,34 @@ class RangeCache(object):
result = self.match_yt(url)
if result:
range_h, url = result
wbrequest.wb_url.url = url
print(range_h)
# check for standard range header
if not range_h:
range_h = wbrequest.env.get('HTTP_RANGE')
if not range_h:
return None, None
range_h = True
return None
return self.handle_range(wbrequest, cdx, url,
wbresponse_func,
range_h, use_206)
use_206 = True
def handle_range(self, wbrequest, cdx, url, wbresponse_func,
range_h, use_206):
return url, range_h, use_206
def __call__(self, wbrequest, digest, wbresponse_func):
result = self.is_ranged(wbrequest)
if not result:
return None, None
return self.handle_range(wbrequest, digest, wbresponse_func,
*result)
def handle_range(self, wbrequest, digest, wbresponse_func,
url, range_h, use_206):
range_h = range_h.split('=')[-1]
key = cdx.get('digest')
if not key:
hash_ = hashlib.md5()
hash_.update(url)
#hash_.update(cdx['timestamp'])
key = hash_.hexdigest()
print('KEY: ', key)
print('RANGE: ', range_h)
key = digest
if not key in self.cache:
print('MISS')
response = wbresponse_func()
if not response:
return None, None
with NamedTemporaryFile(delete=False) as fh:
for obj in response.body:
@ -86,21 +80,19 @@ class RangeCache(object):
spec = dict(name=fh.name,
headers=response.status_headers.headers)
print('SET CACHE: ' + key)
self.cache[key] = yaml.dump(spec)
else:
print('HIT')
spec = yaml.load(self.cache[key])
if not spec:
return None, None
spec['headers'] = [tuple(x) for x in spec['headers']]
print(spec['headers'])
print('TEMP FILE: ' + spec['name'])
filelen = os.path.getsize(spec['name'])
range_h = range_h.rstrip()
if range_h == '0-':
print('FIX RANGE')
range_h = '0-120000'
parts = range_h.rstrip().split('-')
@ -119,7 +111,6 @@ class RangeCache(object):
fh = LimitReader.wrap_stream(fh, maxlen)
while True:
buf = fh.read()
print('READ: ', len(buf))
if not buf:
break
@ -129,17 +120,16 @@ class RangeCache(object):
content_range = 'bytes {0}-{1}/{2}'.format(start,
start + maxlen - 1,
filelen)
print('CONTENT_RANGE: ', content_range)
status_headers = StatusAndHeaders('206 Partial Content', spec['headers'])
status_headers.replace_header('Content-Range', content_range)
else:
status_headers = StatusAndHeaders('200 OK', spec['headers'])
status_headers.headers.append(('Accept-Ranges', 'bytes'))
status_headers.headers.append(('Access-Control-Allow-Credentials', 'true'))
status_headers.headers.append(('Access-Control-Allow-Origin', 'http://localhost:8080'))
status_headers.headers.append(('Timing-Allow-Origin', 'http://localhost:8080'))
#status_headers.headers.append(('Accept-Ranges', 'bytes'))
#status_headers.headers.append(('Access-Control-Allow-Credentials', 'true'))
#status_headers.headers.append(('Access-Control-Allow-Origin', 'http://localhost:8080'))
#status_headers.headers.append(('Timing-Allow-Origin', 'http://localhost:8080'))
status_headers.replace_header('Content-Length', str(maxlen))
return status_headers, read_range()

View File

@ -107,7 +107,7 @@ class ReplayView(object):
return self.replay_capture(wbrequest, cdx, cdx_loader, failed_files)
range_status, range_iter = range_cache(wbrequest,
cdx,
cdx.get('digest'),
get_capture)
if range_status and range_iter:
response = self.response_class(range_status,

View File

@ -34,7 +34,7 @@ class PyTest(TestCommand):
setup(
name='pywb',
version='0.6.4',
version='0.7.0',
url='https://github.com/ikreymer/pywb',
author='Ilya Kreymer',
author_email='ikreymer@gmail.com',