1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

rangecache: always bound range, set default bound of 16384

wombat: work on date override, disable for now
head_insert: check for wombat not being inited to avoid undef error
This commit is contained in:
Ilya Kreymer 2014-11-05 10:55:46 -08:00
parent 88f553dce7
commit c6a2c83b66
5 changed files with 80 additions and 37 deletions

View File

@ -138,8 +138,8 @@ rules:
- url_prefix: 'com,googlevideo,'
#fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]).*(mime=[^&]).*(signature=[^&])'
fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]).*(mime=[^&])'
fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]).*(mime=[^&]).*(signature=[^&])'
#fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]).*(mime=[^&])'
# testing rules -- not for valid domain

View File

@ -20,7 +20,7 @@ This file is part of pywb, https://github.com/ikreymer/pywb
//============================================
// Wombat JS-Rewriting Library v2.1
//============================================
window._WBWombat = (function() {
_WBWombat = (function() {
// Globals
var wb_replay_prefix;
@ -511,6 +511,7 @@ window._WBWombat = (function() {
//============================================
function init_image_override() {
window.__Image = window.Image;
window.Image = function (Image) {
return function (width, height) {
var image = new Image(width, height);
@ -522,16 +523,40 @@ window._WBWombat = (function() {
//============================================
function init_date_override(timestamp) {
timestamp = parseInt(timestamp) * 1000;
var timediff = Date.now() - timestamp;
window.__Date = window.Date;
window.__Date_now = window.Date.now;
window.Date = function (Date) {
return function (A, B, C, D, E, F, G) {
if (arguments.length == 0) {
timestamp = parseInt(timestamp) * 1000;
return new Date(timestamp);
// Apply doesn't work for constructors and Date doesn't
// seem to like undefined args, so must explicitly
// call constructor for each possible args 0..7
if (A === undefined) {
return new Date(window.Date.now());
} else if (B === undefined) {
return new Date(A);
} else if (C === undefined) {
return new Date(A, B);
} else if (D === undefined) {
return new Date(A, B, C);
} else if (E === undefined) {
return new Date(A, B, C, D);
} else if (F === undefined) {
return new Date(A, B, C, D, E);
} else if (G === undefined) {
return new Date(A, B, C, D, E, F);
} else {
return new Date(A, B, C, D, E, F, G);
}
}
}(window.Date);
window.Date.now = function() {
return __Date_now() - timediff;
}
}
//============================================
@ -874,7 +899,7 @@ window._WBWombat = (function() {
init_seeded_random(timestamp);
// Date
init_date_override(timestamp);
// init_date_override(timestamp);
// expose functions
this.extract_orig = extract_orig;
@ -882,4 +907,6 @@ window._WBWombat = (function() {
return wombat_init;
})(this);
})();
window._WBWombat = _WBWombat;

View File

@ -3,12 +3,14 @@
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wombat.js'> </script>
<script>
{% set urlsplit = cdx.original | urlsplit %}
if (window && window._WBWombat) {
var _wb_wombat = new _WBWombat("{{ wbrequest.wb_prefix}}",
"{{ cdx['timestamp'] if include_ts else ''}}",
"{{ urlsplit.scheme }}",
"{{ urlsplit.netloc }}",
"{{ cdx.timestamp | format_ts('%s') }}",
"{{ wbrequest.wb_url.mod }}");
}
</script>
{% endif %}
<script>

View File

@ -74,18 +74,27 @@ class RewriteHandler(SearchPageWbUrlHandler):
proxies = None # default
ping_url = None
ping_cache_key = None
ping_range_header = None
if self.default_proxy and range_cache:
rangeres = range_cache.is_ranged(wbrequest)
if rangeres:
url, start, end, use_206 = rangeres
proxies = False
# force a bound on unbounded range
if use_206 and wbrequest.env['HTTP_RANGE'].endswith('-'):
range_h = 'bytes={0}-{1}'.format(start, end)
wbrequest.env['HTTP_RANGE'] = range_h
print('BOUNDING: ' + range_h)
hash_ = hashlib.md5()
hash_.update(rangeres[0])
hash_.update(url)
ping_cache_key = hash_.hexdigest()
if ping_cache_key not in range_cache.cache:
ping_url = rangeres[0]
ping_url = url
result = self.rewriter.fetch_request(wbrequest.wb_url.url,
wbrequest.urlrewriter,
@ -120,21 +129,25 @@ class RewriteHandler(SearchPageWbUrlHandler):
'https': self.default_proxy}
headers = self._live_request_headers(wbrequest)
print('PINGING PROXY: ' + url)
resp = requests.get(url=url,
headers=headers,
proxies=proxies,
verify=False,
stream=True)
headers['Connection'] = 'close'
# don't actually read whole response, proxy response for writing it
resp.raw.close()
resp.close()
if key in range_cache.cache:
return
# mark as pinged
range_cache.cache[key] = '1'
try:
# mark as pinged
range_cache.cache[key] = '1'
return None
resp = requests.get(url=url,
headers=headers,
proxies=proxies,
verify=False,
stream=True)
# don't actually read whole response, proxy response for writing it
resp.close()
except:
del range_cache.cache[key]
def check_buff_gen(gen):
for x in gen:

View File

@ -14,6 +14,8 @@ class RangeCache(object):
YOUTUBE_RX = re.compile('.*.googlevideo.com/videoplayback')
YT_EXTRACT_RX = re.compile('&range=([^&]+)')
DEFAULT_BUFF = 16384
@staticmethod
def match_yt(url):
if not RangeCache.YOUTUBE_RX.match(url):
@ -51,7 +53,17 @@ class RangeCache(object):
use_206 = True
return url, range_h, use_206
# force bounded range
range_h = range_h.split('=')[-1]
range_h = range_h.rstrip()
parts = range_h.split('-', 1)
start = int(parts[0])
if len(parts) == 2 and parts[1]:
end = int(parts[1])
else:
end = start + self.DEFAULT_BUFF - 1
return url, start, end, use_206
def __call__(self, wbrequest, digest, wbresponse_func):
result = self.is_ranged(wbrequest)
@ -62,9 +74,8 @@ class RangeCache(object):
*result)
def handle_range(self, wbrequest, digest, wbresponse_func,
url, range_h, use_206):
url, start, end, use_206):
range_h = range_h.split('=')[-1]
key = digest
if not key in self.cache:
response = wbresponse_func()
@ -90,20 +101,10 @@ class RangeCache(object):
filelen = os.path.getsize(spec['name'])
range_h = range_h.rstrip()
if range_h == '0-':
range_h = '0-120000'
parts = range_h.rstrip().split('-')
start = parts[0]
#start = start.split('=')[1]
start = int(start)
maxlen = filelen - start
if len(parts) == 2 and parts[1]:
maxlen = min(maxlen, int(parts[1]) - start + 1)
if end:
maxlen = min(maxlen, end - start + 1)
def read_range():
with open(spec['name']) as fh: