1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

urlrewrite: http range: support skipping record for range requests not starting at 0-

and performing async request,
support converting unbounded 0- to non-ranged and back
This commit is contained in:
Ilya Kreymer 2016-04-15 02:21:39 +00:00
parent 0b255819ff
commit 0370470e68
2 changed files with 136 additions and 14 deletions

View File

@ -3,10 +3,15 @@ from pywb.utils.loaders import extract_client_cookie
from six import iteritems
from six.moves.urllib.parse import urlsplit
import re
#=============================================================================
class RewriteInputRequest(DirectWSGIInputRequest):
RANGE_ARG_RX = re.compile('.*.googlevideo.com/videoplayback.*([&?]range=(\d+)-(\d+))')
RANGE_HEADER = re.compile('bytes=(\d+)-(\d+)?')
def __init__(self, env, urlkey, url, rewriter):
super(RewriteInputRequest, self).__init__(env)
self.urlkey = urlkey
@ -38,7 +43,7 @@ class RewriteInputRequest(DirectWSGIInputRequest):
elif name == 'HTTP_X_CSRFTOKEN':
name = 'X-CSRFToken'
cookie_val = extract_client_cookie(env, 'csrftoken')
cookie_val = extract_client_cookie(self.env, 'csrftoken')
if cookie_val:
value = cookie_val
@ -86,3 +91,39 @@ class RewriteInputRequest(DirectWSGIInputRequest):
return value
def extract_range(self):
use_206 = False
start = None
end = None
url = self.url
range_h = self.env.get('HTTP_RANGE')
if range_h:
m = self.RANGE_HEADER.match(range_h)
if m:
start = m.group(1)
end = m.group(2)
use_206 = True
else:
m = self.RANGE_ARG_RX.match(url)
if m:
start = m.group(2)
end = m.group(3)
url = url[:m.start(1)] + url[m.end(1):]
use_206 = False
if not start:
return None
start = int(start)
if end:
end = int(end)
else:
end = ''
result = (url, start, end, use_206)
return result

View File

@ -17,6 +17,7 @@ from urlrewrite.rewriteinputreq import RewriteInputRequest
from urlrewrite.templateview import JinjaEnv, HeadInsertView, TopFrameView
from io import BytesIO
import gevent
# ============================================================================
@ -65,22 +66,33 @@ class RewriterApp(object):
inputreq = RewriteInputRequest(request.environ, urlkey, url,
self.content_rewriter)
req_data = inputreq.reconstruct_request(url)
mod_url = None
use_206 = False
rangeres = None
headers = {'Content-Length': len(req_data),
'Content-Type': 'application/request'}
readd_range = False
async_record_url = None
if wb_url.is_latest_replay():
closest = 'now'
else:
closest = wb_url.timestamp
if kwargs.get('type') == 'record':
rangeres = inputreq.extract_range()
upstream_url = self.get_upstream_url(url, wb_url, closest, kwargs)
if rangeres:
mod_url, start, end, use_206 = rangeres
r = requests.post(upstream_url,
data=BytesIO(req_data),
headers=headers,
stream=True)
# if bytes=0- Range request,
# simply remove the range and still proxy
if start == 0 and not end and use_206:
url = mod_url
wb_url.url = mod_url
inputreq.url = mod_url
del request.environ['HTTP_RANGE']
readd_range = True
else:
async_record_url = mod_url
r = self._do_req(inputreq, url, wb_url, kwargs,
async_record_url is not None)
if r.status_code >= 400:
error = None
@ -98,6 +110,16 @@ class RewriterApp(object):
data = dict(url=url, args=kwargs, error=error)
raise HTTPError(r.status_code, exception=data)
if async_record_url:
#print('ASYNC REC', async_record_url)
request.environ.pop('HTTP_RANGE', '')
gevent.spawn(self._do_async_req,
inputreq,
async_record_url,
wb_url,
kwargs,
False)
record = self.loader.parse_record_stream(r.raw)
cdx = CDXObject()
@ -107,6 +129,16 @@ class RewriterApp(object):
self._add_custom_params(cdx, r.headers, kwargs)
if readd_range:
content_length = (record.status_headers.
get_header('Content-Length'))
try:
content_length = int(content_length)
record.status_headers.add_range(0, content_length,
content_length)
except (ValueError, TypeError):
pass
if self.is_ajax():
head_insert_func = None
else:
@ -133,6 +165,54 @@ class RewriterApp(object):
return gen
def _do_async_req(self, *args):
count = 0
#print('ASYNC')
try:
r = self._do_req(*args)
while True:
buff = r.raw.read(8192)
count += len(buff)
if not buff:
return
except:
import traceback
traceback.print_exc()
finally:
#print('CLOSING')
#print('READ ASYNC', count)
try:
r.raw.close()
except:
pass
def _do_req(self, inputreq, url, wb_url, kwargs, skip):
req_data = inputreq.reconstruct_request(url)
headers = {'Content-Length': len(req_data),
'Content-Type': 'application/request'}
if skip:
headers['Recorder-Skip'] = '1'
if wb_url.is_latest_replay():
closest = 'now'
else:
closest = wb_url.timestamp
upstream_url = self.get_upstream_url(url, wb_url, closest, kwargs)
r = requests.post(upstream_url,
data=BytesIO(req_data),
headers=headers,
stream=True)
return r
def get_host_prefix(self):
return request.urlparts.scheme + '://' + request.urlparts.netloc
@ -151,7 +231,7 @@ class RewriterApp(object):
if referrer.startswith(full_prefix):
referrer = referrer[len(full_prefix):]
request.environ['HTTP_REFERER'] = referrer
request.environ['HTTP_REFERER'] = WbUrl(referrer).url
return True
return False
@ -168,6 +248,7 @@ class RewriterApp(object):
raise NotImplemented()
def _add_custom_params(self, cdx, headers, kwargs):
cdx['is_live'] = 'true'
pass
def get_top_frame_params(self, wb_url, kwargs):