mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
range request fixes: (#266)
- fully support range requests on frontend, if range request reaches pywb - add OffsetLimitReader() to skip offset and limit read - disbale rewriting for range requests - serve 416 if range outside of content-length - tests: add tests for range request handling dockerignore: add collections/
This commit is contained in:
parent
1bb1a32ee1
commit
ae56514c03
@ -7,5 +7,5 @@ dist/
|
|||||||
**/*.egg-info
|
**/*.egg-info
|
||||||
**/__pycache__
|
**/__pycache__
|
||||||
**/*.pyc
|
**/*.pyc
|
||||||
|
collections/
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@ from pywb.rewrite.url_rewriter import UrlRewriter, IdentityUrlRewriter
|
|||||||
from pywb.utils.wbexception import WbException
|
from pywb.utils.wbexception import WbException
|
||||||
from pywb.utils.canonicalize import canonicalize
|
from pywb.utils.canonicalize import canonicalize
|
||||||
from pywb.utils.loaders import extract_client_cookie
|
from pywb.utils.loaders import extract_client_cookie
|
||||||
from pywb.utils.io import BUFF_SIZE
|
from pywb.utils.io import BUFF_SIZE, OffsetLimitReader
|
||||||
from pywb.utils.memento import MementoUtils
|
from pywb.utils.memento import MementoUtils
|
||||||
|
|
||||||
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
|
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
|
||||||
@ -134,6 +134,72 @@ class RewriterApp(object):
|
|||||||
|
|
||||||
return is_timegate
|
return is_timegate
|
||||||
|
|
||||||
|
def _check_range(self, inputreq, wb_url):
|
||||||
|
skip_record = False
|
||||||
|
range_start = None
|
||||||
|
range_end = None
|
||||||
|
|
||||||
|
rangeres = inputreq.extract_range()
|
||||||
|
|
||||||
|
if not rangeres:
|
||||||
|
return range_start, range_end, skip_record
|
||||||
|
|
||||||
|
mod_url, start, end, use_206 = rangeres
|
||||||
|
|
||||||
|
# remove the range and still proxy
|
||||||
|
if not use_206:
|
||||||
|
return range_start, range_end, skip_record
|
||||||
|
|
||||||
|
wb_url.url = mod_url
|
||||||
|
inputreq.url = mod_url
|
||||||
|
|
||||||
|
range_start = start
|
||||||
|
range_end = end
|
||||||
|
|
||||||
|
# disable rewriting
|
||||||
|
wb_url.mod = 'id_'
|
||||||
|
|
||||||
|
#if start with 0, load from upstream, but add range after
|
||||||
|
if start == 0:
|
||||||
|
del inputreq.env['HTTP_RANGE']
|
||||||
|
else:
|
||||||
|
skip_record = True
|
||||||
|
|
||||||
|
return range_start, range_end, skip_record
|
||||||
|
|
||||||
|
def _add_range(self, record, wb_url, range_start, range_end):
|
||||||
|
if range_end is None and range_start is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
if record.http_headers.get_statuscode() != '200':
|
||||||
|
return
|
||||||
|
|
||||||
|
content_length = (record.http_headers.
|
||||||
|
get_header('Content-Length'))
|
||||||
|
try:
|
||||||
|
content_length = int(content_length)
|
||||||
|
if not range_end:
|
||||||
|
range_end = content_length - 1
|
||||||
|
|
||||||
|
if range_start >= content_length or range_end >= content_length:
|
||||||
|
details = 'Invalid Range: {0} >= {2} or {1} >= {2}'.format(range_start, range_end, content_length)
|
||||||
|
try:
|
||||||
|
r.raw.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
raise UpstreamException(416, url=wb_url.url, details=details)
|
||||||
|
|
||||||
|
range_len = range_end - range_start + 1
|
||||||
|
record.http_headers.add_range(range_start, range_len,
|
||||||
|
content_length)
|
||||||
|
|
||||||
|
record.http_headers.replace_header('Content-Length', str(range_len))
|
||||||
|
|
||||||
|
record.raw_stream = OffsetLimitReader(record.raw_stream, range_start, range_len)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
def render_content(self, wb_url, kwargs, environ):
|
def render_content(self, wb_url, kwargs, environ):
|
||||||
wb_url = wb_url.replace('#', '%23')
|
wb_url = wb_url.replace('#', '%23')
|
||||||
wb_url = WbUrl(wb_url)
|
wb_url = WbUrl(wb_url)
|
||||||
@ -191,31 +257,7 @@ class RewriterApp(object):
|
|||||||
|
|
||||||
inputreq.include_method_query(wb_url.url)
|
inputreq.include_method_query(wb_url.url)
|
||||||
|
|
||||||
mod_url = None
|
range_start, range_end, skip_record = self._check_range(inputreq, wb_url)
|
||||||
use_206 = False
|
|
||||||
rangeres = None
|
|
||||||
|
|
||||||
readd_range = False
|
|
||||||
async_record_url = None
|
|
||||||
|
|
||||||
if kwargs.get('type') in ('record', 'patch'):
|
|
||||||
rangeres = inputreq.extract_range()
|
|
||||||
|
|
||||||
if rangeres:
|
|
||||||
mod_url, start, end, use_206 = rangeres
|
|
||||||
|
|
||||||
# if bytes=0- Range request,
|
|
||||||
# simply remove the range and still proxy
|
|
||||||
if start == 0 and not end and use_206:
|
|
||||||
wb_url.url = mod_url
|
|
||||||
inputreq.url = mod_url
|
|
||||||
|
|
||||||
del environ['HTTP_RANGE']
|
|
||||||
readd_range = True
|
|
||||||
else:
|
|
||||||
async_record_url = mod_url
|
|
||||||
|
|
||||||
skip = async_record_url is not None
|
|
||||||
|
|
||||||
setcookie_headers = None
|
setcookie_headers = None
|
||||||
if self.cookie_tracker:
|
if self.cookie_tracker:
|
||||||
@ -223,7 +265,7 @@ class RewriterApp(object):
|
|||||||
res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key)
|
res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key)
|
||||||
inputreq.extra_cookie, setcookie_headers = res
|
inputreq.extra_cookie, setcookie_headers = res
|
||||||
|
|
||||||
r = self._do_req(inputreq, wb_url, kwargs, skip)
|
r = self._do_req(inputreq, wb_url, kwargs, skip_record)
|
||||||
|
|
||||||
if r.status_code >= 400:
|
if r.status_code >= 400:
|
||||||
error = None
|
error = None
|
||||||
@ -241,17 +283,6 @@ class RewriterApp(object):
|
|||||||
details = dict(args=kwargs, error=error)
|
details = dict(args=kwargs, error=error)
|
||||||
raise UpstreamException(r.status_code, url=wb_url.url, details=details)
|
raise UpstreamException(r.status_code, url=wb_url.url, details=details)
|
||||||
|
|
||||||
if async_record_url:
|
|
||||||
environ.pop('HTTP_RANGE', '')
|
|
||||||
new_wb_url = copy(wb_url)
|
|
||||||
new_wb_url.url = async_record_url
|
|
||||||
|
|
||||||
gevent.spawn(self._do_async_req,
|
|
||||||
inputreq,
|
|
||||||
new_wb_url,
|
|
||||||
kwargs,
|
|
||||||
False)
|
|
||||||
|
|
||||||
stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
|
stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
|
||||||
record = self.loader.parse_record_stream(stream,
|
record = self.loader.parse_record_stream(stream,
|
||||||
ensure_http_headers=True)
|
ensure_http_headers=True)
|
||||||
@ -295,17 +326,10 @@ class RewriterApp(object):
|
|||||||
|
|
||||||
self._add_custom_params(cdx, r.headers, kwargs)
|
self._add_custom_params(cdx, r.headers, kwargs)
|
||||||
|
|
||||||
if readd_range and record.http_headers.get_statuscode() == '200':
|
self._add_range(record, wb_url, range_start, range_end)
|
||||||
content_length = (record.http_headers.
|
|
||||||
get_header('Content-Length'))
|
|
||||||
try:
|
|
||||||
content_length = int(content_length)
|
|
||||||
record.http_headers.add_range(0, content_length,
|
|
||||||
content_length)
|
|
||||||
except (ValueError, TypeError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
is_ajax = self.is_ajax(environ)
|
is_ajax = self.is_ajax(environ)
|
||||||
|
|
||||||
if is_ajax:
|
if is_ajax:
|
||||||
head_insert_func = None
|
head_insert_func = None
|
||||||
urlrewriter.rewrite_opts['is_ajax'] = True
|
urlrewriter.rewrite_opts['is_ajax'] = True
|
||||||
@ -326,6 +350,7 @@ class RewriterApp(object):
|
|||||||
cookie_key)
|
cookie_key)
|
||||||
|
|
||||||
urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')
|
urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')
|
||||||
|
|
||||||
result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx)
|
result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx)
|
||||||
|
|
||||||
status_headers, gen, is_rw = result
|
status_headers, gen, is_rw = result
|
||||||
@ -424,25 +449,6 @@ class RewriterApp(object):
|
|||||||
top_url += wb_url.to_str(mod='')
|
top_url += wb_url.to_str(mod='')
|
||||||
return top_url
|
return top_url
|
||||||
|
|
||||||
def _do_async_req(self, *args):
|
|
||||||
count = 0
|
|
||||||
try:
|
|
||||||
r = self._do_req(*args)
|
|
||||||
while True:
|
|
||||||
buff = r.raw.read(8192)
|
|
||||||
count += len(buff)
|
|
||||||
if not buff:
|
|
||||||
return
|
|
||||||
except:
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
finally:
|
|
||||||
try:
|
|
||||||
r.raw.close()
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def handle_error(self, environ, ue):
|
def handle_error(self, environ, ue):
|
||||||
if ue.status_code == 404:
|
if ue.status_code == 404:
|
||||||
return self._not_found_response(environ, ue.url)
|
return self._not_found_response(environ, ue.url)
|
||||||
@ -465,13 +471,13 @@ class RewriterApp(object):
|
|||||||
return WbResponse.text_response(resp, status=status, content_type='text/html')
|
return WbResponse.text_response(resp, status=status, content_type='text/html')
|
||||||
|
|
||||||
|
|
||||||
def _do_req(self, inputreq, wb_url, kwargs, skip):
|
def _do_req(self, inputreq, wb_url, kwargs, skip_record):
|
||||||
req_data = inputreq.reconstruct_request(wb_url.url)
|
req_data = inputreq.reconstruct_request(wb_url.url)
|
||||||
|
|
||||||
headers = {'Content-Length': str(len(req_data)),
|
headers = {'Content-Length': str(len(req_data)),
|
||||||
'Content-Type': 'application/request'}
|
'Content-Type': 'application/request'}
|
||||||
|
|
||||||
if skip:
|
if skip_record:
|
||||||
headers['Recorder-Skip'] = '1'
|
headers['Recorder-Skip'] = '1'
|
||||||
|
|
||||||
if wb_url.is_latest_replay():
|
if wb_url.is_latest_replay():
|
||||||
|
@ -2,6 +2,7 @@ import zlib
|
|||||||
from contextlib import closing, contextmanager
|
from contextlib import closing, contextmanager
|
||||||
|
|
||||||
from warcio.utils import BUFF_SIZE
|
from warcio.utils import BUFF_SIZE
|
||||||
|
from warcio.limitreader import LimitReader
|
||||||
from tempfile import SpooledTemporaryFile
|
from tempfile import SpooledTemporaryFile
|
||||||
|
|
||||||
|
|
||||||
@ -77,3 +78,27 @@ def compress_gzip_iter(orig_iter):
|
|||||||
yield compressobj.flush()
|
yield compressobj.flush()
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
class OffsetLimitReader(LimitReader):
|
||||||
|
def __init__(self, stream, offset, length):
|
||||||
|
super(OffsetLimitReader, self).__init__(stream, length)
|
||||||
|
self.offset = offset
|
||||||
|
if offset > 0:
|
||||||
|
self._skip_reader = LimitReader(stream, offset)
|
||||||
|
else:
|
||||||
|
self._skip_reader = None
|
||||||
|
|
||||||
|
def _skip(self):
|
||||||
|
while self._skip_reader:
|
||||||
|
buff = self._skip_reader.read()
|
||||||
|
if not buff:
|
||||||
|
self._skip_reader = None
|
||||||
|
|
||||||
|
def read(self, length=None):
|
||||||
|
self._skip()
|
||||||
|
return super(OffsetLimitReader, self).read(length)
|
||||||
|
|
||||||
|
def readline(self, length=None):
|
||||||
|
self._skip()
|
||||||
|
return super(OffsetLimitReader, self).readline(length)
|
||||||
|
|
||||||
|
@ -189,42 +189,6 @@ class TestWbIntegration(BaseConfigTest):
|
|||||||
# original unrewritten url present
|
# original unrewritten url present
|
||||||
assert '"http://www.iana.org/domains/example"' in resp.text
|
assert '"http://www.iana.org/domains/example"' in resp.text
|
||||||
|
|
||||||
def _test_replay_range_cache_content(self):
|
|
||||||
headers = [('Range', 'bytes=0-200')]
|
|
||||||
resp = self.testapp.get('/pywb/20140127171250id_/http://example.com', headers=headers)
|
|
||||||
|
|
||||||
assert resp.status_int == 206
|
|
||||||
assert resp.headers['Accept-Ranges'] == 'bytes'
|
|
||||||
assert resp.headers['Content-Range'] == 'bytes 0-200/1270', resp.headers['Content-Range']
|
|
||||||
assert resp.content_length == 201, resp.content_length
|
|
||||||
|
|
||||||
assert 'wombat.js' not in resp.text
|
|
||||||
|
|
||||||
def _test_replay_content_ignore_range(self):
|
|
||||||
headers = [('Range', 'bytes=0-200')]
|
|
||||||
resp = self.testapp.get('/pywb-norange/20140127171251id_/http://example.com', headers=headers)
|
|
||||||
|
|
||||||
# range request ignored
|
|
||||||
assert resp.status_int == 200
|
|
||||||
|
|
||||||
# full response
|
|
||||||
assert resp.content_length == 1270, resp.content_length
|
|
||||||
|
|
||||||
# identity, no header insertion
|
|
||||||
assert 'wombat.js' not in resp.text
|
|
||||||
|
|
||||||
def _test_replay_range_cache_content_bound_end(self):
|
|
||||||
headers = [('Range', 'bytes=10-10000')]
|
|
||||||
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com', headers=headers)
|
|
||||||
|
|
||||||
assert resp.status_int == 206
|
|
||||||
assert resp.headers['Accept-Ranges'] == 'bytes'
|
|
||||||
assert resp.headers['Content-Range'] == 'bytes 10-1269/1270', resp.headers['Content-Range']
|
|
||||||
assert resp.content_length == 1260, resp.content_length
|
|
||||||
assert len(resp.text) == resp.content_length
|
|
||||||
|
|
||||||
assert 'wombat.js' not in resp.text
|
|
||||||
|
|
||||||
def _test_replay_redir_no_cache(self):
|
def _test_replay_redir_no_cache(self):
|
||||||
headers = [('Range', 'bytes=10-10000')]
|
headers = [('Range', 'bytes=10-10000')]
|
||||||
# Range ignored
|
# Range ignored
|
||||||
|
78
tests/test_range.py
Normal file
78
tests/test_range.py
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
from .base_config_test import BaseConfigTest, fmod
|
||||||
|
from pywb.warcserver.warcserver import BaseWarcServer
|
||||||
|
from mock import patch
|
||||||
|
|
||||||
|
orig_call = BaseWarcServer.__call__
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
def mock_call(self, environ, start_response):
|
||||||
|
TestReplayRange.recorder_skip = environ.get('HTTP_RECORDER_SKIP')
|
||||||
|
return orig_call(self, environ, start_response)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
@patch('pywb.warcserver.basewarcserver.BaseWarcServer.__call__', mock_call)
|
||||||
|
class TestReplayRange(BaseConfigTest):
|
||||||
|
recorder_skip = None
|
||||||
|
recorder_range = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setup_class(cls):
|
||||||
|
super(TestReplayRange, cls).setup_class('config_test.yaml')
|
||||||
|
|
||||||
|
def test_replay_range_start_end(self, fmod):
|
||||||
|
headers = [('Range', 'bytes=0-200')]
|
||||||
|
resp = self.get('/pywb/20140127171250{0}/http://example.com/', fmod, headers=headers)
|
||||||
|
|
||||||
|
assert resp.status_int == 206
|
||||||
|
assert resp.headers['Accept-Ranges'] == 'bytes'
|
||||||
|
assert resp.headers['Content-Range'] == 'bytes 0-200/1270', resp.headers['Content-Range']
|
||||||
|
assert resp.content_length == 201, resp.content_length
|
||||||
|
|
||||||
|
assert self.recorder_skip == None
|
||||||
|
|
||||||
|
assert 'wombat.js' not in resp.text
|
||||||
|
|
||||||
|
def test_replay_range_start_end_2(self, fmod):
|
||||||
|
headers = [('Range', 'bytes=10-200')]
|
||||||
|
resp = self.get('/pywb/20140127171250{0}/http://example.com/', fmod, headers=headers)
|
||||||
|
|
||||||
|
assert resp.status_int == 206
|
||||||
|
assert resp.headers['Accept-Ranges'] == 'bytes'
|
||||||
|
assert resp.headers['Content-Range'] == 'bytes 10-200/1270', resp.headers['Content-Range']
|
||||||
|
assert resp.content_length == 191, resp.content_length
|
||||||
|
|
||||||
|
assert self.recorder_skip == '1'
|
||||||
|
|
||||||
|
assert 'wombat.js' not in resp.text
|
||||||
|
|
||||||
|
def test_replay_range_start_only(self, fmod):
|
||||||
|
headers = [('Range', 'bytes=0-')]
|
||||||
|
resp = self.get('/pywb/20140127171250{0}/http://example.com/', fmod, headers=headers)
|
||||||
|
|
||||||
|
assert resp.status_int == 206
|
||||||
|
assert resp.headers['Accept-Ranges'] == 'bytes'
|
||||||
|
assert resp.headers['Content-Range'] == 'bytes 0-1269/1270', resp.headers['Content-Range']
|
||||||
|
assert resp.content_length == 1270, resp.content_length
|
||||||
|
|
||||||
|
assert self.recorder_skip == None
|
||||||
|
|
||||||
|
assert 'wombat.js' not in resp.text
|
||||||
|
|
||||||
|
def test_error_range_out_of_bounds_1(self, fmod):
|
||||||
|
headers = [('Range', 'bytes=10-2000')]
|
||||||
|
resp = self.get('/pywb/20140127171251{0}/http://example.com/', fmod, headers=headers, status=416)
|
||||||
|
|
||||||
|
assert resp.status_int == 416
|
||||||
|
|
||||||
|
assert self.recorder_skip == '1'
|
||||||
|
|
||||||
|
|
||||||
|
def test_error_range_out_of_bounds_2(self, fmod):
|
||||||
|
headers = [('Range', 'bytes=2000-10')]
|
||||||
|
resp = self.get('/pywb/20140127171251{0}/http://example.com/', fmod, headers=headers, status=416)
|
||||||
|
|
||||||
|
assert resp.status_int == 416
|
||||||
|
|
||||||
|
assert self.recorder_skip == '1'
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user