1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

range request fixes: (#266)

- fully support range requests on frontend, if range request reaches pywb
- add OffsetLimitReader() to skip offset and limit read
- disbale rewriting for range requests
- serve 416 if range outside of content-length
- tests: add tests for range request handling
dockerignore: add collections/
This commit is contained in:
Ilya Kreymer 2017-11-21 17:57:38 -08:00 committed by GitHub
parent 1bb1a32ee1
commit ae56514c03
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 178 additions and 105 deletions

View File

@ -7,5 +7,5 @@ dist/
**/*.egg-info **/*.egg-info
**/__pycache__ **/__pycache__
**/*.pyc **/*.pyc
collections/

View File

@ -11,7 +11,7 @@ from pywb.rewrite.url_rewriter import UrlRewriter, IdentityUrlRewriter
from pywb.utils.wbexception import WbException from pywb.utils.wbexception import WbException
from pywb.utils.canonicalize import canonicalize from pywb.utils.canonicalize import canonicalize
from pywb.utils.loaders import extract_client_cookie from pywb.utils.loaders import extract_client_cookie
from pywb.utils.io import BUFF_SIZE from pywb.utils.io import BUFF_SIZE, OffsetLimitReader
from pywb.utils.memento import MementoUtils from pywb.utils.memento import MementoUtils
from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date
@ -134,6 +134,72 @@ class RewriterApp(object):
return is_timegate return is_timegate
def _check_range(self, inputreq, wb_url):
skip_record = False
range_start = None
range_end = None
rangeres = inputreq.extract_range()
if not rangeres:
return range_start, range_end, skip_record
mod_url, start, end, use_206 = rangeres
# remove the range and still proxy
if not use_206:
return range_start, range_end, skip_record
wb_url.url = mod_url
inputreq.url = mod_url
range_start = start
range_end = end
# disable rewriting
wb_url.mod = 'id_'
#if start with 0, load from upstream, but add range after
if start == 0:
del inputreq.env['HTTP_RANGE']
else:
skip_record = True
return range_start, range_end, skip_record
def _add_range(self, record, wb_url, range_start, range_end):
if range_end is None and range_start is None:
return
if record.http_headers.get_statuscode() != '200':
return
content_length = (record.http_headers.
get_header('Content-Length'))
try:
content_length = int(content_length)
if not range_end:
range_end = content_length - 1
if range_start >= content_length or range_end >= content_length:
details = 'Invalid Range: {0} >= {2} or {1} >= {2}'.format(range_start, range_end, content_length)
try:
r.raw.close()
except:
pass
raise UpstreamException(416, url=wb_url.url, details=details)
range_len = range_end - range_start + 1
record.http_headers.add_range(range_start, range_len,
content_length)
record.http_headers.replace_header('Content-Length', str(range_len))
record.raw_stream = OffsetLimitReader(record.raw_stream, range_start, range_len)
except (ValueError, TypeError):
pass
def render_content(self, wb_url, kwargs, environ): def render_content(self, wb_url, kwargs, environ):
wb_url = wb_url.replace('#', '%23') wb_url = wb_url.replace('#', '%23')
wb_url = WbUrl(wb_url) wb_url = WbUrl(wb_url)
@ -191,31 +257,7 @@ class RewriterApp(object):
inputreq.include_method_query(wb_url.url) inputreq.include_method_query(wb_url.url)
mod_url = None range_start, range_end, skip_record = self._check_range(inputreq, wb_url)
use_206 = False
rangeres = None
readd_range = False
async_record_url = None
if kwargs.get('type') in ('record', 'patch'):
rangeres = inputreq.extract_range()
if rangeres:
mod_url, start, end, use_206 = rangeres
# if bytes=0- Range request,
# simply remove the range and still proxy
if start == 0 and not end and use_206:
wb_url.url = mod_url
inputreq.url = mod_url
del environ['HTTP_RANGE']
readd_range = True
else:
async_record_url = mod_url
skip = async_record_url is not None
setcookie_headers = None setcookie_headers = None
if self.cookie_tracker: if self.cookie_tracker:
@ -223,7 +265,7 @@ class RewriterApp(object):
res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key) res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key)
inputreq.extra_cookie, setcookie_headers = res inputreq.extra_cookie, setcookie_headers = res
r = self._do_req(inputreq, wb_url, kwargs, skip) r = self._do_req(inputreq, wb_url, kwargs, skip_record)
if r.status_code >= 400: if r.status_code >= 400:
error = None error = None
@ -241,17 +283,6 @@ class RewriterApp(object):
details = dict(args=kwargs, error=error) details = dict(args=kwargs, error=error)
raise UpstreamException(r.status_code, url=wb_url.url, details=details) raise UpstreamException(r.status_code, url=wb_url.url, details=details)
if async_record_url:
environ.pop('HTTP_RANGE', '')
new_wb_url = copy(wb_url)
new_wb_url.url = async_record_url
gevent.spawn(self._do_async_req,
inputreq,
new_wb_url,
kwargs,
False)
stream = BufferedReader(r.raw, block_size=BUFF_SIZE) stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
record = self.loader.parse_record_stream(stream, record = self.loader.parse_record_stream(stream,
ensure_http_headers=True) ensure_http_headers=True)
@ -295,17 +326,10 @@ class RewriterApp(object):
self._add_custom_params(cdx, r.headers, kwargs) self._add_custom_params(cdx, r.headers, kwargs)
if readd_range and record.http_headers.get_statuscode() == '200': self._add_range(record, wb_url, range_start, range_end)
content_length = (record.http_headers.
get_header('Content-Length'))
try:
content_length = int(content_length)
record.http_headers.add_range(0, content_length,
content_length)
except (ValueError, TypeError):
pass
is_ajax = self.is_ajax(environ) is_ajax = self.is_ajax(environ)
if is_ajax: if is_ajax:
head_insert_func = None head_insert_func = None
urlrewriter.rewrite_opts['is_ajax'] = True urlrewriter.rewrite_opts['is_ajax'] = True
@ -326,6 +350,7 @@ class RewriterApp(object):
cookie_key) cookie_key)
urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT') urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT')
result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx) result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx)
status_headers, gen, is_rw = result status_headers, gen, is_rw = result
@ -424,25 +449,6 @@ class RewriterApp(object):
top_url += wb_url.to_str(mod='') top_url += wb_url.to_str(mod='')
return top_url return top_url
def _do_async_req(self, *args):
count = 0
try:
r = self._do_req(*args)
while True:
buff = r.raw.read(8192)
count += len(buff)
if not buff:
return
except:
import traceback
traceback.print_exc()
finally:
try:
r.raw.close()
except:
pass
def handle_error(self, environ, ue): def handle_error(self, environ, ue):
if ue.status_code == 404: if ue.status_code == 404:
return self._not_found_response(environ, ue.url) return self._not_found_response(environ, ue.url)
@ -465,13 +471,13 @@ class RewriterApp(object):
return WbResponse.text_response(resp, status=status, content_type='text/html') return WbResponse.text_response(resp, status=status, content_type='text/html')
def _do_req(self, inputreq, wb_url, kwargs, skip): def _do_req(self, inputreq, wb_url, kwargs, skip_record):
req_data = inputreq.reconstruct_request(wb_url.url) req_data = inputreq.reconstruct_request(wb_url.url)
headers = {'Content-Length': str(len(req_data)), headers = {'Content-Length': str(len(req_data)),
'Content-Type': 'application/request'} 'Content-Type': 'application/request'}
if skip: if skip_record:
headers['Recorder-Skip'] = '1' headers['Recorder-Skip'] = '1'
if wb_url.is_latest_replay(): if wb_url.is_latest_replay():

View File

@ -2,6 +2,7 @@ import zlib
from contextlib import closing, contextmanager from contextlib import closing, contextmanager
from warcio.utils import BUFF_SIZE from warcio.utils import BUFF_SIZE
from warcio.limitreader import LimitReader
from tempfile import SpooledTemporaryFile from tempfile import SpooledTemporaryFile
@ -77,3 +78,27 @@ def compress_gzip_iter(orig_iter):
yield compressobj.flush() yield compressobj.flush()
# ============================================================================
class OffsetLimitReader(LimitReader):
def __init__(self, stream, offset, length):
super(OffsetLimitReader, self).__init__(stream, length)
self.offset = offset
if offset > 0:
self._skip_reader = LimitReader(stream, offset)
else:
self._skip_reader = None
def _skip(self):
while self._skip_reader:
buff = self._skip_reader.read()
if not buff:
self._skip_reader = None
def read(self, length=None):
self._skip()
return super(OffsetLimitReader, self).read(length)
def readline(self, length=None):
self._skip()
return super(OffsetLimitReader, self).readline(length)

View File

@ -189,42 +189,6 @@ class TestWbIntegration(BaseConfigTest):
# original unrewritten url present # original unrewritten url present
assert '"http://www.iana.org/domains/example"' in resp.text assert '"http://www.iana.org/domains/example"' in resp.text
def _test_replay_range_cache_content(self):
headers = [('Range', 'bytes=0-200')]
resp = self.testapp.get('/pywb/20140127171250id_/http://example.com', headers=headers)
assert resp.status_int == 206
assert resp.headers['Accept-Ranges'] == 'bytes'
assert resp.headers['Content-Range'] == 'bytes 0-200/1270', resp.headers['Content-Range']
assert resp.content_length == 201, resp.content_length
assert 'wombat.js' not in resp.text
def _test_replay_content_ignore_range(self):
headers = [('Range', 'bytes=0-200')]
resp = self.testapp.get('/pywb-norange/20140127171251id_/http://example.com', headers=headers)
# range request ignored
assert resp.status_int == 200
# full response
assert resp.content_length == 1270, resp.content_length
# identity, no header insertion
assert 'wombat.js' not in resp.text
def _test_replay_range_cache_content_bound_end(self):
headers = [('Range', 'bytes=10-10000')]
resp = self.testapp.get('/pywb/20140127171251id_/http://example.com', headers=headers)
assert resp.status_int == 206
assert resp.headers['Accept-Ranges'] == 'bytes'
assert resp.headers['Content-Range'] == 'bytes 10-1269/1270', resp.headers['Content-Range']
assert resp.content_length == 1260, resp.content_length
assert len(resp.text) == resp.content_length
assert 'wombat.js' not in resp.text
def _test_replay_redir_no_cache(self): def _test_replay_redir_no_cache(self):
headers = [('Range', 'bytes=10-10000')] headers = [('Range', 'bytes=10-10000')]
# Range ignored # Range ignored

78
tests/test_range.py Normal file
View File

@ -0,0 +1,78 @@
from .base_config_test import BaseConfigTest, fmod
from pywb.warcserver.warcserver import BaseWarcServer
from mock import patch
orig_call = BaseWarcServer.__call__
# ============================================================================
def mock_call(self, environ, start_response):
TestReplayRange.recorder_skip = environ.get('HTTP_RECORDER_SKIP')
return orig_call(self, environ, start_response)
# ============================================================================
@patch('pywb.warcserver.basewarcserver.BaseWarcServer.__call__', mock_call)
class TestReplayRange(BaseConfigTest):
recorder_skip = None
recorder_range = None
@classmethod
def setup_class(cls):
super(TestReplayRange, cls).setup_class('config_test.yaml')
def test_replay_range_start_end(self, fmod):
headers = [('Range', 'bytes=0-200')]
resp = self.get('/pywb/20140127171250{0}/http://example.com/', fmod, headers=headers)
assert resp.status_int == 206
assert resp.headers['Accept-Ranges'] == 'bytes'
assert resp.headers['Content-Range'] == 'bytes 0-200/1270', resp.headers['Content-Range']
assert resp.content_length == 201, resp.content_length
assert self.recorder_skip == None
assert 'wombat.js' not in resp.text
def test_replay_range_start_end_2(self, fmod):
headers = [('Range', 'bytes=10-200')]
resp = self.get('/pywb/20140127171250{0}/http://example.com/', fmod, headers=headers)
assert resp.status_int == 206
assert resp.headers['Accept-Ranges'] == 'bytes'
assert resp.headers['Content-Range'] == 'bytes 10-200/1270', resp.headers['Content-Range']
assert resp.content_length == 191, resp.content_length
assert self.recorder_skip == '1'
assert 'wombat.js' not in resp.text
def test_replay_range_start_only(self, fmod):
headers = [('Range', 'bytes=0-')]
resp = self.get('/pywb/20140127171250{0}/http://example.com/', fmod, headers=headers)
assert resp.status_int == 206
assert resp.headers['Accept-Ranges'] == 'bytes'
assert resp.headers['Content-Range'] == 'bytes 0-1269/1270', resp.headers['Content-Range']
assert resp.content_length == 1270, resp.content_length
assert self.recorder_skip == None
assert 'wombat.js' not in resp.text
def test_error_range_out_of_bounds_1(self, fmod):
headers = [('Range', 'bytes=10-2000')]
resp = self.get('/pywb/20140127171251{0}/http://example.com/', fmod, headers=headers, status=416)
assert resp.status_int == 416
assert self.recorder_skip == '1'
def test_error_range_out_of_bounds_2(self, fmod):
headers = [('Range', 'bytes=2000-10')]
resp = self.get('/pywb/20140127171251{0}/http://example.com/', fmod, headers=headers, status=416)
assert resp.status_int == 416
assert self.recorder_skip == '1'