From ae56514c038a3e884ce2269e3c0735090b402bd8 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 21 Nov 2017 17:57:38 -0800 Subject: [PATCH] range request fixes: (#266) - fully support range requests on frontend, if range request reaches pywb - add OffsetLimitReader() to skip offset and limit read - disbale rewriting for range requests - serve 416 if range outside of content-length - tests: add tests for range request handling dockerignore: add collections/ --- .dockerignore | 2 +- pywb/apps/rewriterapp.py | 142 ++++++++++++++++++++------------------ pywb/utils/io.py | 25 +++++++ tests/test_integration.py | 36 ---------- tests/test_range.py | 78 +++++++++++++++++++++ 5 files changed, 178 insertions(+), 105 deletions(-) create mode 100644 tests/test_range.py diff --git a/.dockerignore b/.dockerignore index 3266615d..4edbcb56 100644 --- a/.dockerignore +++ b/.dockerignore @@ -7,5 +7,5 @@ dist/ **/*.egg-info **/__pycache__ **/*.pyc - +collections/ diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index 68690b1a..fb0a000e 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -11,7 +11,7 @@ from pywb.rewrite.url_rewriter import UrlRewriter, IdentityUrlRewriter from pywb.utils.wbexception import WbException from pywb.utils.canonicalize import canonicalize from pywb.utils.loaders import extract_client_cookie -from pywb.utils.io import BUFF_SIZE +from pywb.utils.io import BUFF_SIZE, OffsetLimitReader from pywb.utils.memento import MementoUtils from warcio.timeutils import http_date_to_timestamp, timestamp_to_http_date @@ -134,6 +134,72 @@ class RewriterApp(object): return is_timegate + def _check_range(self, inputreq, wb_url): + skip_record = False + range_start = None + range_end = None + + rangeres = inputreq.extract_range() + + if not rangeres: + return range_start, range_end, skip_record + + mod_url, start, end, use_206 = rangeres + + # remove the range and still proxy + if not use_206: + return range_start, range_end, skip_record + + wb_url.url = mod_url + inputreq.url = mod_url + + range_start = start + range_end = end + + # disable rewriting + wb_url.mod = 'id_' + + #if start with 0, load from upstream, but add range after + if start == 0: + del inputreq.env['HTTP_RANGE'] + else: + skip_record = True + + return range_start, range_end, skip_record + + def _add_range(self, record, wb_url, range_start, range_end): + if range_end is None and range_start is None: + return + + if record.http_headers.get_statuscode() != '200': + return + + content_length = (record.http_headers. + get_header('Content-Length')) + try: + content_length = int(content_length) + if not range_end: + range_end = content_length - 1 + + if range_start >= content_length or range_end >= content_length: + details = 'Invalid Range: {0} >= {2} or {1} >= {2}'.format(range_start, range_end, content_length) + try: + r.raw.close() + except: + pass + + raise UpstreamException(416, url=wb_url.url, details=details) + + range_len = range_end - range_start + 1 + record.http_headers.add_range(range_start, range_len, + content_length) + + record.http_headers.replace_header('Content-Length', str(range_len)) + + record.raw_stream = OffsetLimitReader(record.raw_stream, range_start, range_len) + except (ValueError, TypeError): + pass + def render_content(self, wb_url, kwargs, environ): wb_url = wb_url.replace('#', '%23') wb_url = WbUrl(wb_url) @@ -191,31 +257,7 @@ class RewriterApp(object): inputreq.include_method_query(wb_url.url) - mod_url = None - use_206 = False - rangeres = None - - readd_range = False - async_record_url = None - - if kwargs.get('type') in ('record', 'patch'): - rangeres = inputreq.extract_range() - - if rangeres: - mod_url, start, end, use_206 = rangeres - - # if bytes=0- Range request, - # simply remove the range and still proxy - if start == 0 and not end and use_206: - wb_url.url = mod_url - inputreq.url = mod_url - - del environ['HTTP_RANGE'] - readd_range = True - else: - async_record_url = mod_url - - skip = async_record_url is not None + range_start, range_end, skip_record = self._check_range(inputreq, wb_url) setcookie_headers = None if self.cookie_tracker: @@ -223,7 +265,7 @@ class RewriterApp(object): res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key) inputreq.extra_cookie, setcookie_headers = res - r = self._do_req(inputreq, wb_url, kwargs, skip) + r = self._do_req(inputreq, wb_url, kwargs, skip_record) if r.status_code >= 400: error = None @@ -241,17 +283,6 @@ class RewriterApp(object): details = dict(args=kwargs, error=error) raise UpstreamException(r.status_code, url=wb_url.url, details=details) - if async_record_url: - environ.pop('HTTP_RANGE', '') - new_wb_url = copy(wb_url) - new_wb_url.url = async_record_url - - gevent.spawn(self._do_async_req, - inputreq, - new_wb_url, - kwargs, - False) - stream = BufferedReader(r.raw, block_size=BUFF_SIZE) record = self.loader.parse_record_stream(stream, ensure_http_headers=True) @@ -295,17 +326,10 @@ class RewriterApp(object): self._add_custom_params(cdx, r.headers, kwargs) - if readd_range and record.http_headers.get_statuscode() == '200': - content_length = (record.http_headers. - get_header('Content-Length')) - try: - content_length = int(content_length) - record.http_headers.add_range(0, content_length, - content_length) - except (ValueError, TypeError): - pass + self._add_range(record, wb_url, range_start, range_end) is_ajax = self.is_ajax(environ) + if is_ajax: head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True @@ -326,6 +350,7 @@ class RewriterApp(object): cookie_key) urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT') + result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx) status_headers, gen, is_rw = result @@ -424,25 +449,6 @@ class RewriterApp(object): top_url += wb_url.to_str(mod='') return top_url - def _do_async_req(self, *args): - count = 0 - try: - r = self._do_req(*args) - while True: - buff = r.raw.read(8192) - count += len(buff) - if not buff: - return - except: - import traceback - traceback.print_exc() - - finally: - try: - r.raw.close() - except: - pass - def handle_error(self, environ, ue): if ue.status_code == 404: return self._not_found_response(environ, ue.url) @@ -465,13 +471,13 @@ class RewriterApp(object): return WbResponse.text_response(resp, status=status, content_type='text/html') - def _do_req(self, inputreq, wb_url, kwargs, skip): + def _do_req(self, inputreq, wb_url, kwargs, skip_record): req_data = inputreq.reconstruct_request(wb_url.url) headers = {'Content-Length': str(len(req_data)), 'Content-Type': 'application/request'} - if skip: + if skip_record: headers['Recorder-Skip'] = '1' if wb_url.is_latest_replay(): diff --git a/pywb/utils/io.py b/pywb/utils/io.py index 4b6b6ba5..a651a75e 100644 --- a/pywb/utils/io.py +++ b/pywb/utils/io.py @@ -2,6 +2,7 @@ import zlib from contextlib import closing, contextmanager from warcio.utils import BUFF_SIZE +from warcio.limitreader import LimitReader from tempfile import SpooledTemporaryFile @@ -77,3 +78,27 @@ def compress_gzip_iter(orig_iter): yield compressobj.flush() +# ============================================================================ +class OffsetLimitReader(LimitReader): + def __init__(self, stream, offset, length): + super(OffsetLimitReader, self).__init__(stream, length) + self.offset = offset + if offset > 0: + self._skip_reader = LimitReader(stream, offset) + else: + self._skip_reader = None + + def _skip(self): + while self._skip_reader: + buff = self._skip_reader.read() + if not buff: + self._skip_reader = None + + def read(self, length=None): + self._skip() + return super(OffsetLimitReader, self).read(length) + + def readline(self, length=None): + self._skip() + return super(OffsetLimitReader, self).readline(length) + diff --git a/tests/test_integration.py b/tests/test_integration.py index a929a950..ee5733f4 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -189,42 +189,6 @@ class TestWbIntegration(BaseConfigTest): # original unrewritten url present assert '"http://www.iana.org/domains/example"' in resp.text - def _test_replay_range_cache_content(self): - headers = [('Range', 'bytes=0-200')] - resp = self.testapp.get('/pywb/20140127171250id_/http://example.com', headers=headers) - - assert resp.status_int == 206 - assert resp.headers['Accept-Ranges'] == 'bytes' - assert resp.headers['Content-Range'] == 'bytes 0-200/1270', resp.headers['Content-Range'] - assert resp.content_length == 201, resp.content_length - - assert 'wombat.js' not in resp.text - - def _test_replay_content_ignore_range(self): - headers = [('Range', 'bytes=0-200')] - resp = self.testapp.get('/pywb-norange/20140127171251id_/http://example.com', headers=headers) - - # range request ignored - assert resp.status_int == 200 - - # full response - assert resp.content_length == 1270, resp.content_length - - # identity, no header insertion - assert 'wombat.js' not in resp.text - - def _test_replay_range_cache_content_bound_end(self): - headers = [('Range', 'bytes=10-10000')] - resp = self.testapp.get('/pywb/20140127171251id_/http://example.com', headers=headers) - - assert resp.status_int == 206 - assert resp.headers['Accept-Ranges'] == 'bytes' - assert resp.headers['Content-Range'] == 'bytes 10-1269/1270', resp.headers['Content-Range'] - assert resp.content_length == 1260, resp.content_length - assert len(resp.text) == resp.content_length - - assert 'wombat.js' not in resp.text - def _test_replay_redir_no_cache(self): headers = [('Range', 'bytes=10-10000')] # Range ignored diff --git a/tests/test_range.py b/tests/test_range.py new file mode 100644 index 00000000..dc81965b --- /dev/null +++ b/tests/test_range.py @@ -0,0 +1,78 @@ +from .base_config_test import BaseConfigTest, fmod +from pywb.warcserver.warcserver import BaseWarcServer +from mock import patch + +orig_call = BaseWarcServer.__call__ + +# ============================================================================ +def mock_call(self, environ, start_response): + TestReplayRange.recorder_skip = environ.get('HTTP_RECORDER_SKIP') + return orig_call(self, environ, start_response) + + +# ============================================================================ +@patch('pywb.warcserver.basewarcserver.BaseWarcServer.__call__', mock_call) +class TestReplayRange(BaseConfigTest): + recorder_skip = None + recorder_range = None + + @classmethod + def setup_class(cls): + super(TestReplayRange, cls).setup_class('config_test.yaml') + + def test_replay_range_start_end(self, fmod): + headers = [('Range', 'bytes=0-200')] + resp = self.get('/pywb/20140127171250{0}/http://example.com/', fmod, headers=headers) + + assert resp.status_int == 206 + assert resp.headers['Accept-Ranges'] == 'bytes' + assert resp.headers['Content-Range'] == 'bytes 0-200/1270', resp.headers['Content-Range'] + assert resp.content_length == 201, resp.content_length + + assert self.recorder_skip == None + + assert 'wombat.js' not in resp.text + + def test_replay_range_start_end_2(self, fmod): + headers = [('Range', 'bytes=10-200')] + resp = self.get('/pywb/20140127171250{0}/http://example.com/', fmod, headers=headers) + + assert resp.status_int == 206 + assert resp.headers['Accept-Ranges'] == 'bytes' + assert resp.headers['Content-Range'] == 'bytes 10-200/1270', resp.headers['Content-Range'] + assert resp.content_length == 191, resp.content_length + + assert self.recorder_skip == '1' + + assert 'wombat.js' not in resp.text + + def test_replay_range_start_only(self, fmod): + headers = [('Range', 'bytes=0-')] + resp = self.get('/pywb/20140127171250{0}/http://example.com/', fmod, headers=headers) + + assert resp.status_int == 206 + assert resp.headers['Accept-Ranges'] == 'bytes' + assert resp.headers['Content-Range'] == 'bytes 0-1269/1270', resp.headers['Content-Range'] + assert resp.content_length == 1270, resp.content_length + + assert self.recorder_skip == None + + assert 'wombat.js' not in resp.text + + def test_error_range_out_of_bounds_1(self, fmod): + headers = [('Range', 'bytes=10-2000')] + resp = self.get('/pywb/20140127171251{0}/http://example.com/', fmod, headers=headers, status=416) + + assert resp.status_int == 416 + + assert self.recorder_skip == '1' + + + def test_error_range_out_of_bounds_2(self, fmod): + headers = [('Range', 'bytes=2000-10')] + resp = self.get('/pywb/20140127171251{0}/http://example.com/', fmod, headers=headers, status=416) + + assert resp.status_int == 416 + + assert self.recorder_skip == '1' +