diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index a6565993..2b93272b 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -60,6 +60,11 @@ class RewriterApp(object): self.frame_mod = None self.replay_mod = '' + self.enable_prefer = self.config.get('enable_prefer', False) + self.prefs = {'raw': 'id_', + 'rewritten': self.replay_mod + } + self.default_rw = DefaultRewriter(replay_mod=self.replay_mod, config=config) @@ -139,6 +144,19 @@ class RewriterApp(object): return is_timegate + def _get_prefer_mod(self, wb_url, environ): + if not self.enable_prefer: + return None, None + + prefer = environ.get('HTTP_PREFER') + if not prefer: + return None, 'raw' if wb_url.is_identity else 'rewritten' + + try: + return self.prefs[prefer], prefer + except: + raise UpstreamException(400, url=wb_url.url, details='Invalid Prefer: ' + prefer) + def _check_range(self, inputreq, wb_url): skip_record = False range_start = None @@ -268,6 +286,24 @@ class RewriterApp(object): if not url_parts.path: return self.send_redirect('/', url_parts, urlrewriter) + # Check Prefer + pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ) + + # fast-redirect to preferred + if pref_mod is not None: + if self.redirect_to_exact and not is_timegate and pref_mod != wb_url.mod: + new_url = urlrewriter.get_new_url(url=wb_url.url, + timestamp=wb_url.timestamp, + mod=pref_mod) + headers = [('Preference-Applied', pref_applied), + ('Vary', 'Prefer')] + + return WbResponse.redir_response(new_url, + '307 Temporary Redirect', + headers=headers) + else: + wb_url.mod = pref_mod + self.unrewrite_referrer(environ, full_prefix) urlkey = canonicalize(wb_url.url) @@ -357,7 +393,9 @@ class RewriterApp(object): self._add_memento_links(target_uri, full_prefix, memento_dt, cdx['timestamp'], resp.status_headers, - is_timegate, is_proxy) + is_timegate, is_proxy, + pref_applied=pref_applied, + mod=pref_mod) else: resp.status_headers['Link'] = MementoUtils.make_link(target_uri, 'original') @@ -421,13 +459,15 @@ class RewriterApp(object): if not is_ajax and self.enable_memento: self._add_memento_links(cdx['url'], full_prefix, memento_dt, cdx['timestamp'], status_headers, - is_timegate, is_proxy, cdx.get('source-coll')) + is_timegate, is_proxy, cdx.get('source-coll'), + mod=pref_mod, pref_applied=pref_applied) set_content_loc = True if set_content_loc and not self.redirect_to_exact: status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], url=cdx['url']))) + if not is_proxy: self.add_csp_header(wb_url, status_headers) @@ -454,7 +494,10 @@ class RewriterApp(object): return response def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts, - status_headers, is_timegate, is_proxy, coll=None): + status_headers, is_timegate, is_proxy, coll=None, + pref_applied=None, mod=None): + + mod = mod or self.replay_mod # memento url + header if not memento_dt and memento_ts: @@ -466,12 +509,12 @@ class RewriterApp(object): if is_proxy: memento_url = url else: - memento_url = full_prefix + memento_ts + self.replay_mod + memento_url = full_prefix + memento_ts + mod memento_url += '/' + url else: memento_url = None - timegate_url, timemap_url = self._get_timegate_timemap(url, full_prefix) + timegate_url, timemap_url = self._get_timegate_timemap(url, full_prefix, mod) link = [] if not is_proxy: @@ -486,14 +529,22 @@ class RewriterApp(object): status_headers.headers.append(('Link', link_str)) + vary = '' if is_timegate: - status_headers.headers.append(('Vary', 'accept-datetime')) + vary = 'accept-datetime' - def _get_timegate_timemap(self, url, full_prefix): + if pref_applied: + vary = 'Prefer' if not vary else vary + ', Prefer' + status_headers.headers.append(('Preference-Applied', pref_applied)) + + if vary: + status_headers.headers.append(('Vary', vary)) + + def _get_timegate_timemap(self, url, full_prefix, mod): # timegate url timegate_url = full_prefix - if self.replay_mod: - timegate_url += self.replay_mod + '/' + if mod: + timegate_url += mod + '/' timegate_url += url @@ -585,7 +636,7 @@ class RewriterApp(object): status = str(res.status_code) + ' ' + res.reason if res.status_code == 200 and output == 'link': - timegate, timemap = self._get_timegate_timemap(wb_url.url, full_prefix) + timegate, timemap = self._get_timegate_timemap(wb_url.url, full_prefix, self.replay_mod) text = MementoUtils.wrap_timemap_header(wb_url.url, timegate, diff --git a/tests/config_test_redirect_classic.yaml b/tests/config_test_redirect_classic.yaml index 21321416..c3a6000a 100644 --- a/tests/config_test_redirect_classic.yaml +++ b/tests/config_test_redirect_classic.yaml @@ -14,5 +14,6 @@ enable_flash_video_rewrite: true redirect_to_exact: true enable_memento: true +enable_prefer: true debug: true diff --git a/tests/test_prefer_header.py b/tests/test_prefer_header.py new file mode 100644 index 00000000..ff672100 --- /dev/null +++ b/tests/test_prefer_header.py @@ -0,0 +1,164 @@ +from .base_config_test import BaseConfigTest, fmod + +from pywb.warcserver.index.cdxobject import CDXObject + + +# ============================================================================ +class TestPreferWithRedirects(BaseConfigTest): + @classmethod + def setup_class(cls): + super(TestPreferWithRedirects, cls).setup_class('config_test_redirect_classic.yaml') + + def _assert_pref_headers(self, resp, pref): + assert resp.headers['Preference-Applied'] == pref + assert 'Prefer' in resp.headers['Vary'] + + def _assert_raw_memento(self, resp): + self._assert_pref_headers(resp, 'raw') + assert '"/time-zones"' in resp.text, resp.text + assert 'wombat.js' not in resp.text + + def _assert_rewritten(self, resp, fmod): + self._assert_pref_headers(resp, 'rewritten') + + assert '"20140127171238"' in resp.text + assert 'wombat.js' in resp.text + assert 'new _WBWombat' in resp.text, resp.text + assert '/20140127171238{0}/http://www.iana.org/time-zones"'.format(fmod) in resp.text + + def _assert_redir_to_raw(self, resp): + self._assert_pref_headers(resp, 'raw') + + assert resp.location.endswith('/pywb/20140127171238id_/http://www.iana.org/') + resp = resp.follow() + + self._assert_raw_memento(resp) + + def _assert_redir_to_rewritten(self, resp, fmod): + self._assert_pref_headers(resp, 'rewritten') + + assert resp.location.endswith('/pywb/20140127171238{0}/http://www.iana.org/'.format(fmod)) + resp = resp.follow() + + self._assert_rewritten(resp, fmod) + + def test_prefer_redir_timegate_raw(self, fmod): + headers = {'Prefer': 'raw'} + fmod_slash = fmod + '/' if fmod else '' + resp = self.get('/pywb/{0}http://www.iana.org/', fmod_slash, headers=headers, status=307) + + self._assert_redir_to_raw(resp) + + def test_prefer_redir_timegate_rewritten(self, fmod): + headers = {'Prefer': 'rewritten'} + fmod_slash = fmod + '/' if fmod else '' + resp = self.get('/pywb/{0}http://www.iana.org/', fmod_slash, headers=headers, status=307) + + self._assert_redir_to_rewritten(resp, fmod) + + def test_prefer_redir_memento_to_raw(self, fmod): + headers = {'Prefer': 'raw'} + resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/', fmod, headers=headers, status=307) + + self._assert_redir_to_raw(resp) + + def test_prefer_redir_memento_redir_to_rewritten_diff_mod(self, fmod): + headers = {'Prefer': 'rewritten'} + resp = self.get('/pywb/20140127171238js_/http://www.iana.org/', fmod, headers=headers, status=307) + + self._assert_redir_to_rewritten(resp, fmod) + + def test_prefer_redir_memento_matches_rewritten(self, fmod): + headers = {'Prefer': 'rewritten'} + resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/', fmod, headers=headers, status=200) + + self._assert_rewritten(resp, fmod) + + def test_prefer_redir_memento_matches_raw(self): + headers = {'Prefer': 'raw'} + resp = self.testapp.get('/pywb/20140127171238id_/http://www.iana.org/', headers=headers, status=200) + + self._assert_raw_memento(resp) + + def test_prefer_redir_invalid(self, fmod): + headers = {'Prefer': 'unknown'} + resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/', fmod, headers=headers, status=400) + + +# ============================================================================ +class TestPreferWithNoRedirects(BaseConfigTest): + @classmethod + def setup_class(cls): + super(TestPreferWithNoRedirects, cls).setup_class('config_test.yaml', + custom_config={'enable_prefer': True}) + + def _assert_pref_headers(self, resp, pref): + assert resp.headers['Preference-Applied'] == pref + assert 'Prefer' in resp.headers['Vary'] + + def _assert_raw(self, resp): + self._assert_pref_headers(resp, 'raw') + assert '"/time-zones"' in resp.text, resp.text + assert 'wombat.js' not in resp.text + + assert resp.headers['Content-Location'].endswith('/pywb/20140127171238id_/http://www.iana.org/') + + def _assert_rewritten(self, resp, fmod): + self._assert_pref_headers(resp, 'rewritten') + + assert '"20140127171238"' in resp.text + assert 'wombat.js' in resp.text + assert 'new _WBWombat' in resp.text, resp.text + + assert resp.headers['Content-Location'].endswith('/pywb/20140127171238{0}/http://www.iana.org/'.format(fmod)) + + def test_prefer_timegate_raw(self, fmod): + headers = {'Prefer': 'raw'} + fmod_slash = fmod + '/' if fmod else '' + resp = self.get('/pywb/{0}http://www.iana.org/', fmod_slash, headers=headers, status=200) + + self._assert_raw(resp) + + def test_prefer_timegate_rewritten(self, fmod): + headers = {'Prefer': 'rewritten'} + fmod_slash = fmod + '/' if fmod else '' + resp = self.get('/pywb/{0}http://www.iana.org/', fmod_slash, headers=headers, status=200) + + assert '/pywb/{0}http://www.iana.org/time-zones"'.format(fmod_slash) in resp.text + self._assert_rewritten(resp, fmod) + + def test_prefer_memento_raw(self, fmod): + headers = {'Prefer': 'raw'} + resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/', fmod, headers=headers, status=200) + + self._assert_raw(resp) + + def test_prefer_memento_rewritten(self, fmod): + headers = {'Prefer': 'rewritten'} + resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/', fmod, headers=headers, status=200) + + self._assert_rewritten(resp, fmod) + + def test_prefer_memento_raw_id_mod(self): + headers = {'Prefer': 'raw'} + resp = self.testapp.get('/pywb/20140127171238id_/http://www.iana.org/', headers=headers, status=200) + + self._assert_raw(resp) + + def test_prefer_memento_rewritten_id_mod(self, fmod): + headers = {'Prefer': 'rewritten'} + resp = self.get('/pywb/20140127171238id_/http://www.iana.org/', fmod, headers=headers, status=200) + + self._assert_rewritten(resp, fmod) + + def test_prefer_memento_rewritten_diff_mod(self): + headers = {'Prefer': 'raw'} + resp = self.get('/pywb/20140127171238js_/http://www.iana.org/', fmod, headers=headers, status=200) + + self._assert_raw(resp) + + def test_prefer_invalid(self, fmod): + headers = {'Prefer': 'unknown'} + resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/', fmod, headers=headers, status=400) + +