From 871cef26a896101ab1c3bfaf5f950185dc5e1852 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 27 Feb 2018 23:06:08 -0800 Subject: [PATCH] proxy mode and prefer header: (ukwa/ukwa-pywb#16) - fix proxy mode when 'redirect_to_exact=True' is set config, don't redirect in proxy mode - more general prefer support, moved to content_rewriter to support preference<->mod mappings - add 'banner-only' preference mapped to bn_ modifier - proxy mode: allow 'raw' and 'banner-only' preferences - proxy mode: 'Prefer: rewritten' forced to 'banner-only', served with 'Preference-Applied: banner-only' - tests: test proxy with prefer header, 'redirect_to_exact=True', add 'banner-only' to Prefer header tests in rewriting mode --- pywb/apps/rewriterapp.py | 41 ++++++++------- pywb/rewrite/content_rewriter.py | 17 +++++++ pywb/rewrite/default_rewriter.py | 5 ++ tests/test_prefer_header.py | 76 +++++++++++++++++++++++++++- tests/test_proxy.py | 85 ++++++++++++++++++++++++++++++++ 5 files changed, 206 insertions(+), 18 deletions(-) diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index 5c0ca1cc..e41b5f3c 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -61,9 +61,6 @@ class RewriterApp(object): self.replay_mod = '' self.enable_prefer = self.config.get('enable_prefer', False) - self.prefs = {'raw': 'id_', - 'rewritten': self.replay_mod - } self.default_rw = DefaultRewriter(replay_mod=self.replay_mod, config=config) @@ -144,19 +141,25 @@ class RewriterApp(object): return is_timegate - def _get_prefer_mod(self, wb_url, environ): + def _get_prefer_mod(self, wb_url, environ, content_rw, is_proxy): if not self.enable_prefer: return None, None prefer = environ.get('HTTP_PREFER') if not prefer: - return None, 'raw' if wb_url.is_identity else 'rewritten' + return None, content_rw.mod_to_prefer(wb_url.mod) - try: - return self.prefs[prefer], prefer - except: + mod = content_rw.prefer_to_mod(prefer) + + if mod is None: raise UpstreamException(400, url=wb_url.url, details='Invalid Prefer: ' + prefer) + if is_proxy and mod == self.replay_mod: + mod = 'bn_' + prefer = content_rw.mod_to_prefer('bn_') + + return mod, prefer + def _check_range(self, inputreq, wb_url): skip_record = False range_start = None @@ -261,15 +264,24 @@ class RewriterApp(object): 'pywb.static_prefix', '/static/') is_proxy = ('wsgiprox.proxy_host' in environ) + if self.use_js_obj_proxy: + content_rw = self.js_proxy_rw + else: + content_rw = self.default_rw + + # no redirects if in proxy + redirect_to_exact = self.redirect_to_exact and not is_proxy + # Check Prefer - pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ) + pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ, + content_rw, is_proxy) response = None # prefer overrides custom response? if pref_mod is not None: # fast-redirect to preferred - if self.redirect_to_exact and not is_timegate and pref_mod != wb_url.mod: + if redirect_to_exact and not is_timegate and pref_mod != wb_url.mod: new_url = full_prefix + wb_url.to_str(mod=pref_mod) headers = [('Preference-Applied', pref_applied), ('Vary', 'Prefer')] @@ -309,11 +321,6 @@ class RewriterApp(object): urlkey = canonicalize(wb_url.url) - if self.use_js_obj_proxy: - content_rw = self.js_proxy_rw - else: - content_rw = self.default_rw - inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw) inputreq.include_method_query(wb_url.url) @@ -380,7 +387,7 @@ class RewriterApp(object): set_content_loc = True # if redir to exact, redir if url or ts are different - if self.redirect_to_exact: + if redirect_to_exact: if (set_content_loc or (wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))): @@ -465,7 +472,7 @@ class RewriterApp(object): set_content_loc = True - if set_content_loc and not self.redirect_to_exact: + if set_content_loc and not redirect_to_exact and not is_proxy: status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], url=cdx['url']))) diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index 8b8d0593..b5fc0b75 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -52,6 +52,23 @@ class BaseContentRewriter(object): self.load_rules(rules_file) self.replay_mod = replay_mod + self._mod_to_pref = {} + self._pref_to_mod = {} + + def add_prefer_mod(self, pref, mod): + self._mod_to_pref[mod] = pref + self._pref_to_mod[pref] = mod + + def mod_to_prefer(self, mod): + pref = self._mod_to_pref.get(mod) + if not pref: + pref = self._mod_to_pref.get(self.replay_mod) + + return pref + + def prefer_to_mod(self, pref): + return self._pref_to_mod.get(pref) + def add_rewriter(self, rw): self.all_rewriters[rw.name] = rw diff --git a/pywb/rewrite/default_rewriter.py b/pywb/rewrite/default_rewriter.py index f93eb065..f05ecc70 100644 --- a/pywb/rewrite/default_rewriter.py +++ b/pywb/rewrite/default_rewriter.py @@ -98,9 +98,14 @@ class DefaultRewriter(BaseContentRewriter): def __init__(self, replay_mod='', config=None): config = config or {} rules_file = config.get('rules_file', DEFAULT_RULES_FILE) + super(DefaultRewriter, self).__init__(rules_file, replay_mod) self.all_rewriters = copy.copy(self.DEFAULT_REWRITERS) + self.add_prefer_mod('raw', 'id_') + self.add_prefer_mod('banner-only', 'bn_') + self.add_prefer_mod('rewritten', replay_mod) + def init_js_regex(self, regexs): return RegexRewriter.parse_rules_from_config(regexs) diff --git a/tests/test_prefer_header.py b/tests/test_prefer_header.py index 22279c6f..51f103f3 100644 --- a/tests/test_prefer_header.py +++ b/tests/test_prefer_header.py @@ -39,10 +39,21 @@ class BasePreferTests(BaseConfigTest): assert '"/time-zones"' in resp.text, resp.text assert 'wombat.js' not in resp.text + def _assert_banner_only(self, resp): + self._assert_pref_headers(resp, 'banner-only') + + assert '"20140127171238"' in resp.text + assert 'WB Insert' in resp.text + + assert 'wombat.js' not in resp.text + assert 'new _WBWombat' not in resp.text, resp.text + def _assert_rewritten(self, resp): self._assert_pref_headers(resp, 'rewritten') assert '"20140127171238"' in resp.text + assert 'WB Insert' in resp.text + assert 'wombat.js' in resp.text assert 'new _WBWombat' in resp.text, resp.text @@ -66,6 +77,14 @@ class TestPreferWithRedirects(BasePreferTests): self._assert_raw(resp) + def _assert_redir_to_banner_only(self, resp): + self._assert_pref_headers(resp, 'banner-only') + + assert resp.location.endswith('/pywb/20140127171238bn_/http://www.iana.org/') + resp = resp.follow() + + self._assert_banner_only(resp) + def _assert_redir_to_rewritten(self, resp, fmod): self._assert_pref_headers(resp, 'rewritten') @@ -80,6 +99,12 @@ class TestPreferWithRedirects(BasePreferTests): self._assert_redir_to_raw(resp) + def test_prefer_redir_timegate_banner_only(self, fmod): + headers = {'Prefer': 'banner-only'} + resp = self.get('/pywb/{0}http://www.iana.org/', fmod, with_slash=True, headers=headers, status=307) + + self._assert_redir_to_banner_only(resp) + def test_prefer_redir_timegate_rewritten(self, fmod): headers = {'Prefer': 'rewritten'} resp = self.get('/pywb/{0}http://www.iana.org/', fmod, with_slash=True, headers=headers, status=307) @@ -92,6 +117,18 @@ class TestPreferWithRedirects(BasePreferTests): self._assert_redir_to_raw(resp) + def test_prefer_redir_memento_to_banner_only(self, fmod): + headers = {'Prefer': 'banner-only'} + resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/', fmod, headers=headers, status=307) + + self._assert_redir_to_banner_only(resp) + + def test_prefer_redir_memento_redir_to_banner_only_diff_mod(self, fmod): + headers = {'Prefer': 'banner-only'} + resp = self.get('/pywb/20140127171238js_/http://www.iana.org/', fmod, headers=headers, status=307) + + self._assert_redir_to_banner_only(resp) + def test_prefer_redir_memento_redir_to_rewritten_diff_mod(self, fmod): headers = {'Prefer': 'rewritten'} resp = self.get('/pywb/20140127171238js_/http://www.iana.org/', fmod, headers=headers, status=307) @@ -121,6 +158,12 @@ class TestPreferWithRedirects(BasePreferTests): self._assert_raw(resp) + def test_prefer_redir_memento_matches_banner_only(self): + headers = {'Prefer': 'banner-only'} + resp = self.testapp.get('/pywb/20140127171238bn_/http://www.iana.org/', headers=headers, status=200) + + self._assert_banner_only(resp) + def test_prefer_redir_invalid(self, fmod): headers = {'Prefer': 'unknown'} resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/', fmod, headers=headers, status=400) @@ -143,12 +186,25 @@ class TestPreferWithNoRedirects(BasePreferTests): assert resp.headers['Content-Location'].endswith(self.format('/pywb/20140127171238{0}/http://www.iana.org/', fmod[0])) + def _assert_banner_only(self, resp): + super(TestPreferWithNoRedirects, self)._assert_banner_only(resp) + + assert resp.headers['Content-Location'].endswith('/pywb/20140127171238bn_/http://www.iana.org/') + def test_prefer_timegate_raw(self, fmod): headers = {'Prefer': 'raw'} resp = self.get('/pywb/{0}http://www.iana.org/', fmod, with_slash=True, headers=headers, status=200) + assert '"/time-zones"' in resp.text self._assert_raw(resp) + def test_prefer_timegate_banner_only(self, fmod): + headers = {'Prefer': 'banner-only'} + resp = self.get('/pywb/{0}http://www.iana.org/', fmod, with_slash=True, headers=headers, status=200) + + assert '"/time-zones"' in resp.text + self._assert_banner_only(resp) + def test_prefer_timegate_rewritten(self, fmod): headers = {'Prefer': 'rewritten'} resp = self.get('/pywb/{0}http://www.iana.org/', fmod, with_slash=True, headers=headers, status=200) @@ -162,6 +218,12 @@ class TestPreferWithNoRedirects(BasePreferTests): self._assert_raw(resp) + def test_prefer_memento_banner_only(self, fmod): + headers = {'Prefer': 'banner-only'} + resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/', fmod, headers=headers, status=200) + + self._assert_banner_only(resp) + def test_prefer_memento_rewritten(self, fmod): headers = {'Prefer': 'rewritten'} resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/', fmod, headers=headers, status=200) @@ -175,18 +237,30 @@ class TestPreferWithNoRedirects(BasePreferTests): self._assert_raw(resp) - def test_prefer_memento_rewritten_id_mod(self, fmod): + def test_prefer_memento_rewritten_from_id_mod(self, fmod): headers = {'Prefer': 'rewritten'} resp = self.get('/pywb/20140127171238id_/http://www.iana.org/', fmod, headers=headers, status=200) self._assert_rewritten(resp, fmod) + def test_prefer_memento_banner_only_no_mod(self, fmod): + headers = {'Prefer': 'banner-only'} + resp = self.get('/pywb/20140127171238/http://www.iana.org/', fmod, headers=headers, status=200) + + self._assert_banner_only(resp) + def test_prefer_memento_rewritten_diff_mod(self, fmod): headers = {'Prefer': 'raw'} resp = self.get('/pywb/20140127171238js_/http://www.iana.org/', fmod, headers=headers, status=200) self._assert_raw(resp) + def test_prefer_memento_banner_only_diff_mod(self, fmod): + headers = {'Prefer': 'banner-only'} + resp = self.get('/pywb/20140127171238js_/http://www.iana.org/', fmod, headers=headers, status=200) + + self._assert_banner_only(resp) + def test_prefer_invalid(self, fmod): headers = {'Prefer': 'unknown'} resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/', fmod, headers=headers, status=400) diff --git a/tests/test_proxy.py b/tests/test_proxy.py index ba34d140..e013a04c 100644 --- a/tests/test_proxy.py +++ b/tests/test_proxy.py @@ -81,6 +81,7 @@ class TestProxy(BaseTestProxy): assert res.headers['Link'] == '; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"; collection="pywb"' assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT' + assert 'Content-Location' not in res.headers def test_proxy_replay_change_dt(self, scheme): headers = {'Accept-Datetime': 'Mon, 26 Dec 2011 17:12:51 GMT'} @@ -138,6 +139,90 @@ class TestProxyDefaultDate(BaseTestProxy): assert res.headers['Link'] == '; rel="memento"; datetime="Mon, 29 Jul 2013 19:51:51 GMT"; collection="pywb"' assert res.headers['Memento-Datetime'] == 'Mon, 29 Jul 2013 19:51:51 GMT' + assert 'Content-Location' not in res.headers + + +# ============================================================================ +class TestRedirectClassicProxy(TestProxy): + @classmethod + def setup_class(cls): + super(TestRedirectClassicProxy, cls).setup_class('pywb', config_file='config_test_redirect_classic.yaml') + + def test_proxy_replay_prefer_raw(self, scheme): + headers = {'Prefer': 'raw'} + res = requests.get('{0}://example.com/'.format(scheme), + proxies=self.proxies, + headers=headers, + verify=self.root_ca_file) + + # no rewriting + assert 'WB Insert' not in res.text + assert 'wbinfo' not in res.text + + # content + assert 'Example Domain' in res.text + + # no wombat.js + assert 'wombat.js' not in res.text + + # no redirect check + assert 'window == window.top' not in res.text + + assert res.headers['Link'] == '; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"; collection="pywb"' + assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT' + assert res.headers['Preference-Applied'] == 'raw' + assert 'Content-Location' not in res.headers + + def test_proxy_replay_prefer_rewritten_to_banner_only(self, scheme): + headers = {'Prefer': 'rewritten'} + res = requests.get('{0}://example.com/'.format(scheme), + proxies=self.proxies, + headers=headers, + verify=self.root_ca_file) + + assert 'WB Insert' in res.text + assert 'Example Domain' in res.text + + # no wombat.js + assert 'wombat.js' not in res.text + + # no redirect check + assert 'window == window.top' not in res.text + + assert res.headers['Link'] == '; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"; collection="pywb"' + assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT' + assert res.headers['Preference-Applied'] == 'banner-only' + assert 'Content-Location' not in res.headers + + def test_proxy_replay_prefer_banner_only(self, scheme): + headers = {'Prefer': 'banner-only'} + res = requests.get('{0}://example.com/'.format(scheme), + proxies=self.proxies, + headers=headers, + verify=self.root_ca_file) + + assert 'WB Insert' in res.text + assert 'Example Domain' in res.text + + # no wombat.js + assert 'wombat.js' not in res.text + + # no redirect check + assert 'window == window.top' not in res.text + + assert res.headers['Link'] == '; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"; collection="pywb"' + assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT' + assert res.headers['Preference-Applied'] == 'banner-only' + + def test_proxy_replay_prefer_invalid(self, scheme): + headers = {'Prefer': 'invalid'} + res = requests.get('{0}://example.com/'.format(scheme), + proxies=self.proxies, + headers=headers, + verify=self.root_ca_file) + + assert 'Preference-Applied' not in res.headers + assert res.status_code == 400 # ============================================================================