mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
proxy mode and prefer header: (ukwa/ukwa-pywb#16)
- fix proxy mode when 'redirect_to_exact=True' is set config, don't redirect in proxy mode - more general prefer support, moved to content_rewriter to support preference<->mod mappings - add 'banner-only' preference mapped to bn_ modifier - proxy mode: allow 'raw' and 'banner-only' preferences - proxy mode: 'Prefer: rewritten' forced to 'banner-only', served with 'Preference-Applied: banner-only' - tests: test proxy with prefer header, 'redirect_to_exact=True', add 'banner-only' to Prefer header tests in rewriting mode
This commit is contained in:
parent
a301dda0fb
commit
871cef26a8
@ -61,9 +61,6 @@ class RewriterApp(object):
|
||||
self.replay_mod = ''
|
||||
|
||||
self.enable_prefer = self.config.get('enable_prefer', False)
|
||||
self.prefs = {'raw': 'id_',
|
||||
'rewritten': self.replay_mod
|
||||
}
|
||||
|
||||
self.default_rw = DefaultRewriter(replay_mod=self.replay_mod,
|
||||
config=config)
|
||||
@ -144,19 +141,25 @@ class RewriterApp(object):
|
||||
|
||||
return is_timegate
|
||||
|
||||
def _get_prefer_mod(self, wb_url, environ):
|
||||
def _get_prefer_mod(self, wb_url, environ, content_rw, is_proxy):
|
||||
if not self.enable_prefer:
|
||||
return None, None
|
||||
|
||||
prefer = environ.get('HTTP_PREFER')
|
||||
if not prefer:
|
||||
return None, 'raw' if wb_url.is_identity else 'rewritten'
|
||||
return None, content_rw.mod_to_prefer(wb_url.mod)
|
||||
|
||||
try:
|
||||
return self.prefs[prefer], prefer
|
||||
except:
|
||||
mod = content_rw.prefer_to_mod(prefer)
|
||||
|
||||
if mod is None:
|
||||
raise UpstreamException(400, url=wb_url.url, details='Invalid Prefer: ' + prefer)
|
||||
|
||||
if is_proxy and mod == self.replay_mod:
|
||||
mod = 'bn_'
|
||||
prefer = content_rw.mod_to_prefer('bn_')
|
||||
|
||||
return mod, prefer
|
||||
|
||||
def _check_range(self, inputreq, wb_url):
|
||||
skip_record = False
|
||||
range_start = None
|
||||
@ -261,15 +264,24 @@ class RewriterApp(object):
|
||||
'pywb.static_prefix', '/static/')
|
||||
is_proxy = ('wsgiprox.proxy_host' in environ)
|
||||
|
||||
if self.use_js_obj_proxy:
|
||||
content_rw = self.js_proxy_rw
|
||||
else:
|
||||
content_rw = self.default_rw
|
||||
|
||||
# no redirects if in proxy
|
||||
redirect_to_exact = self.redirect_to_exact and not is_proxy
|
||||
|
||||
# Check Prefer
|
||||
pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ)
|
||||
pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ,
|
||||
content_rw, is_proxy)
|
||||
|
||||
response = None
|
||||
|
||||
# prefer overrides custom response?
|
||||
if pref_mod is not None:
|
||||
# fast-redirect to preferred
|
||||
if self.redirect_to_exact and not is_timegate and pref_mod != wb_url.mod:
|
||||
if redirect_to_exact and not is_timegate and pref_mod != wb_url.mod:
|
||||
new_url = full_prefix + wb_url.to_str(mod=pref_mod)
|
||||
headers = [('Preference-Applied', pref_applied),
|
||||
('Vary', 'Prefer')]
|
||||
@ -309,11 +321,6 @@ class RewriterApp(object):
|
||||
|
||||
urlkey = canonicalize(wb_url.url)
|
||||
|
||||
if self.use_js_obj_proxy:
|
||||
content_rw = self.js_proxy_rw
|
||||
else:
|
||||
content_rw = self.default_rw
|
||||
|
||||
inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw)
|
||||
|
||||
inputreq.include_method_query(wb_url.url)
|
||||
@ -380,7 +387,7 @@ class RewriterApp(object):
|
||||
set_content_loc = True
|
||||
|
||||
# if redir to exact, redir if url or ts are different
|
||||
if self.redirect_to_exact:
|
||||
if redirect_to_exact:
|
||||
if (set_content_loc or
|
||||
(wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))):
|
||||
|
||||
@ -465,7 +472,7 @@ class RewriterApp(object):
|
||||
|
||||
set_content_loc = True
|
||||
|
||||
if set_content_loc and not self.redirect_to_exact:
|
||||
if set_content_loc and not redirect_to_exact and not is_proxy:
|
||||
status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'],
|
||||
url=cdx['url'])))
|
||||
|
||||
|
@ -52,6 +52,23 @@ class BaseContentRewriter(object):
|
||||
self.load_rules(rules_file)
|
||||
self.replay_mod = replay_mod
|
||||
|
||||
self._mod_to_pref = {}
|
||||
self._pref_to_mod = {}
|
||||
|
||||
def add_prefer_mod(self, pref, mod):
|
||||
self._mod_to_pref[mod] = pref
|
||||
self._pref_to_mod[pref] = mod
|
||||
|
||||
def mod_to_prefer(self, mod):
|
||||
pref = self._mod_to_pref.get(mod)
|
||||
if not pref:
|
||||
pref = self._mod_to_pref.get(self.replay_mod)
|
||||
|
||||
return pref
|
||||
|
||||
def prefer_to_mod(self, pref):
|
||||
return self._pref_to_mod.get(pref)
|
||||
|
||||
def add_rewriter(self, rw):
|
||||
self.all_rewriters[rw.name] = rw
|
||||
|
||||
|
@ -98,9 +98,14 @@ class DefaultRewriter(BaseContentRewriter):
|
||||
def __init__(self, replay_mod='', config=None):
|
||||
config = config or {}
|
||||
rules_file = config.get('rules_file', DEFAULT_RULES_FILE)
|
||||
|
||||
super(DefaultRewriter, self).__init__(rules_file, replay_mod)
|
||||
self.all_rewriters = copy.copy(self.DEFAULT_REWRITERS)
|
||||
|
||||
self.add_prefer_mod('raw', 'id_')
|
||||
self.add_prefer_mod('banner-only', 'bn_')
|
||||
self.add_prefer_mod('rewritten', replay_mod)
|
||||
|
||||
def init_js_regex(self, regexs):
|
||||
return RegexRewriter.parse_rules_from_config(regexs)
|
||||
|
||||
|
@ -39,10 +39,21 @@ class BasePreferTests(BaseConfigTest):
|
||||
assert '"/time-zones"' in resp.text, resp.text
|
||||
assert 'wombat.js' not in resp.text
|
||||
|
||||
def _assert_banner_only(self, resp):
|
||||
self._assert_pref_headers(resp, 'banner-only')
|
||||
|
||||
assert '"20140127171238"' in resp.text
|
||||
assert 'WB Insert' in resp.text
|
||||
|
||||
assert 'wombat.js' not in resp.text
|
||||
assert 'new _WBWombat' not in resp.text, resp.text
|
||||
|
||||
def _assert_rewritten(self, resp):
|
||||
self._assert_pref_headers(resp, 'rewritten')
|
||||
|
||||
assert '"20140127171238"' in resp.text
|
||||
assert 'WB Insert' in resp.text
|
||||
|
||||
assert 'wombat.js' in resp.text
|
||||
assert 'new _WBWombat' in resp.text, resp.text
|
||||
|
||||
@ -66,6 +77,14 @@ class TestPreferWithRedirects(BasePreferTests):
|
||||
|
||||
self._assert_raw(resp)
|
||||
|
||||
def _assert_redir_to_banner_only(self, resp):
|
||||
self._assert_pref_headers(resp, 'banner-only')
|
||||
|
||||
assert resp.location.endswith('/pywb/20140127171238bn_/http://www.iana.org/')
|
||||
resp = resp.follow()
|
||||
|
||||
self._assert_banner_only(resp)
|
||||
|
||||
def _assert_redir_to_rewritten(self, resp, fmod):
|
||||
self._assert_pref_headers(resp, 'rewritten')
|
||||
|
||||
@ -80,6 +99,12 @@ class TestPreferWithRedirects(BasePreferTests):
|
||||
|
||||
self._assert_redir_to_raw(resp)
|
||||
|
||||
def test_prefer_redir_timegate_banner_only(self, fmod):
|
||||
headers = {'Prefer': 'banner-only'}
|
||||
resp = self.get('/pywb/{0}http://www.iana.org/', fmod, with_slash=True, headers=headers, status=307)
|
||||
|
||||
self._assert_redir_to_banner_only(resp)
|
||||
|
||||
def test_prefer_redir_timegate_rewritten(self, fmod):
|
||||
headers = {'Prefer': 'rewritten'}
|
||||
resp = self.get('/pywb/{0}http://www.iana.org/', fmod, with_slash=True, headers=headers, status=307)
|
||||
@ -92,6 +117,18 @@ class TestPreferWithRedirects(BasePreferTests):
|
||||
|
||||
self._assert_redir_to_raw(resp)
|
||||
|
||||
def test_prefer_redir_memento_to_banner_only(self, fmod):
|
||||
headers = {'Prefer': 'banner-only'}
|
||||
resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/', fmod, headers=headers, status=307)
|
||||
|
||||
self._assert_redir_to_banner_only(resp)
|
||||
|
||||
def test_prefer_redir_memento_redir_to_banner_only_diff_mod(self, fmod):
|
||||
headers = {'Prefer': 'banner-only'}
|
||||
resp = self.get('/pywb/20140127171238js_/http://www.iana.org/', fmod, headers=headers, status=307)
|
||||
|
||||
self._assert_redir_to_banner_only(resp)
|
||||
|
||||
def test_prefer_redir_memento_redir_to_rewritten_diff_mod(self, fmod):
|
||||
headers = {'Prefer': 'rewritten'}
|
||||
resp = self.get('/pywb/20140127171238js_/http://www.iana.org/', fmod, headers=headers, status=307)
|
||||
@ -121,6 +158,12 @@ class TestPreferWithRedirects(BasePreferTests):
|
||||
|
||||
self._assert_raw(resp)
|
||||
|
||||
def test_prefer_redir_memento_matches_banner_only(self):
|
||||
headers = {'Prefer': 'banner-only'}
|
||||
resp = self.testapp.get('/pywb/20140127171238bn_/http://www.iana.org/', headers=headers, status=200)
|
||||
|
||||
self._assert_banner_only(resp)
|
||||
|
||||
def test_prefer_redir_invalid(self, fmod):
|
||||
headers = {'Prefer': 'unknown'}
|
||||
resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/', fmod, headers=headers, status=400)
|
||||
@ -143,12 +186,25 @@ class TestPreferWithNoRedirects(BasePreferTests):
|
||||
|
||||
assert resp.headers['Content-Location'].endswith(self.format('/pywb/20140127171238{0}/http://www.iana.org/', fmod[0]))
|
||||
|
||||
def _assert_banner_only(self, resp):
|
||||
super(TestPreferWithNoRedirects, self)._assert_banner_only(resp)
|
||||
|
||||
assert resp.headers['Content-Location'].endswith('/pywb/20140127171238bn_/http://www.iana.org/')
|
||||
|
||||
def test_prefer_timegate_raw(self, fmod):
|
||||
headers = {'Prefer': 'raw'}
|
||||
resp = self.get('/pywb/{0}http://www.iana.org/', fmod, with_slash=True, headers=headers, status=200)
|
||||
|
||||
assert '"/time-zones"' in resp.text
|
||||
self._assert_raw(resp)
|
||||
|
||||
def test_prefer_timegate_banner_only(self, fmod):
|
||||
headers = {'Prefer': 'banner-only'}
|
||||
resp = self.get('/pywb/{0}http://www.iana.org/', fmod, with_slash=True, headers=headers, status=200)
|
||||
|
||||
assert '"/time-zones"' in resp.text
|
||||
self._assert_banner_only(resp)
|
||||
|
||||
def test_prefer_timegate_rewritten(self, fmod):
|
||||
headers = {'Prefer': 'rewritten'}
|
||||
resp = self.get('/pywb/{0}http://www.iana.org/', fmod, with_slash=True, headers=headers, status=200)
|
||||
@ -162,6 +218,12 @@ class TestPreferWithNoRedirects(BasePreferTests):
|
||||
|
||||
self._assert_raw(resp)
|
||||
|
||||
def test_prefer_memento_banner_only(self, fmod):
|
||||
headers = {'Prefer': 'banner-only'}
|
||||
resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/', fmod, headers=headers, status=200)
|
||||
|
||||
self._assert_banner_only(resp)
|
||||
|
||||
def test_prefer_memento_rewritten(self, fmod):
|
||||
headers = {'Prefer': 'rewritten'}
|
||||
resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/', fmod, headers=headers, status=200)
|
||||
@ -175,18 +237,30 @@ class TestPreferWithNoRedirects(BasePreferTests):
|
||||
|
||||
self._assert_raw(resp)
|
||||
|
||||
def test_prefer_memento_rewritten_id_mod(self, fmod):
|
||||
def test_prefer_memento_rewritten_from_id_mod(self, fmod):
|
||||
headers = {'Prefer': 'rewritten'}
|
||||
resp = self.get('/pywb/20140127171238id_/http://www.iana.org/', fmod, headers=headers, status=200)
|
||||
|
||||
self._assert_rewritten(resp, fmod)
|
||||
|
||||
def test_prefer_memento_banner_only_no_mod(self, fmod):
|
||||
headers = {'Prefer': 'banner-only'}
|
||||
resp = self.get('/pywb/20140127171238/http://www.iana.org/', fmod, headers=headers, status=200)
|
||||
|
||||
self._assert_banner_only(resp)
|
||||
|
||||
def test_prefer_memento_rewritten_diff_mod(self, fmod):
|
||||
headers = {'Prefer': 'raw'}
|
||||
resp = self.get('/pywb/20140127171238js_/http://www.iana.org/', fmod, headers=headers, status=200)
|
||||
|
||||
self._assert_raw(resp)
|
||||
|
||||
def test_prefer_memento_banner_only_diff_mod(self, fmod):
|
||||
headers = {'Prefer': 'banner-only'}
|
||||
resp = self.get('/pywb/20140127171238js_/http://www.iana.org/', fmod, headers=headers, status=200)
|
||||
|
||||
self._assert_banner_only(resp)
|
||||
|
||||
def test_prefer_invalid(self, fmod):
|
||||
headers = {'Prefer': 'unknown'}
|
||||
resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/', fmod, headers=headers, status=400)
|
||||
|
@ -81,6 +81,7 @@ class TestProxy(BaseTestProxy):
|
||||
|
||||
assert res.headers['Link'] == '<http://example.com>; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"; collection="pywb"'
|
||||
assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
|
||||
assert 'Content-Location' not in res.headers
|
||||
|
||||
def test_proxy_replay_change_dt(self, scheme):
|
||||
headers = {'Accept-Datetime': 'Mon, 26 Dec 2011 17:12:51 GMT'}
|
||||
@ -138,6 +139,90 @@ class TestProxyDefaultDate(BaseTestProxy):
|
||||
|
||||
assert res.headers['Link'] == '<http://test@example.com/>; rel="memento"; datetime="Mon, 29 Jul 2013 19:51:51 GMT"; collection="pywb"'
|
||||
assert res.headers['Memento-Datetime'] == 'Mon, 29 Jul 2013 19:51:51 GMT'
|
||||
assert 'Content-Location' not in res.headers
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class TestRedirectClassicProxy(TestProxy):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
super(TestRedirectClassicProxy, cls).setup_class('pywb', config_file='config_test_redirect_classic.yaml')
|
||||
|
||||
def test_proxy_replay_prefer_raw(self, scheme):
|
||||
headers = {'Prefer': 'raw'}
|
||||
res = requests.get('{0}://example.com/'.format(scheme),
|
||||
proxies=self.proxies,
|
||||
headers=headers,
|
||||
verify=self.root_ca_file)
|
||||
|
||||
# no rewriting
|
||||
assert 'WB Insert' not in res.text
|
||||
assert 'wbinfo' not in res.text
|
||||
|
||||
# content
|
||||
assert 'Example Domain' in res.text
|
||||
|
||||
# no wombat.js
|
||||
assert 'wombat.js' not in res.text
|
||||
|
||||
# no redirect check
|
||||
assert 'window == window.top' not in res.text
|
||||
|
||||
assert res.headers['Link'] == '<http://example.com>; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"; collection="pywb"'
|
||||
assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
|
||||
assert res.headers['Preference-Applied'] == 'raw'
|
||||
assert 'Content-Location' not in res.headers
|
||||
|
||||
def test_proxy_replay_prefer_rewritten_to_banner_only(self, scheme):
|
||||
headers = {'Prefer': 'rewritten'}
|
||||
res = requests.get('{0}://example.com/'.format(scheme),
|
||||
proxies=self.proxies,
|
||||
headers=headers,
|
||||
verify=self.root_ca_file)
|
||||
|
||||
assert 'WB Insert' in res.text
|
||||
assert 'Example Domain' in res.text
|
||||
|
||||
# no wombat.js
|
||||
assert 'wombat.js' not in res.text
|
||||
|
||||
# no redirect check
|
||||
assert 'window == window.top' not in res.text
|
||||
|
||||
assert res.headers['Link'] == '<http://example.com>; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"; collection="pywb"'
|
||||
assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
|
||||
assert res.headers['Preference-Applied'] == 'banner-only'
|
||||
assert 'Content-Location' not in res.headers
|
||||
|
||||
def test_proxy_replay_prefer_banner_only(self, scheme):
|
||||
headers = {'Prefer': 'banner-only'}
|
||||
res = requests.get('{0}://example.com/'.format(scheme),
|
||||
proxies=self.proxies,
|
||||
headers=headers,
|
||||
verify=self.root_ca_file)
|
||||
|
||||
assert 'WB Insert' in res.text
|
||||
assert 'Example Domain' in res.text
|
||||
|
||||
# no wombat.js
|
||||
assert 'wombat.js' not in res.text
|
||||
|
||||
# no redirect check
|
||||
assert 'window == window.top' not in res.text
|
||||
|
||||
assert res.headers['Link'] == '<http://example.com>; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"; collection="pywb"'
|
||||
assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
|
||||
assert res.headers['Preference-Applied'] == 'banner-only'
|
||||
|
||||
def test_proxy_replay_prefer_invalid(self, scheme):
|
||||
headers = {'Prefer': 'invalid'}
|
||||
res = requests.get('{0}://example.com/'.format(scheme),
|
||||
proxies=self.proxies,
|
||||
headers=headers,
|
||||
verify=self.root_ca_file)
|
||||
|
||||
assert 'Preference-Applied' not in res.headers
|
||||
assert res.status_code == 400
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
Loading…
x
Reference in New Issue
Block a user