1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

support for 'classic' pywb features and misc improvements: (#261)

* support for 'classic' pywb features and misc improvements:
- add support for redirect to exact timestamp mode via 'redirect_to_exact: true' config setting
- tests: ensure memento headers added for redirect-to-exact
- memento: ensure Link header added for intermediate resources, check for 'enable_memento' before adding
- config: config passed to head_insert template as 'config'
- insert legacy 'vidrw.js' script if 'enable_flash_video_rewrite' config is set to true
- config: use_js_obj_proxy now defaults to true
- memento/tests: add proxy with custom accept-datetime test
This commit is contained in:
Ilya Kreymer 2017-10-23 17:13:48 -07:00 committed by GitHub
parent 459cd706d3
commit 4b60dd5dda
6 changed files with 157 additions and 17 deletions

View File

@ -75,6 +75,8 @@ class RewriterApp(object):
self.jinja_env = jinja_env
self.redirect_to_exact = config.get('redirect_to_exact')
self.banner_view = BaseInsertView(self.jinja_env, self._html_templ('banner_html'))
self.head_insert_view = HeadInsertView(self.jinja_env,
@ -89,7 +91,7 @@ class RewriterApp(object):
self.not_found_view = BaseInsertView(self.jinja_env, self._html_templ('not_found_html'))
self.query_view = BaseInsertView(self.jinja_env, self._html_templ('query_html'))
self.use_js_obj_proxy = config.get('use_js_obj_proxy', False)
self.use_js_obj_proxy = config.get('use_js_obj_proxy', True)
self.cookie_tracker = None
@ -167,9 +169,14 @@ class RewriterApp(object):
scheme, netloc, path, query, frag = url_parts
path = '/'
url = urlunsplit((scheme, netloc, path, query, frag))
return WbResponse.redir_response(urlrewriter.rewrite(url),
resp = WbResponse.redir_response(urlrewriter.rewrite(url),
'307 Temporary Redirect')
if self.enable_memento:
resp.status_headers['Link'] = MementoUtils.make_link(url, 'original')
return resp
self.unrewrite_referrer(environ, full_prefix)
urlkey = canonicalize(wb_url.url)
@ -263,8 +270,27 @@ class RewriterApp(object):
if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
set_content_loc = True
# return WbResponse.redir_response(urlrewriter.rewrite(target_uri),
# '307 Temporary Redirect')
# if redir to exact, redir if url or ts are different
if self.redirect_to_exact:
if (set_content_loc or
(wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))):
new_url = urlrewriter.get_new_url(url=target_uri,
timestamp=cdx['timestamp'],
mod=wb_url.mod)
resp = WbResponse.redir_response(new_url, '307 Temporary Redirect')
if self.enable_memento:
if is_timegate and not is_proxy:
self._add_memento_links(target_uri, full_prefix,
memento_dt, cdx['timestamp'],
resp.status_headers,
is_timegate, is_proxy)
else:
resp.status_headers['Link'] = MementoUtils.make_link(target_uri, 'original')
return resp
self._add_custom_params(cdx, r.headers, kwargs)
@ -290,7 +316,8 @@ class RewriterApp(object):
host_prefix,
top_url,
environ,
framed_replay))
framed_replay,
config=self.config))
cookie_rewriter = None
if self.cookie_tracker:
@ -315,10 +342,9 @@ class RewriterApp(object):
set_content_loc = True
if set_content_loc:
if set_content_loc and not self.redirect_to_exact:
status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'],
url=cdx['url'])))
if not is_proxy:
self.add_csp_header(wb_url, status_headers)
@ -339,8 +365,9 @@ class RewriterApp(object):
response = WbResponse.text_response(response, content_type=content_type)
self._add_memento_links(wb_url.url, full_prefix, None, memento_ts,
response.status_headers, is_timegate, is_proxy)
if self.enable_memento:
self._add_memento_links(wb_url.url, full_prefix, None, memento_ts,
response.status_headers, is_timegate, is_proxy)
return response
def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,

View File

@ -153,15 +153,16 @@ class HeadInsertView(BaseInsertView):
env,
is_framed,
coll='',
include_ts=True):
include_ts=True,
**kwargs):
params = {'host_prefix': host_prefix,
'wb_prefix': wb_prefix,
'wb_url': wb_url,
'coll': coll,
'is_framed': 'true' if is_framed else 'false',
'top_url': top_url,
}
params = kwargs
params['host_prefix'] = host_prefix
params['wb_prefix'] = wb_prefix
params['wb_url'] = wb_url
params['top_url'] = top_url
params['coll'] = coll
params['is_framed'] = 'true' if is_framed else 'false'
def make_head_insert(rule, cdx):
params['wombat_ts'] = cdx['timestamp'] if include_ts else ''

View File

@ -34,6 +34,10 @@
</script>
{% if config.enable_flash_video_rewrite %}
<script src='{{ host_prefix }}/{{ static_path }}/vidrw.js'> </script>
{% endif %}
{{ banner_html }}
<!-- End WB Insert -->

View File

@ -0,0 +1,18 @@
# config similar to old pywb setup
# -redirect requests
# -include video rewrite
collections:
pywb:
index: ./sample_archive/cdx/
archive_paths: ./sample_archive/warcs/
live: $live
enable_flash_video_rewrite: true
redirect_to_exact: true
enable_memento: true
debug: true

View File

@ -63,6 +63,19 @@ class TestProxy(BaseTestProxy):
assert res.headers['Link'] == '<http://example.com>; rel="memento"; datetime="Mon, 27 Jan 2014 17:12:51 GMT"; collection="pywb"'
assert res.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
def test_proxy_replay_change_dt(self, scheme):
headers = {'Accept-Datetime': 'Mon, 26 Dec 2011 17:12:51 GMT'}
res = requests.get('{0}://example.com/'.format(scheme),
proxies=self.proxies,
headers=headers,
verify=self.root_ca_file)
assert 'WB Insert' in res.text
assert 'Example Domain' in res.text
assert res.headers['Link'] == '<http://test@example.com/>; rel="memento"; datetime="Mon, 29 Jul 2013 19:51:51 GMT"; collection="pywb"'
assert res.headers['Memento-Datetime'] == 'Mon, 29 Jul 2013 19:51:51 GMT'
# ============================================================================
class TestRecordingProxy(CollsDirMixin, BaseTestProxy):

View File

@ -0,0 +1,77 @@
from .base_config_test import BaseConfigTest, fmod
# ============================================================================
class TestRedirectClassic(BaseConfigTest):
@classmethod
def setup_class(cls):
super(TestRedirectClassic, cls).setup_class('config_test_redirect_classic.yaml')
def test_replay_content_inexact(self, fmod):
resp = self.get('/pywb/20140127171235{0}/http://www.iana.org/', fmod)
assert resp.status_code == 307
assert resp.headers['Location'].endswith('/20140127171238{0}/http://www.iana.org/'.format(fmod))
assert resp.headers['Link'] == '<http://www.iana.org/>; rel="original"'
resp = resp.follow()
self._assert_basic_html(resp)
assert '"20140127171238"' in resp.text, resp.text
assert 'wombat.js' in resp.text
assert 'new _WBWombat' in resp.text, resp.text
assert '/pywb/20140127171238{0}/http://www.iana.org/time-zones"'.format(fmod) in resp.text
assert ('wbinfo.is_framed = ' + ('true' if fmod else 'false')) in resp.text
csp = "default-src 'unsafe-eval' 'unsafe-inline' 'self' data: blob: mediastream: ws: wss: ; form-action 'self'"
assert resp.headers['Content-Security-Policy'] == csp
# verify enable_rewrite_flash_video is injected
assert 'vidrw.js' in resp.text
def test_latest_replay_redirect(self, fmod):
fmod_slash = fmod + '/' if fmod else ''
resp = self.get('/pywb/{0}http://example.com/', fmod_slash)
assert resp.status_code == 307
assert resp.headers['Location'].endswith('/20140127171251{0}/http://example.com'.format(fmod))
assert resp.headers['Link'] != ''
# trailing slash redir
resp = resp.follow()
assert resp.status_code == 307
assert resp.headers['Location'].endswith('/20140127171251{0}/http://example.com/'.format(fmod))
assert resp.headers['Link'] != ''
resp = resp.follow()
self._assert_basic_html(resp)
assert resp.headers['Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT'
assert '"20140127171251"' in resp.text
assert '/pywb/20140127171251{0}/http://www.iana.org/domains/example'.format(fmod) in resp.text, resp.text
def test_replay_memento_accept_dt(self, fmod):
fmod_slash = fmod + '/' if fmod else ''
headers = {'Accept-Datetime': 'Mon, 26 Dec 2011 17:12:51 GMT'}
resp = self.get('/pywb/{0}http://example.com/', fmod_slash, headers=headers)
assert resp.status_code == 307
assert resp.headers['Location'].endswith('/20130729195151{0}/http://test@example.com/'.format(fmod))
assert resp.headers['Link'] != ''
resp = resp.follow()
self._assert_basic_html(resp)
assert resp.headers['Memento-Datetime'] == 'Mon, 29 Jul 2013 19:51:51 GMT'
def test_replay_fuzzy_1_redirect(self, fmod):
resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/?_=123', fmod)
assert resp.status_int == 307
assert resp.headers['Location'].endswith('/pywb/20140126200624{0}/http://www.iana.org/'.format(fmod))
def test_live_no_redir(self, fmod):
fmod_slash = fmod + '/' if fmod else ''
resp = self.get('/live/{0}http://example.com/?test=test', fmod_slash)
assert resp.status_int == 200