1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

memento-fix: fix for ukwa/ukwa-pywb#37.

- support memento timegate on top-frame (when no timestamp is provided)
- treat top-frame no-timestamp url as canonical timegate
- tests: update tests, add memento redirect mode tests for timegate, timegate with accept-dt header
This commit is contained in:
Ilya Kreymer 2019-02-14 15:33:07 -08:00 committed by John Berlin
parent 0c08b9b5d5
commit ce0ed610bd
No known key found for this signature in database
GPG Key ID: 6EF5E4B442011B02
2 changed files with 90 additions and 15 deletions

View File

@ -302,7 +302,9 @@ class RewriterApp(object):
kwargs) kwargs)
if response: if response:
return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy) # don't return top-frame response for timegate with exact redirects
if not is_timegate or not redirect_to_exact:
return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy)
if is_proxy: if is_proxy:
environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host'] environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host']
@ -395,11 +397,9 @@ class RewriterApp(object):
if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1': if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1':
set_content_loc = True set_content_loc = True
# if redir to exact, redir if url or ts are different # if redirect to exact timestamp, bit only if not live
if redirect_to_exact: if redirect_to_exact and not cdx.get('is_live'):
if (set_content_loc or if set_content_loc or is_timegate or wb_url.timestamp != cdx.get('timestamp'):
(wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))):
new_url = urlrewriter.get_new_url(url=target_uri, new_url = urlrewriter.get_new_url(url=target_uri,
timestamp=cdx['timestamp'], timestamp=cdx['timestamp'],
mod=wb_url.mod) mod=wb_url.mod)
@ -412,7 +412,8 @@ class RewriterApp(object):
resp.status_headers, resp.status_headers,
is_timegate, is_proxy, is_timegate, is_proxy,
pref_applied=pref_applied, pref_applied=pref_applied,
mod=pref_mod) mod=pref_mod,
is_memento=False)
else: else:
resp.status_headers['Link'] = MementoUtils.make_link(target_uri, 'original') resp.status_headers['Link'] = MementoUtils.make_link(target_uri, 'original')
@ -512,21 +513,22 @@ class RewriterApp(object):
def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts, def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
status_headers, is_timegate, is_proxy, coll=None, status_headers, is_timegate, is_proxy, coll=None,
pref_applied=None, mod=None): pref_applied=None, mod=None, is_memento=True):
mod = mod or self.replay_mod replay_mod = mod or self.replay_mod
# memento url + header # memento url + header
if not memento_dt and memento_ts: if not memento_dt and memento_ts:
memento_dt = timestamp_to_http_date(memento_ts) memento_dt = timestamp_to_http_date(memento_ts)
if memento_dt: if memento_dt:
status_headers.headers.append(('Memento-Datetime', memento_dt)) if is_memento:
status_headers.headers.append(('Memento-Datetime', memento_dt))
if is_proxy: if is_proxy:
memento_url = url memento_url = url
else: else:
memento_url = full_prefix + memento_ts + mod memento_url = full_prefix + memento_ts + replay_mod
memento_url += '/' + url memento_url += '/' + url
else: else:
memento_url = None memento_url = None
@ -560,6 +562,7 @@ class RewriterApp(object):
def _get_timegate_timemap(self, url, full_prefix, mod): def _get_timegate_timemap(self, url, full_prefix, mod):
# timegate url # timegate url
timegate_url = full_prefix timegate_url = full_prefix
mod = ''
if mod: if mod:
timegate_url += mod + '/' timegate_url += mod + '/'
@ -653,7 +656,7 @@ class RewriterApp(object):
status = str(res.status_code) + ' ' + res.reason status = str(res.status_code) + ' ' + res.reason
if res.status_code == 200 and output == 'link': if res.status_code == 200 and output == 'link':
timegate, timemap = self._get_timegate_timemap(wb_url.url, full_prefix, self.replay_mod) timegate, timemap = self._get_timegate_timemap(wb_url.url, full_prefix, wb_url.mod)
text = MementoUtils.wrap_timemap_header(wb_url.url, text = MementoUtils.wrap_timemap_header(wb_url.url,
timegate, timegate,

View File

@ -33,7 +33,7 @@ class TestMemento(MementoMixin, BaseConfigTest):
assert resp.headers['Content-Location'] in memento_link assert resp.headers['Content-Location'] in memento_link
# timegate link # timegate link
assert self.make_timegate_link(url, fmod) in links assert self.make_timegate_link(url, '') in links
# timemap link # timemap link
assert self.make_timemap_link(url) in links assert self.make_timemap_link(url) in links
@ -60,7 +60,7 @@ class TestMemento(MementoMixin, BaseConfigTest):
assert self.make_memento_link(url, '20140127171238', dt, 'mp_', include_coll=False) in links assert self.make_memento_link(url, '20140127171238', dt, 'mp_', include_coll=False) in links
#timegate link #timegate link
assert self.make_timegate_link(url, 'mp_') in links assert self.make_timegate_link(url, '') in links
# Body # Body
assert '"20140127171238"' in resp.text assert '"20140127171238"' in resp.text
@ -132,7 +132,7 @@ class TestMemento(MementoMixin, BaseConfigTest):
exp = """\ exp = """\
<http://localhost:80/pywb/timemap/link/http://example.com?example=1>; rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT", <http://localhost:80/pywb/timemap/link/http://example.com?example=1>; rel="self"; type="application/link-format"; from="Fri, 03 Jan 2014 03:03:21 GMT",
<http://localhost:80/pywb/mp_/http://example.com?example=1>; rel="timegate", <http://localhost:80/pywb/http://example.com?example=1>; rel="timegate",
<http://example.com?example=1>; rel="original", <http://example.com?example=1>; rel="original",
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; collection="pywb", <http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:21 GMT"; collection="pywb",
<http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; collection="pywb" <http://example.com?example=1>; rel="memento"; datetime="Fri, 03 Jan 2014 03:03:41 GMT"; collection="pywb"
@ -186,3 +186,75 @@ com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "
assert resp.status_int == 400 assert resp.status_int == 400
# ============================================================================
class TestMementoRedirectClassic(MementoMixin, BaseConfigTest):
@classmethod
def setup_class(cls):
super(TestMementoRedirectClassic, cls).setup_class('config_test_redirect_classic.yaml')
def test_memento_top_frame_timegate(self, fmod):
resp = self.testapp.get('/pywb/http://www.iana.org/')
assert resp.status_code == 307
assert resp.headers['Location'].endswith('/20140127171238/http://www.iana.org/')
assert resp.headers['Link'] != ''
# Memento Headers
assert VARY in resp.headers
assert MEMENTO_DATETIME not in resp.headers
# memento link
dt = 'Mon, 27 Jan 2014 17:12:38 GMT'
url = 'http://www.iana.org/'
links = self.get_links(resp)
assert self.make_memento_link(url, '20140127171238', dt, 'mp_', include_coll=False) in links
#timegate link
assert self.make_timegate_link(url, '') in links
resp = resp.follow()
# Body
assert '"20140127171238"' in resp.text
assert '"http://www.iana.org/"' in resp.text, resp.text
def test_memento_top_frame_timegate_accept_dt(self, fmod):
headers = {'Accept-Datetime': 'Sun, 26 Jan 2014 20:06:24 GMT'}
resp = self.testapp.get('/pywb/http://www.iana.org/', headers=headers)
assert resp.status_code == 307
assert resp.headers['Location'].endswith('/20140126200624/http://www.iana.org/')
assert resp.headers['Link'] != ''
# Memento Headers
assert VARY in resp.headers
assert MEMENTO_DATETIME not in resp.headers
# memento link
dt = 'Sun, 26 Jan 2014 20:06:24 GMT'
url = 'http://www.iana.org/'
links = self.get_links(resp)
assert self.make_memento_link(url, '20140126200624', dt, 'mp_', include_coll=False) in links
#timegate link
assert self.make_timegate_link(url, '') in links
resp = resp.follow()
# Body
assert '"20140126200624"' in resp.text
assert '"http://www.iana.org/"' in resp.text, resp.text
def test_memento_not_time_gate(self, fmod):
headers = {'Accept-Datetime': 'Sun, 26 Jan 2014 20:06:24 GMT'}
resp = self.testapp.get('/pywb/2/http://www.iana.org/', headers=headers)
assert resp.status_code == 200
def test_timegate_error_not_found(self):
resp = self.testapp.get('/pywb/http://example.com/x-not-found', status=404)
assert resp.status_code == 404