1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

memento timegate: make timegate headers for /<coll>/<url> behave correctly per-memento spec, (#564)

return 404 if not found, return latest memento header. do this by performing actual response lookup,
but then returning the top frame response if succeeded. addresses ukwa/ukwa-pywb#58
This commit is contained in:
Ilya Kreymer 2020-06-08 13:26:20 -07:00 committed by GitHub
parent 5e9b13e267
commit 3c53c2731b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 56 additions and 16 deletions

View File

@ -345,6 +345,7 @@ class RewriterApp(object):
content_rw, is_proxy)
response = None
keep_frame_response = False
# prefer overrides custom response?
if pref_mod is not None:
@ -360,13 +361,22 @@ class RewriterApp(object):
else:
wb_url.mod = pref_mod
else:
# don't return top-frame response for timegate with exact redirects
kwargs['is_timegate_redir'] = is_timegate and redirect_to_exact
response = self.handle_custom_response(environ, wb_url,
full_prefix, host_prefix,
kwargs)
if kwargs.get('output'):
response = self.handle_timemap(wb_url, kwargs, full_prefix)
if response:
elif wb_url.is_query():
response = self.handle_query(environ, wb_url, kwargs, full_prefix)
else:
# don't return top-frame response for timegate with exact redirects
if not (is_timegate and redirect_to_exact):
keep_frame_response = is_timegate and not redirect_to_exact and not is_proxy
response = self.handle_custom_response(environ, wb_url,
full_prefix, host_prefix,
kwargs)
if response and not keep_frame_response:
return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy)
if is_proxy:
@ -443,6 +453,11 @@ class RewriterApp(object):
return self.send_redirect(new_path, url_parts, urlrewriter)
# return top-frame timegate response, with timestamp from cdx
if response and keep_frame_response:
no_except_close(r.raw)
return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy, cdx['timestamp'])
stream = BufferedReader(r.raw, block_size=BUFF_SIZE)
record = self.loader.parse_record_stream(stream,
ensure_http_headers=True)
@ -560,7 +575,7 @@ class RewriterApp(object):
return response
def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy):
def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy, timegate_closest_ts=None):
memento_ts = None
if not isinstance(response, WbResponse):
content_type = 'text/html'
@ -569,13 +584,13 @@ class RewriterApp(object):
if not self.is_framed_replay(wb_url):
content_type += '; charset=utf-8'
else:
memento_ts = wb_url.timestamp
memento_ts = timegate_closest_ts or wb_url.timestamp
response = WbResponse.text_response(response, content_type=content_type)
if self.enable_memento and response.status_headers.statusline.startswith('200'):
self._add_memento_links(wb_url.url, full_prefix, None, memento_ts,
response.status_headers, is_timegate, is_proxy)
response.status_headers, is_timegate, is_proxy, is_memento=not is_timegate)
return response
def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts,
@ -873,13 +888,7 @@ class RewriterApp(object):
return {'metadata': kwargs.get('metadata', {})}
def handle_custom_response(self, environ, wb_url, full_prefix, host_prefix, kwargs):
if kwargs.get('output'):
return self.handle_timemap(wb_url, kwargs, full_prefix)
if wb_url.is_query():
return self.handle_query(environ, wb_url, kwargs, full_prefix)
if self.is_framed_replay(wb_url) and not kwargs.get('is_timegate_redir'):
if self.is_framed_replay(wb_url):
extra_params = self.get_top_frame_params(wb_url, kwargs)
return self.frame_insert_view.get_top_frame(wb_url,
full_prefix,

View File

@ -63,6 +63,28 @@ class TestMemento(MementoMixin, BaseConfigTest):
assert '"20140127171238"' in resp.text
assert '"http://www.iana.org/"' in resp.text, resp.text
def test_memento_top_frame_timegate(self):
resp = self.testapp.get('/pywb/http://www.iana.org/_css/2013.1/screen.css')
# vary header
assert VARY in resp.headers
# no memento header, as not really a memento (top-frame)
assert MEMENTO_DATETIME not in resp.headers
# Memento Headers
# memento link
dt = 'Mon, 27 Jan 2014 17:12:39 GMT'
url = 'http://www.iana.org/_css/2013.1/screen.css'
links = self.get_links(resp)
assert self.make_memento_link(url, '20140127171239', dt, 'mp_', include_coll=False) in links
#timegate link
assert self.make_timegate_link(url, '') in links
def test_memento_content_replay_exact(self, fmod):
resp = self.get('/pywb/20140127171238{0}/http://www.iana.org/', fmod)
@ -175,6 +197,15 @@ com,example)/?example=1 20140103030341 {"url": "http://example.com?example=1", "
resp = self._timemap_get('/pywb/timemap/foo/http://example.com', status=400)
assert resp.json == {'message': 'output=foo not supported'}
def test_timegate_error_not_found(self):
resp = self.testapp.get('/pywb/http://example.com/x-not-found', status=404)
assert resp.status_code == 404
# No Memento Headers
assert VARY not in resp.headers
assert MEMENTO_DATETIME not in resp.headers
assert 'Link' not in resp.headers
def test_error_bad_accept_datetime(self):
"""
400 response for bad accept_datetime