diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index dc43d51e..653e74ac 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -3,7 +3,6 @@ Fetch a url from live web and apply rewriting rules """ import requests -import datetime import mimetypes import logging import os @@ -12,7 +11,7 @@ from urlparse import urlsplit from pywb.utils.loaders import is_http, LimitReader, BlockLoader, to_file_url from pywb.utils.loaders import extract_client_cookie -from pywb.utils.timeutils import datetime_to_timestamp +from pywb.utils.timeutils import timestamp_now from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.canonicalize import canonicalize @@ -205,7 +204,7 @@ class LiveRewriter(object): (status_headers, stream) = self.fetch_local_file(url) if timestamp is None: - timestamp = datetime_to_timestamp(datetime.datetime.utcnow()) + timestamp = timestamp_now() cdx = {'urlkey': urlkey, 'timestamp': timestamp, diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index e43bab37..87e6c982 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -61,6 +61,9 @@ class BaseWbUrl(object): def is_replay(self): return self.is_replay_type(self.type) + def is_latest_replay(self): + return (self.type == BaseWbUrl.LATEST_REPLAY) + def is_query(self): return self.is_query_type(self.type) diff --git a/pywb/static/wb.js b/pywb/static/wb.js index d67e50c9..3f9e3d90 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -122,6 +122,7 @@ function notify_top() { if (window.__orig_parent && window.__orig_parent.update_wb_url) { window.__orig_parent.update_wb_url(window.WB_wombat_location.href, wbinfo.timestamp, + wbinfo.request_ts, wbinfo.is_live); } diff --git a/pywb/static/wb_frame.js b/pywb/static/wb_frame.js index 3e599058..79642db1 100644 --- a/pywb/static/wb_frame.js +++ b/pywb/static/wb_frame.js @@ -42,7 +42,7 @@ function make_inner_url(url, ts) } } -function push_state(url, timestamp, capture_str, is_live) { +function push_state(url, timestamp, request_ts, capture_str, is_live) { if (window.frames[0].WB_wombat_location) { var curr_href = window.frames[0].WB_wombat_location.href; @@ -54,8 +54,9 @@ function push_state(url, timestamp, capture_str, is_live) { var state = {} state.timestamp = timestamp; - state.outer_url = make_outer_url(url, state.timestamp); - state.inner_url = make_inner_url(url, state.timestamp); + state.request_ts = request_ts; + state.outer_url = make_outer_url(url, state.request_ts); + state.inner_url = make_inner_url(url, state.request_ts); state.url = url; state.capture_str = capture_str; state.is_live = is_live; @@ -130,6 +131,7 @@ function extract_ts_cookie(value) { function iframe_loaded(event) { var url; var ts; + var request_ts; var capture_str; var is_live = false; var iframe = window.frames[0]; @@ -142,6 +144,7 @@ function iframe_loaded(event) { if (iframe.wbinfo) { ts = iframe.wbinfo.timestamp; + request_ts = iframe.wbinfo.request_ts; is_live = iframe.wbinfo.is_live; } else { ts = extract_ts_cookie(iframe.document.cookie); @@ -150,19 +153,20 @@ function iframe_loaded(event) { } else { ts = extract_ts(iframe.location.href); } + request_ts = ts; } - update_wb_url(url, ts, is_live); + update_wb_url(url, ts, request_ts, is_live); } -function update_wb_url(url, ts, is_live) { +function update_wb_url(url, ts, request_ts, is_live) { if (curr_state.url == url && curr_state.timestamp == ts) { return; } capture_str = _wb_js.ts_to_date(ts, true); - push_state(url, ts, capture_str, is_live); + push_state(url, ts, request_ts, capture_str, is_live); } // Load Banner diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html index 0ebf2fba..cda0ea80 100644 --- a/pywb/ui/head_insert.html +++ b/pywb/ui/head_insert.html @@ -17,6 +17,7 @@ wbinfo = {} wbinfo.url = "{{ cdx.url }}"; wbinfo.timestamp = "{{ cdx.timestamp }}"; + wbinfo.request_ts = "{{ wbrequest.wb_url.timestamp }}"; wbinfo.prefix = "{{ wbrequest.wb_prefix }}"; wbinfo.mod = "{{ wbrequest.wb_url.mod }}"; wbinfo.top_url = "{{ top_url }}"; diff --git a/pywb/utils/timeutils.py b/pywb/utils/timeutils.py index 0dcc7e22..dcca13d6 100644 --- a/pywb/utils/timeutils.py +++ b/pywb/utils/timeutils.py @@ -72,6 +72,14 @@ def datetime_to_timestamp(the_datetime): return the_datetime.strftime(TIMESTAMP_14) +def timestamp_now(): + """ + >>> len(timestamp_now()) + 14 + """ + return datetime_to_timestamp(datetime.datetime.utcnow()) + + def iso_date_to_timestamp(string): """ >>> iso_date_to_timestamp('2013-12-26T10:11:12Z') diff --git a/pywb/webapp/replay_views.py b/pywb/webapp/replay_views.py index 5bab0ae1..d474e178 100644 --- a/pywb/webapp/replay_views.py +++ b/pywb/webapp/replay_views.py @@ -7,6 +7,7 @@ from urlparse import urlsplit from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.wbexception import WbException, NotFoundException from pywb.utils.loaders import LimitReader +from pywb.utils.timeutils import timestamp_now from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.memento import MementoResponse @@ -219,8 +220,11 @@ class ReplayView(object): if wbrequest.custom_params.get('noredir'): return None - is_memento_timegate = (wbrequest.options.get('is_timegate', False)) - redir_needed = is_memento_timegate + is_timegate = (wbrequest.options.get('is_timegate', False)) + if not is_timegate: + is_timegate = wbrequest.wb_url.is_latest_replay() + + redir_needed = is_timegate if not redir_needed and self.redir_to_exact: redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp) @@ -231,15 +235,20 @@ class ReplayView(object): if self.enable_range_cache and wbrequest.extract_range(): return None + if is_timegate and not self.redir_to_exact: + timestamp = timestamp_now() + else: + timestamp = cdx['timestamp'] + new_url = (wbrequest.urlrewriter. - get_new_url(timestamp=cdx['timestamp'], + get_new_url(timestamp=timestamp, url=cdx['original'])) if wbrequest.method == 'POST': # FF shows a confirm dialog, so can't use 307 effectively # was: statusline = '307 Same-Method Internal Redirect' return None - elif is_memento_timegate: + elif is_timegate: statusline = '302 Found' else: # clear cdx line to indicate internal redirect diff --git a/tests/test_config.yaml b/tests/test_config.yaml index 1d034671..a2d0e9f6 100644 --- a/tests/test_config.yaml +++ b/tests/test_config.yaml @@ -40,7 +40,13 @@ collections: pywb-norange: index_paths: ./sample_archive/cdx/ - enable_ranges: False + enable_ranges: false + + pywb-non-exact: + index_paths: ./sample_archive/cdx/ + redir_to_exact: false + + # indicate if cdx files are sorted by SURT keys -- eg: com,example)/ # SURT keys are recommended for future indices, but non-SURT cdxs diff --git a/tests/test_config_memento.yaml b/tests/test_config_memento.yaml index e8d0eb21..91d4c624 100644 --- a/tests/test_config_memento.yaml +++ b/tests/test_config_memento.yaml @@ -6,6 +6,10 @@ collections: pywb: ./sample_archive/cdx/ + pywb-non-exact: + index_paths: ./sample_archive/cdx/ + redir_to_exact: false + archive_paths: ['./sample_archive/warcs/'] # Test memento diff --git a/tests/test_integration.py b/tests/test_integration.py index ef06cf91..cc1393d8 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -4,7 +4,7 @@ import base64 from pywb.webapp.pywb_init import create_wb_router from pywb.framework.wsgi_wrappers import init_app from pywb.cdx.cdxobject import CDXObject - +from pywb.utils.timeutils import timestamp_now class TestWb: TEST_CONFIG = 'tests/test_config.yaml' @@ -256,14 +256,24 @@ class TestWb: assert resp.content_length == 0 assert resp.content_type == 'application/x-javascript' - def test_redirect_1(self): + def test_redirect_exact(self): resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/') assert resp.status_int == 302 assert resp.headers['Location'].endswith('/pywb/20140127171238/http://iana.org') + def test_no_redirect_non_exact(self): + # non-exact mode, don't redirect to exact capture + resp = self.testapp.get('/pywb-non-exact/20140127171237/http://www.iana.org/') + assert resp.status_int == 200 - def test_redirect_replay_2(self): + self._assert_basic_html(resp) + assert '"20140127171237"' in resp.body + # actual timestamp set in JS + assert 'timestamp = "20140127171238"' in resp.body + assert '/pywb-non-exact/20140127171237/http://www.iana.org/about/' in resp.body + + def test_redirect_latest_replay(self): resp = self.testapp.get('/pywb/http://example.com/') assert resp.status_int == 302 @@ -275,6 +285,26 @@ class TestWb: assert '"20140127171251"' in resp.body assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body + def test_redirect_non_exact_latest_replay_ts(self): + resp = self.testapp.get('/pywb-non-exact/http://example.com/') + assert resp.status_int == 302 + + assert resp.headers['Location'].endswith('/http://example.com') + + # extract ts, which should be current time + ts = resp.headers['Location'].rsplit('/http://')[0].rsplit('/', 1)[-1] + assert len(ts) == 14, ts + resp = resp.follow() + + self._assert_basic_html(resp) + + # ensure the current ts is present in the links + assert '"{0}"'.format(ts) in resp.body + assert '/pywb-non-exact/{0}/http://www.iana.org/domains/example'.format(ts) in resp.body + + # ensure ts is current ts + assert timestamp_now() >= ts, ts + def test_redirect_relative_3(self): # webtest uses Host: localhost:80 by default # first two requests should result in same redirect diff --git a/tests/test_memento.py b/tests/test_memento.py index 129f425b..0152d16e 100644 --- a/tests/test_memento.py +++ b/tests/test_memento.py @@ -3,6 +3,7 @@ import re from pywb.webapp.pywb_init import create_wb_router from pywb.framework.wsgi_wrappers import init_app from pywb.cdx.cdxobject import CDXObject +from pywb.utils.timeutils import timestamp_now MEMENTO_DATETIME = 'Memento-Datetime' ACCEPT_DATETIME = 'Accept-Datetime' @@ -23,13 +24,13 @@ class TestWb: def get_links(self, resp): return map(lambda x: x.strip(), re.split(', (?![0-9])', resp.headers[LINK])) - def make_timemap_link(self, url): - format_ = '; rel="timemap"; type="{1}"' - return format_.format(url, LINK_FORMAT) + def make_timemap_link(self, url, coll='pywb'): + format_ = '; rel="timemap"; type="{1}"' + return format_.format(url, LINK_FORMAT, coll) - def make_memento_link(self, url, ts, dt): - format_ = '; rel="memento"; datetime="{2}"' - return format_.format(url, ts, dt) + def make_memento_link(self, url, ts, dt, coll='pywb'): + format_ = '; rel="memento"; datetime="{2}"' + return format_.format(url, ts, dt, coll) # Below functionality is for archival (non-proxy) mode # It is designed to conform to Memento protocol Pattern 2.1 @@ -57,7 +58,35 @@ class TestWb: assert '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location'] - def test_timegate_accept_datetime(self): + # timegate with latest memento, but redirect to current timestamp url instead of + # memento timestamp + def test_timegate_latest_request_timestamp(self): + """ + TimeGate with no Accept-Datetime header + """ + + dt = 'Mon, 27 Jan 2014 17:12:39 GMT' + resp = self.testapp.get('/pywb-non-exact/http://www.iana.org/_css/2013.1/screen.css') + + assert resp.status_int == 302 + + assert resp.headers[VARY] == 'accept-datetime' + + links = self.get_links(resp) + assert '; rel="original"' in links + assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css', coll='pywb-non-exact') in links + assert self.make_memento_link('http://www.iana.org/_css/2013.1/screen.css', '20140127171239', dt, coll='pywb-non-exact') in links + + assert MEMENTO_DATETIME not in resp.headers + + assert '/pywb-non-exact/' in resp.headers['Location'] + + wburl = resp.headers['Location'].split('/pywb-non-exact/')[-1] + ts = wburl.split('/')[0] + assert len(ts) == 14 + assert timestamp_now() >= ts + + def test_timegate_accept_datetime_exact(self): """ TimeGate with Accept-Datetime header, matching exactly """ @@ -78,6 +107,28 @@ class TestWb: assert '/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location'] + def test_timegate_accept_datetime_inexact(self): + """ + TimeGate with Accept-Datetime header, not matching a memento exactly + """ + dt = 'Sun, 26 Jan 2014 20:08:04 GMT' + request_dt = 'Sun, 26 Jan 2014 20:08:00 GMT' + headers = {ACCEPT_DATETIME: request_dt} + resp = self.testapp.get('/pywb//http://www.iana.org/_css/2013.1/screen.css', headers=headers) + + assert resp.status_int == 302 + + assert resp.headers[VARY] == 'accept-datetime' + + links = self.get_links(resp) + assert '; rel="original"' in links + assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links + assert self.make_memento_link('http://www.iana.org/_css/2013.1/screen.css', '20140126200804', dt) == links[0], links[0] + + assert MEMENTO_DATETIME not in resp.headers + + assert '/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location'] + def test_non_timegate_intermediate_redir(self): """