1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

rewrite: improvements to non-exact replay mode, redir_to_exact option set to false

frames: add request_ts to wbinfo and use that as the timestamp in the top-frame. for exact replay, request_ts == timestamp
for latest replay / no timestamp / memento timegate, redirect to current time instead of time of last capture, while serving
last capture.
timeutils: add timestamp_now() function to return timestamp of current datetime
Add extra tests for this mode
Tracked via #72
This commit is contained in:
Ilya Kreymer 2015-02-17 17:47:30 -08:00
parent 9623f95439
commit 80dcb6ff27
11 changed files with 140 additions and 24 deletions

View File

@ -3,7 +3,6 @@ Fetch a url from live web and apply rewriting rules
""" """
import requests import requests
import datetime
import mimetypes import mimetypes
import logging import logging
import os import os
@ -12,7 +11,7 @@ from urlparse import urlsplit
from pywb.utils.loaders import is_http, LimitReader, BlockLoader, to_file_url from pywb.utils.loaders import is_http, LimitReader, BlockLoader, to_file_url
from pywb.utils.loaders import extract_client_cookie from pywb.utils.loaders import extract_client_cookie
from pywb.utils.timeutils import datetime_to_timestamp from pywb.utils.timeutils import timestamp_now
from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.canonicalize import canonicalize from pywb.utils.canonicalize import canonicalize
@ -205,7 +204,7 @@ class LiveRewriter(object):
(status_headers, stream) = self.fetch_local_file(url) (status_headers, stream) = self.fetch_local_file(url)
if timestamp is None: if timestamp is None:
timestamp = datetime_to_timestamp(datetime.datetime.utcnow()) timestamp = timestamp_now()
cdx = {'urlkey': urlkey, cdx = {'urlkey': urlkey,
'timestamp': timestamp, 'timestamp': timestamp,

View File

@ -61,6 +61,9 @@ class BaseWbUrl(object):
def is_replay(self): def is_replay(self):
return self.is_replay_type(self.type) return self.is_replay_type(self.type)
def is_latest_replay(self):
return (self.type == BaseWbUrl.LATEST_REPLAY)
def is_query(self): def is_query(self):
return self.is_query_type(self.type) return self.is_query_type(self.type)

View File

@ -122,6 +122,7 @@ function notify_top() {
if (window.__orig_parent && window.__orig_parent.update_wb_url) { if (window.__orig_parent && window.__orig_parent.update_wb_url) {
window.__orig_parent.update_wb_url(window.WB_wombat_location.href, window.__orig_parent.update_wb_url(window.WB_wombat_location.href,
wbinfo.timestamp, wbinfo.timestamp,
wbinfo.request_ts,
wbinfo.is_live); wbinfo.is_live);
} }

View File

@ -42,7 +42,7 @@ function make_inner_url(url, ts)
} }
} }
function push_state(url, timestamp, capture_str, is_live) { function push_state(url, timestamp, request_ts, capture_str, is_live) {
if (window.frames[0].WB_wombat_location) { if (window.frames[0].WB_wombat_location) {
var curr_href = window.frames[0].WB_wombat_location.href; var curr_href = window.frames[0].WB_wombat_location.href;
@ -54,8 +54,9 @@ function push_state(url, timestamp, capture_str, is_live) {
var state = {} var state = {}
state.timestamp = timestamp; state.timestamp = timestamp;
state.outer_url = make_outer_url(url, state.timestamp); state.request_ts = request_ts;
state.inner_url = make_inner_url(url, state.timestamp); state.outer_url = make_outer_url(url, state.request_ts);
state.inner_url = make_inner_url(url, state.request_ts);
state.url = url; state.url = url;
state.capture_str = capture_str; state.capture_str = capture_str;
state.is_live = is_live; state.is_live = is_live;
@ -130,6 +131,7 @@ function extract_ts_cookie(value) {
function iframe_loaded(event) { function iframe_loaded(event) {
var url; var url;
var ts; var ts;
var request_ts;
var capture_str; var capture_str;
var is_live = false; var is_live = false;
var iframe = window.frames[0]; var iframe = window.frames[0];
@ -142,6 +144,7 @@ function iframe_loaded(event) {
if (iframe.wbinfo) { if (iframe.wbinfo) {
ts = iframe.wbinfo.timestamp; ts = iframe.wbinfo.timestamp;
request_ts = iframe.wbinfo.request_ts;
is_live = iframe.wbinfo.is_live; is_live = iframe.wbinfo.is_live;
} else { } else {
ts = extract_ts_cookie(iframe.document.cookie); ts = extract_ts_cookie(iframe.document.cookie);
@ -150,19 +153,20 @@ function iframe_loaded(event) {
} else { } else {
ts = extract_ts(iframe.location.href); ts = extract_ts(iframe.location.href);
} }
request_ts = ts;
} }
update_wb_url(url, ts, is_live); update_wb_url(url, ts, request_ts, is_live);
} }
function update_wb_url(url, ts, is_live) { function update_wb_url(url, ts, request_ts, is_live) {
if (curr_state.url == url && curr_state.timestamp == ts) { if (curr_state.url == url && curr_state.timestamp == ts) {
return; return;
} }
capture_str = _wb_js.ts_to_date(ts, true); capture_str = _wb_js.ts_to_date(ts, true);
push_state(url, ts, capture_str, is_live); push_state(url, ts, request_ts, capture_str, is_live);
} }
// Load Banner // Load Banner

View File

@ -17,6 +17,7 @@
wbinfo = {} wbinfo = {}
wbinfo.url = "{{ cdx.url }}"; wbinfo.url = "{{ cdx.url }}";
wbinfo.timestamp = "{{ cdx.timestamp }}"; wbinfo.timestamp = "{{ cdx.timestamp }}";
wbinfo.request_ts = "{{ wbrequest.wb_url.timestamp }}";
wbinfo.prefix = "{{ wbrequest.wb_prefix }}"; wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
wbinfo.mod = "{{ wbrequest.wb_url.mod }}"; wbinfo.mod = "{{ wbrequest.wb_url.mod }}";
wbinfo.top_url = "{{ top_url }}"; wbinfo.top_url = "{{ top_url }}";

View File

@ -72,6 +72,14 @@ def datetime_to_timestamp(the_datetime):
return the_datetime.strftime(TIMESTAMP_14) return the_datetime.strftime(TIMESTAMP_14)
def timestamp_now():
"""
>>> len(timestamp_now())
14
"""
return datetime_to_timestamp(datetime.datetime.utcnow())
def iso_date_to_timestamp(string): def iso_date_to_timestamp(string):
""" """
>>> iso_date_to_timestamp('2013-12-26T10:11:12Z') >>> iso_date_to_timestamp('2013-12-26T10:11:12Z')

View File

@ -7,6 +7,7 @@ from urlparse import urlsplit
from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.wbexception import WbException, NotFoundException from pywb.utils.wbexception import WbException, NotFoundException
from pywb.utils.loaders import LimitReader from pywb.utils.loaders import LimitReader
from pywb.utils.timeutils import timestamp_now
from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.memento import MementoResponse from pywb.framework.memento import MementoResponse
@ -219,8 +220,11 @@ class ReplayView(object):
if wbrequest.custom_params.get('noredir'): if wbrequest.custom_params.get('noredir'):
return None return None
is_memento_timegate = (wbrequest.options.get('is_timegate', False)) is_timegate = (wbrequest.options.get('is_timegate', False))
redir_needed = is_memento_timegate if not is_timegate:
is_timegate = wbrequest.wb_url.is_latest_replay()
redir_needed = is_timegate
if not redir_needed and self.redir_to_exact: if not redir_needed and self.redir_to_exact:
redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp) redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp)
@ -231,15 +235,20 @@ class ReplayView(object):
if self.enable_range_cache and wbrequest.extract_range(): if self.enable_range_cache and wbrequest.extract_range():
return None return None
if is_timegate and not self.redir_to_exact:
timestamp = timestamp_now()
else:
timestamp = cdx['timestamp']
new_url = (wbrequest.urlrewriter. new_url = (wbrequest.urlrewriter.
get_new_url(timestamp=cdx['timestamp'], get_new_url(timestamp=timestamp,
url=cdx['original'])) url=cdx['original']))
if wbrequest.method == 'POST': if wbrequest.method == 'POST':
# FF shows a confirm dialog, so can't use 307 effectively # FF shows a confirm dialog, so can't use 307 effectively
# was: statusline = '307 Same-Method Internal Redirect' # was: statusline = '307 Same-Method Internal Redirect'
return None return None
elif is_memento_timegate: elif is_timegate:
statusline = '302 Found' statusline = '302 Found'
else: else:
# clear cdx line to indicate internal redirect # clear cdx line to indicate internal redirect

View File

@ -40,7 +40,13 @@ collections:
pywb-norange: pywb-norange:
index_paths: ./sample_archive/cdx/ index_paths: ./sample_archive/cdx/
enable_ranges: False enable_ranges: false
pywb-non-exact:
index_paths: ./sample_archive/cdx/
redir_to_exact: false
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/ # indicate if cdx files are sorted by SURT keys -- eg: com,example)/
# SURT keys are recommended for future indices, but non-SURT cdxs # SURT keys are recommended for future indices, but non-SURT cdxs

View File

@ -6,6 +6,10 @@
collections: collections:
pywb: ./sample_archive/cdx/ pywb: ./sample_archive/cdx/
pywb-non-exact:
index_paths: ./sample_archive/cdx/
redir_to_exact: false
archive_paths: ['./sample_archive/warcs/'] archive_paths: ['./sample_archive/warcs/']
# Test memento # Test memento

View File

@ -4,7 +4,7 @@ import base64
from pywb.webapp.pywb_init import create_wb_router from pywb.webapp.pywb_init import create_wb_router
from pywb.framework.wsgi_wrappers import init_app from pywb.framework.wsgi_wrappers import init_app
from pywb.cdx.cdxobject import CDXObject from pywb.cdx.cdxobject import CDXObject
from pywb.utils.timeutils import timestamp_now
class TestWb: class TestWb:
TEST_CONFIG = 'tests/test_config.yaml' TEST_CONFIG = 'tests/test_config.yaml'
@ -256,14 +256,24 @@ class TestWb:
assert resp.content_length == 0 assert resp.content_length == 0
assert resp.content_type == 'application/x-javascript' assert resp.content_type == 'application/x-javascript'
def test_redirect_1(self): def test_redirect_exact(self):
resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/') resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/')
assert resp.status_int == 302 assert resp.status_int == 302
assert resp.headers['Location'].endswith('/pywb/20140127171238/http://iana.org') assert resp.headers['Location'].endswith('/pywb/20140127171238/http://iana.org')
def test_no_redirect_non_exact(self):
# non-exact mode, don't redirect to exact capture
resp = self.testapp.get('/pywb-non-exact/20140127171237/http://www.iana.org/')
assert resp.status_int == 200
def test_redirect_replay_2(self): self._assert_basic_html(resp)
assert '"20140127171237"' in resp.body
# actual timestamp set in JS
assert 'timestamp = "20140127171238"' in resp.body
assert '/pywb-non-exact/20140127171237/http://www.iana.org/about/' in resp.body
def test_redirect_latest_replay(self):
resp = self.testapp.get('/pywb/http://example.com/') resp = self.testapp.get('/pywb/http://example.com/')
assert resp.status_int == 302 assert resp.status_int == 302
@ -275,6 +285,26 @@ class TestWb:
assert '"20140127171251"' in resp.body assert '"20140127171251"' in resp.body
assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body
def test_redirect_non_exact_latest_replay_ts(self):
resp = self.testapp.get('/pywb-non-exact/http://example.com/')
assert resp.status_int == 302
assert resp.headers['Location'].endswith('/http://example.com')
# extract ts, which should be current time
ts = resp.headers['Location'].rsplit('/http://')[0].rsplit('/', 1)[-1]
assert len(ts) == 14, ts
resp = resp.follow()
self._assert_basic_html(resp)
# ensure the current ts is present in the links
assert '"{0}"'.format(ts) in resp.body
assert '/pywb-non-exact/{0}/http://www.iana.org/domains/example'.format(ts) in resp.body
# ensure ts is current ts
assert timestamp_now() >= ts, ts
def test_redirect_relative_3(self): def test_redirect_relative_3(self):
# webtest uses Host: localhost:80 by default # webtest uses Host: localhost:80 by default
# first two requests should result in same redirect # first two requests should result in same redirect

View File

@ -3,6 +3,7 @@ import re
from pywb.webapp.pywb_init import create_wb_router from pywb.webapp.pywb_init import create_wb_router
from pywb.framework.wsgi_wrappers import init_app from pywb.framework.wsgi_wrappers import init_app
from pywb.cdx.cdxobject import CDXObject from pywb.cdx.cdxobject import CDXObject
from pywb.utils.timeutils import timestamp_now
MEMENTO_DATETIME = 'Memento-Datetime' MEMENTO_DATETIME = 'Memento-Datetime'
ACCEPT_DATETIME = 'Accept-Datetime' ACCEPT_DATETIME = 'Accept-Datetime'
@ -23,13 +24,13 @@ class TestWb:
def get_links(self, resp): def get_links(self, resp):
return map(lambda x: x.strip(), re.split(', (?![0-9])', resp.headers[LINK])) return map(lambda x: x.strip(), re.split(', (?![0-9])', resp.headers[LINK]))
def make_timemap_link(self, url): def make_timemap_link(self, url, coll='pywb'):
format_ = '<http://localhost:80/pywb/timemap/*/{0}>; rel="timemap"; type="{1}"' format_ = '<http://localhost:80/{2}/timemap/*/{0}>; rel="timemap"; type="{1}"'
return format_.format(url, LINK_FORMAT) return format_.format(url, LINK_FORMAT, coll)
def make_memento_link(self, url, ts, dt): def make_memento_link(self, url, ts, dt, coll='pywb'):
format_ = '<http://localhost:80/pywb/{1}/{0}>; rel="memento"; datetime="{2}"' format_ = '<http://localhost:80/{3}/{1}/{0}>; rel="memento"; datetime="{2}"'
return format_.format(url, ts, dt) return format_.format(url, ts, dt, coll)
# Below functionality is for archival (non-proxy) mode # Below functionality is for archival (non-proxy) mode
# It is designed to conform to Memento protocol Pattern 2.1 # It is designed to conform to Memento protocol Pattern 2.1
@ -57,7 +58,35 @@ class TestWb:
assert '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location'] assert '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
def test_timegate_accept_datetime(self): # timegate with latest memento, but redirect to current timestamp url instead of
# memento timestamp
def test_timegate_latest_request_timestamp(self):
"""
TimeGate with no Accept-Datetime header
"""
dt = 'Mon, 27 Jan 2014 17:12:39 GMT'
resp = self.testapp.get('/pywb-non-exact/http://www.iana.org/_css/2013.1/screen.css')
assert resp.status_int == 302
assert resp.headers[VARY] == 'accept-datetime'
links = self.get_links(resp)
assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"' in links
assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css', coll='pywb-non-exact') in links
assert self.make_memento_link('http://www.iana.org/_css/2013.1/screen.css', '20140127171239', dt, coll='pywb-non-exact') in links
assert MEMENTO_DATETIME not in resp.headers
assert '/pywb-non-exact/' in resp.headers['Location']
wburl = resp.headers['Location'].split('/pywb-non-exact/')[-1]
ts = wburl.split('/')[0]
assert len(ts) == 14
assert timestamp_now() >= ts
def test_timegate_accept_datetime_exact(self):
""" """
TimeGate with Accept-Datetime header, matching exactly TimeGate with Accept-Datetime header, matching exactly
""" """
@ -78,6 +107,28 @@ class TestWb:
assert '/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location'] assert '/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
def test_timegate_accept_datetime_inexact(self):
"""
TimeGate with Accept-Datetime header, not matching a memento exactly
"""
dt = 'Sun, 26 Jan 2014 20:08:04 GMT'
request_dt = 'Sun, 26 Jan 2014 20:08:00 GMT'
headers = {ACCEPT_DATETIME: request_dt}
resp = self.testapp.get('/pywb//http://www.iana.org/_css/2013.1/screen.css', headers=headers)
assert resp.status_int == 302
assert resp.headers[VARY] == 'accept-datetime'
links = self.get_links(resp)
assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"' in links
assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links
assert self.make_memento_link('http://www.iana.org/_css/2013.1/screen.css', '20140126200804', dt) == links[0], links[0]
assert MEMENTO_DATETIME not in resp.headers
assert '/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
def test_non_timegate_intermediate_redir(self): def test_non_timegate_intermediate_redir(self):
""" """