1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rewrite: improvements to non-exact replay mode, redir_to_exact option set to false

frames: add request_ts to wbinfo and use that as the timestamp in the top-frame. for exact replay, request_ts == timestamp
for latest replay / no timestamp / memento timegate, redirect to current time instead of time of last capture, while serving
last capture.
timeutils: add timestamp_now() function to return timestamp of current datetime
Add extra tests for this mode
Tracked via #72
This commit is contained in:
Ilya Kreymer 2015-02-17 17:47:30 -08:00
parent 9623f95439
commit 80dcb6ff27
11 changed files with 140 additions and 24 deletions

View File

@ -3,7 +3,6 @@ Fetch a url from live web and apply rewriting rules
"""
import requests
import datetime
import mimetypes
import logging
import os
@ -12,7 +11,7 @@ from urlparse import urlsplit
from pywb.utils.loaders import is_http, LimitReader, BlockLoader, to_file_url
from pywb.utils.loaders import extract_client_cookie
from pywb.utils.timeutils import datetime_to_timestamp
from pywb.utils.timeutils import timestamp_now
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.canonicalize import canonicalize
@ -205,7 +204,7 @@ class LiveRewriter(object):
(status_headers, stream) = self.fetch_local_file(url)
if timestamp is None:
timestamp = datetime_to_timestamp(datetime.datetime.utcnow())
timestamp = timestamp_now()
cdx = {'urlkey': urlkey,
'timestamp': timestamp,

View File

@ -61,6 +61,9 @@ class BaseWbUrl(object):
def is_replay(self):
return self.is_replay_type(self.type)
def is_latest_replay(self):
return (self.type == BaseWbUrl.LATEST_REPLAY)
def is_query(self):
return self.is_query_type(self.type)

View File

@ -122,6 +122,7 @@ function notify_top() {
if (window.__orig_parent && window.__orig_parent.update_wb_url) {
window.__orig_parent.update_wb_url(window.WB_wombat_location.href,
wbinfo.timestamp,
wbinfo.request_ts,
wbinfo.is_live);
}

View File

@ -42,7 +42,7 @@ function make_inner_url(url, ts)
}
}
function push_state(url, timestamp, capture_str, is_live) {
function push_state(url, timestamp, request_ts, capture_str, is_live) {
if (window.frames[0].WB_wombat_location) {
var curr_href = window.frames[0].WB_wombat_location.href;
@ -54,8 +54,9 @@ function push_state(url, timestamp, capture_str, is_live) {
var state = {}
state.timestamp = timestamp;
state.outer_url = make_outer_url(url, state.timestamp);
state.inner_url = make_inner_url(url, state.timestamp);
state.request_ts = request_ts;
state.outer_url = make_outer_url(url, state.request_ts);
state.inner_url = make_inner_url(url, state.request_ts);
state.url = url;
state.capture_str = capture_str;
state.is_live = is_live;
@ -130,6 +131,7 @@ function extract_ts_cookie(value) {
function iframe_loaded(event) {
var url;
var ts;
var request_ts;
var capture_str;
var is_live = false;
var iframe = window.frames[0];
@ -142,6 +144,7 @@ function iframe_loaded(event) {
if (iframe.wbinfo) {
ts = iframe.wbinfo.timestamp;
request_ts = iframe.wbinfo.request_ts;
is_live = iframe.wbinfo.is_live;
} else {
ts = extract_ts_cookie(iframe.document.cookie);
@ -150,19 +153,20 @@ function iframe_loaded(event) {
} else {
ts = extract_ts(iframe.location.href);
}
request_ts = ts;
}
update_wb_url(url, ts, is_live);
update_wb_url(url, ts, request_ts, is_live);
}
function update_wb_url(url, ts, is_live) {
function update_wb_url(url, ts, request_ts, is_live) {
if (curr_state.url == url && curr_state.timestamp == ts) {
return;
}
capture_str = _wb_js.ts_to_date(ts, true);
push_state(url, ts, capture_str, is_live);
push_state(url, ts, request_ts, capture_str, is_live);
}
// Load Banner

View File

@ -17,6 +17,7 @@
wbinfo = {}
wbinfo.url = "{{ cdx.url }}";
wbinfo.timestamp = "{{ cdx.timestamp }}";
wbinfo.request_ts = "{{ wbrequest.wb_url.timestamp }}";
wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
wbinfo.mod = "{{ wbrequest.wb_url.mod }}";
wbinfo.top_url = "{{ top_url }}";

View File

@ -72,6 +72,14 @@ def datetime_to_timestamp(the_datetime):
return the_datetime.strftime(TIMESTAMP_14)
def timestamp_now():
"""
>>> len(timestamp_now())
14
"""
return datetime_to_timestamp(datetime.datetime.utcnow())
def iso_date_to_timestamp(string):
"""
>>> iso_date_to_timestamp('2013-12-26T10:11:12Z')

View File

@ -7,6 +7,7 @@ from urlparse import urlsplit
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.wbexception import WbException, NotFoundException
from pywb.utils.loaders import LimitReader
from pywb.utils.timeutils import timestamp_now
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.memento import MementoResponse
@ -219,8 +220,11 @@ class ReplayView(object):
if wbrequest.custom_params.get('noredir'):
return None
is_memento_timegate = (wbrequest.options.get('is_timegate', False))
redir_needed = is_memento_timegate
is_timegate = (wbrequest.options.get('is_timegate', False))
if not is_timegate:
is_timegate = wbrequest.wb_url.is_latest_replay()
redir_needed = is_timegate
if not redir_needed and self.redir_to_exact:
redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp)
@ -231,15 +235,20 @@ class ReplayView(object):
if self.enable_range_cache and wbrequest.extract_range():
return None
if is_timegate and not self.redir_to_exact:
timestamp = timestamp_now()
else:
timestamp = cdx['timestamp']
new_url = (wbrequest.urlrewriter.
get_new_url(timestamp=cdx['timestamp'],
get_new_url(timestamp=timestamp,
url=cdx['original']))
if wbrequest.method == 'POST':
# FF shows a confirm dialog, so can't use 307 effectively
# was: statusline = '307 Same-Method Internal Redirect'
return None
elif is_memento_timegate:
elif is_timegate:
statusline = '302 Found'
else:
# clear cdx line to indicate internal redirect

View File

@ -40,7 +40,13 @@ collections:
pywb-norange:
index_paths: ./sample_archive/cdx/
enable_ranges: False
enable_ranges: false
pywb-non-exact:
index_paths: ./sample_archive/cdx/
redir_to_exact: false
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
# SURT keys are recommended for future indices, but non-SURT cdxs

View File

@ -6,6 +6,10 @@
collections:
pywb: ./sample_archive/cdx/
pywb-non-exact:
index_paths: ./sample_archive/cdx/
redir_to_exact: false
archive_paths: ['./sample_archive/warcs/']
# Test memento

View File

@ -4,7 +4,7 @@ import base64
from pywb.webapp.pywb_init import create_wb_router
from pywb.framework.wsgi_wrappers import init_app
from pywb.cdx.cdxobject import CDXObject
from pywb.utils.timeutils import timestamp_now
class TestWb:
TEST_CONFIG = 'tests/test_config.yaml'
@ -256,14 +256,24 @@ class TestWb:
assert resp.content_length == 0
assert resp.content_type == 'application/x-javascript'
def test_redirect_1(self):
def test_redirect_exact(self):
resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/')
assert resp.status_int == 302
assert resp.headers['Location'].endswith('/pywb/20140127171238/http://iana.org')
def test_no_redirect_non_exact(self):
# non-exact mode, don't redirect to exact capture
resp = self.testapp.get('/pywb-non-exact/20140127171237/http://www.iana.org/')
assert resp.status_int == 200
def test_redirect_replay_2(self):
self._assert_basic_html(resp)
assert '"20140127171237"' in resp.body
# actual timestamp set in JS
assert 'timestamp = "20140127171238"' in resp.body
assert '/pywb-non-exact/20140127171237/http://www.iana.org/about/' in resp.body
def test_redirect_latest_replay(self):
resp = self.testapp.get('/pywb/http://example.com/')
assert resp.status_int == 302
@ -275,6 +285,26 @@ class TestWb:
assert '"20140127171251"' in resp.body
assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body
def test_redirect_non_exact_latest_replay_ts(self):
resp = self.testapp.get('/pywb-non-exact/http://example.com/')
assert resp.status_int == 302
assert resp.headers['Location'].endswith('/http://example.com')
# extract ts, which should be current time
ts = resp.headers['Location'].rsplit('/http://')[0].rsplit('/', 1)[-1]
assert len(ts) == 14, ts
resp = resp.follow()
self._assert_basic_html(resp)
# ensure the current ts is present in the links
assert '"{0}"'.format(ts) in resp.body
assert '/pywb-non-exact/{0}/http://www.iana.org/domains/example'.format(ts) in resp.body
# ensure ts is current ts
assert timestamp_now() >= ts, ts
def test_redirect_relative_3(self):
# webtest uses Host: localhost:80 by default
# first two requests should result in same redirect

View File

@ -3,6 +3,7 @@ import re
from pywb.webapp.pywb_init import create_wb_router
from pywb.framework.wsgi_wrappers import init_app
from pywb.cdx.cdxobject import CDXObject
from pywb.utils.timeutils import timestamp_now
MEMENTO_DATETIME = 'Memento-Datetime'
ACCEPT_DATETIME = 'Accept-Datetime'
@ -23,13 +24,13 @@ class TestWb:
def get_links(self, resp):
return map(lambda x: x.strip(), re.split(', (?![0-9])', resp.headers[LINK]))
def make_timemap_link(self, url):
format_ = '<http://localhost:80/pywb/timemap/*/{0}>; rel="timemap"; type="{1}"'
return format_.format(url, LINK_FORMAT)
def make_timemap_link(self, url, coll='pywb'):
format_ = '<http://localhost:80/{2}/timemap/*/{0}>; rel="timemap"; type="{1}"'
return format_.format(url, LINK_FORMAT, coll)
def make_memento_link(self, url, ts, dt):
format_ = '<http://localhost:80/pywb/{1}/{0}>; rel="memento"; datetime="{2}"'
return format_.format(url, ts, dt)
def make_memento_link(self, url, ts, dt, coll='pywb'):
format_ = '<http://localhost:80/{3}/{1}/{0}>; rel="memento"; datetime="{2}"'
return format_.format(url, ts, dt, coll)
# Below functionality is for archival (non-proxy) mode
# It is designed to conform to Memento protocol Pattern 2.1
@ -57,7 +58,35 @@ class TestWb:
assert '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
def test_timegate_accept_datetime(self):
# timegate with latest memento, but redirect to current timestamp url instead of
# memento timestamp
def test_timegate_latest_request_timestamp(self):
"""
TimeGate with no Accept-Datetime header
"""
dt = 'Mon, 27 Jan 2014 17:12:39 GMT'
resp = self.testapp.get('/pywb-non-exact/http://www.iana.org/_css/2013.1/screen.css')
assert resp.status_int == 302
assert resp.headers[VARY] == 'accept-datetime'
links = self.get_links(resp)
assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"' in links
assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css', coll='pywb-non-exact') in links
assert self.make_memento_link('http://www.iana.org/_css/2013.1/screen.css', '20140127171239', dt, coll='pywb-non-exact') in links
assert MEMENTO_DATETIME not in resp.headers
assert '/pywb-non-exact/' in resp.headers['Location']
wburl = resp.headers['Location'].split('/pywb-non-exact/')[-1]
ts = wburl.split('/')[0]
assert len(ts) == 14
assert timestamp_now() >= ts
def test_timegate_accept_datetime_exact(self):
"""
TimeGate with Accept-Datetime header, matching exactly
"""
@ -78,6 +107,28 @@ class TestWb:
assert '/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
def test_timegate_accept_datetime_inexact(self):
"""
TimeGate with Accept-Datetime header, not matching a memento exactly
"""
dt = 'Sun, 26 Jan 2014 20:08:04 GMT'
request_dt = 'Sun, 26 Jan 2014 20:08:00 GMT'
headers = {ACCEPT_DATETIME: request_dt}
resp = self.testapp.get('/pywb//http://www.iana.org/_css/2013.1/screen.css', headers=headers)
assert resp.status_int == 302
assert resp.headers[VARY] == 'accept-datetime'
links = self.get_links(resp)
assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"' in links
assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links
assert self.make_memento_link('http://www.iana.org/_css/2013.1/screen.css', '20140126200804', dt) == links[0], links[0]
assert MEMENTO_DATETIME not in resp.headers
assert '/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
def test_non_timegate_intermediate_redir(self):
"""