mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
rewrite: improvements to non-exact replay mode, redir_to_exact option set to false
frames: add request_ts to wbinfo and use that as the timestamp in the top-frame. for exact replay, request_ts == timestamp for latest replay / no timestamp / memento timegate, redirect to current time instead of time of last capture, while serving last capture. timeutils: add timestamp_now() function to return timestamp of current datetime Add extra tests for this mode Tracked via #72
This commit is contained in:
parent
9623f95439
commit
80dcb6ff27
@ -3,7 +3,6 @@ Fetch a url from live web and apply rewriting rules
|
||||
"""
|
||||
|
||||
import requests
|
||||
import datetime
|
||||
import mimetypes
|
||||
import logging
|
||||
import os
|
||||
@ -12,7 +11,7 @@ from urlparse import urlsplit
|
||||
|
||||
from pywb.utils.loaders import is_http, LimitReader, BlockLoader, to_file_url
|
||||
from pywb.utils.loaders import extract_client_cookie
|
||||
from pywb.utils.timeutils import datetime_to_timestamp
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.canonicalize import canonicalize
|
||||
|
||||
@ -205,7 +204,7 @@ class LiveRewriter(object):
|
||||
(status_headers, stream) = self.fetch_local_file(url)
|
||||
|
||||
if timestamp is None:
|
||||
timestamp = datetime_to_timestamp(datetime.datetime.utcnow())
|
||||
timestamp = timestamp_now()
|
||||
|
||||
cdx = {'urlkey': urlkey,
|
||||
'timestamp': timestamp,
|
||||
|
@ -61,6 +61,9 @@ class BaseWbUrl(object):
|
||||
def is_replay(self):
|
||||
return self.is_replay_type(self.type)
|
||||
|
||||
def is_latest_replay(self):
|
||||
return (self.type == BaseWbUrl.LATEST_REPLAY)
|
||||
|
||||
def is_query(self):
|
||||
return self.is_query_type(self.type)
|
||||
|
||||
|
@ -122,6 +122,7 @@ function notify_top() {
|
||||
if (window.__orig_parent && window.__orig_parent.update_wb_url) {
|
||||
window.__orig_parent.update_wb_url(window.WB_wombat_location.href,
|
||||
wbinfo.timestamp,
|
||||
wbinfo.request_ts,
|
||||
wbinfo.is_live);
|
||||
}
|
||||
|
||||
|
@ -42,7 +42,7 @@ function make_inner_url(url, ts)
|
||||
}
|
||||
}
|
||||
|
||||
function push_state(url, timestamp, capture_str, is_live) {
|
||||
function push_state(url, timestamp, request_ts, capture_str, is_live) {
|
||||
if (window.frames[0].WB_wombat_location) {
|
||||
var curr_href = window.frames[0].WB_wombat_location.href;
|
||||
|
||||
@ -54,8 +54,9 @@ function push_state(url, timestamp, capture_str, is_live) {
|
||||
|
||||
var state = {}
|
||||
state.timestamp = timestamp;
|
||||
state.outer_url = make_outer_url(url, state.timestamp);
|
||||
state.inner_url = make_inner_url(url, state.timestamp);
|
||||
state.request_ts = request_ts;
|
||||
state.outer_url = make_outer_url(url, state.request_ts);
|
||||
state.inner_url = make_inner_url(url, state.request_ts);
|
||||
state.url = url;
|
||||
state.capture_str = capture_str;
|
||||
state.is_live = is_live;
|
||||
@ -130,6 +131,7 @@ function extract_ts_cookie(value) {
|
||||
function iframe_loaded(event) {
|
||||
var url;
|
||||
var ts;
|
||||
var request_ts;
|
||||
var capture_str;
|
||||
var is_live = false;
|
||||
var iframe = window.frames[0];
|
||||
@ -142,6 +144,7 @@ function iframe_loaded(event) {
|
||||
|
||||
if (iframe.wbinfo) {
|
||||
ts = iframe.wbinfo.timestamp;
|
||||
request_ts = iframe.wbinfo.request_ts;
|
||||
is_live = iframe.wbinfo.is_live;
|
||||
} else {
|
||||
ts = extract_ts_cookie(iframe.document.cookie);
|
||||
@ -150,19 +153,20 @@ function iframe_loaded(event) {
|
||||
} else {
|
||||
ts = extract_ts(iframe.location.href);
|
||||
}
|
||||
request_ts = ts;
|
||||
}
|
||||
|
||||
update_wb_url(url, ts, is_live);
|
||||
update_wb_url(url, ts, request_ts, is_live);
|
||||
}
|
||||
|
||||
function update_wb_url(url, ts, is_live) {
|
||||
function update_wb_url(url, ts, request_ts, is_live) {
|
||||
if (curr_state.url == url && curr_state.timestamp == ts) {
|
||||
return;
|
||||
}
|
||||
|
||||
capture_str = _wb_js.ts_to_date(ts, true);
|
||||
|
||||
push_state(url, ts, capture_str, is_live);
|
||||
push_state(url, ts, request_ts, capture_str, is_live);
|
||||
}
|
||||
|
||||
// Load Banner
|
||||
|
@ -17,6 +17,7 @@
|
||||
wbinfo = {}
|
||||
wbinfo.url = "{{ cdx.url }}";
|
||||
wbinfo.timestamp = "{{ cdx.timestamp }}";
|
||||
wbinfo.request_ts = "{{ wbrequest.wb_url.timestamp }}";
|
||||
wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
|
||||
wbinfo.mod = "{{ wbrequest.wb_url.mod }}";
|
||||
wbinfo.top_url = "{{ top_url }}";
|
||||
|
@ -72,6 +72,14 @@ def datetime_to_timestamp(the_datetime):
|
||||
return the_datetime.strftime(TIMESTAMP_14)
|
||||
|
||||
|
||||
def timestamp_now():
|
||||
"""
|
||||
>>> len(timestamp_now())
|
||||
14
|
||||
"""
|
||||
return datetime_to_timestamp(datetime.datetime.utcnow())
|
||||
|
||||
|
||||
def iso_date_to_timestamp(string):
|
||||
"""
|
||||
>>> iso_date_to_timestamp('2013-12-26T10:11:12Z')
|
||||
|
@ -7,6 +7,7 @@ from urlparse import urlsplit
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.wbexception import WbException, NotFoundException
|
||||
from pywb.utils.loaders import LimitReader
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.framework.memento import MementoResponse
|
||||
@ -219,8 +220,11 @@ class ReplayView(object):
|
||||
if wbrequest.custom_params.get('noredir'):
|
||||
return None
|
||||
|
||||
is_memento_timegate = (wbrequest.options.get('is_timegate', False))
|
||||
redir_needed = is_memento_timegate
|
||||
is_timegate = (wbrequest.options.get('is_timegate', False))
|
||||
if not is_timegate:
|
||||
is_timegate = wbrequest.wb_url.is_latest_replay()
|
||||
|
||||
redir_needed = is_timegate
|
||||
|
||||
if not redir_needed and self.redir_to_exact:
|
||||
redir_needed = (cdx['timestamp'] != wbrequest.wb_url.timestamp)
|
||||
@ -231,15 +235,20 @@ class ReplayView(object):
|
||||
if self.enable_range_cache and wbrequest.extract_range():
|
||||
return None
|
||||
|
||||
if is_timegate and not self.redir_to_exact:
|
||||
timestamp = timestamp_now()
|
||||
else:
|
||||
timestamp = cdx['timestamp']
|
||||
|
||||
new_url = (wbrequest.urlrewriter.
|
||||
get_new_url(timestamp=cdx['timestamp'],
|
||||
get_new_url(timestamp=timestamp,
|
||||
url=cdx['original']))
|
||||
|
||||
if wbrequest.method == 'POST':
|
||||
# FF shows a confirm dialog, so can't use 307 effectively
|
||||
# was: statusline = '307 Same-Method Internal Redirect'
|
||||
return None
|
||||
elif is_memento_timegate:
|
||||
elif is_timegate:
|
||||
statusline = '302 Found'
|
||||
else:
|
||||
# clear cdx line to indicate internal redirect
|
||||
|
@ -40,7 +40,13 @@ collections:
|
||||
|
||||
pywb-norange:
|
||||
index_paths: ./sample_archive/cdx/
|
||||
enable_ranges: False
|
||||
enable_ranges: false
|
||||
|
||||
pywb-non-exact:
|
||||
index_paths: ./sample_archive/cdx/
|
||||
redir_to_exact: false
|
||||
|
||||
|
||||
|
||||
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
|
||||
# SURT keys are recommended for future indices, but non-SURT cdxs
|
||||
|
@ -6,6 +6,10 @@
|
||||
collections:
|
||||
pywb: ./sample_archive/cdx/
|
||||
|
||||
pywb-non-exact:
|
||||
index_paths: ./sample_archive/cdx/
|
||||
redir_to_exact: false
|
||||
|
||||
archive_paths: ['./sample_archive/warcs/']
|
||||
|
||||
# Test memento
|
||||
|
@ -4,7 +4,7 @@ import base64
|
||||
from pywb.webapp.pywb_init import create_wb_router
|
||||
from pywb.framework.wsgi_wrappers import init_app
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
|
||||
class TestWb:
|
||||
TEST_CONFIG = 'tests/test_config.yaml'
|
||||
@ -256,14 +256,24 @@ class TestWb:
|
||||
assert resp.content_length == 0
|
||||
assert resp.content_type == 'application/x-javascript'
|
||||
|
||||
def test_redirect_1(self):
|
||||
def test_redirect_exact(self):
|
||||
resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/')
|
||||
assert resp.status_int == 302
|
||||
|
||||
assert resp.headers['Location'].endswith('/pywb/20140127171238/http://iana.org')
|
||||
|
||||
def test_no_redirect_non_exact(self):
|
||||
# non-exact mode, don't redirect to exact capture
|
||||
resp = self.testapp.get('/pywb-non-exact/20140127171237/http://www.iana.org/')
|
||||
assert resp.status_int == 200
|
||||
|
||||
def test_redirect_replay_2(self):
|
||||
self._assert_basic_html(resp)
|
||||
assert '"20140127171237"' in resp.body
|
||||
# actual timestamp set in JS
|
||||
assert 'timestamp = "20140127171238"' in resp.body
|
||||
assert '/pywb-non-exact/20140127171237/http://www.iana.org/about/' in resp.body
|
||||
|
||||
def test_redirect_latest_replay(self):
|
||||
resp = self.testapp.get('/pywb/http://example.com/')
|
||||
assert resp.status_int == 302
|
||||
|
||||
@ -275,6 +285,26 @@ class TestWb:
|
||||
assert '"20140127171251"' in resp.body
|
||||
assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body
|
||||
|
||||
def test_redirect_non_exact_latest_replay_ts(self):
|
||||
resp = self.testapp.get('/pywb-non-exact/http://example.com/')
|
||||
assert resp.status_int == 302
|
||||
|
||||
assert resp.headers['Location'].endswith('/http://example.com')
|
||||
|
||||
# extract ts, which should be current time
|
||||
ts = resp.headers['Location'].rsplit('/http://')[0].rsplit('/', 1)[-1]
|
||||
assert len(ts) == 14, ts
|
||||
resp = resp.follow()
|
||||
|
||||
self._assert_basic_html(resp)
|
||||
|
||||
# ensure the current ts is present in the links
|
||||
assert '"{0}"'.format(ts) in resp.body
|
||||
assert '/pywb-non-exact/{0}/http://www.iana.org/domains/example'.format(ts) in resp.body
|
||||
|
||||
# ensure ts is current ts
|
||||
assert timestamp_now() >= ts, ts
|
||||
|
||||
def test_redirect_relative_3(self):
|
||||
# webtest uses Host: localhost:80 by default
|
||||
# first two requests should result in same redirect
|
||||
|
@ -3,6 +3,7 @@ import re
|
||||
from pywb.webapp.pywb_init import create_wb_router
|
||||
from pywb.framework.wsgi_wrappers import init_app
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.utils.timeutils import timestamp_now
|
||||
|
||||
MEMENTO_DATETIME = 'Memento-Datetime'
|
||||
ACCEPT_DATETIME = 'Accept-Datetime'
|
||||
@ -23,13 +24,13 @@ class TestWb:
|
||||
def get_links(self, resp):
|
||||
return map(lambda x: x.strip(), re.split(', (?![0-9])', resp.headers[LINK]))
|
||||
|
||||
def make_timemap_link(self, url):
|
||||
format_ = '<http://localhost:80/pywb/timemap/*/{0}>; rel="timemap"; type="{1}"'
|
||||
return format_.format(url, LINK_FORMAT)
|
||||
def make_timemap_link(self, url, coll='pywb'):
|
||||
format_ = '<http://localhost:80/{2}/timemap/*/{0}>; rel="timemap"; type="{1}"'
|
||||
return format_.format(url, LINK_FORMAT, coll)
|
||||
|
||||
def make_memento_link(self, url, ts, dt):
|
||||
format_ = '<http://localhost:80/pywb/{1}/{0}>; rel="memento"; datetime="{2}"'
|
||||
return format_.format(url, ts, dt)
|
||||
def make_memento_link(self, url, ts, dt, coll='pywb'):
|
||||
format_ = '<http://localhost:80/{3}/{1}/{0}>; rel="memento"; datetime="{2}"'
|
||||
return format_.format(url, ts, dt, coll)
|
||||
|
||||
# Below functionality is for archival (non-proxy) mode
|
||||
# It is designed to conform to Memento protocol Pattern 2.1
|
||||
@ -57,7 +58,35 @@ class TestWb:
|
||||
assert '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
|
||||
|
||||
|
||||
def test_timegate_accept_datetime(self):
|
||||
# timegate with latest memento, but redirect to current timestamp url instead of
|
||||
# memento timestamp
|
||||
def test_timegate_latest_request_timestamp(self):
|
||||
"""
|
||||
TimeGate with no Accept-Datetime header
|
||||
"""
|
||||
|
||||
dt = 'Mon, 27 Jan 2014 17:12:39 GMT'
|
||||
resp = self.testapp.get('/pywb-non-exact/http://www.iana.org/_css/2013.1/screen.css')
|
||||
|
||||
assert resp.status_int == 302
|
||||
|
||||
assert resp.headers[VARY] == 'accept-datetime'
|
||||
|
||||
links = self.get_links(resp)
|
||||
assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"' in links
|
||||
assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css', coll='pywb-non-exact') in links
|
||||
assert self.make_memento_link('http://www.iana.org/_css/2013.1/screen.css', '20140127171239', dt, coll='pywb-non-exact') in links
|
||||
|
||||
assert MEMENTO_DATETIME not in resp.headers
|
||||
|
||||
assert '/pywb-non-exact/' in resp.headers['Location']
|
||||
|
||||
wburl = resp.headers['Location'].split('/pywb-non-exact/')[-1]
|
||||
ts = wburl.split('/')[0]
|
||||
assert len(ts) == 14
|
||||
assert timestamp_now() >= ts
|
||||
|
||||
def test_timegate_accept_datetime_exact(self):
|
||||
"""
|
||||
TimeGate with Accept-Datetime header, matching exactly
|
||||
"""
|
||||
@ -78,6 +107,28 @@ class TestWb:
|
||||
|
||||
assert '/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
|
||||
|
||||
def test_timegate_accept_datetime_inexact(self):
|
||||
"""
|
||||
TimeGate with Accept-Datetime header, not matching a memento exactly
|
||||
"""
|
||||
dt = 'Sun, 26 Jan 2014 20:08:04 GMT'
|
||||
request_dt = 'Sun, 26 Jan 2014 20:08:00 GMT'
|
||||
headers = {ACCEPT_DATETIME: request_dt}
|
||||
resp = self.testapp.get('/pywb//http://www.iana.org/_css/2013.1/screen.css', headers=headers)
|
||||
|
||||
assert resp.status_int == 302
|
||||
|
||||
assert resp.headers[VARY] == 'accept-datetime'
|
||||
|
||||
links = self.get_links(resp)
|
||||
assert '<http://www.iana.org/_css/2013.1/screen.css>; rel="original"' in links
|
||||
assert self.make_timemap_link('http://www.iana.org/_css/2013.1/screen.css') in links
|
||||
assert self.make_memento_link('http://www.iana.org/_css/2013.1/screen.css', '20140126200804', dt) == links[0], links[0]
|
||||
|
||||
assert MEMENTO_DATETIME not in resp.headers
|
||||
|
||||
assert '/pywb/20140126200804/http://www.iana.org/_css/2013.1/screen.css' in resp.headers['Location']
|
||||
|
||||
|
||||
def test_non_timegate_intermediate_redir(self):
|
||||
"""
|
||||
|
Loading…
x
Reference in New Issue
Block a user