1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

replay: support 'framed_replay' option in config for both replay and live rewrite

split replay view into BaseContentView and ReplayView
refactor RewriteLiveHandler into RewriteLiveView
add additional tests for framed and non-framed mode
default to framed replay!
This commit is contained in:
Ilya Kreymer 2014-06-14 18:26:19 -07:00
parent d21f8079ca
commit 80e80e97d3
9 changed files with 237 additions and 152 deletions

View File

@ -104,3 +104,5 @@ enable_memento: true
# Use lxml parser, if available
use_lxml_parser: false
# Replay content in an iframe
framed_replay: true

View File

@ -17,9 +17,12 @@ This file is part of pywb.
along with pywb. If not, see <http://www.gnu.org/licenses/>.
*/
_wb_js = (function() {
function init_banner() {
var PLAIN_BANNER_ID = "_wb_plain_banner";
var FRAME_BANNER_ID = "_wb_frame_top_banner";
var bid;
if (wbinfo.is_embed) {
return;
@ -44,7 +47,7 @@ function init_banner() {
text = "This is an archived page ";
if (wbinfo && wbinfo.capture_str) {
text += " from <b>" + wbinfo.capture_str + "</b>";
text += " from <b id='_wb_capture_info'>" + wbinfo.capture_str + "</b>";
}
banner.innerHTML = text;
@ -76,31 +79,51 @@ function remove_event(name, func, object) {
}
}
var notified_top = false;
var detect_on_init = function() {
if (!notified_top && window && window.top && (window.self != window.top) && window.WB_wombat_location) {
if (!wbinfo.is_embed) {
window.top.postMessage(window.WB_wombat_location.href, "*");
}
notified_top = true;
function notify_top(event) {
if (window.self == window.top) {
return;
}
if (document.readyState === "interactive" ||
document.readyState === "complete") {
if (window.top.top != window.top) {
return;
}
if (!window.WB_wombat_location) {
return;
}
if (wbinfo.is_embed) {
return;
}
if (event.target != window.document) {
return;
}
if (typeof(window.WB_wombat_location.href) != "string") {
return;
}
if (window.top.update_wb_url) {
window.top.update_wb_url(window.WB_wombat_location.href, wbinfo.timestamp, wbinfo.capture_str);
}
}
var detect_on_init = function(event) {
init_banner();
notify_top(event);
remove_event("readystatechange", detect_on_init, document);
}
}
add_event("readystatechange", detect_on_init, document);
if (wbinfo.is_frame_mp && wbinfo.canon_url &&
(window.self == window.top) &&
(window.self == window.top) && (window.self.top == window.top) &&
window.location.href != wbinfo.canon_url) {
window.location.replace(wbinfo.canon_url);
}
})();

View File

@ -3,7 +3,8 @@
<!-- Start WB Insert -->
<script>
wbinfo = {}
wbinfo.capture_str = "{{ timestamp | format_ts }}";
// wbinfo.capture_str = "{{ timestamp | format_ts }}";
wbinfo.capture_str = " ";
wbinfo.is_embed = false;
wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
wbinfo.capture_url = "{{ url }}";
@ -12,27 +13,49 @@
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script>
<script>
window.addEventListener("message", update_url, false);
var update_wb_url = push_state;
function push_state(url) {
state = {}
state.outer_url = wbinfo.prefix + url;
state.inner_url = wbinfo.prefix + "mp_/" + url;
if (url == wbinfo.capture_url) {
return;
function make_outer_url(url, ts)
{
if (ts) {
return wbinfo.prefix + ts + "/" + url;
} else {
return wbinfo.prefix + url;
}
}
function make_inner_url(url, ts)
{
if (ts) {
return wbinfo.prefix + ts + "mp_/" + url;
} else {
return wbinfo.prefix + "mp_/" + url;
}
}
function push_state(url, timestamp, capture_str) {
var state = {}
state.outer_url = make_outer_url(url, timestamp);
state.inner_url = make_inner_url(url, timestamp);
state.capture_str = capture_str;
//if (url == wbinfo.capture_url) {
// return;
//}
window.history.replaceState(state, "", state.outer_url);
update_status(state.capture_str);
}
function pop_state(url) {
window.frames[0].src = url;
function pop_state(state) {
update_status(state.capture_str);
window.frames[0].src = state.outer_url;
}
function update_url(event) {
if (event.source == window.frames[0]) {
push_state(event.data);
function update_status(str) {
var elem = document.getElementById("_wb_capture_info");
if (elem) {
elem.innerHTML = str;
}
}
@ -40,7 +63,7 @@ window.onpopstate = function(event) {
var curr_state = event.state;
if (curr_state) {
pop_state(curr_state.outer_url);
pop_state(curr_state);
}
}

View File

@ -12,6 +12,7 @@
{% endif %}
<script>
wbinfo = {}
wbinfo.timestamp = "{{ cdx.timestamp }}";
wbinfo.capture_str = "{{ cdx.timestamp | format_ts }}";
wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
wbinfo.is_embed = {{"true" if wbrequest.wb_url.is_embed else "false"}};

View File

@ -2,75 +2,24 @@ from pywb.framework.basehandlers import WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.rewrite.rewrite_live import LiveRewriter
from pywb.rewrite.wburl import WbUrl
from handlers import StaticHandler
from pywb.utils.canonicalize import canonicalize
from pywb.utils.timeutils import datetime_to_timestamp
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.rewrite.rewriterules import use_lxml_parser
import datetime
from views import J2TemplateView, HeadInsertView
from replay_views import RewriteLiveView
#=================================================================
class RewriteHandler(WbUrlHandler):
def __init__(self, config={}):
#use_lxml_parser()
self.rewriter = LiveRewriter(defmod='mp_')
view = config.get('head_insert_view')
if not view:
head_insert = config.get('head_insert_html',
'ui/head_insert.html')
view = HeadInsertView.create_template(head_insert, 'Head Insert')
self.head_insert_view = view
view = config.get('frame_insert_view')
if not view:
frame_insert = config.get('frame_insert_html',
'ui/frame_insert.html')
view = J2TemplateView.create_template(frame_insert, 'Frame Insert')
self.frame_insert_view = view
def __init__(self, config=dict(framed_replay=True)):
self.rewrite_proxy_view = RewriteLiveView(config)
def __call__(self, wbrequest):
url = wbrequest.wb_url.url
if not wbrequest.wb_url.mod:
embed_url = wbrequest.wb_url.to_str(mod='mp_')
timestamp = datetime_to_timestamp(datetime.datetime.utcnow())
return self.frame_insert_view.render_response(embed_url=embed_url,
wbrequest=wbrequest,
timestamp=timestamp,
url=url)
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
ref_wburl_str = wbrequest.extract_referrer_wburl_str()
if ref_wburl_str:
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
result = self.rewriter.fetch_request(url, wbrequest.urlrewriter,
head_insert_func=head_insert_func,
env=wbrequest.env)
status_headers, gen, is_rewritten = result
return WbResponse(status_headers, gen)
return self.rewrite_proxy_view(wbrequest)
#=================================================================
def create_live_rewriter_app():
routes = [Route('rewrite', RewriteHandler()),
Route('static/default', StaticHandler('pywb/static/'))
]
return ArchivalRouter(routes, hostpaths=['http://localhost:8080'])

View File

@ -9,7 +9,6 @@ from pywb.framework.basehandlers import BaseHandler
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.rewrite.rewriterules import use_lxml_parser
from views import J2TemplateView, add_env_globals
@ -66,8 +65,7 @@ class DictChain:
#=================================================================
def create_wb_handler(query_handler, config,
ds_rules_file=DEFAULT_RULES_FILE):
def create_wb_handler(query_handler, config):
cookie_maker = config.get('cookie_maker')
record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker)
@ -81,28 +79,7 @@ def create_wb_handler(query_handler, config,
if template_globals:
add_env_globals(template_globals)
head_insert_view = (HeadInsertView.
create_template(config.get('head_insert_html'),
'Head Insert'))
defmod = config.get('default_mod', '')
replayer = ReplayView(
content_loader=resolving_loader,
content_rewriter=RewriteContent(ds_rules_file=ds_rules_file,
defmod=defmod),
head_insert_view=head_insert_view,
buffer_response=config.get('buffer_response', True),
redir_to_exact=config.get('redir_to_exact', True),
memento=config.get('enable_memento', False),
reporter=config.get('reporter')
)
replayer = ReplayView(resolving_loader, config)
search_view = (J2TemplateView.
create_template(config.get('search_html'),
@ -137,7 +114,7 @@ def init_collection(value, config):
ds_rules_file,
html_view)
return route_config, query_handler, ds_rules_file
return route_config, query_handler
#=================================================================
@ -167,7 +144,7 @@ def create_cdx_server_app(passed_config):
for name, value in collections.iteritems():
result = init_collection(value, config)
route_config, query_handler, ds_rules_file = result
route_config, query_handler = result
cdx_api_suffix = route_config.get('enable_cdx_api', True)
@ -210,12 +187,11 @@ def create_wb_router(passed_config={}):
continue
result = init_collection(value, config)
route_config, query_handler, ds_rules_file = result
route_config, query_handler = result
wb_handler = create_wb_handler(
query_handler=query_handler,
config=route_config,
ds_rules_file=ds_rules_file,
config=route_config
)
logging.debug('Adding Collection: ' + name)

View File

@ -1,15 +1,24 @@
import re
import datetime
from io import BytesIO
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.wbexception import WbException, NotFoundException
from pywb.utils.loaders import LimitReader
from pywb.utils.timeutils import datetime_to_timestamp
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.memento import MementoResponse
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.rewrite.rewrite_live import LiveRewriter
from pywb.rewrite.wburl import WbUrl
from pywb.warc.recordloader import ArchiveLoadFailed
from views import J2TemplateView, add_env_globals
from views import J2HtmlCapturesView, HeadInsertView
#=================================================================
class CaptureException(WbException):
@ -23,33 +32,108 @@ class CaptureException(WbException):
#=================================================================
class ReplayView(object):
class BaseContentView(object):
def __init__(self, config):
self.is_frame_mode = config.get('framed_replay', False)
if self.is_frame_mode:
self._mp_mod = 'mp_'
else:
self._mp_mod = ''
view = config.get('head_insert_view')
if not view:
head_insert = config.get('head_insert_html',
'ui/head_insert.html')
view = HeadInsertView.create_template(head_insert, 'Head Insert')
self.head_insert_view = view
if not self.is_frame_mode:
self.frame_insert_view = None
return
view = config.get('frame_insert_view')
if not view:
frame_insert = config.get('frame_insert_html',
'ui/frame_insert.html')
view = J2TemplateView.create_template(frame_insert, 'Frame Insert')
self.frame_insert_view = view
def __call__(self, wbrequest, *args):
# render top level frame if in frame mode
# (not supported in proxy mode)
if (self.is_frame_mode and
not wbrequest.is_proxy and
not wbrequest.wb_url.mod):
embed_url = wbrequest.wb_url.to_str(mod=self._mp_mod)
timestamp = datetime_to_timestamp(datetime.datetime.utcnow())
url = wbrequest.wb_url.url
return self.frame_insert_view.render_response(embed_url=embed_url,
wbrequest=wbrequest,
timestamp=timestamp,
url=url)
return self.render_content(wbrequest, *args)
#=================================================================
class RewriteLiveView(BaseContentView):
def __init__(self, config):
super(RewriteLiveView, self).__init__(config)
self.rewriter = LiveRewriter(defmod=self._mp_mod)
def render_content(self, wbrequest, *args):
head_insert_func = self.head_insert_view.create_insert_func(wbrequest)
ref_wburl_str = wbrequest.extract_referrer_wburl_str()
if ref_wburl_str:
wbrequest.env['REL_REFERER'] = WbUrl(ref_wburl_str).url
url = wbrequest.wb_url.url
result = self.rewriter.fetch_request(url, wbrequest.urlrewriter,
head_insert_func=head_insert_func,
env=wbrequest.env)
status_headers, gen, is_rewritten = result
return WbResponse(status_headers, gen)
#=================================================================
class ReplayView(BaseContentView):
STRIP_SCHEME = re.compile('^([\w]+:[/]*)?(.*?)$')
def __init__(self, content_loader, content_rewriter, head_insert_view=None,
redir_to_exact=True, buffer_response=False, reporter=None,
memento=False):
def __init__(self, content_loader, config):
super(ReplayView, self).__init__(config)
self.content_loader = content_loader
self.content_rewriter = content_rewriter
self.content_rewriter=RewriteContent(defmod=self._mp_mod)
self.head_insert_view = head_insert_view
self.buffer_response = config.get('buffer_response', True)
self.redir_to_exact = redir_to_exact
# buffer or stream rewritten response
self.buffer_response = buffer_response
self._reporter = reporter
self.redir_to_exact = config.get('redir_to_exact', True)
memento = config.get('enable_memento', False)
if memento:
self.response_class = MementoResponse
else:
self.response_class = WbResponse
def __call__(self, wbrequest, cdx_lines, cdx_loader):
self._reporter = config.get('reporter')
def render_content(self, wbrequest, *args):
last_e = None
first = True
cdx_lines = args[0]
cdx_loader = args[1]
# List of already failed w/arcs
failed_files = []

View File

@ -13,10 +13,18 @@ collections:
pywb: ./sample_archive/cdx/
# ex with filtering: filter CDX lines by filename starting with 'dupe'
pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}
pywb-filt:
index_paths: './sample_archive/cdx/'
filters: ['filename:dupe*']
pywb-nonframe:
index_paths: './sample_archive/cdx/'
framed_replay: false
# collection of non-surt CDX
pywb-nosurt: {'index_paths': './sample_archive/non-surt-cdx/', 'surt_ordered': False}
pywb-nosurt:
index_paths: './sample_archive/non-surt-cdx/'
surt_ordered: false
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
@ -101,6 +109,12 @@ reporter: !!python/object/new:tests.fixture.PrintReporter []
# custom rules for domain specific matching
#domain_specific_rules: rules.yaml
# Use lxml parser, if available
use_lxml_parser: false
# Replay content in an iframe
framed_replay: true
# ==== New / Experimental Settings ====
# Not yet production ready -- used primarily for testing

View File

@ -85,30 +85,43 @@ class TestWb:
actual_len = len(str(resp.body).rstrip().split('\n'))
assert actual_len == 3, actual_len
def test_replay_1(self):
def test_replay_top_frame(self):
resp = self.testapp.get('/pywb/20140127171238/http://www.iana.org/')
assert '<iframe ' in resp.body
assert '/pywb/20140127171238mp_/http://www.iana.org/' in resp.body
def test_replay_content(self):
resp = self.testapp.get('/pywb/20140127171238mp_/http://www.iana.org/')
self._assert_basic_html(resp)
assert 'Mon, Jan 27 2014 17:12:38' in resp.body
assert 'wb.js' in resp.body
assert '/pywb/20140127171238/http://www.iana.org/time-zones"' in resp.body
assert '/pywb/20140127171238mp_/http://www.iana.org/time-zones"' in resp.body
def test_replay_non_frame_content(self):
resp = self.testapp.get('/pywb-nonframe/20140127171238/http://www.iana.org/')
self._assert_basic_html(resp)
assert 'Mon, Jan 27 2014 17:12:38' in resp.body
assert 'wb.js' in resp.body
assert '/pywb-nonframe/20140127171238/http://www.iana.org/time-zones"' in resp.body
def test_replay_non_surt(self):
resp = self.testapp.get('/pywb-nosurt/20140103030321/http://example.com?example=1')
resp = self.testapp.get('/pywb-nosurt/20140103030321mp_/http://example.com?example=1')
self._assert_basic_html(resp)
assert 'Fri, Jan 03 2014 03:03:21' in resp.body
assert 'wb.js' in resp.body
assert '/pywb-nosurt/20140103030321/http://www.iana.org/domains/example' in resp.body
assert '/pywb-nosurt/20140103030321mp_/http://www.iana.org/domains/example' in resp.body
def test_replay_url_agnostic_revisit(self):
resp = self.testapp.get('/pywb/20130729195151/http://www.example.com/')
resp = self.testapp.get('/pywb/20130729195151mp_/http://www.example.com/')
self._assert_basic_html(resp)
assert 'Mon, Jul 29 2013 19:51:51' in resp.body
assert 'wb.js' in resp.body
assert '/pywb/20130729195151/http://www.iana.org/domains/example"' in resp.body
assert '/pywb/20130729195151mp_/http://www.iana.org/domains/example"' in resp.body
def test_replay_cdx_mod(self):
resp = self.testapp.get('/pywb/20140127171239cdx_/http://www.iana.org/_css/2013.1/print.css')
@ -164,42 +177,42 @@ class TestWb:
assert resp.content_type == 'application/x-javascript'
def test_redirect_1(self):
resp = self.testapp.get('/pywb/20140127171237/http://www.iana.org/')
resp = self.testapp.get('/pywb/20140127171237mp_/http://www.iana.org/')
assert resp.status_int == 302
assert resp.headers['Location'].endswith('/pywb/20140127171238/http://iana.org')
assert resp.headers['Location'].endswith('/pywb/20140127171238mp_/http://iana.org')
def test_redirect_replay_2(self):
resp = self.testapp.get('/pywb/http://example.com/')
resp = self.testapp.get('/pywb/mp_/http://example.com/')
assert resp.status_int == 302
assert resp.headers['Location'].endswith('/20140127171251/http://example.com')
assert resp.headers['Location'].endswith('/20140127171251mp_/http://example.com')
resp = resp.follow()
#check resp
self._assert_basic_html(resp)
assert 'Mon, Jan 27 2014 17:12:51' in resp.body
assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body
assert '/pywb/20140127171251mp_/http://www.iana.org/domains/example' in resp.body
def test_redirect_relative_3(self):
# first two requests should result in same redirect
target = 'http://localhost:8080/pywb/2014/http://iana.org/_css/2013.1/screen.css'
target = 'http://localhost:8080/pywb/2014mp_/http://iana.org/_css/2013.1/screen.css'
# without timestamp
resp = self.testapp.get('/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:8080/pywb/2014/http://iana.org/')])
resp = self.testapp.get('/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:8080/pywb/2014mp_/http://iana.org/')])
assert resp.status_int == 302
assert resp.headers['Location'] == target, resp.headers['Location']
# with timestamp
resp = self.testapp.get('/2014/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:8080/pywb/2014/http://iana.org/')])
resp = self.testapp.get('/2014/_css/2013.1/screen.css', headers = [('Referer', 'http://localhost:8080/pywb/2014mp_/http://iana.org/')])
assert resp.status_int == 302
assert resp.headers['Location'] == target, resp.headers['Location']
resp = resp.follow()
assert resp.status_int == 302
assert resp.headers['Location'].endswith('/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css')
assert resp.headers['Location'].endswith('/pywb/20140127171239mp_/http://www.iana.org/_css/2013.1/screen.css')
resp = resp.follow()
assert resp.status_int == 200
@ -207,7 +220,7 @@ class TestWb:
def test_referrer_self_redirect(self):
uri = '/pywb/20140127171239/http://www.iana.org/_css/2013.1/screen.css'
uri = '/pywb/20140127171239mp_/http://www.iana.org/_css/2013.1/screen.css'
host = 'somehost:8082'
referrer = 'http://' + host + uri
@ -221,7 +234,7 @@ class TestWb:
def test_post_1(self):
resp = self.testapp.post('/pywb/httpbin.org/post', {'foo': 'bar', 'test': 'abc'})
resp = self.testapp.post('/pywb/mp_/httpbin.org/post', {'foo': 'bar', 'test': 'abc'})
# no redirects for POST, as some browsers (FF) show modal confirmation dialog!
#assert resp.status_int == 307
@ -236,13 +249,13 @@ class TestWb:
assert '"test": "abc"' in resp.body
def test_post_2(self):
resp = self.testapp.post('/pywb/20140610001255/http://httpbin.org/post?foo=bar', {'data': '^'})
resp = self.testapp.post('/pywb/20140610001255mp_/http://httpbin.org/post?foo=bar', {'data': '^'})
assert resp.status_int == 200
assert '"data": "^"' in resp.body
def test_post_redirect(self):
# post handled without redirect (since 307 not allowed)
resp = self.testapp.post('/post', {'foo': 'bar', 'test': 'abc'}, headers=[('Referer', 'http://localhost:8080/pywb/2014/http://httpbin.org/post')])
resp = self.testapp.post('/post', {'foo': 'bar', 'test': 'abc'}, headers=[('Referer', 'http://localhost:8080/pywb/2014mp_/http://httpbin.org/post')])
assert resp.status_int == 200
assert '"foo": "bar"' in resp.body
assert '"test": "abc"' in resp.body