1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Merge branch 'develop' for 0.7.6

This commit is contained in:
Ilya Kreymer 2015-01-26 10:38:35 -08:00
commit 8b5a6be956
13 changed files with 163 additions and 44 deletions

View File

@ -1,13 +1,26 @@
pywb 0.7.6 changelist
~~~~~~~~~~~~~~~~~~~~~
* new not found Jinja2 template: Add per-collection-overridable ``not_found.html`` template, specified via ``not_found_html`` option. For missing resources, the ``not_found_html`` template is now used instead of the generic ``error_html``
* client-side rewrite: improved wombat rewrite of postMessage events, unrewrite target on receive, improved Vine replay
* packaging: allow adding multiple packages for Jinja2 template resolving
pywb 0.7.5 changelist
~~~~~~~~~~~~~~~~~~~~~
* Cross platform fixes to support Windows -- all tests pass on Linux, OS X and Windows now. Improved cross-platform support includes:
- read all files as binary to avoid line ending issues
- properly convert url <-> file
- avoid platform dependent apis
- properly convert between platform dependent file paths and urls
- add .gitattributes to ensure line endings on *.warc*, *.arc*, *.cdx* files are unaltered
- avoid platform dependent apis (eg. %s for strftime)
* Change any unhandled exceptions to result in a 500 error, instead of 400.
* Setup: switch to ``zip_safe=True`` to allow for embedding pywb egg in one-file app with `pyinstaller <https://github.com/pyinstaller/pyinstaller>`_
* More compresensive client side ``src`` attribute rewriting (via wombat.js), additional server-side HTML tag rewriting.
@ -39,6 +52,7 @@ pywb 0.7.1 changelist
* Use `youtube-dl <http://rg3.github.io/youtube-dl/>`_ to find actual video streams from page urls, record video info.
* New, improved wombat 2.1 -- improved rewriting of dynamic content, including:
- setAttribute override
- Date override sets date to replay timestamp
- Image() object override

View File

@ -1,4 +1,4 @@
PyWb 0.7.5
PyWb 0.7.6
==========
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master

View File

@ -126,7 +126,15 @@ rules:
- videoFileId
- signature
# vine
- url_prefix: 'co,vine,cdn,'
rewrite:
js_rewrite_location: urls
js_regexs:
- match: 'window.location'
replace: 'WB_wombat_location'
# youtube rules
#=================================================================

View File

@ -18,7 +18,7 @@ This file is part of pywb, https://github.com/ikreymer/pywb
*/
//============================================
// Wombat JS-Rewriting Library v2.1
// Wombat JS-Rewriting Library v2.2
//============================================
_WBWombat = (function() {
@ -68,6 +68,16 @@ _WBWombat = (function() {
return undefined;
}
//============================================
function equals_any(string, arr) {
for (var i = 0; i < arr.length; i++) {
if (string === arr[i]) {
return arr[i];
}
}
return undefined;
}
//============================================
function ends_with(str, suffix) {
if (str.indexOf(suffix, str.length - suffix.length) !== -1) {
@ -107,6 +117,8 @@ _WBWombat = (function() {
var SRC_TAGS = ["IMG", "SCRIPT", "VIDEO", "AUDIO", "SOURCE", "EMBED", "INPUT"];
var REWRITE_ATTRS = ["src", "href", "poster"];
//============================================
function rewrite_url_(url) {
// If undefined, just return it
@ -158,12 +170,19 @@ _WBWombat = (function() {
return wb_replay_date_prefix + wb_orig_host + url;
}
// If full url starting with http://, add prefix
// If full url starting with http://, https:// or //
// add rewrite prefix
var prefix = starts_with(url, VALID_PREFIXES);
if (prefix) {
// if already rewriting url, must still check scheme
if (starts_with(url, prefix + window.location.host + '/')) {
var curr_scheme = window.location.protocol + '//';
// replace scheme to ensure using the correct server scheme
if (starts_with(url, wb_orig_scheme) && (wb_orig_scheme != curr_scheme)) {
url = curr_scheme + url.substring(wb_orig_scheme.length);
}
return url;
}
return wb_replay_date_prefix + url;
@ -254,7 +273,11 @@ _WBWombat = (function() {
});
return true;
} catch (e) {
console.log(e);
var info = "Can't redefine prop " + prop;
if (obj && obj.tagName) {
info += " on " + obj.tagName;
}
console.log(info);
obj[prop] = value;
return false;
}
@ -480,13 +503,6 @@ _WBWombat = (function() {
async = true;
}
// extra check for correct scheme here.. maybe move to rewrite_url..
var curr_scheme = window.location.protocol + '//';
if (starts_with(url, wb_orig_scheme) && (wb_orig_scheme != curr_scheme)) {
url = curr_scheme + url.substring(wb_orig_scheme.length);
}
return orig.call(this, method, url, async, user, password);
}
@ -507,7 +523,7 @@ _WBWombat = (function() {
Element.prototype.setAttribute = function(name, value) {
if (name) {
var lowername = name.toLowerCase();
if (lowername == "src" || lowername == "href") {
if (equals_any(lowername, REWRITE_ATTRS)) {
if (!this._no_rewrite) {
value = rewrite_url(value);
}
@ -518,6 +534,21 @@ _WBWombat = (function() {
};
}
//============================================
function init_createElementNS_fix()
{
if (!document.createElementNS) {
return;
}
document._orig_createElementNS = document.createElementNS;
document.createElementNS = function(namespaceURI, qualifiedName)
{
namespaceURI = extract_orig(namespaceURI);
return document._orig_createElementNS(namespaceURI, qualifiedName);
}
}
//============================================
function init_image_override() {
window.__Image = window.Image;
@ -612,8 +643,6 @@ _WBWombat = (function() {
//============================================
function rewrite_style(value)
{
//console.log("style rewrite: " + value);
STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/g;
function style_replacer(match, n1, n2, n3, offset, string) {
@ -694,7 +723,7 @@ _WBWombat = (function() {
}
override_attr(created, "src");
} else if (created.tagName && starts_with(created.tagName, SRC_TAGS)) {
} else if (created.tagName && equals_any(created.tagName, SRC_TAGS)) {
override_attr(created, "src");
}
@ -710,22 +739,28 @@ _WBWombat = (function() {
//============================================
function init_postmessage_override()
{
if (!Window.prototype.postMessage) {
if (!window.postMessage) {
return;
}
var orig = Window.prototype.postMessage;
var orig = window.postMessage;
var postmessage_rewritten = function(message, targetOrigin, transfer) {
message = {"origin": targetOrigin, "message": message};
if (targetOrigin && targetOrigin != "*") {
targetOrigin = window.location.origin;
}
return orig.call(this, message, targetOrigin, transfer);
}
window.postMessage = postmessage_rewritten;
window.Window.prototype.postMessage = postmessage_rewritten;
if (Window.prototype.postMessage) {
window.Window.prototype.postMessage = postmessage_rewritten;
}
for (var i = 0; i < window.frames.length; i++) {
try {
@ -734,6 +769,30 @@ _WBWombat = (function() {
console.log(e);
}
}
window._orig_addEventListener = window.addEventListener;
window.addEventListener = function(type, listener, useCapture) {
if (type == "message") {
var orig_listener = listener;
listener = function(event) {
var ne = new MessageEvent("message",
{"bubbles": event.bubbles,
"cancelable": event.cancelable,
"data": event.data.message,
"origin": event.data.origin,
"lastEventId": event.lastEventId,
"source": event.source,
"ports": event.ports});
return orig_listener(ne);
}
}
return window._orig_addEventListener(type, listener, useCapture);
}
}
//============================================
@ -882,6 +941,7 @@ _WBWombat = (function() {
document.WB_wombat_domain = orig_host;
document.WB_wombat_referrer = extract_orig(document.referrer);
// History
copy_history_func(window.history, 'pushState');
copy_history_func(window.history, 'replaceState');
@ -902,6 +962,9 @@ _WBWombat = (function() {
// setAttribute
init_setAttribute_override();
// ensure namespace urls are NOT rewritten
init_createElementNS_fix();
// Image
init_image_override();

View File

@ -9,10 +9,3 @@
</pre>
</p>
{% endif %}
{% if env.pywb_proxy_magic and err_url and status == '404 Not Found' %}
<p>
<a href="//select.{{ env.pywb_proxy_magic }}/{{ err_url }}">Try Different Collection</a>
</p>
{% endif %}

10
pywb/ui/not_found.html Normal file
View File

@ -0,0 +1,10 @@
<h2>Url Not Found</h2>
The url <b>{{ url }}</b> could not be found in this collection.
{% if env.pywb_proxy_magic and url %}
<p>
<a href="//select.{{ env.pywb_proxy_magic }}/{{ url }}">Try Different Collection</a>
</p>
{% endif %}

View File

@ -24,7 +24,8 @@ function ts_to_date(ts, is_gmt)
</script>
</head>
<body>
<h2>pywb Sample Calendar Results</h2>
<h2>pywb Query Results</h2>
{% if cdx_lines | length > 0 %}
<b>{{ cdx_lines | length }}</b> captures of <b>{{ url }}</b>
<table id="captures" style="border-spacing: 10px;">
<tr>
@ -47,5 +48,8 @@ function ts_to_date(ts, is_gmt)
<p>
<i><b>* Unique captures are bold.</b> Other captures are duplicates of a previous capture.</i>
</p>
{% else %}
<i>No captures found for <b>{{ url }}</b></i>
{% endif %}
</body>
</html>

View File

@ -49,12 +49,9 @@ class SearchPageWbUrlHandler(WbUrlHandler):
self.banner_html = None
def render_search_page(self, wbrequest, **kwargs):
if self.search_view:
return self.search_view.render_response(wbrequest=wbrequest,
prefix=wbrequest.wb_prefix,
**kwargs)
else:
return WbResponse.text_response('No Lookup Url Specified')
return self.search_view.render_response(wbrequest=wbrequest,
prefix=wbrequest.wb_prefix,
**kwargs)
def __call__(self, wbrequest):
# root search page
@ -110,6 +107,9 @@ class WBHandler(SearchPageWbUrlHandler):
super(WBHandler, self).__init__(config)
self.index_reader = query_handler
self.not_found_view = (J2TemplateView.
create_template(config.get('not_found_html'),
'Not Found Error'))
cookie_maker = config.get('cookie_maker')
record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker)
@ -152,12 +152,19 @@ class WBHandler(SearchPageWbUrlHandler):
cdx_callback)
def handle_not_found(self, wbrequest, nfe):
if (not self.fallback_handler or
wbrequest.wb_url.is_query() or
wbrequest.wb_url.is_identity):
raise
# check fallback: only for replay queries and not for identity
if (self.fallback_handler and
not wbrequest.wb_url.is_query() and
not wbrequest.wb_url.is_identity):
return self.fallback_handler(wbrequest)
return self.fallback_handler(wbrequest)
# if capture query, just return capture page
if wbrequest.wb_url.is_query():
return self.index_reader.make_cdx_response(wbrequest, [], 'html')
else:
return self.not_found_view.render_response(status='404 Not Found',
env=wbrequest.env,
url=wbrequest.wb_url.url)
def __str__(self):
return 'Web Archive Replay Handler'

View File

@ -34,6 +34,7 @@ DEFAULTS = {
'search_html': 'ui/search.html',
'home_html': 'ui/index.html',
'error_html': 'ui/error.html',
'not_found_html': 'ui/not_found.html',
'proxy_select_html': 'ui/proxy_select.html',
'proxy_cert_download_html': 'ui/proxy_cert_download.html',

View File

@ -61,7 +61,7 @@ def is_wb_handler(obj):
#=================================================================
class J2TemplateView(object):
env_globals = {'static_path': 'static/default',
'package': 'pywb'}
'packages': ['pywb']}
def __init__(self, filename):
template_dir, template_file = path.split(filename)
@ -84,8 +84,11 @@ class J2TemplateView(object):
# add relative and absolute path loaders for banner support
loaders.append(FileSystemLoader('.'))
loaders.append(FileSystemLoader('/'))
loaders.append(PackageLoader(self.env_globals['package'],
template_dir))
# add loaders for all specified packages
for package in self.env_globals['packages']:
loaders.append(PackageLoader(package,
template_dir))
return loaders
def render_to_string(self, **kwargs):

View File

@ -34,7 +34,7 @@ class PyTest(TestCommand):
setup(
name='pywb',
version='0.7.5',
version='0.7.6',
url='https://github.com/ikreymer/pywb',
author='Ilya Kreymer',
author_email='ikreymer@gmail.com',

View File

@ -86,6 +86,10 @@ home_html: ui/index.html
# if omitted, a text response is returned
error_html: ui/error.html
# template for 404 not found error, may be customized per collection
not_found_html: ui/not_found.html
# ==== Other Paths ====
# Rewrite urls with absolute paths instead of relative

View File

@ -77,6 +77,13 @@ class TestWb:
# 17 Captures + header
assert len(resp.html.find_all('tr')) == 18
def test_calendar_not_found(self):
# query with no results
resp = self.testapp.get('/pywb/*/http://not-exist.example.com')
self._assert_basic_html(resp)
assert 'No captures found' in resp.body, resp.body
assert len(resp.html.find_all('tr')) == 0
def test_cdx_query(self):
resp = self.testapp.get('/pywb/cdx_/*/http://www.iana.org/')
self._assert_basic_text(resp)
@ -374,6 +381,11 @@ class TestWb:
assert resp.status_int == 403
assert 'Excluded' in resp.body
def test_replay_not_found(self):
resp = self.testapp.head('/pywb/http://not-exist.example.com', status=404)
assert resp.content_type == 'text/html'
assert resp.status_int == 404
def test_static_content(self):
resp = self.testapp.get('/static/test/route/wb.css')
assert resp.status_int == 200