1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Merge branch '0.5.4-fixes' into develop

This commit is contained in:
Ilya Kreymer 2014-08-05 10:46:18 -07:00
commit 95c3f080c3
14 changed files with 148 additions and 83 deletions

View File

@ -1,11 +1,11 @@
PyWb 0.6.0
==========
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=https-proxy
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop
:target: https://travis-ci.org/ikreymer/pywb
.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=https-proxy
:target: https://coveralls.io/r/ikreymer/pywb?branch=https-proxy
.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=develop
:target: https://coveralls.io/r/ikreymer/pywb?branch=develop
pywb is a python implementation of web archival replay tools, sometimes also known as 'Wayback Machine'.

View File

@ -78,7 +78,8 @@ class HTMLRewriterMixin(object):
head_insert=None,
js_rewriter_class=JSRewriter,
css_rewriter_class=CSSRewriter,
defmod=''):
defmod='',
parse_comments=False):
self.url_rewriter = url_rewriter
self._wb_parse_context = None
@ -87,6 +88,8 @@ class HTMLRewriterMixin(object):
self.css_rewriter = css_rewriter_class(url_rewriter)
self.head_insert = head_insert
self.parse_comments = parse_comments
self.rewrite_tags = self._init_rewrite_tags(defmod)
# ===========================
@ -316,7 +319,11 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
def handle_comment(self, data):
self.out.write('<!--')
self.parse_data(data)
if self.parse_comments:
data = self._rewrite_script(data)
self.out.write(data)
else:
self.parse_data(data)
self.out.write('-->')
def handle_decl(self, data):

View File

@ -25,7 +25,7 @@ class RegexRewriter(object):
@staticmethod
def archival_rewrite(rewriter):
return lambda string: rewriter.rewrite(string, 'em_')
return lambda string: rewriter.rewrite(string, 'mp_')
#@staticmethod
#def replacer(other):
@ -105,7 +105,7 @@ class JSLinkOnlyRewriter(RegexRewriter):
JS Rewriter which rewrites absolute http://, https:// and // urls
at the beginning of a string
"""
JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+'
JS_HTTPX = r'(?<="|\'|;)(?:https?:)?\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+'
def __init__(self, rewriter, rules=[]):
rules = rules + [

View File

@ -122,6 +122,23 @@ class RewriteContent:
if head_insert_func:
head_insert_str = head_insert_func(rule, cdx)
head_insert_str = head_insert_str.encode('utf-8')
if wb_url.is_banner_only:
gen = self._head_insert_only_gen(head_insert_str, stream)
content_len = headers.get_header('Content-Length')
try:
content_len = int(content_len)
except Exception:
content_len = None
if content_len and content_len >= 0:
content_len = str(content_len + len(head_insert_str))
status_headers.replace_header('Content-Length',
content_len)
return (status_headers, gen, False)
if wb_url.is_banner_only:
gen = self._head_insert_only_gen(head_insert_str, stream)
@ -131,7 +148,8 @@ class RewriteContent:
js_rewriter_class=rule.rewriters['js'],
css_rewriter_class=rule.rewriters['css'],
head_insert=head_insert_str,
defmod=self.defmod)
defmod=self.defmod,
parse_comments=rule.parse_comments)
else:
if wb_url.is_banner_only:
@ -165,7 +183,8 @@ class RewriteContent:
matcher = self.HEAD_REGEX.search(buff)
if matcher:
yield buff[:matcher.end()] + insert_str
yield buff[:matcher.end()]
yield insert_str
yield buff[matcher.end():]
else:
yield insert_str

View File

@ -8,8 +8,6 @@ from html_rewriter import HTMLRewriter
import itertools
HTML = HTMLRewriter
#=================================================================
class RewriteRules(BaseRule):
@ -23,8 +21,9 @@ class RewriteRules(BaseRule):
self.rewriters['header'] = config.get('header_class', HeaderRewriter)
self.rewriters['css'] = config.get('css_class', CSSRewriter)
self.rewriters['xml'] = config.get('xml_class', XMLRewriter)
self.rewriters['html'] = config.get('html_class', HTML)
#self.rewriters['html'] = config.get('html_class', HTMLRewriter)
self.rewriters['html'] = config.get('html_class', HTMLRewriter)
self.parse_comments = config.get('parse_comments', False)
# Custom handling for js rewriting, often the most complex
self.js_rewrite_location = config.get('js_rewrite_location', True)

View File

@ -62,7 +62,7 @@ ur"""
# Script tag
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
<script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</script>
<script>window.WB_wombat_location = "/web/20131226101010mp_/http://example.com/a/b/c.html"</script>
# Script tag + crossorigin
>>> parse('<script src="/js/scripts.js" crossorigin="anonymous"></script>')
@ -70,21 +70,21 @@ ur"""
# Unterminated script tag, handle and auto-terminate
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
<script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</sc></script>
<script>window.WB_wombat_location = "/web/20131226101010mp_/http://example.com/a/b/c.html"</sc></script>
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html;/*]]>*/"</script>
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010mp_/http://example.com/a/b/c.html;/*]]>*/"</script>
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
<div style="background: url('/web/20131226101010em_/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
<div style="background: url('/web/20131226101010mp_/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
# Style
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
<style>@import "/web/20131226101010em_/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010em_/http://example.com/some/path/myfont.ttf') }</style>
<style>@import "/web/20131226101010mp_/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010mp_/http://example.com/some/path/myfont.ttf') }</style>
# Unterminated style tag, handle and auto-terminate
>>> parse('<style>@import url(styles.css)')
<style>@import url(/web/20131226101010em_/http://example.com/some/path/styles.css)</style>
<style>@import url(/web/20131226101010mp_/http://example.com/some/path/styles.css)</style>
# Head Insertion
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')

View File

@ -12,16 +12,16 @@ r"""
#=================================================================
>>> _test_js('location = "http://example.com/abc.html"')
'WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"'
'WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html"'
>>> _test_js(r'location = "http:\/\/example.com/abc.html"')
'WB_wombat_location = "/web/20131010em_/http:\\/\\/example.com/abc.html"'
'WB_wombat_location = "/web/20131010mp_/http:\\/\\/example.com/abc.html"'
>>> _test_js(r'location = "http:\\/\\/example.com/abc.html"')
'WB_wombat_location = "/web/20131010em_/http:\\\\/\\\\/example.com/abc.html"'
'WB_wombat_location = "/web/20131010mp_/http:\\\\/\\\\/example.com/abc.html"'
>>> _test_js(r"location = 'http://example.com/abc.html/'")
"WB_wombat_location = '/web/20131010em_/http://example.com/abc.html/'"
"WB_wombat_location = '/web/20131010mp_/http://example.com/abc.html/'"
>>> _test_js(r'location = http://example.com/abc.html/')
'WB_wombat_location = http://example.com/abc.html/'
@ -37,94 +37,98 @@ r"""
'"/location" == some_location_val; locations = WB_wombat_location;'
>>> _test_js('cool_Location = "http://example.com/abc.html"')
'cool_Location = "/web/20131010em_/http://example.com/abc.html"'
'cool_Location = "/web/20131010mp_/http://example.com/abc.html"'
>>> _test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
'window.WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
>>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
# custom rules added
>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)])
'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"; /*some_func(); */'
'window.WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html"; /*some_func(); */'
# scheme-agnostic
>>> _test_js('cool_Location = "//example.com/abc.html" //comment')
'cool_Location = "/web/20131010em_/http://example.com/abc.html" //comment'
'cool_Location = "/web/20131010mp_/http://example.com/abc.html" //comment'
# document.cookie test
>>> _test_js('document.cookie = "a=b; Path=/"')
'document.WB_wombat_cookie = "a=b; Path=/"'
# js-escaped
>>> _test_js('&quot;http:\\/\\/www.example.com\\/some\\/path\\/?query=1&quot;')
'&quot;/web/20131010mp_/http:\\/\\/www.example.com\\/some\\/path\\/?query=1&quot;'
#=================================================================
# XML Rewriting
#=================================================================
>>> _test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>')
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010em_/http://example.com"></tag>'
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010mp_/http://example.com"></tag>'
>>> _test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>')
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010em_/http://example.com"></tag>'
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010mp_/http://example.com"></tag>'
>>> _test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>')
'<tag> /web/20131010em_/http://example.com<other>abchttp://example.com</other></tag>'
'<tag> /web/20131010mp_/http://example.com<other>abchttp://example.com</other></tag>'
>>> _test_xml('<main> http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>')
'<main> /web/20131010em_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010em_/http://example.com </main>'
'<main> /web/20131010mp_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010mp_/http://example.com </main>'
#=================================================================
# CSS Rewriting
#=================================================================
>>> _test_css("background: url('/some/path.html')")
"background: url('/web/20131010em_/http://example.com/some/path.html')"
"background: url('/web/20131010mp_/http://example.com/some/path.html')"
>>> _test_css("background: url('../path.html')")
"background: url('/web/20131010em_/http://example.com/path.html')"
"background: url('/web/20131010mp_/http://example.com/path.html')"
>>> _test_css("background: url(\"http://domain.com/path.html\")")
'background: url("/web/20131010em_/http://domain.com/path.html")'
'background: url("/web/20131010mp_/http://domain.com/path.html")'
>>> _test_css("background: url(file.jpeg)")
'background: url(/web/20131010em_/http://example.com/file.jpeg)'
'background: url(/web/20131010mp_/http://example.com/file.jpeg)'
>>> _test_css("background:#abc url('/static/styles/../images/layout/logo.png')")
"background:#abc url('/web/20131010em_/http://example.com/static/images/layout/logo.png')"
"background:#abc url('/web/20131010mp_/http://example.com/static/images/layout/logo.png')"
>>> _test_css("background:#000 url('/static/styles/../../images/layout/logo.png')")
"background:#000 url('/web/20131010em_/http://example.com/images/layout/logo.png')"
"background:#000 url('/web/20131010mp_/http://example.com/images/layout/logo.png')"
>>> _test_css("background: url('')")
"background: url('')"
>>> _test_css("background: url (\"weirdpath\')")
'background: url ("/web/20131010em_/http://example.com/weirdpath\')'
'background: url ("/web/20131010mp_/http://example.com/weirdpath\')'
>>> _test_css("@import url ('path.css')")
"@import url ('/web/20131010em_/http://example.com/path.css')"
"@import url ('/web/20131010mp_/http://example.com/path.css')"
>>> _test_css("@import url('path.css')")
"@import url('/web/20131010em_/http://example.com/path.css')"
"@import url('/web/20131010mp_/http://example.com/path.css')"
>>> _test_css("@import ( 'path.css')")
"@import ( '/web/20131010em_/http://example.com/path.css')"
"@import ( '/web/20131010mp_/http://example.com/path.css')"
>>> _test_css("@import \"path.css\"")
'@import "/web/20131010em_/http://example.com/path.css"'
'@import "/web/20131010mp_/http://example.com/path.css"'
>>> _test_css("@import ('../path.css\"")
'@import (\'/web/20131010em_/http://example.com/path.css"'
'@import (\'/web/20131010mp_/http://example.com/path.css"'
>>> _test_css("@import ('../url.css\"")
'@import (\'/web/20131010em_/http://example.com/url.css"'
'@import (\'/web/20131010mp_/http://example.com/url.css"'
>>> _test_css("@import (\"url.css\")")
'@import ("/web/20131010em_/http://example.com/url.css")'
'@import ("/web/20131010mp_/http://example.com/url.css")'
>>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
'@import url(/web/20131010em_/http://example.com/url.css)\n@import url(/web/20131010em_/http://example.com/anotherurl.css)\n @import url(/web/20131010em_/http://example.com/and_a_third.css)'
'@import url(/web/20131010mp_/http://example.com/url.css)\n@import url(/web/20131010mp_/http://example.com/anotherurl.css)\n @import url(/web/20131010mp_/http://example.com/and_a_third.css)'
"""

View File

@ -31,6 +31,8 @@ rules:
- match: 'Bootloader\.configurePage.*?;'
replace: '/* {0} */'
parse_comments: true
# flickr rules
#=================================================================

View File

@ -30,10 +30,6 @@ function init_banner() {
var FRAME_BANNER_ID = "_wb_frame_top_banner";
var bid;
if (wbinfo.is_embed) {
return;
}
if (window.top != window.self) {
return;
}
@ -131,23 +127,42 @@ function remove_event(name, func, object) {
}
}
var detect_on_init = function(event) {
init_banner();
function notify_top() {
if (window.parent != window.top) {
return;
}
remove_event("readystatechange", detect_on_init, document);
if (!window.WB_wombat_location) {
return;
}
if (typeof(window.WB_wombat_location.href) != "string") {
return;
}
window.parent.update_wb_url(window.WB_wombat_location.href,
wbinfo.timestamp,
wbinfo.is_live);
remove_event("readystatechange", notify_top, document);
}
add_event("readystatechange", detect_on_init, document);
if (wbinfo.is_frame_mp && wbinfo.canon_url &&
(window.self == window.top) && (window.self.top == window.top) &&
window.location.href != wbinfo.canon_url) {
window.location.replace(wbinfo.canon_url);
if ((window.self == window.top) && wbinfo) {
if (wbinfo.canon_url && (window.location.href != wbinfo.canon_url) && wbinfo.mod != "bn_") {
// Auto-redirect to top frame
window.location.replace(wbinfo.canon_url);
} else {
// Init Banner (no frame or top frame)
add_event("readystatechange", init_banner, document);
}
} else if (window.self != window.parent && window.parent.update_wb_url) {
add_event("readystatechange", notify_top, document);
}
return {'labels': labels,
'ts_to_date': ts_to_date};
return {
'labels': labels,
'ts_to_date': ts_to_date
};
})();

View File

@ -1,7 +1,7 @@
var update_wb_url = push_state;
var LIVE_COOKIE_REGEX = /pywb.timestamp=([\d]{1,14})/;
var curr_state = {};
function make_outer_url(url, ts)
{
@ -22,6 +22,15 @@ function make_inner_url(url, ts)
}
function push_state(url, timestamp, capture_str, is_live) {
if (window.frames[0].WB_wombat_location) {
curr_href = window.frames[0].WB_wombat_location.href;
// If not current url, don't update
if (url != curr_href) {
return;
}
}
var state = {}
state.timestamp = timestamp;
state.outer_url = make_outer_url(url, state.timestamp);
@ -32,13 +41,13 @@ function push_state(url, timestamp, capture_str, is_live) {
window.history.replaceState(state, "", state.outer_url);
update_status(state.capture_str, is_live);
set_state(state);
}
function pop_state(state) {
update_status(state.capture_str, state.is_live);
set_state(state);
window.frames[0].src = state.outer_url;
window.frames[0].src = state.inner_url;
}
function extract_ts(url)
@ -66,15 +75,15 @@ function extract_replay_url(url) {
return url.substring(inx + 1);
}
function update_status(str, is_live) {
function set_state(state) {
var capture_info = document.getElementById("_wb_capture_info");
if (capture_info) {
capture_info.innerHTML = str;
capture_info.innerHTML = state.capture_str;
}
var label = document.getElementById("_wb_label");
if (label) {
if (is_live) {
if (state.is_live) {
label.innerHTML = _wb_js.labels.LIVE_MSG;
} else {
label.innerHTML = _wb_js.labels.REPLAY_MSG;
@ -83,10 +92,10 @@ function update_status(str, is_live) {
}
window.onpopstate = function(event) {
var curr_state = event.state;
var state = event.state;
if (curr_state) {
pop_state(curr_state);
if (state) {
pop_state(state);
}
}
@ -100,14 +109,14 @@ function extract_ts_cookie(value) {
}
function iframe_loaded(event) {
var iframe = window.frames[0];
var url;
var ts;
var capture_str;
var is_live = false;
var iframe = window.frames[0];
if (iframe.WB_wombat_location) {
url = window.WB_wombat_location.href;
url = iframe.WB_wombat_location.href;
} else {
url = extract_replay_url(iframe.location.href);
}
@ -123,7 +132,16 @@ function iframe_loaded(event) {
ts = extract_ts(iframe.location.href);
}
}
update_wb_url(url, ts, is_live);
}
function update_wb_url(url, ts, is_live) {
if (curr_state.url == url && curr_state.timestamp == ts) {
return;
}
capture_str = _wb_js.ts_to_date(ts, true);
update_wb_url(url, ts, capture_str, is_live);
push_state(url, ts, capture_str, is_live);
}

View File

@ -699,7 +699,7 @@ WB_wombat_init = (function() {
wb_replay_prefix = replay_prefix;
if (wb_replay_prefix) {
wb_replay_date_prefix = replay_prefix + capture_date + "em_/";
wb_replay_date_prefix = replay_prefix + capture_date + "mp_/";
if (capture_date.length > 0) {
wb_capture_date_part = "/" + capture_date + "/";

View File

@ -3,7 +3,6 @@
<!-- Start WB Insert -->
<script>
wbinfo = {}
wbinfo.is_embed = false;
wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
wbinfo.capture_url = "{{ url }}";
wbinfo.is_frame = true;

View File

@ -15,12 +15,10 @@
wbinfo.url = "{{ cdx.original }}";
wbinfo.timestamp = "{{ cdx.timestamp }}";
wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
wbinfo.is_embed = {{"true" if wbrequest.wb_url.is_embed else "false"}};
wbinfo.is_frame_mp = {{"true" if wbrequest.wb_url.mod == 'mp_' else "false"}};
wbinfo.mod = "{{ wbrequest.wb_url.mod }}";
wbinfo.canon_url = "{{ canon_url }}";
wbinfo.is_live = {{ "true" if cdx.is_live else "false" }};
wbinfo.coll = "{{ wbrequest.coll }}";
wbinfo.proxy_magic = "{{ wbrequest.env.pywb_proxy_magic }}";
wbinfo.is_proxy_mode = {{ "true" if wbrequest.options.is_proxy else "false" }};
</script>
<script src='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.js'> </script>
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/{{ static_path }}/wb.css'/>

View File

@ -34,7 +34,11 @@ class PyTest(TestCommand):
setup(
name='pywb',
<<<<<<< HEAD
version='0.6.0',
=======
version='0.5.3',
>>>>>>> 0.5.4-fixes
url='https://github.com/ikreymer/pywb',
author='Ilya Kreymer',
author_email='ikreymer@gmail.com',