diff --git a/README.rst b/README.rst
index 3640c69d..88d78f96 100644
--- a/README.rst
+++ b/README.rst
@@ -1,11 +1,11 @@
PyWb 0.6.0
==========
-.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=https-proxy
+.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop
:target: https://travis-ci.org/ikreymer/pywb
-.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=https-proxy
- :target: https://coveralls.io/r/ikreymer/pywb?branch=https-proxy
+.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=develop
+ :target: https://coveralls.io/r/ikreymer/pywb?branch=develop
pywb is a python implementation of web archival replay tools, sometimes also known as 'Wayback Machine'.
diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py
index 5a316016..f7575fa5 100644
--- a/pywb/rewrite/html_rewriter.py
+++ b/pywb/rewrite/html_rewriter.py
@@ -78,7 +78,8 @@ class HTMLRewriterMixin(object):
head_insert=None,
js_rewriter_class=JSRewriter,
css_rewriter_class=CSSRewriter,
- defmod=''):
+ defmod='',
+ parse_comments=False):
self.url_rewriter = url_rewriter
self._wb_parse_context = None
@@ -87,6 +88,8 @@ class HTMLRewriterMixin(object):
self.css_rewriter = css_rewriter_class(url_rewriter)
self.head_insert = head_insert
+ self.parse_comments = parse_comments
+
self.rewrite_tags = self._init_rewrite_tags(defmod)
# ===========================
@@ -316,7 +319,11 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
def handle_comment(self, data):
self.out.write('')
def handle_decl(self, data):
diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py
index 3f440eee..df7a128e 100644
--- a/pywb/rewrite/regex_rewriters.py
+++ b/pywb/rewrite/regex_rewriters.py
@@ -25,7 +25,7 @@ class RegexRewriter(object):
@staticmethod
def archival_rewrite(rewriter):
- return lambda string: rewriter.rewrite(string, 'em_')
+ return lambda string: rewriter.rewrite(string, 'mp_')
#@staticmethod
#def replacer(other):
@@ -105,7 +105,7 @@ class JSLinkOnlyRewriter(RegexRewriter):
JS Rewriter which rewrites absolute http://, https:// and // urls
at the beginning of a string
"""
- JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+'
+ JS_HTTPX = r'(?<="|\'|;)(?:https?:)?\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+'
def __init__(self, rewriter, rules=[]):
rules = rules + [
diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py
index 2225bbaf..b7254f76 100644
--- a/pywb/rewrite/rewrite_content.py
+++ b/pywb/rewrite/rewrite_content.py
@@ -122,6 +122,23 @@ class RewriteContent:
if head_insert_func:
head_insert_str = head_insert_func(rule, cdx)
+ head_insert_str = head_insert_str.encode('utf-8')
+
+ if wb_url.is_banner_only:
+ gen = self._head_insert_only_gen(head_insert_str, stream)
+
+ content_len = headers.get_header('Content-Length')
+ try:
+ content_len = int(content_len)
+ except Exception:
+ content_len = None
+
+ if content_len and content_len >= 0:
+ content_len = str(content_len + len(head_insert_str))
+ status_headers.replace_header('Content-Length',
+ content_len)
+
+ return (status_headers, gen, False)
if wb_url.is_banner_only:
gen = self._head_insert_only_gen(head_insert_str, stream)
@@ -131,7 +148,8 @@ class RewriteContent:
js_rewriter_class=rule.rewriters['js'],
css_rewriter_class=rule.rewriters['css'],
head_insert=head_insert_str,
- defmod=self.defmod)
+ defmod=self.defmod,
+ parse_comments=rule.parse_comments)
else:
if wb_url.is_banner_only:
@@ -165,7 +183,8 @@ class RewriteContent:
matcher = self.HEAD_REGEX.search(buff)
if matcher:
- yield buff[:matcher.end()] + insert_str
+ yield buff[:matcher.end()]
+ yield insert_str
yield buff[matcher.end():]
else:
yield insert_str
diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py
index d70d2d08..226942c5 100644
--- a/pywb/rewrite/rewriterules.py
+++ b/pywb/rewrite/rewriterules.py
@@ -8,8 +8,6 @@ from html_rewriter import HTMLRewriter
import itertools
-HTML = HTMLRewriter
-
#=================================================================
class RewriteRules(BaseRule):
@@ -23,8 +21,9 @@ class RewriteRules(BaseRule):
self.rewriters['header'] = config.get('header_class', HeaderRewriter)
self.rewriters['css'] = config.get('css_class', CSSRewriter)
self.rewriters['xml'] = config.get('xml_class', XMLRewriter)
- self.rewriters['html'] = config.get('html_class', HTML)
- #self.rewriters['html'] = config.get('html_class', HTMLRewriter)
+ self.rewriters['html'] = config.get('html_class', HTMLRewriter)
+
+ self.parse_comments = config.get('parse_comments', False)
# Custom handling for js rewriting, often the most complex
self.js_rewrite_location = config.get('js_rewrite_location', True)
diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py
index 9ea8edc0..f3a5d38d 100644
--- a/pywb/rewrite/test/test_html_rewriter.py
+++ b/pywb/rewrite/test/test_html_rewriter.py
@@ -62,7 +62,7 @@ ur"""
# Script tag
>>> parse('')
-
+
# Script tag + crossorigin
>>> parse('')
@@ -70,21 +70,21 @@ ur"""
# Unterminated script tag, handle and auto-terminate
>>> parse('
+
>>> parse('')
-
+
>>> parse('
')
-
+
# Style
>>> parse('')
-
+
# Unterminated style tag, handle and auto-terminate
>>> parse('
+
# Head Insertion
>>> parse('Test', head_insert = '')
diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py
index 3f3b4638..69a367a9 100644
--- a/pywb/rewrite/test/test_regex_rewriters.py
+++ b/pywb/rewrite/test/test_regex_rewriters.py
@@ -12,16 +12,16 @@ r"""
#=================================================================
>>> _test_js('location = "http://example.com/abc.html"')
-'WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"'
+'WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html"'
>>> _test_js(r'location = "http:\/\/example.com/abc.html"')
-'WB_wombat_location = "/web/20131010em_/http:\\/\\/example.com/abc.html"'
+'WB_wombat_location = "/web/20131010mp_/http:\\/\\/example.com/abc.html"'
>>> _test_js(r'location = "http:\\/\\/example.com/abc.html"')
-'WB_wombat_location = "/web/20131010em_/http:\\\\/\\\\/example.com/abc.html"'
+'WB_wombat_location = "/web/20131010mp_/http:\\\\/\\\\/example.com/abc.html"'
>>> _test_js(r"location = 'http://example.com/abc.html/'")
-"WB_wombat_location = '/web/20131010em_/http://example.com/abc.html/'"
+"WB_wombat_location = '/web/20131010mp_/http://example.com/abc.html/'"
>>> _test_js(r'location = http://example.com/abc.html/')
'WB_wombat_location = http://example.com/abc.html/'
@@ -37,94 +37,98 @@ r"""
'"/location" == some_location_val; locations = WB_wombat_location;'
>>> _test_js('cool_Location = "http://example.com/abc.html"')
-'cool_Location = "/web/20131010em_/http://example.com/abc.html"'
+'cool_Location = "/web/20131010mp_/http://example.com/abc.html"'
>>> _test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
-'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
+'window.WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
>>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
# custom rules added
>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)])
-'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"; /*some_func(); */'
+'window.WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html"; /*some_func(); */'
# scheme-agnostic
>>> _test_js('cool_Location = "//example.com/abc.html" //comment')
-'cool_Location = "/web/20131010em_/http://example.com/abc.html" //comment'
+'cool_Location = "/web/20131010mp_/http://example.com/abc.html" //comment'
# document.cookie test
>>> _test_js('document.cookie = "a=b; Path=/"')
'document.WB_wombat_cookie = "a=b; Path=/"'
+# js-escaped
+>>> _test_js('"http:\\/\\/www.example.com\\/some\\/path\\/?query=1"')
+'"/web/20131010mp_/http:\\/\\/www.example.com\\/some\\/path\\/?query=1"'
+
#=================================================================
# XML Rewriting
#=================================================================
>>> _test_xml('')
-''
+''
>>> _test_xml('')
-''
+''
>>> _test_xml(' http://example.comabchttp://example.com')
-' /web/20131010em_/http://example.comabchttp://example.com'
+' /web/20131010mp_/http://example.comabchttp://example.com'
>>> _test_xml(' http://www.example.com/blah http://example.com ')
-' /web/20131010em_/http://www.example.com/blah /web/20131010em_/http://example.com '
+' /web/20131010mp_/http://www.example.com/blah /web/20131010mp_/http://example.com '
#=================================================================
# CSS Rewriting
#=================================================================
>>> _test_css("background: url('/some/path.html')")
-"background: url('/web/20131010em_/http://example.com/some/path.html')"
+"background: url('/web/20131010mp_/http://example.com/some/path.html')"
>>> _test_css("background: url('../path.html')")
-"background: url('/web/20131010em_/http://example.com/path.html')"
+"background: url('/web/20131010mp_/http://example.com/path.html')"
>>> _test_css("background: url(\"http://domain.com/path.html\")")
-'background: url("/web/20131010em_/http://domain.com/path.html")'
+'background: url("/web/20131010mp_/http://domain.com/path.html")'
>>> _test_css("background: url(file.jpeg)")
-'background: url(/web/20131010em_/http://example.com/file.jpeg)'
+'background: url(/web/20131010mp_/http://example.com/file.jpeg)'
>>> _test_css("background:#abc url('/static/styles/../images/layout/logo.png')")
-"background:#abc url('/web/20131010em_/http://example.com/static/images/layout/logo.png')"
+"background:#abc url('/web/20131010mp_/http://example.com/static/images/layout/logo.png')"
>>> _test_css("background:#000 url('/static/styles/../../images/layout/logo.png')")
-"background:#000 url('/web/20131010em_/http://example.com/images/layout/logo.png')"
+"background:#000 url('/web/20131010mp_/http://example.com/images/layout/logo.png')"
>>> _test_css("background: url('')")
"background: url('')"
>>> _test_css("background: url (\"weirdpath\')")
-'background: url ("/web/20131010em_/http://example.com/weirdpath\')'
+'background: url ("/web/20131010mp_/http://example.com/weirdpath\')'
>>> _test_css("@import url ('path.css')")
-"@import url ('/web/20131010em_/http://example.com/path.css')"
+"@import url ('/web/20131010mp_/http://example.com/path.css')"
>>> _test_css("@import url('path.css')")
-"@import url('/web/20131010em_/http://example.com/path.css')"
+"@import url('/web/20131010mp_/http://example.com/path.css')"
>>> _test_css("@import ( 'path.css')")
-"@import ( '/web/20131010em_/http://example.com/path.css')"
+"@import ( '/web/20131010mp_/http://example.com/path.css')"
>>> _test_css("@import \"path.css\"")
-'@import "/web/20131010em_/http://example.com/path.css"'
+'@import "/web/20131010mp_/http://example.com/path.css"'
>>> _test_css("@import ('../path.css\"")
-'@import (\'/web/20131010em_/http://example.com/path.css"'
+'@import (\'/web/20131010mp_/http://example.com/path.css"'
>>> _test_css("@import ('../url.css\"")
-'@import (\'/web/20131010em_/http://example.com/url.css"'
+'@import (\'/web/20131010mp_/http://example.com/url.css"'
>>> _test_css("@import (\"url.css\")")
-'@import ("/web/20131010em_/http://example.com/url.css")'
+'@import ("/web/20131010mp_/http://example.com/url.css")'
>>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
-'@import url(/web/20131010em_/http://example.com/url.css)\n@import url(/web/20131010em_/http://example.com/anotherurl.css)\n @import url(/web/20131010em_/http://example.com/and_a_third.css)'
+'@import url(/web/20131010mp_/http://example.com/url.css)\n@import url(/web/20131010mp_/http://example.com/anotherurl.css)\n @import url(/web/20131010mp_/http://example.com/and_a_third.css)'
"""
diff --git a/pywb/rules.yaml b/pywb/rules.yaml
index 03e2e33b..4e6ac514 100644
--- a/pywb/rules.yaml
+++ b/pywb/rules.yaml
@@ -31,6 +31,8 @@ rules:
- match: 'Bootloader\.configurePage.*?;'
replace: '/* {0} */'
+ parse_comments: true
+
# flickr rules
#=================================================================
diff --git a/pywb/static/wb.js b/pywb/static/wb.js
index 0511f983..203f0b6c 100644
--- a/pywb/static/wb.js
+++ b/pywb/static/wb.js
@@ -30,10 +30,6 @@ function init_banner() {
var FRAME_BANNER_ID = "_wb_frame_top_banner";
var bid;
- if (wbinfo.is_embed) {
- return;
- }
-
if (window.top != window.self) {
return;
}
@@ -131,23 +127,42 @@ function remove_event(name, func, object) {
}
}
-var detect_on_init = function(event) {
- init_banner();
+function notify_top() {
+ if (window.parent != window.top) {
+ return;
+ }
- remove_event("readystatechange", detect_on_init, document);
+ if (!window.WB_wombat_location) {
+ return;
+ }
+
+ if (typeof(window.WB_wombat_location.href) != "string") {
+ return;
+ }
+
+ window.parent.update_wb_url(window.WB_wombat_location.href,
+ wbinfo.timestamp,
+ wbinfo.is_live);
+
+ remove_event("readystatechange", notify_top, document);
}
-add_event("readystatechange", detect_on_init, document);
-
-
-if (wbinfo.is_frame_mp && wbinfo.canon_url &&
- (window.self == window.top) && (window.self.top == window.top) &&
- window.location.href != wbinfo.canon_url) {
-
- window.location.replace(wbinfo.canon_url);
+if ((window.self == window.top) && wbinfo) {
+ if (wbinfo.canon_url && (window.location.href != wbinfo.canon_url) && wbinfo.mod != "bn_") {
+ // Auto-redirect to top frame
+ window.location.replace(wbinfo.canon_url);
+ } else {
+ // Init Banner (no frame or top frame)
+ add_event("readystatechange", init_banner, document);
+ }
+} else if (window.self != window.parent && window.parent.update_wb_url) {
+ add_event("readystatechange", notify_top, document);
}
-return {'labels': labels,
- 'ts_to_date': ts_to_date};
+
+return {
+ 'labels': labels,
+ 'ts_to_date': ts_to_date
+ };
})();
diff --git a/pywb/static/wb_frame.js b/pywb/static/wb_frame.js
index 2dd88228..1ec173c7 100644
--- a/pywb/static/wb_frame.js
+++ b/pywb/static/wb_frame.js
@@ -1,7 +1,7 @@
-var update_wb_url = push_state;
-
var LIVE_COOKIE_REGEX = /pywb.timestamp=([\d]{1,14})/;
+var curr_state = {};
+
function make_outer_url(url, ts)
{
@@ -22,6 +22,15 @@ function make_inner_url(url, ts)
}
function push_state(url, timestamp, capture_str, is_live) {
+ if (window.frames[0].WB_wombat_location) {
+ curr_href = window.frames[0].WB_wombat_location.href;
+
+ // If not current url, don't update
+ if (url != curr_href) {
+ return;
+ }
+ }
+
var state = {}
state.timestamp = timestamp;
state.outer_url = make_outer_url(url, state.timestamp);
@@ -32,13 +41,13 @@ function push_state(url, timestamp, capture_str, is_live) {
window.history.replaceState(state, "", state.outer_url);
- update_status(state.capture_str, is_live);
+ set_state(state);
}
function pop_state(state) {
- update_status(state.capture_str, state.is_live);
+ set_state(state);
- window.frames[0].src = state.outer_url;
+ window.frames[0].src = state.inner_url;
}
function extract_ts(url)
@@ -66,15 +75,15 @@ function extract_replay_url(url) {
return url.substring(inx + 1);
}
-function update_status(str, is_live) {
+function set_state(state) {
var capture_info = document.getElementById("_wb_capture_info");
if (capture_info) {
- capture_info.innerHTML = str;
+ capture_info.innerHTML = state.capture_str;
}
var label = document.getElementById("_wb_label");
if (label) {
- if (is_live) {
+ if (state.is_live) {
label.innerHTML = _wb_js.labels.LIVE_MSG;
} else {
label.innerHTML = _wb_js.labels.REPLAY_MSG;
@@ -83,10 +92,10 @@ function update_status(str, is_live) {
}
window.onpopstate = function(event) {
- var curr_state = event.state;
+ var state = event.state;
- if (curr_state) {
- pop_state(curr_state);
+ if (state) {
+ pop_state(state);
}
}
@@ -100,14 +109,14 @@ function extract_ts_cookie(value) {
}
function iframe_loaded(event) {
- var iframe = window.frames[0];
var url;
var ts;
var capture_str;
var is_live = false;
+ var iframe = window.frames[0];
if (iframe.WB_wombat_location) {
- url = window.WB_wombat_location.href;
+ url = iframe.WB_wombat_location.href;
} else {
url = extract_replay_url(iframe.location.href);
}
@@ -123,7 +132,16 @@ function iframe_loaded(event) {
ts = extract_ts(iframe.location.href);
}
}
+
+ update_wb_url(url, ts, is_live);
+}
+
+function update_wb_url(url, ts, is_live) {
+ if (curr_state.url == url && curr_state.timestamp == ts) {
+ return;
+ }
+
capture_str = _wb_js.ts_to_date(ts, true);
- update_wb_url(url, ts, capture_str, is_live);
+ push_state(url, ts, capture_str, is_live);
}
diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js
index e14c9d7d..962a32cd 100644
--- a/pywb/static/wombat.js
+++ b/pywb/static/wombat.js
@@ -699,7 +699,7 @@ WB_wombat_init = (function() {
wb_replay_prefix = replay_prefix;
if (wb_replay_prefix) {
- wb_replay_date_prefix = replay_prefix + capture_date + "em_/";
+ wb_replay_date_prefix = replay_prefix + capture_date + "mp_/";
if (capture_date.length > 0) {
wb_capture_date_part = "/" + capture_date + "/";
diff --git a/pywb/ui/frame_insert.html b/pywb/ui/frame_insert.html
index ce8d4ccd..9f232972 100644
--- a/pywb/ui/frame_insert.html
+++ b/pywb/ui/frame_insert.html
@@ -3,7 +3,6 @@
diff --git a/setup.py b/setup.py
index 6b5482bf..a69a7c47 100755
--- a/setup.py
+++ b/setup.py
@@ -34,7 +34,11 @@ class PyTest(TestCommand):
setup(
name='pywb',
+<<<<<<< HEAD
version='0.6.0',
+=======
+ version='0.5.3',
+>>>>>>> 0.5.4-fixes
url='https://github.com/ikreymer/pywb',
author='Ilya Kreymer',
author_email='ikreymer@gmail.com',