diff --git a/README.rst b/README.rst index 3640c69d..88d78f96 100644 --- a/README.rst +++ b/README.rst @@ -1,11 +1,11 @@ PyWb 0.6.0 ========== -.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=https-proxy +.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=develop :target: https://travis-ci.org/ikreymer/pywb -.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=https-proxy - :target: https://coveralls.io/r/ikreymer/pywb?branch=https-proxy +.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=develop + :target: https://coveralls.io/r/ikreymer/pywb?branch=develop pywb is a python implementation of web archival replay tools, sometimes also known as 'Wayback Machine'. diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 5a316016..f7575fa5 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -78,7 +78,8 @@ class HTMLRewriterMixin(object): head_insert=None, js_rewriter_class=JSRewriter, css_rewriter_class=CSSRewriter, - defmod=''): + defmod='', + parse_comments=False): self.url_rewriter = url_rewriter self._wb_parse_context = None @@ -87,6 +88,8 @@ class HTMLRewriterMixin(object): self.css_rewriter = css_rewriter_class(url_rewriter) self.head_insert = head_insert + self.parse_comments = parse_comments + self.rewrite_tags = self._init_rewrite_tags(defmod) # =========================== @@ -316,7 +319,11 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser): def handle_comment(self, data): self.out.write('') def handle_decl(self, data): diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index 3f440eee..df7a128e 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -25,7 +25,7 @@ class RegexRewriter(object): @staticmethod def archival_rewrite(rewriter): - return lambda string: rewriter.rewrite(string, 'em_') + return lambda string: rewriter.rewrite(string, 'mp_') #@staticmethod #def replacer(other): @@ -105,7 +105,7 @@ class JSLinkOnlyRewriter(RegexRewriter): JS Rewriter which rewrites absolute http://, https:// and // urls at the beginning of a string """ - JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+' + JS_HTTPX = r'(?<="|\'|;)(?:https?:)?\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+' def __init__(self, rewriter, rules=[]): rules = rules + [ diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 2225bbaf..b7254f76 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -122,6 +122,23 @@ class RewriteContent: if head_insert_func: head_insert_str = head_insert_func(rule, cdx) + head_insert_str = head_insert_str.encode('utf-8') + + if wb_url.is_banner_only: + gen = self._head_insert_only_gen(head_insert_str, stream) + + content_len = headers.get_header('Content-Length') + try: + content_len = int(content_len) + except Exception: + content_len = None + + if content_len and content_len >= 0: + content_len = str(content_len + len(head_insert_str)) + status_headers.replace_header('Content-Length', + content_len) + + return (status_headers, gen, False) if wb_url.is_banner_only: gen = self._head_insert_only_gen(head_insert_str, stream) @@ -131,7 +148,8 @@ class RewriteContent: js_rewriter_class=rule.rewriters['js'], css_rewriter_class=rule.rewriters['css'], head_insert=head_insert_str, - defmod=self.defmod) + defmod=self.defmod, + parse_comments=rule.parse_comments) else: if wb_url.is_banner_only: @@ -165,7 +183,8 @@ class RewriteContent: matcher = self.HEAD_REGEX.search(buff) if matcher: - yield buff[:matcher.end()] + insert_str + yield buff[:matcher.end()] + yield insert_str yield buff[matcher.end():] else: yield insert_str diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py index d70d2d08..226942c5 100644 --- a/pywb/rewrite/rewriterules.py +++ b/pywb/rewrite/rewriterules.py @@ -8,8 +8,6 @@ from html_rewriter import HTMLRewriter import itertools -HTML = HTMLRewriter - #================================================================= class RewriteRules(BaseRule): @@ -23,8 +21,9 @@ class RewriteRules(BaseRule): self.rewriters['header'] = config.get('header_class', HeaderRewriter) self.rewriters['css'] = config.get('css_class', CSSRewriter) self.rewriters['xml'] = config.get('xml_class', XMLRewriter) - self.rewriters['html'] = config.get('html_class', HTML) - #self.rewriters['html'] = config.get('html_class', HTMLRewriter) + self.rewriters['html'] = config.get('html_class', HTMLRewriter) + + self.parse_comments = config.get('parse_comments', False) # Custom handling for js rewriting, often the most complex self.js_rewrite_location = config.get('js_rewrite_location', True) diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 9ea8edc0..f3a5d38d 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -62,7 +62,7 @@ ur""" # Script tag >>> parse('') - + # Script tag + crossorigin >>> parse('') @@ -70,21 +70,21 @@ ur""" # Unterminated script tag, handle and auto-terminate >>> parse(' + >>> parse('') - + >>> parse('
') -
+
# Style >>> parse('') - + # Unterminated style tag, handle and auto-terminate >>> parse(' + # Head Insertion >>> parse('Test', head_insert = '') diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py index 3f3b4638..69a367a9 100644 --- a/pywb/rewrite/test/test_regex_rewriters.py +++ b/pywb/rewrite/test/test_regex_rewriters.py @@ -12,16 +12,16 @@ r""" #================================================================= >>> _test_js('location = "http://example.com/abc.html"') -'WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"' +'WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html"' >>> _test_js(r'location = "http:\/\/example.com/abc.html"') -'WB_wombat_location = "/web/20131010em_/http:\\/\\/example.com/abc.html"' +'WB_wombat_location = "/web/20131010mp_/http:\\/\\/example.com/abc.html"' >>> _test_js(r'location = "http:\\/\\/example.com/abc.html"') -'WB_wombat_location = "/web/20131010em_/http:\\\\/\\\\/example.com/abc.html"' +'WB_wombat_location = "/web/20131010mp_/http:\\\\/\\\\/example.com/abc.html"' >>> _test_js(r"location = 'http://example.com/abc.html/'") -"WB_wombat_location = '/web/20131010em_/http://example.com/abc.html/'" +"WB_wombat_location = '/web/20131010mp_/http://example.com/abc.html/'" >>> _test_js(r'location = http://example.com/abc.html/') 'WB_wombat_location = http://example.com/abc.html/' @@ -37,94 +37,98 @@ r""" '"/location" == some_location_val; locations = WB_wombat_location;' >>> _test_js('cool_Location = "http://example.com/abc.html"') -'cool_Location = "/web/20131010em_/http://example.com/abc.html"' +'cool_Location = "/web/20131010mp_/http://example.com/abc.html"' >>> _test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"') -'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"' +'window.WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"' >>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"') 'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"' # custom rules added >>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)]) -'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"; /*some_func(); */' +'window.WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html"; /*some_func(); */' # scheme-agnostic >>> _test_js('cool_Location = "//example.com/abc.html" //comment') -'cool_Location = "/web/20131010em_/http://example.com/abc.html" //comment' +'cool_Location = "/web/20131010mp_/http://example.com/abc.html" //comment' # document.cookie test >>> _test_js('document.cookie = "a=b; Path=/"') 'document.WB_wombat_cookie = "a=b; Path=/"' +# js-escaped +>>> _test_js('"http:\\/\\/www.example.com\\/some\\/path\\/?query=1"') +'"/web/20131010mp_/http:\\/\\/www.example.com\\/some\\/path\\/?query=1"' + #================================================================= # XML Rewriting #================================================================= >>> _test_xml('') -'' +'' >>> _test_xml('') -'' +'' >>> _test_xml(' http://example.comabchttp://example.com') -' /web/20131010em_/http://example.comabchttp://example.com' +' /web/20131010mp_/http://example.comabchttp://example.com' >>> _test_xml('
http://www.example.com/blah http://example.com
') -'
/web/20131010em_/http://www.example.com/blah /web/20131010em_/http://example.com
' +'
/web/20131010mp_/http://www.example.com/blah /web/20131010mp_/http://example.com
' #================================================================= # CSS Rewriting #================================================================= >>> _test_css("background: url('/some/path.html')") -"background: url('/web/20131010em_/http://example.com/some/path.html')" +"background: url('/web/20131010mp_/http://example.com/some/path.html')" >>> _test_css("background: url('../path.html')") -"background: url('/web/20131010em_/http://example.com/path.html')" +"background: url('/web/20131010mp_/http://example.com/path.html')" >>> _test_css("background: url(\"http://domain.com/path.html\")") -'background: url("/web/20131010em_/http://domain.com/path.html")' +'background: url("/web/20131010mp_/http://domain.com/path.html")' >>> _test_css("background: url(file.jpeg)") -'background: url(/web/20131010em_/http://example.com/file.jpeg)' +'background: url(/web/20131010mp_/http://example.com/file.jpeg)' >>> _test_css("background:#abc url('/static/styles/../images/layout/logo.png')") -"background:#abc url('/web/20131010em_/http://example.com/static/images/layout/logo.png')" +"background:#abc url('/web/20131010mp_/http://example.com/static/images/layout/logo.png')" >>> _test_css("background:#000 url('/static/styles/../../images/layout/logo.png')") -"background:#000 url('/web/20131010em_/http://example.com/images/layout/logo.png')" +"background:#000 url('/web/20131010mp_/http://example.com/images/layout/logo.png')" >>> _test_css("background: url('')") "background: url('')" >>> _test_css("background: url (\"weirdpath\')") -'background: url ("/web/20131010em_/http://example.com/weirdpath\')' +'background: url ("/web/20131010mp_/http://example.com/weirdpath\')' >>> _test_css("@import url ('path.css')") -"@import url ('/web/20131010em_/http://example.com/path.css')" +"@import url ('/web/20131010mp_/http://example.com/path.css')" >>> _test_css("@import url('path.css')") -"@import url('/web/20131010em_/http://example.com/path.css')" +"@import url('/web/20131010mp_/http://example.com/path.css')" >>> _test_css("@import ( 'path.css')") -"@import ( '/web/20131010em_/http://example.com/path.css')" +"@import ( '/web/20131010mp_/http://example.com/path.css')" >>> _test_css("@import \"path.css\"") -'@import "/web/20131010em_/http://example.com/path.css"' +'@import "/web/20131010mp_/http://example.com/path.css"' >>> _test_css("@import ('../path.css\"") -'@import (\'/web/20131010em_/http://example.com/path.css"' +'@import (\'/web/20131010mp_/http://example.com/path.css"' >>> _test_css("@import ('../url.css\"") -'@import (\'/web/20131010em_/http://example.com/url.css"' +'@import (\'/web/20131010mp_/http://example.com/url.css"' >>> _test_css("@import (\"url.css\")") -'@import ("/web/20131010em_/http://example.com/url.css")' +'@import ("/web/20131010mp_/http://example.com/url.css")' >>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)") -'@import url(/web/20131010em_/http://example.com/url.css)\n@import url(/web/20131010em_/http://example.com/anotherurl.css)\n @import url(/web/20131010em_/http://example.com/and_a_third.css)' +'@import url(/web/20131010mp_/http://example.com/url.css)\n@import url(/web/20131010mp_/http://example.com/anotherurl.css)\n @import url(/web/20131010mp_/http://example.com/and_a_third.css)' """ diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 03e2e33b..4e6ac514 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -31,6 +31,8 @@ rules: - match: 'Bootloader\.configurePage.*?;' replace: '/* {0} */' + parse_comments: true + # flickr rules #================================================================= diff --git a/pywb/static/wb.js b/pywb/static/wb.js index 0511f983..203f0b6c 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -30,10 +30,6 @@ function init_banner() { var FRAME_BANNER_ID = "_wb_frame_top_banner"; var bid; - if (wbinfo.is_embed) { - return; - } - if (window.top != window.self) { return; } @@ -131,23 +127,42 @@ function remove_event(name, func, object) { } } -var detect_on_init = function(event) { - init_banner(); +function notify_top() { + if (window.parent != window.top) { + return; + } - remove_event("readystatechange", detect_on_init, document); + if (!window.WB_wombat_location) { + return; + } + + if (typeof(window.WB_wombat_location.href) != "string") { + return; + } + + window.parent.update_wb_url(window.WB_wombat_location.href, + wbinfo.timestamp, + wbinfo.is_live); + + remove_event("readystatechange", notify_top, document); } -add_event("readystatechange", detect_on_init, document); - - -if (wbinfo.is_frame_mp && wbinfo.canon_url && - (window.self == window.top) && (window.self.top == window.top) && - window.location.href != wbinfo.canon_url) { - - window.location.replace(wbinfo.canon_url); +if ((window.self == window.top) && wbinfo) { + if (wbinfo.canon_url && (window.location.href != wbinfo.canon_url) && wbinfo.mod != "bn_") { + // Auto-redirect to top frame + window.location.replace(wbinfo.canon_url); + } else { + // Init Banner (no frame or top frame) + add_event("readystatechange", init_banner, document); + } +} else if (window.self != window.parent && window.parent.update_wb_url) { + add_event("readystatechange", notify_top, document); } -return {'labels': labels, - 'ts_to_date': ts_to_date}; + +return { + 'labels': labels, + 'ts_to_date': ts_to_date + }; })(); diff --git a/pywb/static/wb_frame.js b/pywb/static/wb_frame.js index 2dd88228..1ec173c7 100644 --- a/pywb/static/wb_frame.js +++ b/pywb/static/wb_frame.js @@ -1,7 +1,7 @@ -var update_wb_url = push_state; - var LIVE_COOKIE_REGEX = /pywb.timestamp=([\d]{1,14})/; +var curr_state = {}; + function make_outer_url(url, ts) { @@ -22,6 +22,15 @@ function make_inner_url(url, ts) } function push_state(url, timestamp, capture_str, is_live) { + if (window.frames[0].WB_wombat_location) { + curr_href = window.frames[0].WB_wombat_location.href; + + // If not current url, don't update + if (url != curr_href) { + return; + } + } + var state = {} state.timestamp = timestamp; state.outer_url = make_outer_url(url, state.timestamp); @@ -32,13 +41,13 @@ function push_state(url, timestamp, capture_str, is_live) { window.history.replaceState(state, "", state.outer_url); - update_status(state.capture_str, is_live); + set_state(state); } function pop_state(state) { - update_status(state.capture_str, state.is_live); + set_state(state); - window.frames[0].src = state.outer_url; + window.frames[0].src = state.inner_url; } function extract_ts(url) @@ -66,15 +75,15 @@ function extract_replay_url(url) { return url.substring(inx + 1); } -function update_status(str, is_live) { +function set_state(state) { var capture_info = document.getElementById("_wb_capture_info"); if (capture_info) { - capture_info.innerHTML = str; + capture_info.innerHTML = state.capture_str; } var label = document.getElementById("_wb_label"); if (label) { - if (is_live) { + if (state.is_live) { label.innerHTML = _wb_js.labels.LIVE_MSG; } else { label.innerHTML = _wb_js.labels.REPLAY_MSG; @@ -83,10 +92,10 @@ function update_status(str, is_live) { } window.onpopstate = function(event) { - var curr_state = event.state; + var state = event.state; - if (curr_state) { - pop_state(curr_state); + if (state) { + pop_state(state); } } @@ -100,14 +109,14 @@ function extract_ts_cookie(value) { } function iframe_loaded(event) { - var iframe = window.frames[0]; var url; var ts; var capture_str; var is_live = false; + var iframe = window.frames[0]; if (iframe.WB_wombat_location) { - url = window.WB_wombat_location.href; + url = iframe.WB_wombat_location.href; } else { url = extract_replay_url(iframe.location.href); } @@ -123,7 +132,16 @@ function iframe_loaded(event) { ts = extract_ts(iframe.location.href); } } + + update_wb_url(url, ts, is_live); +} + +function update_wb_url(url, ts, is_live) { + if (curr_state.url == url && curr_state.timestamp == ts) { + return; + } + capture_str = _wb_js.ts_to_date(ts, true); - update_wb_url(url, ts, capture_str, is_live); + push_state(url, ts, capture_str, is_live); } diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index e14c9d7d..962a32cd 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -699,7 +699,7 @@ WB_wombat_init = (function() { wb_replay_prefix = replay_prefix; if (wb_replay_prefix) { - wb_replay_date_prefix = replay_prefix + capture_date + "em_/"; + wb_replay_date_prefix = replay_prefix + capture_date + "mp_/"; if (capture_date.length > 0) { wb_capture_date_part = "/" + capture_date + "/"; diff --git a/pywb/ui/frame_insert.html b/pywb/ui/frame_insert.html index ce8d4ccd..9f232972 100644 --- a/pywb/ui/frame_insert.html +++ b/pywb/ui/frame_insert.html @@ -3,7 +3,6 @@ diff --git a/setup.py b/setup.py index 6b5482bf..a69a7c47 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,11 @@ class PyTest(TestCommand): setup( name='pywb', +<<<<<<< HEAD version='0.6.0', +======= + version='0.5.3', +>>>>>>> 0.5.4-fixes url='https://github.com/ikreymer/pywb', author='Ilya Kreymer', author_email='ikreymer@gmail.com',