diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 5a316016..f7575fa5 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -78,7 +78,8 @@ class HTMLRewriterMixin(object): head_insert=None, js_rewriter_class=JSRewriter, css_rewriter_class=CSSRewriter, - defmod=''): + defmod='', + parse_comments=False): self.url_rewriter = url_rewriter self._wb_parse_context = None @@ -87,6 +88,8 @@ class HTMLRewriterMixin(object): self.css_rewriter = css_rewriter_class(url_rewriter) self.head_insert = head_insert + self.parse_comments = parse_comments + self.rewrite_tags = self._init_rewrite_tags(defmod) # =========================== @@ -316,7 +319,11 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser): def handle_comment(self, data): self.out.write('') def handle_decl(self, data): diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index 3f440eee..df7a128e 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -25,7 +25,7 @@ class RegexRewriter(object): @staticmethod def archival_rewrite(rewriter): - return lambda string: rewriter.rewrite(string, 'em_') + return lambda string: rewriter.rewrite(string, 'mp_') #@staticmethod #def replacer(other): @@ -105,7 +105,7 @@ class JSLinkOnlyRewriter(RegexRewriter): JS Rewriter which rewrites absolute http://, https:// and // urls at the beginning of a string """ - JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+' + JS_HTTPX = r'(?<="|\'|;)(?:https?:)?\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+' def __init__(self, rewriter, rules=[]): rules = rules + [ diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 207d879e..c852633b 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -127,7 +127,8 @@ class RewriteContent: js_rewriter_class=rule.rewriters['js'], css_rewriter_class=rule.rewriters['css'], head_insert=head_insert_str, - defmod=self.defmod) + defmod=self.defmod, + parse_comments=rule.parse_comments) else: if wb_url.is_banner_only: diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py index d70d2d08..226942c5 100644 --- a/pywb/rewrite/rewriterules.py +++ b/pywb/rewrite/rewriterules.py @@ -8,8 +8,6 @@ from html_rewriter import HTMLRewriter import itertools -HTML = HTMLRewriter - #================================================================= class RewriteRules(BaseRule): @@ -23,8 +21,9 @@ class RewriteRules(BaseRule): self.rewriters['header'] = config.get('header_class', HeaderRewriter) self.rewriters['css'] = config.get('css_class', CSSRewriter) self.rewriters['xml'] = config.get('xml_class', XMLRewriter) - self.rewriters['html'] = config.get('html_class', HTML) - #self.rewriters['html'] = config.get('html_class', HTMLRewriter) + self.rewriters['html'] = config.get('html_class', HTMLRewriter) + + self.parse_comments = config.get('parse_comments', False) # Custom handling for js rewriting, often the most complex self.js_rewrite_location = config.get('js_rewrite_location', True) diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 9ea8edc0..f3a5d38d 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -62,7 +62,7 @@ ur""" # Script tag >>> parse('') - + # Script tag + crossorigin >>> parse('') @@ -70,21 +70,21 @@ ur""" # Unterminated script tag, handle and auto-terminate >>> parse(' + >>> parse('') - + >>> parse('
') -
+
# Style >>> parse('') - + # Unterminated style tag, handle and auto-terminate >>> parse(' + # Head Insertion >>> parse('Test', head_insert = '') diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py index 3f3b4638..69a367a9 100644 --- a/pywb/rewrite/test/test_regex_rewriters.py +++ b/pywb/rewrite/test/test_regex_rewriters.py @@ -12,16 +12,16 @@ r""" #================================================================= >>> _test_js('location = "http://example.com/abc.html"') -'WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"' +'WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html"' >>> _test_js(r'location = "http:\/\/example.com/abc.html"') -'WB_wombat_location = "/web/20131010em_/http:\\/\\/example.com/abc.html"' +'WB_wombat_location = "/web/20131010mp_/http:\\/\\/example.com/abc.html"' >>> _test_js(r'location = "http:\\/\\/example.com/abc.html"') -'WB_wombat_location = "/web/20131010em_/http:\\\\/\\\\/example.com/abc.html"' +'WB_wombat_location = "/web/20131010mp_/http:\\\\/\\\\/example.com/abc.html"' >>> _test_js(r"location = 'http://example.com/abc.html/'") -"WB_wombat_location = '/web/20131010em_/http://example.com/abc.html/'" +"WB_wombat_location = '/web/20131010mp_/http://example.com/abc.html/'" >>> _test_js(r'location = http://example.com/abc.html/') 'WB_wombat_location = http://example.com/abc.html/' @@ -37,94 +37,98 @@ r""" '"/location" == some_location_val; locations = WB_wombat_location;' >>> _test_js('cool_Location = "http://example.com/abc.html"') -'cool_Location = "/web/20131010em_/http://example.com/abc.html"' +'cool_Location = "/web/20131010mp_/http://example.com/abc.html"' >>> _test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"') -'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"' +'window.WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"' >>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"') 'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"' # custom rules added >>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)]) -'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"; /*some_func(); */' +'window.WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html"; /*some_func(); */' # scheme-agnostic >>> _test_js('cool_Location = "//example.com/abc.html" //comment') -'cool_Location = "/web/20131010em_/http://example.com/abc.html" //comment' +'cool_Location = "/web/20131010mp_/http://example.com/abc.html" //comment' # document.cookie test >>> _test_js('document.cookie = "a=b; Path=/"') 'document.WB_wombat_cookie = "a=b; Path=/"' +# js-escaped +>>> _test_js('"http:\\/\\/www.example.com\\/some\\/path\\/?query=1"') +'"/web/20131010mp_/http:\\/\\/www.example.com\\/some\\/path\\/?query=1"' + #================================================================= # XML Rewriting #================================================================= >>> _test_xml('') -'' +'' >>> _test_xml('') -'' +'' >>> _test_xml(' http://example.comabchttp://example.com') -' /web/20131010em_/http://example.comabchttp://example.com' +' /web/20131010mp_/http://example.comabchttp://example.com' >>> _test_xml('
http://www.example.com/blah http://example.com
') -'
/web/20131010em_/http://www.example.com/blah /web/20131010em_/http://example.com
' +'
/web/20131010mp_/http://www.example.com/blah /web/20131010mp_/http://example.com
' #================================================================= # CSS Rewriting #================================================================= >>> _test_css("background: url('/some/path.html')") -"background: url('/web/20131010em_/http://example.com/some/path.html')" +"background: url('/web/20131010mp_/http://example.com/some/path.html')" >>> _test_css("background: url('../path.html')") -"background: url('/web/20131010em_/http://example.com/path.html')" +"background: url('/web/20131010mp_/http://example.com/path.html')" >>> _test_css("background: url(\"http://domain.com/path.html\")") -'background: url("/web/20131010em_/http://domain.com/path.html")' +'background: url("/web/20131010mp_/http://domain.com/path.html")' >>> _test_css("background: url(file.jpeg)") -'background: url(/web/20131010em_/http://example.com/file.jpeg)' +'background: url(/web/20131010mp_/http://example.com/file.jpeg)' >>> _test_css("background:#abc url('/static/styles/../images/layout/logo.png')") -"background:#abc url('/web/20131010em_/http://example.com/static/images/layout/logo.png')" +"background:#abc url('/web/20131010mp_/http://example.com/static/images/layout/logo.png')" >>> _test_css("background:#000 url('/static/styles/../../images/layout/logo.png')") -"background:#000 url('/web/20131010em_/http://example.com/images/layout/logo.png')" +"background:#000 url('/web/20131010mp_/http://example.com/images/layout/logo.png')" >>> _test_css("background: url('')") "background: url('')" >>> _test_css("background: url (\"weirdpath\')") -'background: url ("/web/20131010em_/http://example.com/weirdpath\')' +'background: url ("/web/20131010mp_/http://example.com/weirdpath\')' >>> _test_css("@import url ('path.css')") -"@import url ('/web/20131010em_/http://example.com/path.css')" +"@import url ('/web/20131010mp_/http://example.com/path.css')" >>> _test_css("@import url('path.css')") -"@import url('/web/20131010em_/http://example.com/path.css')" +"@import url('/web/20131010mp_/http://example.com/path.css')" >>> _test_css("@import ( 'path.css')") -"@import ( '/web/20131010em_/http://example.com/path.css')" +"@import ( '/web/20131010mp_/http://example.com/path.css')" >>> _test_css("@import \"path.css\"") -'@import "/web/20131010em_/http://example.com/path.css"' +'@import "/web/20131010mp_/http://example.com/path.css"' >>> _test_css("@import ('../path.css\"") -'@import (\'/web/20131010em_/http://example.com/path.css"' +'@import (\'/web/20131010mp_/http://example.com/path.css"' >>> _test_css("@import ('../url.css\"") -'@import (\'/web/20131010em_/http://example.com/url.css"' +'@import (\'/web/20131010mp_/http://example.com/url.css"' >>> _test_css("@import (\"url.css\")") -'@import ("/web/20131010em_/http://example.com/url.css")' +'@import ("/web/20131010mp_/http://example.com/url.css")' >>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)") -'@import url(/web/20131010em_/http://example.com/url.css)\n@import url(/web/20131010em_/http://example.com/anotherurl.css)\n @import url(/web/20131010em_/http://example.com/and_a_third.css)' +'@import url(/web/20131010mp_/http://example.com/url.css)\n@import url(/web/20131010mp_/http://example.com/anotherurl.css)\n @import url(/web/20131010mp_/http://example.com/and_a_third.css)' """ diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 03e2e33b..4e6ac514 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -31,6 +31,8 @@ rules: - match: 'Bootloader\.configurePage.*?;' replace: '/* {0} */' + parse_comments: true + # flickr rules #================================================================= diff --git a/pywb/static/wb.js b/pywb/static/wb.js index 2d8b2470..6d261748 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -30,10 +30,6 @@ function init_banner() { var FRAME_BANNER_ID = "_wb_frame_top_banner"; var bid; - if (wbinfo.is_embed) { - return; - } - if (window.top != window.self) { return; } @@ -123,23 +119,20 @@ function remove_event(name, func, object) { } } -var detect_on_init = function(event) { - init_banner(); - - remove_event("readystatechange", detect_on_init, document); +if ((window.self == window.top) && (window.self.top == window.top) && wbinfo) { + if (wbinfo.canon_url && (window.location.href != wbinfo.canon_url)) { + // Auto-redirect to top frame + window.location.replace(wbinfo.canon_url); + } else { + // Init Banner (no frame or top frame) + add_event("readystatechange", init_banner, document); + } } -add_event("readystatechange", detect_on_init, document); - -if (wbinfo.is_frame_mp && wbinfo.canon_url && - (window.self == window.top) && (window.self.top == window.top) && - window.location.href != wbinfo.canon_url) { - - window.location.replace(wbinfo.canon_url); -} - -return {'labels': labels, - 'ts_to_date': ts_to_date}; +return { + 'labels': labels, + 'ts_to_date': ts_to_date + }; })(); diff --git a/pywb/static/wb_frame.js b/pywb/static/wb_frame.js index 2dd88228..48484e06 100644 --- a/pywb/static/wb_frame.js +++ b/pywb/static/wb_frame.js @@ -22,6 +22,15 @@ function make_inner_url(url, ts) } function push_state(url, timestamp, capture_str, is_live) { + if (window.frames[0].WB_wombat_location) { + curr_href = window.frames[0].WB_wombat_location.href; + + // If not current url, don't update + if (url != curr_href) { + return; + } + } + var state = {} state.timestamp = timestamp; state.outer_url = make_outer_url(url, state.timestamp); @@ -100,14 +109,14 @@ function extract_ts_cookie(value) { } function iframe_loaded(event) { - var iframe = window.frames[0]; var url; var ts; var capture_str; var is_live = false; + var iframe = window.frames[0]; if (iframe.WB_wombat_location) { - url = window.WB_wombat_location.href; + url = iframe.WB_wombat_location.href; } else { url = extract_replay_url(iframe.location.href); } diff --git a/pywb/ui/frame_insert.html b/pywb/ui/frame_insert.html index ce8d4ccd..9f232972 100644 --- a/pywb/ui/frame_insert.html +++ b/pywb/ui/frame_insert.html @@ -3,7 +3,6 @@