mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
rewrite: add support for js rewriting ';http:\\/' urls
add 'parse_comments' rule options for parsing comment contents via regex banner: simplify banner insertion check, only insert for top frame, and check for canon_url matching current href at top before redirecting to top replace em_ -> mp_ as default embedded mod
This commit is contained in:
parent
243de1f086
commit
4f9310fe4d
@ -78,7 +78,8 @@ class HTMLRewriterMixin(object):
|
||||
head_insert=None,
|
||||
js_rewriter_class=JSRewriter,
|
||||
css_rewriter_class=CSSRewriter,
|
||||
defmod=''):
|
||||
defmod='',
|
||||
parse_comments=False):
|
||||
|
||||
self.url_rewriter = url_rewriter
|
||||
self._wb_parse_context = None
|
||||
@ -87,6 +88,8 @@ class HTMLRewriterMixin(object):
|
||||
self.css_rewriter = css_rewriter_class(url_rewriter)
|
||||
|
||||
self.head_insert = head_insert
|
||||
self.parse_comments = parse_comments
|
||||
|
||||
self.rewrite_tags = self._init_rewrite_tags(defmod)
|
||||
|
||||
# ===========================
|
||||
@ -316,7 +319,11 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
||||
|
||||
def handle_comment(self, data):
|
||||
self.out.write('<!--')
|
||||
self.parse_data(data)
|
||||
if self.parse_comments:
|
||||
data = self._rewrite_script(data)
|
||||
self.out.write(data)
|
||||
else:
|
||||
self.parse_data(data)
|
||||
self.out.write('-->')
|
||||
|
||||
def handle_decl(self, data):
|
||||
|
@ -25,7 +25,7 @@ class RegexRewriter(object):
|
||||
|
||||
@staticmethod
|
||||
def archival_rewrite(rewriter):
|
||||
return lambda string: rewriter.rewrite(string, 'em_')
|
||||
return lambda string: rewriter.rewrite(string, 'mp_')
|
||||
|
||||
#@staticmethod
|
||||
#def replacer(other):
|
||||
@ -105,7 +105,7 @@ class JSLinkOnlyRewriter(RegexRewriter):
|
||||
JS Rewriter which rewrites absolute http://, https:// and // urls
|
||||
at the beginning of a string
|
||||
"""
|
||||
JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+'
|
||||
JS_HTTPX = r'(?<="|\'|;)(?:https?:)?\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+'
|
||||
|
||||
def __init__(self, rewriter, rules=[]):
|
||||
rules = rules + [
|
||||
|
@ -127,7 +127,8 @@ class RewriteContent:
|
||||
js_rewriter_class=rule.rewriters['js'],
|
||||
css_rewriter_class=rule.rewriters['css'],
|
||||
head_insert=head_insert_str,
|
||||
defmod=self.defmod)
|
||||
defmod=self.defmod,
|
||||
parse_comments=rule.parse_comments)
|
||||
|
||||
else:
|
||||
if wb_url.is_banner_only:
|
||||
|
@ -8,8 +8,6 @@ from html_rewriter import HTMLRewriter
|
||||
|
||||
import itertools
|
||||
|
||||
HTML = HTMLRewriter
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RewriteRules(BaseRule):
|
||||
@ -23,8 +21,9 @@ class RewriteRules(BaseRule):
|
||||
self.rewriters['header'] = config.get('header_class', HeaderRewriter)
|
||||
self.rewriters['css'] = config.get('css_class', CSSRewriter)
|
||||
self.rewriters['xml'] = config.get('xml_class', XMLRewriter)
|
||||
self.rewriters['html'] = config.get('html_class', HTML)
|
||||
#self.rewriters['html'] = config.get('html_class', HTMLRewriter)
|
||||
self.rewriters['html'] = config.get('html_class', HTMLRewriter)
|
||||
|
||||
self.parse_comments = config.get('parse_comments', False)
|
||||
|
||||
# Custom handling for js rewriting, often the most complex
|
||||
self.js_rewrite_location = config.get('js_rewrite_location', True)
|
||||
|
@ -62,7 +62,7 @@ ur"""
|
||||
|
||||
# Script tag
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
||||
<script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</script>
|
||||
<script>window.WB_wombat_location = "/web/20131226101010mp_/http://example.com/a/b/c.html"</script>
|
||||
|
||||
# Script tag + crossorigin
|
||||
>>> parse('<script src="/js/scripts.js" crossorigin="anonymous"></script>')
|
||||
@ -70,21 +70,21 @@ ur"""
|
||||
|
||||
# Unterminated script tag, handle and auto-terminate
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
|
||||
<script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</sc></script>
|
||||
<script>window.WB_wombat_location = "/web/20131226101010mp_/http://example.com/a/b/c.html"</sc></script>
|
||||
|
||||
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
|
||||
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html;/*]]>*/"</script>
|
||||
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010mp_/http://example.com/a/b/c.html;/*]]>*/"</script>
|
||||
|
||||
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
||||
<div style="background: url('/web/20131226101010em_/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
|
||||
<div style="background: url('/web/20131226101010mp_/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
|
||||
|
||||
# Style
|
||||
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
||||
<style>@import "/web/20131226101010em_/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010em_/http://example.com/some/path/myfont.ttf') }</style>
|
||||
<style>@import "/web/20131226101010mp_/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010mp_/http://example.com/some/path/myfont.ttf') }</style>
|
||||
|
||||
# Unterminated style tag, handle and auto-terminate
|
||||
>>> parse('<style>@import url(styles.css)')
|
||||
<style>@import url(/web/20131226101010em_/http://example.com/some/path/styles.css)</style>
|
||||
<style>@import url(/web/20131226101010mp_/http://example.com/some/path/styles.css)</style>
|
||||
|
||||
# Head Insertion
|
||||
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
|
@ -12,16 +12,16 @@ r"""
|
||||
#=================================================================
|
||||
|
||||
>>> _test_js('location = "http://example.com/abc.html"')
|
||||
'WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"'
|
||||
'WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html"'
|
||||
|
||||
>>> _test_js(r'location = "http:\/\/example.com/abc.html"')
|
||||
'WB_wombat_location = "/web/20131010em_/http:\\/\\/example.com/abc.html"'
|
||||
'WB_wombat_location = "/web/20131010mp_/http:\\/\\/example.com/abc.html"'
|
||||
|
||||
>>> _test_js(r'location = "http:\\/\\/example.com/abc.html"')
|
||||
'WB_wombat_location = "/web/20131010em_/http:\\\\/\\\\/example.com/abc.html"'
|
||||
'WB_wombat_location = "/web/20131010mp_/http:\\\\/\\\\/example.com/abc.html"'
|
||||
|
||||
>>> _test_js(r"location = 'http://example.com/abc.html/'")
|
||||
"WB_wombat_location = '/web/20131010em_/http://example.com/abc.html/'"
|
||||
"WB_wombat_location = '/web/20131010mp_/http://example.com/abc.html/'"
|
||||
|
||||
>>> _test_js(r'location = http://example.com/abc.html/')
|
||||
'WB_wombat_location = http://example.com/abc.html/'
|
||||
@ -37,94 +37,98 @@ r"""
|
||||
'"/location" == some_location_val; locations = WB_wombat_location;'
|
||||
|
||||
>>> _test_js('cool_Location = "http://example.com/abc.html"')
|
||||
'cool_Location = "/web/20131010em_/http://example.com/abc.html"'
|
||||
'cool_Location = "/web/20131010mp_/http://example.com/abc.html"'
|
||||
|
||||
>>> _test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
|
||||
'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
|
||||
'window.WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
|
||||
|
||||
>>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
|
||||
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
|
||||
|
||||
# custom rules added
|
||||
>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)])
|
||||
'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"; /*some_func(); */'
|
||||
'window.WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html"; /*some_func(); */'
|
||||
|
||||
# scheme-agnostic
|
||||
>>> _test_js('cool_Location = "//example.com/abc.html" //comment')
|
||||
'cool_Location = "/web/20131010em_/http://example.com/abc.html" //comment'
|
||||
'cool_Location = "/web/20131010mp_/http://example.com/abc.html" //comment'
|
||||
|
||||
# document.cookie test
|
||||
>>> _test_js('document.cookie = "a=b; Path=/"')
|
||||
'document.WB_wombat_cookie = "a=b; Path=/"'
|
||||
|
||||
# js-escaped
|
||||
>>> _test_js('"http:\\/\\/www.example.com\\/some\\/path\\/?query=1"')
|
||||
'"/web/20131010mp_/http:\\/\\/www.example.com\\/some\\/path\\/?query=1"'
|
||||
|
||||
|
||||
#=================================================================
|
||||
# XML Rewriting
|
||||
#=================================================================
|
||||
|
||||
>>> _test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>')
|
||||
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010em_/http://example.com"></tag>'
|
||||
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010mp_/http://example.com"></tag>'
|
||||
|
||||
>>> _test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>')
|
||||
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010em_/http://example.com"></tag>'
|
||||
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010mp_/http://example.com"></tag>'
|
||||
|
||||
>>> _test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>')
|
||||
'<tag> /web/20131010em_/http://example.com<other>abchttp://example.com</other></tag>'
|
||||
'<tag> /web/20131010mp_/http://example.com<other>abchttp://example.com</other></tag>'
|
||||
|
||||
>>> _test_xml('<main> http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>')
|
||||
'<main> /web/20131010em_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010em_/http://example.com </main>'
|
||||
'<main> /web/20131010mp_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010mp_/http://example.com </main>'
|
||||
|
||||
#=================================================================
|
||||
# CSS Rewriting
|
||||
#=================================================================
|
||||
|
||||
>>> _test_css("background: url('/some/path.html')")
|
||||
"background: url('/web/20131010em_/http://example.com/some/path.html')"
|
||||
"background: url('/web/20131010mp_/http://example.com/some/path.html')"
|
||||
|
||||
>>> _test_css("background: url('../path.html')")
|
||||
"background: url('/web/20131010em_/http://example.com/path.html')"
|
||||
"background: url('/web/20131010mp_/http://example.com/path.html')"
|
||||
|
||||
>>> _test_css("background: url(\"http://domain.com/path.html\")")
|
||||
'background: url("/web/20131010em_/http://domain.com/path.html")'
|
||||
'background: url("/web/20131010mp_/http://domain.com/path.html")'
|
||||
|
||||
>>> _test_css("background: url(file.jpeg)")
|
||||
'background: url(/web/20131010em_/http://example.com/file.jpeg)'
|
||||
'background: url(/web/20131010mp_/http://example.com/file.jpeg)'
|
||||
|
||||
>>> _test_css("background:#abc url('/static/styles/../images/layout/logo.png')")
|
||||
"background:#abc url('/web/20131010em_/http://example.com/static/images/layout/logo.png')"
|
||||
"background:#abc url('/web/20131010mp_/http://example.com/static/images/layout/logo.png')"
|
||||
|
||||
>>> _test_css("background:#000 url('/static/styles/../../images/layout/logo.png')")
|
||||
"background:#000 url('/web/20131010em_/http://example.com/images/layout/logo.png')"
|
||||
"background:#000 url('/web/20131010mp_/http://example.com/images/layout/logo.png')"
|
||||
|
||||
>>> _test_css("background: url('')")
|
||||
"background: url('')"
|
||||
|
||||
>>> _test_css("background: url (\"weirdpath\')")
|
||||
'background: url ("/web/20131010em_/http://example.com/weirdpath\')'
|
||||
'background: url ("/web/20131010mp_/http://example.com/weirdpath\')'
|
||||
|
||||
>>> _test_css("@import url ('path.css')")
|
||||
"@import url ('/web/20131010em_/http://example.com/path.css')"
|
||||
"@import url ('/web/20131010mp_/http://example.com/path.css')"
|
||||
|
||||
>>> _test_css("@import url('path.css')")
|
||||
"@import url('/web/20131010em_/http://example.com/path.css')"
|
||||
"@import url('/web/20131010mp_/http://example.com/path.css')"
|
||||
|
||||
>>> _test_css("@import ( 'path.css')")
|
||||
"@import ( '/web/20131010em_/http://example.com/path.css')"
|
||||
"@import ( '/web/20131010mp_/http://example.com/path.css')"
|
||||
|
||||
>>> _test_css("@import \"path.css\"")
|
||||
'@import "/web/20131010em_/http://example.com/path.css"'
|
||||
'@import "/web/20131010mp_/http://example.com/path.css"'
|
||||
|
||||
>>> _test_css("@import ('../path.css\"")
|
||||
'@import (\'/web/20131010em_/http://example.com/path.css"'
|
||||
'@import (\'/web/20131010mp_/http://example.com/path.css"'
|
||||
|
||||
>>> _test_css("@import ('../url.css\"")
|
||||
'@import (\'/web/20131010em_/http://example.com/url.css"'
|
||||
'@import (\'/web/20131010mp_/http://example.com/url.css"'
|
||||
|
||||
>>> _test_css("@import (\"url.css\")")
|
||||
'@import ("/web/20131010em_/http://example.com/url.css")'
|
||||
'@import ("/web/20131010mp_/http://example.com/url.css")'
|
||||
|
||||
>>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
|
||||
'@import url(/web/20131010em_/http://example.com/url.css)\n@import url(/web/20131010em_/http://example.com/anotherurl.css)\n @import url(/web/20131010em_/http://example.com/and_a_third.css)'
|
||||
'@import url(/web/20131010mp_/http://example.com/url.css)\n@import url(/web/20131010mp_/http://example.com/anotherurl.css)\n @import url(/web/20131010mp_/http://example.com/and_a_third.css)'
|
||||
|
||||
"""
|
||||
|
||||
|
@ -31,6 +31,8 @@ rules:
|
||||
- match: 'Bootloader\.configurePage.*?;'
|
||||
replace: '/* {0} */'
|
||||
|
||||
parse_comments: true
|
||||
|
||||
|
||||
# flickr rules
|
||||
#=================================================================
|
||||
|
@ -30,10 +30,6 @@ function init_banner() {
|
||||
var FRAME_BANNER_ID = "_wb_frame_top_banner";
|
||||
var bid;
|
||||
|
||||
if (wbinfo.is_embed) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (window.top != window.self) {
|
||||
return;
|
||||
}
|
||||
@ -123,23 +119,20 @@ function remove_event(name, func, object) {
|
||||
}
|
||||
}
|
||||
|
||||
var detect_on_init = function(event) {
|
||||
init_banner();
|
||||
|
||||
remove_event("readystatechange", detect_on_init, document);
|
||||
if ((window.self == window.top) && (window.self.top == window.top) && wbinfo) {
|
||||
if (wbinfo.canon_url && (window.location.href != wbinfo.canon_url)) {
|
||||
// Auto-redirect to top frame
|
||||
window.location.replace(wbinfo.canon_url);
|
||||
} else {
|
||||
// Init Banner (no frame or top frame)
|
||||
add_event("readystatechange", init_banner, document);
|
||||
}
|
||||
}
|
||||
|
||||
add_event("readystatechange", detect_on_init, document);
|
||||
|
||||
|
||||
if (wbinfo.is_frame_mp && wbinfo.canon_url &&
|
||||
(window.self == window.top) && (window.self.top == window.top) &&
|
||||
window.location.href != wbinfo.canon_url) {
|
||||
|
||||
window.location.replace(wbinfo.canon_url);
|
||||
}
|
||||
|
||||
return {'labels': labels,
|
||||
'ts_to_date': ts_to_date};
|
||||
return {
|
||||
'labels': labels,
|
||||
'ts_to_date': ts_to_date
|
||||
};
|
||||
|
||||
})();
|
||||
|
@ -22,6 +22,15 @@ function make_inner_url(url, ts)
|
||||
}
|
||||
|
||||
function push_state(url, timestamp, capture_str, is_live) {
|
||||
if (window.frames[0].WB_wombat_location) {
|
||||
curr_href = window.frames[0].WB_wombat_location.href;
|
||||
|
||||
// If not current url, don't update
|
||||
if (url != curr_href) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
var state = {}
|
||||
state.timestamp = timestamp;
|
||||
state.outer_url = make_outer_url(url, state.timestamp);
|
||||
@ -100,14 +109,14 @@ function extract_ts_cookie(value) {
|
||||
}
|
||||
|
||||
function iframe_loaded(event) {
|
||||
var iframe = window.frames[0];
|
||||
var url;
|
||||
var ts;
|
||||
var capture_str;
|
||||
var is_live = false;
|
||||
var iframe = window.frames[0];
|
||||
|
||||
if (iframe.WB_wombat_location) {
|
||||
url = window.WB_wombat_location.href;
|
||||
url = iframe.WB_wombat_location.href;
|
||||
} else {
|
||||
url = extract_replay_url(iframe.location.href);
|
||||
}
|
||||
|
@ -3,7 +3,6 @@
|
||||
<!-- Start WB Insert -->
|
||||
<script>
|
||||
wbinfo = {}
|
||||
wbinfo.is_embed = false;
|
||||
wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
|
||||
wbinfo.capture_url = "{{ url }}";
|
||||
wbinfo.is_frame = true;
|
||||
|
@ -14,7 +14,6 @@
|
||||
wbinfo = {}
|
||||
wbinfo.timestamp = "{{ cdx.timestamp }}";
|
||||
wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
|
||||
wbinfo.is_embed = {{"true" if wbrequest.wb_url.is_embed else "false"}};
|
||||
wbinfo.is_frame_mp = {{"true" if wbrequest.wb_url.mod == 'mp_' else "false"}};
|
||||
wbinfo.canon_url = "{{ canon_url }}";
|
||||
wbinfo.is_live = {{ "true" if cdx.is_live else "false" }};
|
||||
|
Loading…
x
Reference in New Issue
Block a user