1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rewrite: add support for js rewriting ';http:\\/' urls

add 'parse_comments' rule options for parsing comment contents via regex
banner: simplify banner insertion check, only insert for top frame, and check
for canon_url matching current href at top before redirecting to top
replace em_ -> mp_ as default embedded mod
This commit is contained in:
Ilya Kreymer 2014-08-05 01:47:52 -07:00
parent 243de1f086
commit 4f9310fe4d
11 changed files with 78 additions and 65 deletions

View File

@ -78,7 +78,8 @@ class HTMLRewriterMixin(object):
head_insert=None,
js_rewriter_class=JSRewriter,
css_rewriter_class=CSSRewriter,
defmod=''):
defmod='',
parse_comments=False):
self.url_rewriter = url_rewriter
self._wb_parse_context = None
@ -87,6 +88,8 @@ class HTMLRewriterMixin(object):
self.css_rewriter = css_rewriter_class(url_rewriter)
self.head_insert = head_insert
self.parse_comments = parse_comments
self.rewrite_tags = self._init_rewrite_tags(defmod)
# ===========================
@ -316,7 +319,11 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
def handle_comment(self, data):
self.out.write('<!--')
self.parse_data(data)
if self.parse_comments:
data = self._rewrite_script(data)
self.out.write(data)
else:
self.parse_data(data)
self.out.write('-->')
def handle_decl(self, data):

View File

@ -25,7 +25,7 @@ class RegexRewriter(object):
@staticmethod
def archival_rewrite(rewriter):
return lambda string: rewriter.rewrite(string, 'em_')
return lambda string: rewriter.rewrite(string, 'mp_')
#@staticmethod
#def replacer(other):
@ -105,7 +105,7 @@ class JSLinkOnlyRewriter(RegexRewriter):
JS Rewriter which rewrites absolute http://, https:// and // urls
at the beginning of a string
"""
JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+'
JS_HTTPX = r'(?<="|\'|;)(?:https?:)?\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+'
def __init__(self, rewriter, rules=[]):
rules = rules + [

View File

@ -127,7 +127,8 @@ class RewriteContent:
js_rewriter_class=rule.rewriters['js'],
css_rewriter_class=rule.rewriters['css'],
head_insert=head_insert_str,
defmod=self.defmod)
defmod=self.defmod,
parse_comments=rule.parse_comments)
else:
if wb_url.is_banner_only:

View File

@ -8,8 +8,6 @@ from html_rewriter import HTMLRewriter
import itertools
HTML = HTMLRewriter
#=================================================================
class RewriteRules(BaseRule):
@ -23,8 +21,9 @@ class RewriteRules(BaseRule):
self.rewriters['header'] = config.get('header_class', HeaderRewriter)
self.rewriters['css'] = config.get('css_class', CSSRewriter)
self.rewriters['xml'] = config.get('xml_class', XMLRewriter)
self.rewriters['html'] = config.get('html_class', HTML)
#self.rewriters['html'] = config.get('html_class', HTMLRewriter)
self.rewriters['html'] = config.get('html_class', HTMLRewriter)
self.parse_comments = config.get('parse_comments', False)
# Custom handling for js rewriting, often the most complex
self.js_rewrite_location = config.get('js_rewrite_location', True)

View File

@ -62,7 +62,7 @@ ur"""
# Script tag
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
<script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</script>
<script>window.WB_wombat_location = "/web/20131226101010mp_/http://example.com/a/b/c.html"</script>
# Script tag + crossorigin
>>> parse('<script src="/js/scripts.js" crossorigin="anonymous"></script>')
@ -70,21 +70,21 @@ ur"""
# Unterminated script tag, handle and auto-terminate
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
<script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</sc></script>
<script>window.WB_wombat_location = "/web/20131226101010mp_/http://example.com/a/b/c.html"</sc></script>
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html;/*]]>*/"</script>
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010mp_/http://example.com/a/b/c.html;/*]]>*/"</script>
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
<div style="background: url('/web/20131226101010em_/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
<div style="background: url('/web/20131226101010mp_/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
# Style
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
<style>@import "/web/20131226101010em_/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010em_/http://example.com/some/path/myfont.ttf') }</style>
<style>@import "/web/20131226101010mp_/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010mp_/http://example.com/some/path/myfont.ttf') }</style>
# Unterminated style tag, handle and auto-terminate
>>> parse('<style>@import url(styles.css)')
<style>@import url(/web/20131226101010em_/http://example.com/some/path/styles.css)</style>
<style>@import url(/web/20131226101010mp_/http://example.com/some/path/styles.css)</style>
# Head Insertion
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')

View File

@ -12,16 +12,16 @@ r"""
#=================================================================
>>> _test_js('location = "http://example.com/abc.html"')
'WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"'
'WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html"'
>>> _test_js(r'location = "http:\/\/example.com/abc.html"')
'WB_wombat_location = "/web/20131010em_/http:\\/\\/example.com/abc.html"'
'WB_wombat_location = "/web/20131010mp_/http:\\/\\/example.com/abc.html"'
>>> _test_js(r'location = "http:\\/\\/example.com/abc.html"')
'WB_wombat_location = "/web/20131010em_/http:\\\\/\\\\/example.com/abc.html"'
'WB_wombat_location = "/web/20131010mp_/http:\\\\/\\\\/example.com/abc.html"'
>>> _test_js(r"location = 'http://example.com/abc.html/'")
"WB_wombat_location = '/web/20131010em_/http://example.com/abc.html/'"
"WB_wombat_location = '/web/20131010mp_/http://example.com/abc.html/'"
>>> _test_js(r'location = http://example.com/abc.html/')
'WB_wombat_location = http://example.com/abc.html/'
@ -37,94 +37,98 @@ r"""
'"/location" == some_location_val; locations = WB_wombat_location;'
>>> _test_js('cool_Location = "http://example.com/abc.html"')
'cool_Location = "/web/20131010em_/http://example.com/abc.html"'
'cool_Location = "/web/20131010mp_/http://example.com/abc.html"'
>>> _test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
'window.WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
>>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
# custom rules added
>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)])
'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"; /*some_func(); */'
'window.WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html"; /*some_func(); */'
# scheme-agnostic
>>> _test_js('cool_Location = "//example.com/abc.html" //comment')
'cool_Location = "/web/20131010em_/http://example.com/abc.html" //comment'
'cool_Location = "/web/20131010mp_/http://example.com/abc.html" //comment'
# document.cookie test
>>> _test_js('document.cookie = "a=b; Path=/"')
'document.WB_wombat_cookie = "a=b; Path=/"'
# js-escaped
>>> _test_js('&quot;http:\\/\\/www.example.com\\/some\\/path\\/?query=1&quot;')
'&quot;/web/20131010mp_/http:\\/\\/www.example.com\\/some\\/path\\/?query=1&quot;'
#=================================================================
# XML Rewriting
#=================================================================
>>> _test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>')
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010em_/http://example.com"></tag>'
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010mp_/http://example.com"></tag>'
>>> _test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>')
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010em_/http://example.com"></tag>'
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010mp_/http://example.com"></tag>'
>>> _test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>')
'<tag> /web/20131010em_/http://example.com<other>abchttp://example.com</other></tag>'
'<tag> /web/20131010mp_/http://example.com<other>abchttp://example.com</other></tag>'
>>> _test_xml('<main> http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>')
'<main> /web/20131010em_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010em_/http://example.com </main>'
'<main> /web/20131010mp_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010mp_/http://example.com </main>'
#=================================================================
# CSS Rewriting
#=================================================================
>>> _test_css("background: url('/some/path.html')")
"background: url('/web/20131010em_/http://example.com/some/path.html')"
"background: url('/web/20131010mp_/http://example.com/some/path.html')"
>>> _test_css("background: url('../path.html')")
"background: url('/web/20131010em_/http://example.com/path.html')"
"background: url('/web/20131010mp_/http://example.com/path.html')"
>>> _test_css("background: url(\"http://domain.com/path.html\")")
'background: url("/web/20131010em_/http://domain.com/path.html")'
'background: url("/web/20131010mp_/http://domain.com/path.html")'
>>> _test_css("background: url(file.jpeg)")
'background: url(/web/20131010em_/http://example.com/file.jpeg)'
'background: url(/web/20131010mp_/http://example.com/file.jpeg)'
>>> _test_css("background:#abc url('/static/styles/../images/layout/logo.png')")
"background:#abc url('/web/20131010em_/http://example.com/static/images/layout/logo.png')"
"background:#abc url('/web/20131010mp_/http://example.com/static/images/layout/logo.png')"
>>> _test_css("background:#000 url('/static/styles/../../images/layout/logo.png')")
"background:#000 url('/web/20131010em_/http://example.com/images/layout/logo.png')"
"background:#000 url('/web/20131010mp_/http://example.com/images/layout/logo.png')"
>>> _test_css("background: url('')")
"background: url('')"
>>> _test_css("background: url (\"weirdpath\')")
'background: url ("/web/20131010em_/http://example.com/weirdpath\')'
'background: url ("/web/20131010mp_/http://example.com/weirdpath\')'
>>> _test_css("@import url ('path.css')")
"@import url ('/web/20131010em_/http://example.com/path.css')"
"@import url ('/web/20131010mp_/http://example.com/path.css')"
>>> _test_css("@import url('path.css')")
"@import url('/web/20131010em_/http://example.com/path.css')"
"@import url('/web/20131010mp_/http://example.com/path.css')"
>>> _test_css("@import ( 'path.css')")
"@import ( '/web/20131010em_/http://example.com/path.css')"
"@import ( '/web/20131010mp_/http://example.com/path.css')"
>>> _test_css("@import \"path.css\"")
'@import "/web/20131010em_/http://example.com/path.css"'
'@import "/web/20131010mp_/http://example.com/path.css"'
>>> _test_css("@import ('../path.css\"")
'@import (\'/web/20131010em_/http://example.com/path.css"'
'@import (\'/web/20131010mp_/http://example.com/path.css"'
>>> _test_css("@import ('../url.css\"")
'@import (\'/web/20131010em_/http://example.com/url.css"'
'@import (\'/web/20131010mp_/http://example.com/url.css"'
>>> _test_css("@import (\"url.css\")")
'@import ("/web/20131010em_/http://example.com/url.css")'
'@import ("/web/20131010mp_/http://example.com/url.css")'
>>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
'@import url(/web/20131010em_/http://example.com/url.css)\n@import url(/web/20131010em_/http://example.com/anotherurl.css)\n @import url(/web/20131010em_/http://example.com/and_a_third.css)'
'@import url(/web/20131010mp_/http://example.com/url.css)\n@import url(/web/20131010mp_/http://example.com/anotherurl.css)\n @import url(/web/20131010mp_/http://example.com/and_a_third.css)'
"""

View File

@ -31,6 +31,8 @@ rules:
- match: 'Bootloader\.configurePage.*?;'
replace: '/* {0} */'
parse_comments: true
# flickr rules
#=================================================================

View File

@ -30,10 +30,6 @@ function init_banner() {
var FRAME_BANNER_ID = "_wb_frame_top_banner";
var bid;
if (wbinfo.is_embed) {
return;
}
if (window.top != window.self) {
return;
}
@ -123,23 +119,20 @@ function remove_event(name, func, object) {
}
}
var detect_on_init = function(event) {
init_banner();
remove_event("readystatechange", detect_on_init, document);
if ((window.self == window.top) && (window.self.top == window.top) && wbinfo) {
if (wbinfo.canon_url && (window.location.href != wbinfo.canon_url)) {
// Auto-redirect to top frame
window.location.replace(wbinfo.canon_url);
} else {
// Init Banner (no frame or top frame)
add_event("readystatechange", init_banner, document);
}
}
add_event("readystatechange", detect_on_init, document);
if (wbinfo.is_frame_mp && wbinfo.canon_url &&
(window.self == window.top) && (window.self.top == window.top) &&
window.location.href != wbinfo.canon_url) {
window.location.replace(wbinfo.canon_url);
}
return {'labels': labels,
'ts_to_date': ts_to_date};
return {
'labels': labels,
'ts_to_date': ts_to_date
};
})();

View File

@ -22,6 +22,15 @@ function make_inner_url(url, ts)
}
function push_state(url, timestamp, capture_str, is_live) {
if (window.frames[0].WB_wombat_location) {
curr_href = window.frames[0].WB_wombat_location.href;
// If not current url, don't update
if (url != curr_href) {
return;
}
}
var state = {}
state.timestamp = timestamp;
state.outer_url = make_outer_url(url, state.timestamp);
@ -100,14 +109,14 @@ function extract_ts_cookie(value) {
}
function iframe_loaded(event) {
var iframe = window.frames[0];
var url;
var ts;
var capture_str;
var is_live = false;
var iframe = window.frames[0];
if (iframe.WB_wombat_location) {
url = window.WB_wombat_location.href;
url = iframe.WB_wombat_location.href;
} else {
url = extract_replay_url(iframe.location.href);
}

View File

@ -3,7 +3,6 @@
<!-- Start WB Insert -->
<script>
wbinfo = {}
wbinfo.is_embed = false;
wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
wbinfo.capture_url = "{{ url }}";
wbinfo.is_frame = true;

View File

@ -14,7 +14,6 @@
wbinfo = {}
wbinfo.timestamp = "{{ cdx.timestamp }}";
wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
wbinfo.is_embed = {{"true" if wbrequest.wb_url.is_embed else "false"}};
wbinfo.is_frame_mp = {{"true" if wbrequest.wb_url.mod == 'mp_' else "false"}};
wbinfo.canon_url = "{{ canon_url }}";
wbinfo.is_live = {{ "true" if cdx.is_live else "false" }};