1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

rewrite: add support for js rewriting ';http:\\/' urls

add 'parse_comments' rule options for parsing comment contents via regex
banner: simplify banner insertion check, only insert for top frame, and check
for canon_url matching current href at top before redirecting to top
replace em_ -> mp_ as default embedded mod
This commit is contained in:
Ilya Kreymer 2014-08-05 01:47:52 -07:00
parent 243de1f086
commit 4f9310fe4d
11 changed files with 78 additions and 65 deletions

View File

@ -78,7 +78,8 @@ class HTMLRewriterMixin(object):
head_insert=None, head_insert=None,
js_rewriter_class=JSRewriter, js_rewriter_class=JSRewriter,
css_rewriter_class=CSSRewriter, css_rewriter_class=CSSRewriter,
defmod=''): defmod='',
parse_comments=False):
self.url_rewriter = url_rewriter self.url_rewriter = url_rewriter
self._wb_parse_context = None self._wb_parse_context = None
@ -87,6 +88,8 @@ class HTMLRewriterMixin(object):
self.css_rewriter = css_rewriter_class(url_rewriter) self.css_rewriter = css_rewriter_class(url_rewriter)
self.head_insert = head_insert self.head_insert = head_insert
self.parse_comments = parse_comments
self.rewrite_tags = self._init_rewrite_tags(defmod) self.rewrite_tags = self._init_rewrite_tags(defmod)
# =========================== # ===========================
@ -316,7 +319,11 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
def handle_comment(self, data): def handle_comment(self, data):
self.out.write('<!--') self.out.write('<!--')
self.parse_data(data) if self.parse_comments:
data = self._rewrite_script(data)
self.out.write(data)
else:
self.parse_data(data)
self.out.write('-->') self.out.write('-->')
def handle_decl(self, data): def handle_decl(self, data):

View File

@ -25,7 +25,7 @@ class RegexRewriter(object):
@staticmethod @staticmethod
def archival_rewrite(rewriter): def archival_rewrite(rewriter):
return lambda string: rewriter.rewrite(string, 'em_') return lambda string: rewriter.rewrite(string, 'mp_')
#@staticmethod #@staticmethod
#def replacer(other): #def replacer(other):
@ -105,7 +105,7 @@ class JSLinkOnlyRewriter(RegexRewriter):
JS Rewriter which rewrites absolute http://, https:// and // urls JS Rewriter which rewrites absolute http://, https:// and // urls
at the beginning of a string at the beginning of a string
""" """
JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+' JS_HTTPX = r'(?<="|\'|;)(?:https?:)?\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+'
def __init__(self, rewriter, rules=[]): def __init__(self, rewriter, rules=[]):
rules = rules + [ rules = rules + [

View File

@ -127,7 +127,8 @@ class RewriteContent:
js_rewriter_class=rule.rewriters['js'], js_rewriter_class=rule.rewriters['js'],
css_rewriter_class=rule.rewriters['css'], css_rewriter_class=rule.rewriters['css'],
head_insert=head_insert_str, head_insert=head_insert_str,
defmod=self.defmod) defmod=self.defmod,
parse_comments=rule.parse_comments)
else: else:
if wb_url.is_banner_only: if wb_url.is_banner_only:

View File

@ -8,8 +8,6 @@ from html_rewriter import HTMLRewriter
import itertools import itertools
HTML = HTMLRewriter
#================================================================= #=================================================================
class RewriteRules(BaseRule): class RewriteRules(BaseRule):
@ -23,8 +21,9 @@ class RewriteRules(BaseRule):
self.rewriters['header'] = config.get('header_class', HeaderRewriter) self.rewriters['header'] = config.get('header_class', HeaderRewriter)
self.rewriters['css'] = config.get('css_class', CSSRewriter) self.rewriters['css'] = config.get('css_class', CSSRewriter)
self.rewriters['xml'] = config.get('xml_class', XMLRewriter) self.rewriters['xml'] = config.get('xml_class', XMLRewriter)
self.rewriters['html'] = config.get('html_class', HTML) self.rewriters['html'] = config.get('html_class', HTMLRewriter)
#self.rewriters['html'] = config.get('html_class', HTMLRewriter)
self.parse_comments = config.get('parse_comments', False)
# Custom handling for js rewriting, often the most complex # Custom handling for js rewriting, often the most complex
self.js_rewrite_location = config.get('js_rewrite_location', True) self.js_rewrite_location = config.get('js_rewrite_location', True)

View File

@ -62,7 +62,7 @@ ur"""
# Script tag # Script tag
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>') >>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
<script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</script> <script>window.WB_wombat_location = "/web/20131226101010mp_/http://example.com/a/b/c.html"</script>
# Script tag + crossorigin # Script tag + crossorigin
>>> parse('<script src="/js/scripts.js" crossorigin="anonymous"></script>') >>> parse('<script src="/js/scripts.js" crossorigin="anonymous"></script>')
@ -70,21 +70,21 @@ ur"""
# Unterminated script tag, handle and auto-terminate # Unterminated script tag, handle and auto-terminate
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>') >>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
<script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</sc></script> <script>window.WB_wombat_location = "/web/20131226101010mp_/http://example.com/a/b/c.html"</sc></script>
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>') >>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html;/*]]>*/"</script> <script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010mp_/http://example.com/a/b/c.html;/*]]>*/"</script>
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>') >>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
<div style="background: url('/web/20131226101010em_/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div> <div style="background: url('/web/20131226101010mp_/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
# Style # Style
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>') >>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
<style>@import "/web/20131226101010em_/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010em_/http://example.com/some/path/myfont.ttf') }</style> <style>@import "/web/20131226101010mp_/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010mp_/http://example.com/some/path/myfont.ttf') }</style>
# Unterminated style tag, handle and auto-terminate # Unterminated style tag, handle and auto-terminate
>>> parse('<style>@import url(styles.css)') >>> parse('<style>@import url(styles.css)')
<style>@import url(/web/20131226101010em_/http://example.com/some/path/styles.css)</style> <style>@import url(/web/20131226101010mp_/http://example.com/some/path/styles.css)</style>
# Head Insertion # Head Insertion
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>') >>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')

View File

@ -12,16 +12,16 @@ r"""
#================================================================= #=================================================================
>>> _test_js('location = "http://example.com/abc.html"') >>> _test_js('location = "http://example.com/abc.html"')
'WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"' 'WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html"'
>>> _test_js(r'location = "http:\/\/example.com/abc.html"') >>> _test_js(r'location = "http:\/\/example.com/abc.html"')
'WB_wombat_location = "/web/20131010em_/http:\\/\\/example.com/abc.html"' 'WB_wombat_location = "/web/20131010mp_/http:\\/\\/example.com/abc.html"'
>>> _test_js(r'location = "http:\\/\\/example.com/abc.html"') >>> _test_js(r'location = "http:\\/\\/example.com/abc.html"')
'WB_wombat_location = "/web/20131010em_/http:\\\\/\\\\/example.com/abc.html"' 'WB_wombat_location = "/web/20131010mp_/http:\\\\/\\\\/example.com/abc.html"'
>>> _test_js(r"location = 'http://example.com/abc.html/'") >>> _test_js(r"location = 'http://example.com/abc.html/'")
"WB_wombat_location = '/web/20131010em_/http://example.com/abc.html/'" "WB_wombat_location = '/web/20131010mp_/http://example.com/abc.html/'"
>>> _test_js(r'location = http://example.com/abc.html/') >>> _test_js(r'location = http://example.com/abc.html/')
'WB_wombat_location = http://example.com/abc.html/' 'WB_wombat_location = http://example.com/abc.html/'
@ -37,94 +37,98 @@ r"""
'"/location" == some_location_val; locations = WB_wombat_location;' '"/location" == some_location_val; locations = WB_wombat_location;'
>>> _test_js('cool_Location = "http://example.com/abc.html"') >>> _test_js('cool_Location = "http://example.com/abc.html"')
'cool_Location = "/web/20131010em_/http://example.com/abc.html"' 'cool_Location = "/web/20131010mp_/http://example.com/abc.html"'
>>> _test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"') >>> _test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"')
'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"' 'window.WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"'
>>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"') >>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"' 'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
# custom rules added # custom rules added
>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)]) >>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)])
'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"; /*some_func(); */' 'window.WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html"; /*some_func(); */'
# scheme-agnostic # scheme-agnostic
>>> _test_js('cool_Location = "//example.com/abc.html" //comment') >>> _test_js('cool_Location = "//example.com/abc.html" //comment')
'cool_Location = "/web/20131010em_/http://example.com/abc.html" //comment' 'cool_Location = "/web/20131010mp_/http://example.com/abc.html" //comment'
# document.cookie test # document.cookie test
>>> _test_js('document.cookie = "a=b; Path=/"') >>> _test_js('document.cookie = "a=b; Path=/"')
'document.WB_wombat_cookie = "a=b; Path=/"' 'document.WB_wombat_cookie = "a=b; Path=/"'
# js-escaped
>>> _test_js('&quot;http:\\/\\/www.example.com\\/some\\/path\\/?query=1&quot;')
'&quot;/web/20131010mp_/http:\\/\\/www.example.com\\/some\\/path\\/?query=1&quot;'
#================================================================= #=================================================================
# XML Rewriting # XML Rewriting
#================================================================= #=================================================================
>>> _test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>') >>> _test_xml('<tag xmlns="http://www.example.com/ns" attr="http://example.com"></tag>')
'<tag xmlns="http://www.example.com/ns" attr="/web/20131010em_/http://example.com"></tag>' '<tag xmlns="http://www.example.com/ns" attr="/web/20131010mp_/http://example.com"></tag>'
>>> _test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>') >>> _test_xml('<tag xmlns:xsi="http://www.example.com/ns" attr=" http://example.com"></tag>')
'<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010em_/http://example.com"></tag>' '<tag xmlns:xsi="http://www.example.com/ns" attr=" /web/20131010mp_/http://example.com"></tag>'
>>> _test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>') >>> _test_xml('<tag> http://example.com<other>abchttp://example.com</other></tag>')
'<tag> /web/20131010em_/http://example.com<other>abchttp://example.com</other></tag>' '<tag> /web/20131010mp_/http://example.com<other>abchttp://example.com</other></tag>'
>>> _test_xml('<main> http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>') >>> _test_xml('<main> http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> http://example.com </main>')
'<main> /web/20131010em_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010em_/http://example.com </main>' '<main> /web/20131010mp_/http://www.example.com/blah</tag> <other xmlns:abcdef= " http://example.com"/> /web/20131010mp_/http://example.com </main>'
#================================================================= #=================================================================
# CSS Rewriting # CSS Rewriting
#================================================================= #=================================================================
>>> _test_css("background: url('/some/path.html')") >>> _test_css("background: url('/some/path.html')")
"background: url('/web/20131010em_/http://example.com/some/path.html')" "background: url('/web/20131010mp_/http://example.com/some/path.html')"
>>> _test_css("background: url('../path.html')") >>> _test_css("background: url('../path.html')")
"background: url('/web/20131010em_/http://example.com/path.html')" "background: url('/web/20131010mp_/http://example.com/path.html')"
>>> _test_css("background: url(\"http://domain.com/path.html\")") >>> _test_css("background: url(\"http://domain.com/path.html\")")
'background: url("/web/20131010em_/http://domain.com/path.html")' 'background: url("/web/20131010mp_/http://domain.com/path.html")'
>>> _test_css("background: url(file.jpeg)") >>> _test_css("background: url(file.jpeg)")
'background: url(/web/20131010em_/http://example.com/file.jpeg)' 'background: url(/web/20131010mp_/http://example.com/file.jpeg)'
>>> _test_css("background:#abc url('/static/styles/../images/layout/logo.png')") >>> _test_css("background:#abc url('/static/styles/../images/layout/logo.png')")
"background:#abc url('/web/20131010em_/http://example.com/static/images/layout/logo.png')" "background:#abc url('/web/20131010mp_/http://example.com/static/images/layout/logo.png')"
>>> _test_css("background:#000 url('/static/styles/../../images/layout/logo.png')") >>> _test_css("background:#000 url('/static/styles/../../images/layout/logo.png')")
"background:#000 url('/web/20131010em_/http://example.com/images/layout/logo.png')" "background:#000 url('/web/20131010mp_/http://example.com/images/layout/logo.png')"
>>> _test_css("background: url('')") >>> _test_css("background: url('')")
"background: url('')" "background: url('')"
>>> _test_css("background: url (\"weirdpath\')") >>> _test_css("background: url (\"weirdpath\')")
'background: url ("/web/20131010em_/http://example.com/weirdpath\')' 'background: url ("/web/20131010mp_/http://example.com/weirdpath\')'
>>> _test_css("@import url ('path.css')") >>> _test_css("@import url ('path.css')")
"@import url ('/web/20131010em_/http://example.com/path.css')" "@import url ('/web/20131010mp_/http://example.com/path.css')"
>>> _test_css("@import url('path.css')") >>> _test_css("@import url('path.css')")
"@import url('/web/20131010em_/http://example.com/path.css')" "@import url('/web/20131010mp_/http://example.com/path.css')"
>>> _test_css("@import ( 'path.css')") >>> _test_css("@import ( 'path.css')")
"@import ( '/web/20131010em_/http://example.com/path.css')" "@import ( '/web/20131010mp_/http://example.com/path.css')"
>>> _test_css("@import \"path.css\"") >>> _test_css("@import \"path.css\"")
'@import "/web/20131010em_/http://example.com/path.css"' '@import "/web/20131010mp_/http://example.com/path.css"'
>>> _test_css("@import ('../path.css\"") >>> _test_css("@import ('../path.css\"")
'@import (\'/web/20131010em_/http://example.com/path.css"' '@import (\'/web/20131010mp_/http://example.com/path.css"'
>>> _test_css("@import ('../url.css\"") >>> _test_css("@import ('../url.css\"")
'@import (\'/web/20131010em_/http://example.com/url.css"' '@import (\'/web/20131010mp_/http://example.com/url.css"'
>>> _test_css("@import (\"url.css\")") >>> _test_css("@import (\"url.css\")")
'@import ("/web/20131010em_/http://example.com/url.css")' '@import ("/web/20131010mp_/http://example.com/url.css")'
>>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)") >>> _test_css("@import url(/url.css)\n@import url(/anotherurl.css)\n @import url(/and_a_third.css)")
'@import url(/web/20131010em_/http://example.com/url.css)\n@import url(/web/20131010em_/http://example.com/anotherurl.css)\n @import url(/web/20131010em_/http://example.com/and_a_third.css)' '@import url(/web/20131010mp_/http://example.com/url.css)\n@import url(/web/20131010mp_/http://example.com/anotherurl.css)\n @import url(/web/20131010mp_/http://example.com/and_a_third.css)'
""" """

View File

@ -31,6 +31,8 @@ rules:
- match: 'Bootloader\.configurePage.*?;' - match: 'Bootloader\.configurePage.*?;'
replace: '/* {0} */' replace: '/* {0} */'
parse_comments: true
# flickr rules # flickr rules
#================================================================= #=================================================================

View File

@ -30,10 +30,6 @@ function init_banner() {
var FRAME_BANNER_ID = "_wb_frame_top_banner"; var FRAME_BANNER_ID = "_wb_frame_top_banner";
var bid; var bid;
if (wbinfo.is_embed) {
return;
}
if (window.top != window.self) { if (window.top != window.self) {
return; return;
} }
@ -123,23 +119,20 @@ function remove_event(name, func, object) {
} }
} }
var detect_on_init = function(event) { if ((window.self == window.top) && (window.self.top == window.top) && wbinfo) {
init_banner(); if (wbinfo.canon_url && (window.location.href != wbinfo.canon_url)) {
// Auto-redirect to top frame
remove_event("readystatechange", detect_on_init, document); window.location.replace(wbinfo.canon_url);
} else {
// Init Banner (no frame or top frame)
add_event("readystatechange", init_banner, document);
}
} }
add_event("readystatechange", detect_on_init, document);
return {
if (wbinfo.is_frame_mp && wbinfo.canon_url && 'labels': labels,
(window.self == window.top) && (window.self.top == window.top) && 'ts_to_date': ts_to_date
window.location.href != wbinfo.canon_url) { };
window.location.replace(wbinfo.canon_url);
}
return {'labels': labels,
'ts_to_date': ts_to_date};
})(); })();

View File

@ -22,6 +22,15 @@ function make_inner_url(url, ts)
} }
function push_state(url, timestamp, capture_str, is_live) { function push_state(url, timestamp, capture_str, is_live) {
if (window.frames[0].WB_wombat_location) {
curr_href = window.frames[0].WB_wombat_location.href;
// If not current url, don't update
if (url != curr_href) {
return;
}
}
var state = {} var state = {}
state.timestamp = timestamp; state.timestamp = timestamp;
state.outer_url = make_outer_url(url, state.timestamp); state.outer_url = make_outer_url(url, state.timestamp);
@ -100,14 +109,14 @@ function extract_ts_cookie(value) {
} }
function iframe_loaded(event) { function iframe_loaded(event) {
var iframe = window.frames[0];
var url; var url;
var ts; var ts;
var capture_str; var capture_str;
var is_live = false; var is_live = false;
var iframe = window.frames[0];
if (iframe.WB_wombat_location) { if (iframe.WB_wombat_location) {
url = window.WB_wombat_location.href; url = iframe.WB_wombat_location.href;
} else { } else {
url = extract_replay_url(iframe.location.href); url = extract_replay_url(iframe.location.href);
} }

View File

@ -3,7 +3,6 @@
<!-- Start WB Insert --> <!-- Start WB Insert -->
<script> <script>
wbinfo = {} wbinfo = {}
wbinfo.is_embed = false;
wbinfo.prefix = "{{ wbrequest.wb_prefix }}"; wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
wbinfo.capture_url = "{{ url }}"; wbinfo.capture_url = "{{ url }}";
wbinfo.is_frame = true; wbinfo.is_frame = true;

View File

@ -14,7 +14,6 @@
wbinfo = {} wbinfo = {}
wbinfo.timestamp = "{{ cdx.timestamp }}"; wbinfo.timestamp = "{{ cdx.timestamp }}";
wbinfo.prefix = "{{ wbrequest.wb_prefix }}"; wbinfo.prefix = "{{ wbrequest.wb_prefix }}";
wbinfo.is_embed = {{"true" if wbrequest.wb_url.is_embed else "false"}};
wbinfo.is_frame_mp = {{"true" if wbrequest.wb_url.mod == 'mp_' else "false"}}; wbinfo.is_frame_mp = {{"true" if wbrequest.wb_url.mod == 'mp_' else "false"}};
wbinfo.canon_url = "{{ canon_url }}"; wbinfo.canon_url = "{{ canon_url }}";
wbinfo.is_live = {{ "true" if cdx.is_live else "false" }}; wbinfo.is_live = {{ "true" if cdx.is_live else "false" }};