1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

js insert rewrite improvements:

- client-side script: only rewrite if overridden objects are found in script text
- server-side inline js rewrite: only rewrite if overriden objects are found, don't insert before 'javascript:' marker
- tests: add improved tests for html js in attribute rewriting
This commit is contained in:
Ilya Kreymer 2017-10-18 10:51:24 -07:00
parent 1dbabef410
commit 70a09e2804
5 changed files with 74 additions and 10 deletions

View File

@ -233,7 +233,7 @@ class StreamingRewriter(object):
def rewrite(self, string):
return string
def rewrite_complete(self, string):
def rewrite_complete(self, string, **kwargs):
return self.first_buff + self.rewrite(string) + self.final_read()
def final_read(self):

View File

@ -248,12 +248,13 @@ class HTMLRewriterMixin(StreamingRewriter):
else:
return ''
def _rewrite_script(self, script_content, ensure_window=False):
def _rewrite_script(self, script_content, inline_attr=False):
if not script_content:
return ''
content = self.js_rewriter.rewrite_complete(script_content)
if ensure_window:
content = self.js_rewriter.rewrite_complete(script_content,
inline_attr=inline_attr)
if inline_attr:
content = self.ADD_WINDOW.sub('window.\\1', content)
return content

View File

@ -162,7 +162,7 @@ class JSWombatProxyRewriterMixin(object):
local_init_func = '\nvar {0} = function(name) {{\
return (self._wb_wombat && self._wb_wombat.local_init &&\
self._wb_wombat.local_init(name)) || self[name]; }}\n\
self._wb_wombat.local_init(name)) || self[name]; }};\n\
if (!self.__WB_pmw) {{ self.__WB_pmw = function(obj) {{ return obj; }} }}\n\
{{\n'
@ -191,7 +191,7 @@ if (!self.__WB_pmw) {{ self.__WB_pmw = function(obj) {{ return obj; }} }}\n\
rules = rules + [
(r'(?<=\.)postMessage\b\(', self.add_prefix('__WB_pmw(self).'), 0),
#(r'Function\(["\']return this["\']\)', self.fixed(func_rw), 0),
(r'(?<!\.)\blocation\b[=]\s*(?![=])', self.add_prefix('WB_wombat_'), 0),
(r'\breturn\s+this\b\s*(?![.$])', self.replace_str(self.THIS_RW), 0),
(r'(?<=[\n])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(';' + self.THIS_RW), 0),
(r'(?<![$.])\s*this\b(?=(?:\.(?:{0})\b))'.format(prop_str), self.replace_str(self.THIS_RW), 0),
@ -205,10 +205,30 @@ if (!self.__WB_pmw) {{ self.__WB_pmw = function(obj) {{ return obj; }} }}\n\
self.first_buff = self.local_init_func.format(self.local_init_func_name) + local_declares
self.close_string = '\n\n}'
self.last_buff = '\n\n}'
def rewrite_complete(self, string, **kwargs):
if not kwargs.get('inline_attr'):
return super(JSWombatProxyRewriterMixin, self).rewrite_complete(string)
# check if any of the wrapped objects are used in the script
# if not, don't rewrite
if not any(obj in string for obj in self.local_objs):
return string
if string.startswith('javascript:'):
string = 'javascript:' + self.first_buff + self.rewrite(string[len('javascript:'):])
else:
string = self.first_buff + self.rewrite(string)
string += self.last_buff
string = string.replace('\n', '')
return string
def final_read(self):
return self.close_string
return self.last_buff
# =================================================================

View File

@ -305,6 +305,17 @@ r"""
>>> parse('<HTML><A Href="">Text</a></hTmL>')
<html><a href="">Text</a></html>
# parse attr with js proxy, wrap script, prepend WB_wombat_ for location assignment
>>> parse('<html><a href="javascript:location=\'foo.html\'"></a></html>', js_proxy=True)
<html><a href="javascript:{ window.WB_wombat_location='foo.html' }"></a></html>
# parse attr with js proxy, wrap script, no WB_wombat_ needed
>>> parse('<html><a href="javascript:location.href=\'foo.html\'"></a></html>', js_proxy=True)
<html><a href="javascript:{ location.href='foo.html' }"></a></html>
# parse attr with js proxy, no rewrite needed
>>> parse('<html><a href="javascript:alert()"></a></html>', js_proxy=True)
<html><a href="javascript:alert()"></a></html>
@ -321,6 +332,7 @@ r"""
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.html_rewriter import HTMLRewriter
from pywb.rewrite.regex_rewriters import JSWombatProxyRewriter
import pprint
import six
@ -341,11 +353,23 @@ urlrewriter_pencode = new_rewriter(rewrite_opts=dict(punycode_links=True))
no_base_canon_rewriter = new_rewriter(rewrite_opts=dict(rewrite_rel_canon=False,
rewrite_base=False))
def parse(data, head_insert=None, urlrewriter=urlrewriter, parse_comments=False):
def parse(data, head_insert=None, urlrewriter=urlrewriter, parse_comments=False,
js_proxy=False):
if js_proxy:
js_rewriter_class = JSWombatProxyRewriter
else:
js_rewriter_class = None
parser = HTMLRewriter(urlrewriter, head_insert=head_insert,
url=ORIGINAL_URL,
js_rewriter_class=js_rewriter_class,
parse_comments=parse_comments)
if js_proxy:
parser.js_rewriter.first_buff = '{ '
parser.js_rewriter.last_buff = ' }'
if six.PY2 and isinstance(data, six.text_type):
data = data.encode('utf-8')

View File

@ -1308,6 +1308,12 @@ var _WBWombat = function($wbwindow, wbinfo) {
return false;
}
var text = elem.textContent.trim();
if (!text || text.indexOf("<") == 0) {
return false;
}
var override_props = ["window",
"self",
"document",
@ -1317,6 +1323,19 @@ var _WBWombat = function($wbwindow, wbinfo) {
"frames",
"opener"];
var contains_props = false;
for (var i = 0; i < override_props.length; i++) {
if (text.indexOf(override_props[i]) >= 0) {
contains_props = true;
break;
}
}
if (!contains_props) {
return false;
}
var insert_str =
'var _____WB$wombat$assign$function_____ = function(name) {return (self._wb_wombat && self._wb_wombat.local_init && self._wb_wombat.local_init(name)) || self[name]; }\n' +
'if (!self.__WB_pmw) { self.__WB_pmw = function(obj) { return obj; } }\n' +
@ -1324,7 +1343,7 @@ var _WBWombat = function($wbwindow, wbinfo) {
var prop;
for (var i = 0; i < override_props.length; i++) {
for (i = 0; i < override_props.length; i++) {
prop = override_props[i];
insert_str += 'let ' + prop + ' = _____WB$wombat$assign$function_____("' + prop + '");\n';
}