mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
rewrite: module rewriting: fix module detection by peeking the buffer in streaming rewrite
bump wombat to latest
This commit is contained in:
parent
aad10d45fb
commit
f48a92af3e
@ -134,6 +134,7 @@ class HTMLRewriterMixin(StreamingRewriter):
|
|||||||
super(HTMLRewriterMixin, self).__init__(url_rewriter, False)
|
super(HTMLRewriterMixin, self).__init__(url_rewriter, False)
|
||||||
self.charset = charset
|
self.charset = charset
|
||||||
self._wb_parse_context = None
|
self._wb_parse_context = None
|
||||||
|
self._wb_parse_module = False
|
||||||
|
|
||||||
if js_rewriter:
|
if js_rewriter:
|
||||||
self.js_rewriter = js_rewriter
|
self.js_rewriter = js_rewriter
|
||||||
@ -308,7 +309,8 @@ class HTMLRewriterMixin(StreamingRewriter):
|
|||||||
return ''
|
return ''
|
||||||
|
|
||||||
content = self.js_rewriter.rewrite_complete(script_content,
|
content = self.js_rewriter.rewrite_complete(script_content,
|
||||||
inline_attr=inline_attr)
|
inline_attr=inline_attr,
|
||||||
|
is_module=self._wb_parse_module)
|
||||||
if inline_attr:
|
if inline_attr:
|
||||||
content = self.ADD_WINDOW.sub('window.\\1', content)
|
content = self.ADD_WINDOW.sub('window.\\1', content)
|
||||||
|
|
||||||
@ -484,24 +486,30 @@ class HTMLRewriterMixin(StreamingRewriter):
|
|||||||
self._wb_parse_context = 'style'
|
self._wb_parse_context = 'style'
|
||||||
|
|
||||||
elif tag == 'script':
|
elif tag == 'script':
|
||||||
if self._allow_js_type(tag_attrs):
|
result = self._allow_js_type(tag_attrs)
|
||||||
|
if result:
|
||||||
self._wb_parse_context = 'script'
|
self._wb_parse_context = 'script'
|
||||||
|
self._wb_parse_module = (result == 'script-module')
|
||||||
|
|
||||||
|
|
||||||
def _allow_js_type(self, tag_attrs):
|
def _allow_js_type(self, tag_attrs):
|
||||||
type_value = self.get_attr(tag_attrs, 'type')
|
type_value = self.get_attr(tag_attrs, 'type')
|
||||||
|
|
||||||
if not type_value:
|
if not type_value:
|
||||||
return True
|
return 'script'
|
||||||
|
|
||||||
type_value = type_value.lower()
|
type_value = type_value.lower()
|
||||||
|
|
||||||
if 'javascript' in type_value:
|
if 'javascript' in type_value:
|
||||||
return True
|
return 'script'
|
||||||
|
|
||||||
if 'ecmascript' in type_value:
|
if 'ecmascript' in type_value:
|
||||||
return True
|
return 'script'
|
||||||
|
|
||||||
return False
|
if type_value == 'module':
|
||||||
|
return 'script-module'
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
def _rewrite_head(self, start_end):
|
def _rewrite_head(self, start_end):
|
||||||
# special case: head tag
|
# special case: head tag
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import re
|
import re
|
||||||
from pywb.rewrite.content_rewriter import StreamingRewriter
|
from pywb.rewrite.content_rewriter import StreamingRewriter
|
||||||
from pywb.utils.loaders import load_py_name
|
from pywb.utils.loaders import load_py_name
|
||||||
|
from pywb.utils.io import BUFF_SIZE
|
||||||
from six.moves.urllib.parse import unquote
|
from six.moves.urllib.parse import unquote
|
||||||
|
|
||||||
|
|
||||||
@ -283,32 +284,53 @@ class JSWombatProxyRewriter(RegexRewriter):
|
|||||||
self.last_buff = self.rules_factory.last_buff
|
self.last_buff = self.rules_factory.last_buff
|
||||||
self.local_objs = self.rules_factory.local_objs
|
self.local_objs = self.rules_factory.local_objs
|
||||||
|
|
||||||
|
self._is_module_check = None
|
||||||
|
|
||||||
|
def set_as_module(self):
|
||||||
|
self.first_buff = "\nimport {{ {0} }} from '/static/__wb_module_decl.js';\n".format(
|
||||||
|
", ".join(obj for obj in self.local_objs)
|
||||||
|
)
|
||||||
|
self.last_buff = ""
|
||||||
|
self._is_module_check = True
|
||||||
|
|
||||||
|
def __call__(self, rwinfo):
|
||||||
|
if self._is_module_check == None:
|
||||||
|
buf = rwinfo.read_and_keep(BUFF_SIZE * 4)
|
||||||
|
|
||||||
|
if self.is_module(buf):
|
||||||
|
self.set_as_module()
|
||||||
|
else:
|
||||||
|
self._is_module_check = False
|
||||||
|
|
||||||
|
return super(JSWombatProxyRewriter, self).__call__(rwinfo)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_module(string):
|
def is_module(string):
|
||||||
"""Return boolean indicating whether import or export statement is found."""
|
"""Return boolean indicating whether import or export statement is found."""
|
||||||
IMPORT_REGEX = r"^\s*?import\s*?[{\"']"
|
IMPORT_REGEX = re.compile(br"^\s*?import\s*?[{\"']")
|
||||||
EXPORT_REGEX = r"^\s*?export\s*?({([\s\w,$\n]+?)}[\s;]*|default|class)\s+"
|
EXPORT_REGEX = re.compile(br"^\s*?export\s*?({([\s\w,$\n]+?)}[\s;]*|default|class)\s+", re.M)
|
||||||
|
|
||||||
if not string:
|
if not string:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if "import" in string and re.search(IMPORT_REGEX, string):
|
if isinstance(string, str):
|
||||||
|
string = string.encode("utf-8")
|
||||||
|
|
||||||
|
if b"import" in string and re.search(IMPORT_REGEX, string):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
if "export" in string and re.search(EXPORT_REGEX, string):
|
if b"export" in string and re.search(EXPORT_REGEX, string):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def rewrite_complete(self, string, **kwargs):
|
def rewrite_complete(self, string, **kwargs):
|
||||||
if not kwargs.get('inline_attr'):
|
if not kwargs.get('inline_attr'):
|
||||||
if self.is_module(string):
|
if kwargs.get('is_module'):
|
||||||
first_buff = "\nimport {} from '/static/__wb_module_decl.js';\n".format(
|
self.set_as_module()
|
||||||
", ".join(obj for obj in self.local_objs)
|
|
||||||
)
|
|
||||||
super(JSWombatProxyRewriter, self).__init__(self.rewriter, extra_rules=self.extra_rules, first_buff=first_buff)
|
|
||||||
return super(JSWombatProxyRewriter, self).rewrite_complete(string)
|
|
||||||
return super(JSWombatProxyRewriter, self).rewrite_complete(string)
|
return super(JSWombatProxyRewriter, self).rewrite_complete(string)
|
||||||
|
|
||||||
# check if any of the wrapped objects are used in the script
|
# check if any of the wrapped objects are used in the script
|
||||||
# if not, don't rewrite
|
# if not, don't rewrite
|
||||||
if not any(obj in string for obj in self.local_objs):
|
if not any(obj in string for obj in self.local_objs):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user