1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

rewrite: module rewriting: fix module detection by peeking the buffer in streaming rewrite

bump wombat to latest
This commit is contained in:
Ilya Kreymer 2022-11-17 23:15:04 -08:00 committed by Tessa Walsh
parent aad10d45fb
commit f48a92af3e
2 changed files with 47 additions and 17 deletions

View File

@ -134,6 +134,7 @@ class HTMLRewriterMixin(StreamingRewriter):
super(HTMLRewriterMixin, self).__init__(url_rewriter, False) super(HTMLRewriterMixin, self).__init__(url_rewriter, False)
self.charset = charset self.charset = charset
self._wb_parse_context = None self._wb_parse_context = None
self._wb_parse_module = False
if js_rewriter: if js_rewriter:
self.js_rewriter = js_rewriter self.js_rewriter = js_rewriter
@ -308,7 +309,8 @@ class HTMLRewriterMixin(StreamingRewriter):
return '' return ''
content = self.js_rewriter.rewrite_complete(script_content, content = self.js_rewriter.rewrite_complete(script_content,
inline_attr=inline_attr) inline_attr=inline_attr,
is_module=self._wb_parse_module)
if inline_attr: if inline_attr:
content = self.ADD_WINDOW.sub('window.\\1', content) content = self.ADD_WINDOW.sub('window.\\1', content)
@ -484,24 +486,30 @@ class HTMLRewriterMixin(StreamingRewriter):
self._wb_parse_context = 'style' self._wb_parse_context = 'style'
elif tag == 'script': elif tag == 'script':
if self._allow_js_type(tag_attrs): result = self._allow_js_type(tag_attrs)
if result:
self._wb_parse_context = 'script' self._wb_parse_context = 'script'
self._wb_parse_module = (result == 'script-module')
def _allow_js_type(self, tag_attrs): def _allow_js_type(self, tag_attrs):
type_value = self.get_attr(tag_attrs, 'type') type_value = self.get_attr(tag_attrs, 'type')
if not type_value: if not type_value:
return True return 'script'
type_value = type_value.lower() type_value = type_value.lower()
if 'javascript' in type_value: if 'javascript' in type_value:
return True return 'script'
if 'ecmascript' in type_value: if 'ecmascript' in type_value:
return True return 'script'
return False if type_value == 'module':
return 'script-module'
return None
def _rewrite_head(self, start_end): def _rewrite_head(self, start_end):
# special case: head tag # special case: head tag

View File

@ -1,6 +1,7 @@
import re import re
from pywb.rewrite.content_rewriter import StreamingRewriter from pywb.rewrite.content_rewriter import StreamingRewriter
from pywb.utils.loaders import load_py_name from pywb.utils.loaders import load_py_name
from pywb.utils.io import BUFF_SIZE
from six.moves.urllib.parse import unquote from six.moves.urllib.parse import unquote
@ -283,32 +284,53 @@ class JSWombatProxyRewriter(RegexRewriter):
self.last_buff = self.rules_factory.last_buff self.last_buff = self.rules_factory.last_buff
self.local_objs = self.rules_factory.local_objs self.local_objs = self.rules_factory.local_objs
self._is_module_check = None
def set_as_module(self):
self.first_buff = "\nimport {{ {0} }} from '/static/__wb_module_decl.js';\n".format(
", ".join(obj for obj in self.local_objs)
)
self.last_buff = ""
self._is_module_check = True
def __call__(self, rwinfo):
if self._is_module_check == None:
buf = rwinfo.read_and_keep(BUFF_SIZE * 4)
if self.is_module(buf):
self.set_as_module()
else:
self._is_module_check = False
return super(JSWombatProxyRewriter, self).__call__(rwinfo)
@staticmethod @staticmethod
def is_module(string): def is_module(string):
"""Return boolean indicating whether import or export statement is found.""" """Return boolean indicating whether import or export statement is found."""
IMPORT_REGEX = r"^\s*?import\s*?[{\"']" IMPORT_REGEX = re.compile(br"^\s*?import\s*?[{\"']")
EXPORT_REGEX = r"^\s*?export\s*?({([\s\w,$\n]+?)}[\s;]*|default|class)\s+" EXPORT_REGEX = re.compile(br"^\s*?export\s*?({([\s\w,$\n]+?)}[\s;]*|default|class)\s+", re.M)
if not string: if not string:
return False return False
if "import" in string and re.search(IMPORT_REGEX, string): if isinstance(string, str):
string = string.encode("utf-8")
if b"import" in string and re.search(IMPORT_REGEX, string):
return True return True
if "export" in string and re.search(EXPORT_REGEX, string): if b"export" in string and re.search(EXPORT_REGEX, string):
return True return True
return False return False
def rewrite_complete(self, string, **kwargs): def rewrite_complete(self, string, **kwargs):
if not kwargs.get('inline_attr'): if not kwargs.get('inline_attr'):
if self.is_module(string): if kwargs.get('is_module'):
first_buff = "\nimport {} from '/static/__wb_module_decl.js';\n".format( self.set_as_module()
", ".join(obj for obj in self.local_objs)
)
super(JSWombatProxyRewriter, self).__init__(self.rewriter, extra_rules=self.extra_rules, first_buff=first_buff)
return super(JSWombatProxyRewriter, self).rewrite_complete(string)
return super(JSWombatProxyRewriter, self).rewrite_complete(string) return super(JSWombatProxyRewriter, self).rewrite_complete(string)
# check if any of the wrapped objects are used in the script # check if any of the wrapped objects are used in the script
# if not, don't rewrite # if not, don't rewrite
if not any(obj in string for obj in self.local_objs): if not any(obj in string for obj in self.local_objs):