1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rewrite: module rewriting: fix module detection by peeking the buffer in streaming rewrite

bump wombat to latest
This commit is contained in:
Ilya Kreymer 2022-11-17 23:15:04 -08:00 committed by Tessa Walsh
parent aad10d45fb
commit f48a92af3e
2 changed files with 47 additions and 17 deletions

View File

@ -134,6 +134,7 @@ class HTMLRewriterMixin(StreamingRewriter):
super(HTMLRewriterMixin, self).__init__(url_rewriter, False)
self.charset = charset
self._wb_parse_context = None
self._wb_parse_module = False
if js_rewriter:
self.js_rewriter = js_rewriter
@ -308,7 +309,8 @@ class HTMLRewriterMixin(StreamingRewriter):
return ''
content = self.js_rewriter.rewrite_complete(script_content,
inline_attr=inline_attr)
inline_attr=inline_attr,
is_module=self._wb_parse_module)
if inline_attr:
content = self.ADD_WINDOW.sub('window.\\1', content)
@ -484,24 +486,30 @@ class HTMLRewriterMixin(StreamingRewriter):
self._wb_parse_context = 'style'
elif tag == 'script':
if self._allow_js_type(tag_attrs):
result = self._allow_js_type(tag_attrs)
if result:
self._wb_parse_context = 'script'
self._wb_parse_module = (result == 'script-module')
def _allow_js_type(self, tag_attrs):
type_value = self.get_attr(tag_attrs, 'type')
if not type_value:
return True
return 'script'
type_value = type_value.lower()
if 'javascript' in type_value:
return True
return 'script'
if 'ecmascript' in type_value:
return True
return 'script'
return False
if type_value == 'module':
return 'script-module'
return None
def _rewrite_head(self, start_end):
# special case: head tag

View File

@ -1,6 +1,7 @@
import re
from pywb.rewrite.content_rewriter import StreamingRewriter
from pywb.utils.loaders import load_py_name
from pywb.utils.io import BUFF_SIZE
from six.moves.urllib.parse import unquote
@ -283,32 +284,53 @@ class JSWombatProxyRewriter(RegexRewriter):
self.last_buff = self.rules_factory.last_buff
self.local_objs = self.rules_factory.local_objs
self._is_module_check = None
def set_as_module(self):
self.first_buff = "\nimport {{ {0} }} from '/static/__wb_module_decl.js';\n".format(
", ".join(obj for obj in self.local_objs)
)
self.last_buff = ""
self._is_module_check = True
def __call__(self, rwinfo):
if self._is_module_check == None:
buf = rwinfo.read_and_keep(BUFF_SIZE * 4)
if self.is_module(buf):
self.set_as_module()
else:
self._is_module_check = False
return super(JSWombatProxyRewriter, self).__call__(rwinfo)
@staticmethod
def is_module(string):
"""Return boolean indicating whether import or export statement is found."""
IMPORT_REGEX = r"^\s*?import\s*?[{\"']"
EXPORT_REGEX = r"^\s*?export\s*?({([\s\w,$\n]+?)}[\s;]*|default|class)\s+"
IMPORT_REGEX = re.compile(br"^\s*?import\s*?[{\"']")
EXPORT_REGEX = re.compile(br"^\s*?export\s*?({([\s\w,$\n]+?)}[\s;]*|default|class)\s+", re.M)
if not string:
return False
if "import" in string and re.search(IMPORT_REGEX, string):
if isinstance(string, str):
string = string.encode("utf-8")
if b"import" in string and re.search(IMPORT_REGEX, string):
return True
if "export" in string and re.search(EXPORT_REGEX, string):
if b"export" in string and re.search(EXPORT_REGEX, string):
return True
return False
def rewrite_complete(self, string, **kwargs):
if not kwargs.get('inline_attr'):
if self.is_module(string):
first_buff = "\nimport {} from '/static/__wb_module_decl.js';\n".format(
", ".join(obj for obj in self.local_objs)
)
super(JSWombatProxyRewriter, self).__init__(self.rewriter, extra_rules=self.extra_rules, first_buff=first_buff)
return super(JSWombatProxyRewriter, self).rewrite_complete(string)
if kwargs.get('is_module'):
self.set_as_module()
return super(JSWombatProxyRewriter, self).rewrite_complete(string)
# check if any of the wrapped objects are used in the script
# if not, don't rewrite
if not any(obj in string for obj in self.local_objs):