diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index f10d861a..aa3af667 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -134,6 +134,7 @@ class HTMLRewriterMixin(StreamingRewriter): super(HTMLRewriterMixin, self).__init__(url_rewriter, False) self.charset = charset self._wb_parse_context = None + self._wb_parse_module = False if js_rewriter: self.js_rewriter = js_rewriter @@ -308,7 +309,8 @@ class HTMLRewriterMixin(StreamingRewriter): return '' content = self.js_rewriter.rewrite_complete(script_content, - inline_attr=inline_attr) + inline_attr=inline_attr, + is_module=self._wb_parse_module) if inline_attr: content = self.ADD_WINDOW.sub('window.\\1', content) @@ -433,7 +435,7 @@ class HTMLRewriterMixin(StreamingRewriter): # URL not skipped, likely src='js/....', forcing abs to make sure, cause PHP MIME(JS) === HTML attr_value = self._rewrite_url(attr_value, rw_mod, True) self._write_attr('__wb_orig_src', ov, empty_attr=None) - + elif attr_name == 'target': target = attr_value if target in ('_blank', '_parent', '_top'): @@ -484,24 +486,30 @@ class HTMLRewriterMixin(StreamingRewriter): self._wb_parse_context = 'style' elif tag == 'script': - if self._allow_js_type(tag_attrs): + result = self._allow_js_type(tag_attrs) + if result: self._wb_parse_context = 'script' + self._wb_parse_module = (result == 'script-module') + def _allow_js_type(self, tag_attrs): type_value = self.get_attr(tag_attrs, 'type') if not type_value: - return True + return 'script' type_value = type_value.lower() if 'javascript' in type_value: - return True + return 'script' if 'ecmascript' in type_value: - return True + return 'script' - return False + if type_value == 'module': + return 'script-module' + + return None def _rewrite_head(self, start_end): # special case: head tag diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index 4e331370..de158584 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -1,6 +1,7 @@ import re from pywb.rewrite.content_rewriter import StreamingRewriter from pywb.utils.loaders import load_py_name +from pywb.utils.io import BUFF_SIZE from six.moves.urllib.parse import unquote @@ -283,32 +284,53 @@ class JSWombatProxyRewriter(RegexRewriter): self.last_buff = self.rules_factory.last_buff self.local_objs = self.rules_factory.local_objs + self._is_module_check = None + + def set_as_module(self): + self.first_buff = "\nimport {{ {0} }} from '/static/__wb_module_decl.js';\n".format( + ", ".join(obj for obj in self.local_objs) + ) + self.last_buff = "" + self._is_module_check = True + + def __call__(self, rwinfo): + if self._is_module_check == None: + buf = rwinfo.read_and_keep(BUFF_SIZE * 4) + + if self.is_module(buf): + self.set_as_module() + else: + self._is_module_check = False + + return super(JSWombatProxyRewriter, self).__call__(rwinfo) + @staticmethod def is_module(string): """Return boolean indicating whether import or export statement is found.""" - IMPORT_REGEX = r"^\s*?import\s*?[{\"']" - EXPORT_REGEX = r"^\s*?export\s*?({([\s\w,$\n]+?)}[\s;]*|default|class)\s+" + IMPORT_REGEX = re.compile(br"^\s*?import\s*?[{\"']") + EXPORT_REGEX = re.compile(br"^\s*?export\s*?({([\s\w,$\n]+?)}[\s;]*|default|class)\s+", re.M) if not string: return False - if "import" in string and re.search(IMPORT_REGEX, string): + if isinstance(string, str): + string = string.encode("utf-8") + + if b"import" in string and re.search(IMPORT_REGEX, string): return True - if "export" in string and re.search(EXPORT_REGEX, string): + if b"export" in string and re.search(EXPORT_REGEX, string): return True return False def rewrite_complete(self, string, **kwargs): if not kwargs.get('inline_attr'): - if self.is_module(string): - first_buff = "\nimport {} from '/static/__wb_module_decl.js';\n".format( - ", ".join(obj for obj in self.local_objs) - ) - super(JSWombatProxyRewriter, self).__init__(self.rewriter, extra_rules=self.extra_rules, first_buff=first_buff) - return super(JSWombatProxyRewriter, self).rewrite_complete(string) + if kwargs.get('is_module'): + self.set_as_module() + return super(JSWombatProxyRewriter, self).rewrite_complete(string) + # check if any of the wrapped objects are used in the script # if not, don't rewrite if not any(obj in string for obj in self.local_objs):