From 4cc8e69f2e7661645eaf7910d6e41d5d5cc4e78a Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 5 Aug 2017 17:20:07 -0700 Subject: [PATCH] Preload Rewrite Improvements (#226) * html rewriter: better rewrite of link preload, set wburl modifier to match preload type (js_ for js, cs_ for style, im_ for image, if_ for iframe, oe_ as default) * tests: add tests for improved preload rewrite --- pywb/rewrite/html_rewriter.py | 43 ++++++++++++++++++------- pywb/rewrite/test/test_html_rewriter.py | 21 ++++++++++++ 2 files changed, 52 insertions(+), 12 deletions(-) diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 9ad6a7d2..d8f60a16 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -73,6 +73,13 @@ class HTMLRewriterMixin(StreamingRewriter): DATA_RW_PROTOCOLS = ('http://', 'https://', '//') + PRELOAD_TYPES = {'script': 'js_', + 'style': 'cs_', + 'image': 'im_', + 'document': 'if_', + 'fetch': 'mp_' + } + #=========================== class AccumBuff: def __init__(self): @@ -317,18 +324,7 @@ class HTMLRewriterMixin(StreamingRewriter): # don't rewrite rel=canonical elif tag == 'link' and attr_name == 'href': rw_mod = handler.get(attr_name) - - if self.has_attr(tag_attrs, ('rel', 'canonical')): - if self.opts.get('rewrite_rel_canon', True): - attr_value = self._rewrite_url(attr_value, rw_mod) - else: - # resolve relative rel=canonical URLs so that they - # refer to the same absolute URL as on the original - # page (see https://github.com/hypothesis/via/issues/65 - # for context) - attr_value = urljoin(self.orig_url, attr_value) - else: - attr_value = self._rewrite_url(attr_value, rw_mod) + attr_value = self._rewrite_link_href(attr_value, tag_attrs, rw_mod) # special case: meta tag elif (tag == 'meta') and (attr_name == 'content'): @@ -370,6 +366,29 @@ class HTMLRewriterMixin(StreamingRewriter): return True + def _rewrite_link_href(self, attr_value, tag_attrs, rw_mod): + # rel="canonical" + rel = self.get_attr(tag_attrs, 'rel') + if rel == 'canonical': + if self.opts.get('rewrite_rel_canon', True): + return self._rewrite_url(attr_value, rw_mod) + else: + # resolve relative rel=canonical URLs so that they + # refer to the same absolute URL as on the original + # page (see https://github.com/hypothesis/via/issues/65 + # for context) + return urljoin(self.orig_url, attr_value) + + # find proper mod for preload + elif rel == 'preload': + preload = self.get_attr(tag_attrs, 'as') + rw_mod = self.PRELOAD_TYPES.get(preload, rw_mod) + + elif rel == 'stylesheet': + rw_mod = 'cs_' + + return self._rewrite_url(attr_value, rw_mod) + def _set_parse_context(self, tag, tag_attrs): # special case: script or style parse context if not self._wb_parse_context: diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index de275002..8e202d57 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -248,6 +248,27 @@ r""" >>> parse('', urlrewriter=no_base_canon_rewriter) +# Preload tests +>>> parse('') + + +>>> parse('') + + +>>> parse('') + + +>>> parse('') + + +>>> parse('') + + +# stylesheet +>>> parse('') + + + # doctype >>> parse('')