mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Preload Rewrite Improvements (#226)
* html rewriter: better rewrite of link preload, set wburl modifier to match preload type (js_ for js, cs_ for style, im_ for image, if_ for iframe, oe_ as default) * tests: add tests for improved preload rewrite
This commit is contained in:
parent
bcb5bef39d
commit
4cc8e69f2e
@ -73,6 +73,13 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
|
||||
DATA_RW_PROTOCOLS = ('http://', 'https://', '//')
|
||||
|
||||
PRELOAD_TYPES = {'script': 'js_',
|
||||
'style': 'cs_',
|
||||
'image': 'im_',
|
||||
'document': 'if_',
|
||||
'fetch': 'mp_'
|
||||
}
|
||||
|
||||
#===========================
|
||||
class AccumBuff:
|
||||
def __init__(self):
|
||||
@ -317,18 +324,7 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
# don't rewrite rel=canonical
|
||||
elif tag == 'link' and attr_name == 'href':
|
||||
rw_mod = handler.get(attr_name)
|
||||
|
||||
if self.has_attr(tag_attrs, ('rel', 'canonical')):
|
||||
if self.opts.get('rewrite_rel_canon', True):
|
||||
attr_value = self._rewrite_url(attr_value, rw_mod)
|
||||
else:
|
||||
# resolve relative rel=canonical URLs so that they
|
||||
# refer to the same absolute URL as on the original
|
||||
# page (see https://github.com/hypothesis/via/issues/65
|
||||
# for context)
|
||||
attr_value = urljoin(self.orig_url, attr_value)
|
||||
else:
|
||||
attr_value = self._rewrite_url(attr_value, rw_mod)
|
||||
attr_value = self._rewrite_link_href(attr_value, tag_attrs, rw_mod)
|
||||
|
||||
# special case: meta tag
|
||||
elif (tag == 'meta') and (attr_name == 'content'):
|
||||
@ -370,6 +366,29 @@ class HTMLRewriterMixin(StreamingRewriter):
|
||||
|
||||
return True
|
||||
|
||||
def _rewrite_link_href(self, attr_value, tag_attrs, rw_mod):
|
||||
# rel="canonical"
|
||||
rel = self.get_attr(tag_attrs, 'rel')
|
||||
if rel == 'canonical':
|
||||
if self.opts.get('rewrite_rel_canon', True):
|
||||
return self._rewrite_url(attr_value, rw_mod)
|
||||
else:
|
||||
# resolve relative rel=canonical URLs so that they
|
||||
# refer to the same absolute URL as on the original
|
||||
# page (see https://github.com/hypothesis/via/issues/65
|
||||
# for context)
|
||||
return urljoin(self.orig_url, attr_value)
|
||||
|
||||
# find proper mod for preload
|
||||
elif rel == 'preload':
|
||||
preload = self.get_attr(tag_attrs, 'as')
|
||||
rw_mod = self.PRELOAD_TYPES.get(preload, rw_mod)
|
||||
|
||||
elif rel == 'stylesheet':
|
||||
rw_mod = 'cs_'
|
||||
|
||||
return self._rewrite_url(attr_value, rw_mod)
|
||||
|
||||
def _set_parse_context(self, tag, tag_attrs):
|
||||
# special case: script or style parse context
|
||||
if not self._wb_parse_context:
|
||||
|
@ -248,6 +248,27 @@ r"""
|
||||
>>> parse('<link rel=canonical href="/relative/path">', urlrewriter=no_base_canon_rewriter)
|
||||
<link rel="canonical" href="http://example.com/relative/path">
|
||||
|
||||
# Preload tests
|
||||
>>> parse('<link rel="preload" as="script" href="http://example.com/some/other/path">')
|
||||
<link rel="preload" as="script" href="/web/20131226101010js_/http://example.com/some/other/path">
|
||||
|
||||
>>> parse('<link rel="preload" as="style" href="http://example.com/some/other/path">')
|
||||
<link rel="preload" as="style" href="/web/20131226101010cs_/http://example.com/some/other/path">
|
||||
|
||||
>>> parse('<link rel="preload" as="image" href="http://example.com/some/other/path">')
|
||||
<link rel="preload" as="image" href="/web/20131226101010im_/http://example.com/some/other/path">
|
||||
|
||||
>>> parse('<link rel="preload" as="document" href="http://example.com/some/other/path">')
|
||||
<link rel="preload" as="document" href="/web/20131226101010if_/http://example.com/some/other/path">
|
||||
|
||||
>>> parse('<link rel="preload" as="video" href="http://example.com/some/other/path">')
|
||||
<link rel="preload" as="video" href="/web/20131226101010oe_/http://example.com/some/other/path">
|
||||
|
||||
# stylesheet
|
||||
>>> parse('<link rel="stylesheet" href="http://example.com/some/other/path">')
|
||||
<link rel="stylesheet" href="/web/20131226101010cs_/http://example.com/some/other/path">
|
||||
|
||||
|
||||
# doctype
|
||||
>>> parse('<!doctype html PUBLIC "public">')
|
||||
<!doctype html PUBLIC "public">
|
||||
|
Loading…
x
Reference in New Issue
Block a user