mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Preload Rewrite Improvements (#226)
* html rewriter: better rewrite of link preload, set wburl modifier to match preload type (js_ for js, cs_ for style, im_ for image, if_ for iframe, oe_ as default) * tests: add tests for improved preload rewrite
This commit is contained in:
parent
bcb5bef39d
commit
4cc8e69f2e
@ -73,6 +73,13 @@ class HTMLRewriterMixin(StreamingRewriter):
|
|||||||
|
|
||||||
DATA_RW_PROTOCOLS = ('http://', 'https://', '//')
|
DATA_RW_PROTOCOLS = ('http://', 'https://', '//')
|
||||||
|
|
||||||
|
PRELOAD_TYPES = {'script': 'js_',
|
||||||
|
'style': 'cs_',
|
||||||
|
'image': 'im_',
|
||||||
|
'document': 'if_',
|
||||||
|
'fetch': 'mp_'
|
||||||
|
}
|
||||||
|
|
||||||
#===========================
|
#===========================
|
||||||
class AccumBuff:
|
class AccumBuff:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -317,18 +324,7 @@ class HTMLRewriterMixin(StreamingRewriter):
|
|||||||
# don't rewrite rel=canonical
|
# don't rewrite rel=canonical
|
||||||
elif tag == 'link' and attr_name == 'href':
|
elif tag == 'link' and attr_name == 'href':
|
||||||
rw_mod = handler.get(attr_name)
|
rw_mod = handler.get(attr_name)
|
||||||
|
attr_value = self._rewrite_link_href(attr_value, tag_attrs, rw_mod)
|
||||||
if self.has_attr(tag_attrs, ('rel', 'canonical')):
|
|
||||||
if self.opts.get('rewrite_rel_canon', True):
|
|
||||||
attr_value = self._rewrite_url(attr_value, rw_mod)
|
|
||||||
else:
|
|
||||||
# resolve relative rel=canonical URLs so that they
|
|
||||||
# refer to the same absolute URL as on the original
|
|
||||||
# page (see https://github.com/hypothesis/via/issues/65
|
|
||||||
# for context)
|
|
||||||
attr_value = urljoin(self.orig_url, attr_value)
|
|
||||||
else:
|
|
||||||
attr_value = self._rewrite_url(attr_value, rw_mod)
|
|
||||||
|
|
||||||
# special case: meta tag
|
# special case: meta tag
|
||||||
elif (tag == 'meta') and (attr_name == 'content'):
|
elif (tag == 'meta') and (attr_name == 'content'):
|
||||||
@ -370,6 +366,29 @@ class HTMLRewriterMixin(StreamingRewriter):
|
|||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def _rewrite_link_href(self, attr_value, tag_attrs, rw_mod):
|
||||||
|
# rel="canonical"
|
||||||
|
rel = self.get_attr(tag_attrs, 'rel')
|
||||||
|
if rel == 'canonical':
|
||||||
|
if self.opts.get('rewrite_rel_canon', True):
|
||||||
|
return self._rewrite_url(attr_value, rw_mod)
|
||||||
|
else:
|
||||||
|
# resolve relative rel=canonical URLs so that they
|
||||||
|
# refer to the same absolute URL as on the original
|
||||||
|
# page (see https://github.com/hypothesis/via/issues/65
|
||||||
|
# for context)
|
||||||
|
return urljoin(self.orig_url, attr_value)
|
||||||
|
|
||||||
|
# find proper mod for preload
|
||||||
|
elif rel == 'preload':
|
||||||
|
preload = self.get_attr(tag_attrs, 'as')
|
||||||
|
rw_mod = self.PRELOAD_TYPES.get(preload, rw_mod)
|
||||||
|
|
||||||
|
elif rel == 'stylesheet':
|
||||||
|
rw_mod = 'cs_'
|
||||||
|
|
||||||
|
return self._rewrite_url(attr_value, rw_mod)
|
||||||
|
|
||||||
def _set_parse_context(self, tag, tag_attrs):
|
def _set_parse_context(self, tag, tag_attrs):
|
||||||
# special case: script or style parse context
|
# special case: script or style parse context
|
||||||
if not self._wb_parse_context:
|
if not self._wb_parse_context:
|
||||||
|
@ -248,6 +248,27 @@ r"""
|
|||||||
>>> parse('<link rel=canonical href="/relative/path">', urlrewriter=no_base_canon_rewriter)
|
>>> parse('<link rel=canonical href="/relative/path">', urlrewriter=no_base_canon_rewriter)
|
||||||
<link rel="canonical" href="http://example.com/relative/path">
|
<link rel="canonical" href="http://example.com/relative/path">
|
||||||
|
|
||||||
|
# Preload tests
|
||||||
|
>>> parse('<link rel="preload" as="script" href="http://example.com/some/other/path">')
|
||||||
|
<link rel="preload" as="script" href="/web/20131226101010js_/http://example.com/some/other/path">
|
||||||
|
|
||||||
|
>>> parse('<link rel="preload" as="style" href="http://example.com/some/other/path">')
|
||||||
|
<link rel="preload" as="style" href="/web/20131226101010cs_/http://example.com/some/other/path">
|
||||||
|
|
||||||
|
>>> parse('<link rel="preload" as="image" href="http://example.com/some/other/path">')
|
||||||
|
<link rel="preload" as="image" href="/web/20131226101010im_/http://example.com/some/other/path">
|
||||||
|
|
||||||
|
>>> parse('<link rel="preload" as="document" href="http://example.com/some/other/path">')
|
||||||
|
<link rel="preload" as="document" href="/web/20131226101010if_/http://example.com/some/other/path">
|
||||||
|
|
||||||
|
>>> parse('<link rel="preload" as="video" href="http://example.com/some/other/path">')
|
||||||
|
<link rel="preload" as="video" href="/web/20131226101010oe_/http://example.com/some/other/path">
|
||||||
|
|
||||||
|
# stylesheet
|
||||||
|
>>> parse('<link rel="stylesheet" href="http://example.com/some/other/path">')
|
||||||
|
<link rel="stylesheet" href="/web/20131226101010cs_/http://example.com/some/other/path">
|
||||||
|
|
||||||
|
|
||||||
# doctype
|
# doctype
|
||||||
>>> parse('<!doctype html PUBLIC "public">')
|
>>> parse('<!doctype html PUBLIC "public">')
|
||||||
<!doctype html PUBLIC "public">
|
<!doctype html PUBLIC "public">
|
||||||
|
Loading…
x
Reference in New Issue
Block a user