1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Preload Rewrite Improvements (#226)

* html rewriter: better rewrite of link preload, set wburl modifier to match preload type (js_ for js, cs_ for style, im_ for image, if_ for iframe, oe_ as default)

* tests: add tests for improved preload rewrite
This commit is contained in:
Ilya Kreymer 2017-08-05 17:20:07 -07:00 committed by GitHub
parent bcb5bef39d
commit 4cc8e69f2e
2 changed files with 52 additions and 12 deletions

View File

@ -73,6 +73,13 @@ class HTMLRewriterMixin(StreamingRewriter):
DATA_RW_PROTOCOLS = ('http://', 'https://', '//')
PRELOAD_TYPES = {'script': 'js_',
'style': 'cs_',
'image': 'im_',
'document': 'if_',
'fetch': 'mp_'
}
#===========================
class AccumBuff:
def __init__(self):
@ -317,18 +324,7 @@ class HTMLRewriterMixin(StreamingRewriter):
# don't rewrite rel=canonical
elif tag == 'link' and attr_name == 'href':
rw_mod = handler.get(attr_name)
if self.has_attr(tag_attrs, ('rel', 'canonical')):
if self.opts.get('rewrite_rel_canon', True):
attr_value = self._rewrite_url(attr_value, rw_mod)
else:
# resolve relative rel=canonical URLs so that they
# refer to the same absolute URL as on the original
# page (see https://github.com/hypothesis/via/issues/65
# for context)
attr_value = urljoin(self.orig_url, attr_value)
else:
attr_value = self._rewrite_url(attr_value, rw_mod)
attr_value = self._rewrite_link_href(attr_value, tag_attrs, rw_mod)
# special case: meta tag
elif (tag == 'meta') and (attr_name == 'content'):
@ -370,6 +366,29 @@ class HTMLRewriterMixin(StreamingRewriter):
return True
def _rewrite_link_href(self, attr_value, tag_attrs, rw_mod):
# rel="canonical"
rel = self.get_attr(tag_attrs, 'rel')
if rel == 'canonical':
if self.opts.get('rewrite_rel_canon', True):
return self._rewrite_url(attr_value, rw_mod)
else:
# resolve relative rel=canonical URLs so that they
# refer to the same absolute URL as on the original
# page (see https://github.com/hypothesis/via/issues/65
# for context)
return urljoin(self.orig_url, attr_value)
# find proper mod for preload
elif rel == 'preload':
preload = self.get_attr(tag_attrs, 'as')
rw_mod = self.PRELOAD_TYPES.get(preload, rw_mod)
elif rel == 'stylesheet':
rw_mod = 'cs_'
return self._rewrite_url(attr_value, rw_mod)
def _set_parse_context(self, tag, tag_attrs):
# special case: script or style parse context
if not self._wb_parse_context:

View File

@ -248,6 +248,27 @@ r"""
>>> parse('<link rel=canonical href="/relative/path">', urlrewriter=no_base_canon_rewriter)
<link rel="canonical" href="http://example.com/relative/path">
# Preload tests
>>> parse('<link rel="preload" as="script" href="http://example.com/some/other/path">')
<link rel="preload" as="script" href="/web/20131226101010js_/http://example.com/some/other/path">
>>> parse('<link rel="preload" as="style" href="http://example.com/some/other/path">')
<link rel="preload" as="style" href="/web/20131226101010cs_/http://example.com/some/other/path">
>>> parse('<link rel="preload" as="image" href="http://example.com/some/other/path">')
<link rel="preload" as="image" href="/web/20131226101010im_/http://example.com/some/other/path">
>>> parse('<link rel="preload" as="document" href="http://example.com/some/other/path">')
<link rel="preload" as="document" href="/web/20131226101010if_/http://example.com/some/other/path">
>>> parse('<link rel="preload" as="video" href="http://example.com/some/other/path">')
<link rel="preload" as="video" href="/web/20131226101010oe_/http://example.com/some/other/path">
# stylesheet
>>> parse('<link rel="stylesheet" href="http://example.com/some/other/path">')
<link rel="stylesheet" href="/web/20131226101010cs_/http://example.com/some/other/path">
# doctype
>>> parse('<!doctype html PUBLIC "public">')
<!doctype html PUBLIC "public">