mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Improved rewriting of srcset image urls; handle urls with commas (#269)
* rewrite improvement: better srcset parsing for comma-separated urls * extensive server-side tests for srcset rewriting (with and without spaces and extra srcset modifiers) * compile regex once for improved performance * same regex for server and client side rewriting Work by @rebeccacremona
This commit is contained in:
parent
777f55f201
commit
d3b379e788
@ -234,13 +234,15 @@ class HTMLRewriterMixin(StreamingRewriter):
|
|||||||
|
|
||||||
return new_value
|
return new_value
|
||||||
|
|
||||||
|
SRCSET_REGEX = re.compile('\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))')
|
||||||
|
|
||||||
def _rewrite_srcset(self, value, mod=''):
|
def _rewrite_srcset(self, value, mod=''):
|
||||||
if not value:
|
if not value:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
values = value.split(',')
|
values = (url.strip() for url in re.split(self.SRCSET_REGEX, value) if url)
|
||||||
values = [self._rewrite_url(v.strip()) for v in values]
|
values = [self._rewrite_url(v.strip()) for v in values]
|
||||||
return ','.join(values)
|
return ', '.join(values)
|
||||||
|
|
||||||
def _rewrite_css(self, css_content):
|
def _rewrite_css(self, css_content):
|
||||||
if css_content:
|
if css_content:
|
||||||
|
@ -140,9 +140,41 @@ r"""
|
|||||||
>>> parse('<param value="foo bar"/>')
|
>>> parse('<param value="foo bar"/>')
|
||||||
<param value="foo bar"/>
|
<param value="foo bar"/>
|
||||||
|
|
||||||
# srcset attrib
|
# srcset attrib: simple
|
||||||
>>> parse('<img srcset="//example.com/1x 1x, //example.com/foo 2x, https://example.com/bar 4x">')
|
>>> parse('<img srcset="http://example.com">')
|
||||||
<img srcset="/web/20131226101010///example.com/1x 1x,/web/20131226101010///example.com/foo 2x,/web/20131226101010/https://example.com/bar 4x">
|
<img srcset="/web/20131226101010/http://example.com">
|
||||||
|
|
||||||
|
# srcset attrib: single comma-containing
|
||||||
|
>>> parse('<img srcset="http://example.com/123,foo">')
|
||||||
|
<img srcset="/web/20131226101010/http://example.com/123,foo">
|
||||||
|
|
||||||
|
# srcset attrib: single comma-containing plus descriptor
|
||||||
|
>>> parse('<img srcset="http://example.com/123,foo 2w">')
|
||||||
|
<img srcset="/web/20131226101010/http://example.com/123,foo 2w">
|
||||||
|
|
||||||
|
# srcset attrib: comma-containing absolute url and relative url, separated by comma and space
|
||||||
|
>>> parse('<img srcset="http://example.com/123,foo, /bar,bar 2w">')
|
||||||
|
<img srcset="/web/20131226101010/http://example.com/123,foo, /web/20131226101010/http://example.com/bar,bar 2w">
|
||||||
|
|
||||||
|
# srcset attrib: comma-containing relative url and absolute url, separated by comma and space
|
||||||
|
>>> parse('<img srcset="/bar,bar 2w, http://example.com/123,foo">')
|
||||||
|
<img srcset="/web/20131226101010/http://example.com/bar,bar 2w, /web/20131226101010/http://example.com/123,foo">
|
||||||
|
|
||||||
|
# srcset attrib: absolute urls with descriptors, separated by comma (no space)
|
||||||
|
>>> parse('<img srcset="http://example.com/123 2w,http://example.com/ 4w">')
|
||||||
|
<img srcset="/web/20131226101010/http://example.com/123 2w, /web/20131226101010/http://example.com/ 4w">
|
||||||
|
|
||||||
|
# srcset attrib: absolute url with descriptor, separated by comma (no space) from absolute url without descriptor
|
||||||
|
>>> parse('<img srcset="http://example.com/123 2x,http://example.com/">')
|
||||||
|
<img srcset="/web/20131226101010/http://example.com/123 2x, /web/20131226101010/http://example.com/">
|
||||||
|
|
||||||
|
# srcset attrib: absolute url without descriptor, separated by comma (no space) from absolute url with descriptor
|
||||||
|
>>> parse('<img srcset="http://example.com/123,http://example.com/ 2x">')
|
||||||
|
<img srcset="/web/20131226101010/http://example.com/123, /web/20131226101010/http://example.com/ 2x">
|
||||||
|
|
||||||
|
# complex srcset attrib
|
||||||
|
>>> parse('<img srcset="//example.com/1x,1x 2w, //example1.com/foo 2x, http://example.com/bar,bar 4x">')
|
||||||
|
<img srcset="/web/20131226101010///example.com/1x,1x 2w, /web/20131226101010///example1.com/foo 2x, /web/20131226101010/http://example.com/bar,bar 4x">
|
||||||
|
|
||||||
# empty srcset attrib
|
# empty srcset attrib
|
||||||
>>> parse('<img srcset="">')
|
>>> parse('<img srcset="">')
|
||||||
|
@ -1373,13 +1373,14 @@ var _WBWombat = function($wbwindow, wbinfo) {
|
|||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
values = value.split(',');
|
// Filter removes non-truthy values like null, undefined, and ""
|
||||||
|
values = value.split(/\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/).filter(Boolean);
|
||||||
|
|
||||||
for (var i = 0; i < values.length; i++) {
|
for (var i = 0; i < values.length; i++) {
|
||||||
values[i] = rewrite_url(values[i].trim());
|
values[i] = rewrite_url(values[i].trim());
|
||||||
}
|
}
|
||||||
|
|
||||||
return values.join(",");
|
return values.join(", ");
|
||||||
}
|
}
|
||||||
|
|
||||||
//============================================
|
//============================================
|
||||||
|
Loading…
x
Reference in New Issue
Block a user