1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Improved rewriting of srcset image urls; handle urls with commas (#269)

* rewrite improvement: better srcset parsing for comma-separated urls

* extensive server-side tests for srcset rewriting (with and without spaces and extra srcset modifiers)

* compile regex once for improved performance

* same regex for server and client side rewriting

Work by @rebeccacremona
This commit is contained in:
Rebecca Lynn Cremona 2018-01-05 15:24:52 -05:00 committed by Ilya Kreymer
parent 777f55f201
commit d3b379e788
3 changed files with 42 additions and 7 deletions

View File

@ -234,13 +234,15 @@ class HTMLRewriterMixin(StreamingRewriter):
return new_value
SRCSET_REGEX = re.compile('\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))')
def _rewrite_srcset(self, value, mod=''):
if not value:
return ''
values = value.split(',')
values = (url.strip() for url in re.split(self.SRCSET_REGEX, value) if url)
values = [self._rewrite_url(v.strip()) for v in values]
return ','.join(values)
return ', '.join(values)
def _rewrite_css(self, css_content):
if css_content:

View File

@ -140,9 +140,41 @@ r"""
>>> parse('<param value="foo bar"/>')
<param value="foo bar"/>
# srcset attrib
>>> parse('<img srcset="//example.com/1x 1x, //example.com/foo 2x, https://example.com/bar 4x">')
<img srcset="/web/20131226101010///example.com/1x 1x,/web/20131226101010///example.com/foo 2x,/web/20131226101010/https://example.com/bar 4x">
# srcset attrib: simple
>>> parse('<img srcset="http://example.com">')
<img srcset="/web/20131226101010/http://example.com">
# srcset attrib: single comma-containing
>>> parse('<img srcset="http://example.com/123,foo">')
<img srcset="/web/20131226101010/http://example.com/123,foo">
# srcset attrib: single comma-containing plus descriptor
>>> parse('<img srcset="http://example.com/123,foo 2w">')
<img srcset="/web/20131226101010/http://example.com/123,foo 2w">
# srcset attrib: comma-containing absolute url and relative url, separated by comma and space
>>> parse('<img srcset="http://example.com/123,foo, /bar,bar 2w">')
<img srcset="/web/20131226101010/http://example.com/123,foo, /web/20131226101010/http://example.com/bar,bar 2w">
# srcset attrib: comma-containing relative url and absolute url, separated by comma and space
>>> parse('<img srcset="/bar,bar 2w, http://example.com/123,foo">')
<img srcset="/web/20131226101010/http://example.com/bar,bar 2w, /web/20131226101010/http://example.com/123,foo">
# srcset attrib: absolute urls with descriptors, separated by comma (no space)
>>> parse('<img srcset="http://example.com/123 2w,http://example.com/ 4w">')
<img srcset="/web/20131226101010/http://example.com/123 2w, /web/20131226101010/http://example.com/ 4w">
# srcset attrib: absolute url with descriptor, separated by comma (no space) from absolute url without descriptor
>>> parse('<img srcset="http://example.com/123 2x,http://example.com/">')
<img srcset="/web/20131226101010/http://example.com/123 2x, /web/20131226101010/http://example.com/">
# srcset attrib: absolute url without descriptor, separated by comma (no space) from absolute url with descriptor
>>> parse('<img srcset="http://example.com/123,http://example.com/ 2x">')
<img srcset="/web/20131226101010/http://example.com/123, /web/20131226101010/http://example.com/ 2x">
# complex srcset attrib
>>> parse('<img srcset="//example.com/1x,1x 2w, //example1.com/foo 2x, http://example.com/bar,bar 4x">')
<img srcset="/web/20131226101010///example.com/1x,1x 2w, /web/20131226101010///example1.com/foo 2x, /web/20131226101010/http://example.com/bar,bar 4x">
# empty srcset attrib
>>> parse('<img srcset="">')

View File

@ -1373,13 +1373,14 @@ var _WBWombat = function($wbwindow, wbinfo) {
return "";
}
values = value.split(',');
// Filter removes non-truthy values like null, undefined, and ""
values = value.split(/\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/).filter(Boolean);
for (var i = 0; i < values.length; i++) {
values[i] = rewrite_url(values[i].trim());
}
return values.join(",");
return values.join(", ");
}
//============================================