From d3b379e7883c7c39c06fd7f2003b6e20a26c2e34 Mon Sep 17 00:00:00 2001 From: Rebecca Lynn Cremona Date: Fri, 5 Jan 2018 15:24:52 -0500 Subject: [PATCH] Improved rewriting of srcset image urls; handle urls with commas (#269) * rewrite improvement: better srcset parsing for comma-separated urls * extensive server-side tests for srcset rewriting (with and without spaces and extra srcset modifiers) * compile regex once for improved performance * same regex for server and client side rewriting Work by @rebeccacremona --- pywb/rewrite/html_rewriter.py | 6 ++-- pywb/rewrite/test/test_html_rewriter.py | 38 +++++++++++++++++++++++-- pywb/static/wombat.js | 5 ++-- 3 files changed, 42 insertions(+), 7 deletions(-) diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index d92e2ef9..ef98df0e 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -234,13 +234,15 @@ class HTMLRewriterMixin(StreamingRewriter): return new_value + SRCSET_REGEX = re.compile('\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))') + def _rewrite_srcset(self, value, mod=''): if not value: return '' - values = value.split(',') + values = (url.strip() for url in re.split(self.SRCSET_REGEX, value) if url) values = [self._rewrite_url(v.strip()) for v in values] - return ','.join(values) + return ', '.join(values) def _rewrite_css(self, css_content): if css_content: diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 71575d18..896b6e9b 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -140,9 +140,41 @@ r""" >>> parse('') -# srcset attrib ->>> parse('') - +# srcset attrib: simple +>>> parse('') + + +# srcset attrib: single comma-containing +>>> parse('') + + +# srcset attrib: single comma-containing plus descriptor +>>> parse('') + + +# srcset attrib: comma-containing absolute url and relative url, separated by comma and space +>>> parse('') + + +# srcset attrib: comma-containing relative url and absolute url, separated by comma and space +>>> parse('') + + +# srcset attrib: absolute urls with descriptors, separated by comma (no space) +>>> parse('') + + +# srcset attrib: absolute url with descriptor, separated by comma (no space) from absolute url without descriptor +>>> parse('') + + +# srcset attrib: absolute url without descriptor, separated by comma (no space) from absolute url with descriptor +>>> parse('') + + +# complex srcset attrib +>>> parse('') + # empty srcset attrib >>> parse('') diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js index 6e4c2f7b..9d7a14f6 100644 --- a/pywb/static/wombat.js +++ b/pywb/static/wombat.js @@ -1373,13 +1373,14 @@ var _WBWombat = function($wbwindow, wbinfo) { return ""; } - values = value.split(','); + // Filter removes non-truthy values like null, undefined, and "" + values = value.split(/\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/).filter(Boolean); for (var i = 0; i < values.length; i++) { values[i] = rewrite_url(values[i].trim()); } - return values.join(","); + return values.join(", "); } //============================================