From 7032160cf9327565948315de15b65433458c6c68 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 14 Jul 2014 19:13:19 -0700 Subject: [PATCH] rewrite: fix rel url resolution to better handle parent rel path. Explicitly resolve path when possible, remove only if at root level --- pywb/rewrite/test/test_regex_rewriters.py | 3 +++ pywb/rewrite/url_rewriter.py | 32 ++++++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py index cbd2cb21..4391edee 100644 --- a/pywb/rewrite/test/test_regex_rewriters.py +++ b/pywb/rewrite/test/test_regex_rewriters.py @@ -86,6 +86,9 @@ r""" >>> _test_css("background: url(file.jpeg)") 'background: url(/web/20131010em_/http://example.com/file.jpeg)' +>>> _test_css("background:#abc url('/static/styles/../images/layout/logo.png')") +"background:#abc url('/web/20131010em_/http://example.com/static/images/layout/logo.png')" + >>> _test_css("background: url('')") "background: url('')" diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index 79136ff5..236aba96 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -57,7 +57,7 @@ class UrlRewriter(object): else: # optimize: join if not absolute url, otherwise just use that if not is_abs: - new_url = urlparse.urljoin(wburl.url, url).replace('../', '') + new_url = self.urljoin(wburl.url, url) else: new_url = url @@ -92,6 +92,36 @@ class UrlRewriter(object): def __repr__(self): return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix) + @staticmethod + def urljoin(orig_url, url): + new_url = urlparse.urljoin(orig_url, url) + if '../' not in new_url: + return new_url + + parts = urlparse.urlsplit(new_url) + scheme, netloc, path, query, frag = parts + + path_parts = path.split('/') + i = len(path_parts) - 1 + while i >= 0: + if path_parts[i] == '..': + del path_parts[i] + if i > 0: + del path_parts[i - 1] + i -= 1 + i -= 1 + + if path_parts == ['']: + path = '/' + else: + path = '/'.join(path_parts) + + parts = (scheme, netloc, path, query, frag) + + + new_url = urlparse.urlunsplit(parts) + return new_url + #================================================================= class HttpsUrlRewriter(object):