From 93d49ae24bdf963d6aa0f4a3b42976feecf3eab7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 6 Jul 2015 18:19:01 -0700 Subject: [PATCH] rewrite deprefix: improve query deprefix to also test url-encoded params, closes #119 --- pywb/rewrite/test/test_url_rewriter.py | 8 ++++++++ pywb/rewrite/wburl.py | 8 ++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pywb/rewrite/test/test_url_rewriter.py b/pywb/rewrite/test/test_url_rewriter.py index 17df29bf..54bd8666 100644 --- a/pywb/rewrite/test/test_url_rewriter.py +++ b/pywb/rewrite/test/test_url_rewriter.py @@ -114,6 +114,14 @@ >>> do_deprefix('http://example.com/file.html?param=http://localhost:8080/pywb/https%3A//example.com/filename.html&other=value&a=b¶m2=http://localhost:8080/pywb/http://test.example.com', '/pywb/', 'http://localhost:8080/pywb/') 'http://example.com/file.html?param=https://example.com/filename.html&other=value&a=b¶m2=http://test.example.com' +# urlencoded +>>> do_deprefix('http://example.com/file.html?foo=bar&url=' + urllib.quote_plus('http://localhost:8080/pywb/http://example.com/filename.html') + '&foo2=bar2', '/pywb/', 'http://localhost:8080/pywb/') +'http://example.com/file.html?foo=bar&url=http://example.com/filename.html&foo2=bar2' + +# with extra path +>>> do_deprefix('http://example.com/file.html?foo=bar&url=' + urllib.quote_plus('http://localhost:8080/pywb/extra/path/http://example.com/filename.html') + '&foo2=bar2', '/pywb/', 'http://localhost:8080/pywb/') +'http://example.com/file.html?foo=bar&url=http://example.com/filename.html&foo2=bar2' + # HttpsUrlRewriter tests >>> httpsrewriter = HttpsUrlRewriter('http://example.com/', None) >>> httpsrewriter.rewrite('https://example.com/abc') diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index ec38bbbf..84b958a9 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -243,6 +243,7 @@ class WbUrl(BaseWbUrl): self.timestamp = res[0] self.mod = res[1] self.url = res[2] + if self.timestamp: self.type = self.REPLAY else: @@ -256,8 +257,11 @@ class WbUrl(BaseWbUrl): def deprefix_url(self, prefix): rex_query = '=' + re.escape(prefix) + '([0-9])*([\w]{2}_)?/?' - new_url = re.sub(rex_query, '=', self.url) - self.url = new_url + self.url = re.sub(rex_query, '=', self.url) + + rex_query = '=(' + urllib.quote_plus(prefix) + '.*?)((?:https?%3A)?%2F%2F[^&]+)' + self.url = re.sub(rex_query, '=\\2', self.url) + return self.url def get_url(self, url=None):