1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rewrite: add 'deprefix' support to remove wburl prefix from any query

params
This commit is contained in:
Ilya Kreymer 2014-10-26 12:12:37 -07:00
parent 037cf35eb8
commit c9273ee5ed
4 changed files with 39 additions and 3 deletions

View File

@ -78,6 +78,8 @@ class WbRequest(object):
rel_prefix,
env.get('SCRIPT_NAME', '/'),
cookie_scope)
self.urlrewriter.deprefix_url()
else:
# no wb_url, just store blank wb_url
self.wb_url = None

View File

@ -74,6 +74,18 @@
>>> UrlRewriter('2013id_/example.com/file/path/blah.html', '/123/').get_new_url(timestamp='20131024')
'/123/20131024id_/http://example.com/file/path/blah.html'
# deprefix tests
>>> do_deprefix('2013id_/http://example.com/file/path/blah.html?param=http://localhost:8080/pywb/20141226/http://example.com/', '/pywb/', 'http://localhost:8080/pywb/')
'http://example.com/file/path/blah.html?param=http://example.com/'
>>> do_deprefix('2013id_/http://example.com/file/path/blah.html?param=http://localhost:8080/pywb/if_/https://example.com/filename.html', '/pywb/', 'http://localhost:8080/pywb/')
'http://example.com/file/path/blah.html?param=https://example.com/filename.html'
>>> do_deprefix('2013id_/http://example.com/file/path/blah.html?param=http://localhost:8080/pywb/https://example.com/filename.html', '/pywb/', 'http://localhost:8080/pywb/')
'http://example.com/file/path/blah.html?param=https://example.com/filename.html'
>>> do_deprefix('http://example.com/file.html?param=http://localhost:8080/pywb/https%3A//example.com/filename.html&other=value&a=b&param2=http://localhost:8080/pywb/http://test.example.com', '/pywb/', 'http://localhost:8080/pywb/')
'http://example.com/file.html?param=https://example.com/filename.html&other=value&a=b&param2=http://test.example.com'
# HttpsUrlRewriter tests
>>> HttpsUrlRewriter('http://example.com/', None).rewrite('https://example.com/abc')
@ -86,13 +98,22 @@
from pywb.rewrite.url_rewriter import UrlRewriter, HttpsUrlRewriter
import urllib
def do_rewrite(rel_url, base_url, prefix, mod=None, full_prefix=None):
rewriter = UrlRewriter(base_url, prefix, full_prefix=full_prefix)
return rewriter.rewrite(rel_url, mod)
def do_deprefix(url, rel_prefix, full_prefix):
encoded = urllib.quote_plus(full_prefix)
url = url.replace(full_prefix, encoded)
rewriter = UrlRewriter(url, rel_prefix, full_prefix)
url = rewriter.deprefix_url()
return urllib.unquote_plus(url)
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -1,4 +1,3 @@
import copy
import urlparse
from wburl import WbUrl
@ -88,6 +87,9 @@ class UrlRewriter(object):
cls = get_cookie_rewriter(scope)
return cls(self)
def deprefix_url(self):
return self.wburl.deprefix_url(self.full_prefix)
def __repr__(self):
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
@ -150,3 +152,6 @@ class HttpsUrlRewriter(UrlRewriter):
def get_cookie_rewriter(self, scope=None):
return None
def deprefix_url(self):
return self.wburl.url

View File

@ -39,7 +39,7 @@ wayback url format.
"""
import re
import urllib
#=================================================================
class BaseWbUrl(object):
@ -149,6 +149,14 @@ class WbUrl(BaseWbUrl):
self.timestamp = timestamp
self.type = self.REPLAY
def deprefix_url(self, prefix):
prefix = urllib.quote_plus(prefix)
rex_query = '=' + re.escape(prefix) + '([0-9])*([\w]{2}_)?/?'
new_url = re.sub(rex_query, '=', self.url)
self.url = new_url
return self.url
# Str Representation
# ====================
def to_str(self, **overrides):