1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Merge pull request #21 from ikreymer/wburl-drop-slash

refactor WbUrl and UrlRewriter to drop requirement for having a WbUrl start with /
This commit is contained in:
ikreymer 2014-02-01 19:47:18 -08:00
commit b6846c54e0
8 changed files with 99 additions and 73 deletions

View File

@ -48,12 +48,25 @@ class ArchivalRequestRouter:
# of request uri (excluding first '/')
#=================================================================
class Route:
"""
# route with relative path
>>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False)
{'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com'), 'coll': 'web', 'wb_prefix': '/web/', 'request_uri': '/web/test.example.com'}
# match upto next slash
SLASH_LOOKAHEAD ='(?=/|$|\?)'
# route with absolute path, running at script /my_pywb
>>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True)
{'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com'), 'coll': 'web', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'request_uri': '/web/2013im_/test.example.com'}
def __init__(self, regex, handler, coll_group = 0, lookahead = SLASH_LOOKAHEAD):
# not matching route -- skipped
>>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False)
"""
# match upto next / or ? or end
SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)'
def __init__(self, regex, handler, coll_group = 0, lookahead = SLASH_QUERY_LOOKAHEAD):
self.path = regex
self.regex = re.compile(regex + lookahead)
self.handler = handler
@ -71,17 +84,17 @@ class Route:
if rel_prefix:
wb_prefix = env['SCRIPT_NAME'] + '/' + rel_prefix + '/'
wb_url = request_uri[len(rel_prefix) + 1:] # remove the '/' + rel_prefix part of uri
wb_url_str = request_uri[len(rel_prefix) + 2:] # remove the '/' + rel_prefix part of uri
else:
wb_prefix = env['SCRIPT_NAME'] + '/'
wb_url = request_uri # the request_uri is the wb_url, since no coll
wb_url_str = request_uri[1:] # the request_uri is the wb_url, since no coll
coll = matcher.group(self.coll_group)
wbrequest = WbRequest(env,
request_uri = request_uri,
coll = coll,
wb_url = wb_url,
wb_url_str = wb_url_str,
wb_prefix = wb_prefix,
use_abs_prefix = use_abs_prefix,
wburl_class = self.handler.get_wburl_type())
@ -164,7 +177,7 @@ class ReferRedirect:
# No match on any exception
try:
rewriter = UrlRewriter('/' + ref_path[1], script_name + '/' + ref_path[0])
rewriter = UrlRewriter(ref_path[1], script_name + '/' + ref_path[0] + '/')
except Exception:
return None
@ -186,6 +199,8 @@ class ReferRedirect:
import utils
if __name__ == "__main__" or utils.enable_doctests():
import handlers
def test_redir(match_host, request_uri, referrer, script_name = ''):
env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name}

View File

@ -12,6 +12,8 @@ class BaseHandler:
def get_wburl_type():
return WbUrl
def __call__(self, wbrequest):
return wbrequest
#=================================================================
# Standard WB Handler

View File

@ -128,7 +128,7 @@ if __name__ == "__main__" or utils.enable_doctests():
import pprint
import url_rewriter
urlrewriter = url_rewriter.UrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
urlrewriter = url_rewriter.UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
headerrewriter = HeaderRewriter()

View File

@ -310,7 +310,7 @@ class HTMLRewriter(HTMLParser):
import utils
if __name__ == "__main__" or utils.enable_doctests():
url_rewriter = UrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/')
url_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
def parse(data, head_insert = None):
parser = HTMLRewriter(url_rewriter, head_insert = head_insert)

View File

@ -224,7 +224,7 @@ class CSSRewriter(RegexRewriter):
import utils
if __name__ == "__main__" or utils.enable_doctests():
arcrw = UrlRewriter('/20131010im_/http://example.com/', '/web/')
arcrw = UrlRewriter('20131010im_/http://example.com/', '/web/')
def test_js(string, extra = []):
return JSRewriter(arcrw, extra).rewrite(string)

View File

@ -6,46 +6,49 @@ from wburl import WbUrl
class UrlRewriter:
"""
>>> test_rewrite('other.html', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
>>> test_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
'https://web.archive.org/web/20131010/http://example.com/path/other.html'
>>> test_rewrite('file.js', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
>>> test_rewrite('file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_')
'https://web.archive.org/web/20131010js_/http://example.com/path/file.js'
>>> test_rewrite('./other.html', '/20130907*/http://example.com/path/page.html', '/coll/')
>>> test_rewrite('/other.html', '20130907*/http://example.com/path/page.html', '/coll/')
'/coll/20130907*/http://example.com/other.html'
>>> test_rewrite('./other.html', '20130907*/http://example.com/path/page.html', '/coll/')
'/coll/20130907*/http://example.com/path/other.html'
>>> test_rewrite('../other.html', '/20131112im_/http://example.com/path/page.html', '/coll/')
>>> test_rewrite('../other.html', '20131112im_/http://example.com/path/page.html', '/coll/')
'/coll/20131112im_/http://example.com/other.html'
>>> test_rewrite('../../other.html', '/*/http://example.com/index.html', 'localhost:8080/')
>>> test_rewrite('../../other.html', '*/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/*/http://example.com/other.html'
>>> test_rewrite('path/../../other.html', '/*/http://example.com/index.html', 'localhost:8080/')
>>> test_rewrite('path/../../other.html', '*/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/*/http://example.com/other.html'
>>> test_rewrite('http://some-other-site.com', '/20101226101112/http://example.com/index.html', 'localhost:8080/')
>>> test_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
'localhost:8080/20101226101112/http://some-other-site.com'
>>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '/')
>>> test_rewrite('../../other.html', '2020/http://example.com/index.html', '/')
'/2020/http://example.com/other.html'
>>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '')
'/2020/http://example.com/other.html'
>>> test_rewrite('../../other.html', '2020/http://example.com/index.html', '')
'2020/http://example.com/other.html'
>>> test_rewrite('', '/20131010010203/http://example.com/file.html', '/web/')
>>> test_rewrite('', '20131010010203/http://example.com/file.html', '/web/')
'/web/20131010010203/http://example.com/file.html'
>>> test_rewrite('#anchor', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
>>> test_rewrite('#anchor', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
'#anchor'
>>> test_rewrite('mailto:example@example.com', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
>>> test_rewrite('mailto:example@example.com', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/')
'mailto:example@example.com'
>>> UrlRewriter('/19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url()
>>> UrlRewriter('19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url()
'/abc/19960708im_/'
>>> UrlRewriter('/2013id_/example.com/file/path/blah.html', '/123/').get_timestamp_url('20131024')
>>> UrlRewriter('2013id_/example.com/file/path/blah.html', '/123/').get_timestamp_url('20131024')
'/123/20131024id_/http://example.com/file/path/blah.html'
>>> UrlRewriter.strip_protocol('https://example.com') == UrlRewriter.strip_protocol('http://example.com')
@ -61,8 +64,8 @@ class UrlRewriter:
self.prefix = prefix
self.archivalurl_class = self.wburl.__class__
if self.prefix.endswith('/'):
self.prefix = self.prefix[:-1]
#if self.prefix.endswith('/'):
# self.prefix = self.prefix[:-1]
def rewrite(self, url, mod = None):
# if special protocol, no rewriting at all

View File

@ -7,24 +7,24 @@ import pprint
class WbRequest:
"""
>>> WbRequest.from_uri('/save/_embed/example.com/?a=b')
{'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', '/http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'}
{'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'}
>>> WbRequest.from_uri('/2345/20101024101112im_/example.com/?b=c')
{'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '/20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'}
{'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'}
>>> WbRequest.from_uri('/2010/example.com')
{'wb_url': ('latest_replay', '', '', 'http://example.com', '/http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
>>> WbRequest.from_uri('../example.com')
{'wb_url': ('latest_replay', '', '', 'http://example.com', '/http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'}
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'}
# Abs path
>>> WbRequest.from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
{'wb_url': ('latest_replay', '', '', 'http://example.com', '/http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'}
# No Scheme, so stick to relative
>>> WbRequest.from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True)
{'wb_url': ('latest_replay', '', '', 'http://example.com', '/http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
{'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'}
"""
@ -38,19 +38,19 @@ class WbRequest:
# Has coll prefix
if len(parts) == 3:
wb_prefix = '/' + parts[1] + '/'
wb_url = '/' + parts[2]
wb_url_str = parts[2]
coll = parts[1]
# No Coll Prefix
elif len(parts) == 2:
wb_prefix = '/'
wb_url = '/' + parts[1]
wb_url_str = parts[1]
coll = ''
else:
wb_prefix = '/'
wb_url = parts[0]
wb_url_str = parts[0]
coll = ''
return WbRequest(env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix)
return WbRequest(env, request_uri, wb_prefix, wb_url_str, coll, use_abs_prefix)
@staticmethod
@ -61,7 +61,7 @@ class WbRequest:
return rel_prefix
def __init__(self, env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix = False, wburl_class = WbUrl):
def __init__(self, env, request_uri, wb_prefix, wb_url_str, coll, use_abs_prefix = False, wburl_class = WbUrl):
self.env = env
self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI')
@ -69,9 +69,9 @@ class WbRequest:
self.wb_prefix = wb_prefix if not use_abs_prefix else WbRequest.make_abs_prefix(env, wb_prefix)
# wb_url present and not root page
if wb_url != '/' and wb_url != '' and wburl_class:
self.wb_url_str = wb_url
self.wb_url = wburl_class(wb_url)
if wb_url_str != '/' and wb_url_str != '' and wburl_class:
self.wb_url_str = wb_url_str
self.wb_url = wburl_class(wb_url_str)
else:
# no wb_url, just store blank
self.wb_url_str = '/'

View File

@ -11,46 +11,52 @@ class WbUrl:
"""
# Replay Urls
# ======================
>>> repr(WbUrl('/20131010000506/example.com'))
"('replay', '20131010000506', '', 'http://example.com', '/20131010000506/http://example.com')"
>>> repr(WbUrl('20131010000506/example.com'))
"('replay', '20131010000506', '', 'http://example.com', '20131010000506/http://example.com')"
>>> repr(WbUrl('/20130102im_/https://example.com'))
"('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')"
>>> repr(WbUrl('20130102im_/https://example.com'))
"('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com')"
# Protocol agnostic convert to http
>>> repr(WbUrl('/20130102im_///example.com'))
"('replay', '20130102', 'im_', 'http://example.com', '/20130102im_/http://example.com')"
>>> repr(WbUrl('20130102im_///example.com'))
"('replay', '20130102', 'im_', 'http://example.com', '20130102im_/http://example.com')"
>>> repr(WbUrl('/cs_/example.com'))
"('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')"
>>> repr(WbUrl('cs_/example.com'))
"('latest_replay', '', 'cs_', 'http://example.com', 'cs_/http://example.com')"
>>> repr(WbUrl('/https://example.com/xyz'))
"('latest_replay', '', '', 'https://example.com/xyz', '/https://example.com/xyz')"
>>> repr(WbUrl('https://example.com/xyz'))
"('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz')"
>>> repr(WbUrl('/https://example.com/xyz?a=%2f&b=%2E'))
"('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', '/https://example.com/xyz?a=%2f&b=%2E')"
>>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E'))
"('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')"
# Query Urls
# ======================
>>> repr(WbUrl('/*/http://example.com/abc?def=a'))
"('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')"
>>> repr(WbUrl('*/http://example.com/abc?def=a'))
"('query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a')"
>>> repr(WbUrl('/*/http://example.com/abc?def=a*'))
"('url_query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a*')"
>>> repr(WbUrl('*/http://example.com/abc?def=a*'))
"('url_query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a*')"
>>> repr(WbUrl('/json/*/http://example.com/abc?def=a'))
"('query', '', 'json', 'http://example.com/abc?def=a', '/json/*/http://example.com/abc?def=a')"
>>> repr(WbUrl('json/*/http://example.com/abc?def=a'))
"('query', '', 'json', 'http://example.com/abc?def=a', 'json/*/http://example.com/abc?def=a')"
>>> repr(WbUrl('/timemap-link/2011*/http://example.com/abc?def=a'))
"('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', '/timemap-link/2011*/http://example.com/abc?def=a')"
>>> repr(WbUrl('timemap-link/2011*/http://example.com/abc?def=a'))
"('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', 'timemap-link/2011*/http://example.com/abc?def=a')"
# strip off repeated, likely scheme-agnostic, slashes altogether
>>> repr(WbUrl('///example.com'))
"('latest_replay', '', '', 'http://example.com', 'http://example.com')"
>>> repr(WbUrl('//example.com/'))
"('latest_replay', '', '', 'http://example.com/', 'http://example.com/')"
>>> repr(WbUrl('/example.com/'))
"('latest_replay', '', '', 'http://example.com/', 'http://example.com/')"
# Error Urls
# ======================
>>> x = WbUrl('abc')
Traceback (most recent call last):
RequestParseException: Invalid WB Request Url: abc
>>> x = WbUrl('/#$%#/')
Traceback (most recent call last):
BadUrlException: Bad Request Url: http://#$%#/
@ -62,8 +68,8 @@ class WbUrl:
# Regexs
# ======================
QUERY_REGEX = re.compile('^/?([\w\-:]+)?/(\d*)\*/(.*)$')
REPLAY_REGEX = re.compile('^/(\d*)([a-z]+_)?/?(.*)$')
QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)\*/?(.*)$')
REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$')
QUERY = 'query'
URL_QUERY = 'url_query'
@ -88,10 +94,10 @@ class WbUrl:
raise wbexceptions.RequestParseException('Invalid WB Request Url: ', url)
# protocol agnostic url -> http://
if self.url.startswith('//'):
self.url = self.DEFAULT_SCHEME + self.url[2:]
#if self.url.startswith('//'):
# self.url = self.DEFAULT_SCHEME + self.url[2:]
# no protocol -> http://
elif not '://' in self.url:
if not '://' in self.url:
self.url = self.DEFAULT_SCHEME + self.url
# BUG?: adding upper() because rfc3987 lib rejects lower case %-encoding
@ -148,7 +154,7 @@ class WbUrl:
url = overrides['url'] if 'url' in overrides else self.url
if atype == self.QUERY or atype == self.URL_QUERY:
tsmod = "/"
tsmod = ''
if mod:
tsmod += mod + "/"
if timestamp:
@ -161,9 +167,9 @@ class WbUrl:
else:
tsmod = timestamp + mod
if len(tsmod) > 0:
return "/" + tsmod + "/" + url
return tsmod + "/" + url
else:
return "/" + url
return url
def __str__(self):
return self.to_str()