diff --git a/pywb/archivalrouter.py b/pywb/archivalrouter.py index 017618f4..b16897e9 100644 --- a/pywb/archivalrouter.py +++ b/pywb/archivalrouter.py @@ -48,12 +48,25 @@ class ArchivalRequestRouter: # of request uri (excluding first '/') #================================================================= class Route: + """ + # route with relative path + >>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/web/test.example.com', 'SCRIPT_NAME': ''}, False) + {'wb_url': ('latest_replay', '', '', 'http://test.example.com', 'http://test.example.com'), 'coll': 'web', 'wb_prefix': '/web/', 'request_uri': '/web/test.example.com'} - # match upto next slash - SLASH_LOOKAHEAD ='(?=/|$|\?)' + # route with absolute path, running at script /my_pywb + >>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/web/2013im_/test.example.com', 'SCRIPT_NAME': '/my_pywb', 'HTTP_HOST': 'localhost:8081', 'wsgi.url_scheme': 'https'}, True) + {'wb_url': ('replay', '2013', 'im_', 'http://test.example.com', '2013im_/http://test.example.com'), 'coll': 'web', 'wb_prefix': 'https://localhost:8081/my_pywb/web/', 'request_uri': '/web/2013im_/test.example.com'} - def __init__(self, regex, handler, coll_group = 0, lookahead = SLASH_LOOKAHEAD): + # not matching route -- skipped + >>> Route('web', handlers.BaseHandler())({'REL_REQUEST_URI': '/other/test.example.com', 'SCRIPT_NAME': ''}, False) + """ + + # match upto next / or ? or end + SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)' + + + def __init__(self, regex, handler, coll_group = 0, lookahead = SLASH_QUERY_LOOKAHEAD): self.path = regex self.regex = re.compile(regex + lookahead) self.handler = handler @@ -71,17 +84,17 @@ class Route: if rel_prefix: wb_prefix = env['SCRIPT_NAME'] + '/' + rel_prefix + '/' - wb_url = request_uri[len(rel_prefix) + 1:] # remove the '/' + rel_prefix part of uri + wb_url_str = request_uri[len(rel_prefix) + 2:] # remove the '/' + rel_prefix part of uri else: wb_prefix = env['SCRIPT_NAME'] + '/' - wb_url = request_uri # the request_uri is the wb_url, since no coll + wb_url_str = request_uri[1:] # the request_uri is the wb_url, since no coll coll = matcher.group(self.coll_group) wbrequest = WbRequest(env, request_uri = request_uri, coll = coll, - wb_url = wb_url, + wb_url_str = wb_url_str, wb_prefix = wb_prefix, use_abs_prefix = use_abs_prefix, wburl_class = self.handler.get_wburl_type()) @@ -164,7 +177,7 @@ class ReferRedirect: # No match on any exception try: - rewriter = UrlRewriter('/' + ref_path[1], script_name + '/' + ref_path[0]) + rewriter = UrlRewriter(ref_path[1], script_name + '/' + ref_path[0] + '/') except Exception: return None @@ -186,6 +199,8 @@ class ReferRedirect: import utils if __name__ == "__main__" or utils.enable_doctests(): + import handlers + def test_redir(match_host, request_uri, referrer, script_name = ''): env = {'REL_REQUEST_URI': request_uri, 'HTTP_REFERER': referrer, 'SCRIPT_NAME': script_name} diff --git a/pywb/handlers.py b/pywb/handlers.py index 63d6fed8..3708d6e4 100644 --- a/pywb/handlers.py +++ b/pywb/handlers.py @@ -12,6 +12,8 @@ class BaseHandler: def get_wburl_type(): return WbUrl + def __call__(self, wbrequest): + return wbrequest #================================================================= # Standard WB Handler diff --git a/pywb/header_rewriter.py b/pywb/header_rewriter.py index 1a6b65b0..fe67f49f 100644 --- a/pywb/header_rewriter.py +++ b/pywb/header_rewriter.py @@ -128,7 +128,7 @@ if __name__ == "__main__" or utils.enable_doctests(): import pprint import url_rewriter - urlrewriter = url_rewriter.UrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/') + urlrewriter = url_rewriter.UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/') headerrewriter = HeaderRewriter() diff --git a/pywb/html_rewriter.py b/pywb/html_rewriter.py index f5228bc5..25236acd 100644 --- a/pywb/html_rewriter.py +++ b/pywb/html_rewriter.py @@ -310,7 +310,7 @@ class HTMLRewriter(HTMLParser): import utils if __name__ == "__main__" or utils.enable_doctests(): - url_rewriter = UrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/') + url_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/') def parse(data, head_insert = None): parser = HTMLRewriter(url_rewriter, head_insert = head_insert) diff --git a/pywb/regex_rewriters.py b/pywb/regex_rewriters.py index bf9d0361..300c248f 100644 --- a/pywb/regex_rewriters.py +++ b/pywb/regex_rewriters.py @@ -224,7 +224,7 @@ class CSSRewriter(RegexRewriter): import utils if __name__ == "__main__" or utils.enable_doctests(): - arcrw = UrlRewriter('/20131010im_/http://example.com/', '/web/') + arcrw = UrlRewriter('20131010im_/http://example.com/', '/web/') def test_js(string, extra = []): return JSRewriter(arcrw, extra).rewrite(string) diff --git a/pywb/url_rewriter.py b/pywb/url_rewriter.py index 18f016eb..79199744 100644 --- a/pywb/url_rewriter.py +++ b/pywb/url_rewriter.py @@ -6,46 +6,49 @@ from wburl import WbUrl class UrlRewriter: """ - >>> test_rewrite('other.html', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') + >>> test_rewrite('other.html', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') 'https://web.archive.org/web/20131010/http://example.com/path/other.html' - >>> test_rewrite('file.js', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_') + >>> test_rewrite('file.js', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/', 'js_') 'https://web.archive.org/web/20131010js_/http://example.com/path/file.js' - >>> test_rewrite('./other.html', '/20130907*/http://example.com/path/page.html', '/coll/') + >>> test_rewrite('/other.html', '20130907*/http://example.com/path/page.html', '/coll/') + '/coll/20130907*/http://example.com/other.html' + + >>> test_rewrite('./other.html', '20130907*/http://example.com/path/page.html', '/coll/') '/coll/20130907*/http://example.com/path/other.html' - >>> test_rewrite('../other.html', '/20131112im_/http://example.com/path/page.html', '/coll/') + >>> test_rewrite('../other.html', '20131112im_/http://example.com/path/page.html', '/coll/') '/coll/20131112im_/http://example.com/other.html' - >>> test_rewrite('../../other.html', '/*/http://example.com/index.html', 'localhost:8080/') + >>> test_rewrite('../../other.html', '*/http://example.com/index.html', 'localhost:8080/') 'localhost:8080/*/http://example.com/other.html' - >>> test_rewrite('path/../../other.html', '/*/http://example.com/index.html', 'localhost:8080/') + >>> test_rewrite('path/../../other.html', '*/http://example.com/index.html', 'localhost:8080/') 'localhost:8080/*/http://example.com/other.html' - >>> test_rewrite('http://some-other-site.com', '/20101226101112/http://example.com/index.html', 'localhost:8080/') + >>> test_rewrite('http://some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/') 'localhost:8080/20101226101112/http://some-other-site.com' - >>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '/') + >>> test_rewrite('../../other.html', '2020/http://example.com/index.html', '/') '/2020/http://example.com/other.html' - >>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '') - '/2020/http://example.com/other.html' + >>> test_rewrite('../../other.html', '2020/http://example.com/index.html', '') + '2020/http://example.com/other.html' - >>> test_rewrite('', '/20131010010203/http://example.com/file.html', '/web/') + >>> test_rewrite('', '20131010010203/http://example.com/file.html', '/web/') '/web/20131010010203/http://example.com/file.html' - >>> test_rewrite('#anchor', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') + >>> test_rewrite('#anchor', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') '#anchor' - >>> test_rewrite('mailto:example@example.com', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') + >>> test_rewrite('mailto:example@example.com', '20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') 'mailto:example@example.com' - >>> UrlRewriter('/19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url() + >>> UrlRewriter('19960708im_/http://domain.example.com/path.txt', '/abc/').get_abs_url() '/abc/19960708im_/' - >>> UrlRewriter('/2013id_/example.com/file/path/blah.html', '/123/').get_timestamp_url('20131024') + >>> UrlRewriter('2013id_/example.com/file/path/blah.html', '/123/').get_timestamp_url('20131024') '/123/20131024id_/http://example.com/file/path/blah.html' >>> UrlRewriter.strip_protocol('https://example.com') == UrlRewriter.strip_protocol('http://example.com') @@ -61,8 +64,8 @@ class UrlRewriter: self.prefix = prefix self.archivalurl_class = self.wburl.__class__ - if self.prefix.endswith('/'): - self.prefix = self.prefix[:-1] + #if self.prefix.endswith('/'): + # self.prefix = self.prefix[:-1] def rewrite(self, url, mod = None): # if special protocol, no rewriting at all diff --git a/pywb/wbrequestresponse.py b/pywb/wbrequestresponse.py index 43f46bfa..efa5fafb 100644 --- a/pywb/wbrequestresponse.py +++ b/pywb/wbrequestresponse.py @@ -7,24 +7,24 @@ import pprint class WbRequest: """ >>> WbRequest.from_uri('/save/_embed/example.com/?a=b') - {'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', '/http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'} + {'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', 'http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'} >>> WbRequest.from_uri('/2345/20101024101112im_/example.com/?b=c') - {'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '/20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'} + {'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'} >>> WbRequest.from_uri('/2010/example.com') - {'wb_url': ('latest_replay', '', '', 'http://example.com', '/http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'} + {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'} >>> WbRequest.from_uri('../example.com') - {'wb_url': ('latest_replay', '', '', 'http://example.com', '/http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'} + {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'} # Abs path >>> WbRequest.from_uri('/2010/example.com', {'wsgi.url_scheme': 'https', 'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True) - {'wb_url': ('latest_replay', '', '', 'http://example.com', '/http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'} + {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': 'https://localhost:8080/2010/', 'request_uri': '/2010/example.com'} # No Scheme, so stick to relative >>> WbRequest.from_uri('/2010/example.com', {'HTTP_HOST': 'localhost:8080'}, use_abs_prefix = True) - {'wb_url': ('latest_replay', '', '', 'http://example.com', '/http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'} + {'wb_url': ('latest_replay', '', '', 'http://example.com', 'http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'} """ @@ -38,19 +38,19 @@ class WbRequest: # Has coll prefix if len(parts) == 3: wb_prefix = '/' + parts[1] + '/' - wb_url = '/' + parts[2] + wb_url_str = parts[2] coll = parts[1] # No Coll Prefix elif len(parts) == 2: wb_prefix = '/' - wb_url = '/' + parts[1] + wb_url_str = parts[1] coll = '' else: wb_prefix = '/' - wb_url = parts[0] + wb_url_str = parts[0] coll = '' - return WbRequest(env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix) + return WbRequest(env, request_uri, wb_prefix, wb_url_str, coll, use_abs_prefix) @staticmethod @@ -61,7 +61,7 @@ class WbRequest: return rel_prefix - def __init__(self, env, request_uri, wb_prefix, wb_url, coll, use_abs_prefix = False, wburl_class = WbUrl): + def __init__(self, env, request_uri, wb_prefix, wb_url_str, coll, use_abs_prefix = False, wburl_class = WbUrl): self.env = env self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI') @@ -69,9 +69,9 @@ class WbRequest: self.wb_prefix = wb_prefix if not use_abs_prefix else WbRequest.make_abs_prefix(env, wb_prefix) # wb_url present and not root page - if wb_url != '/' and wb_url != '' and wburl_class: - self.wb_url_str = wb_url - self.wb_url = wburl_class(wb_url) + if wb_url_str != '/' and wb_url_str != '' and wburl_class: + self.wb_url_str = wb_url_str + self.wb_url = wburl_class(wb_url_str) else: # no wb_url, just store blank self.wb_url_str = '/' diff --git a/pywb/wburl.py b/pywb/wburl.py index a980545f..70162b80 100644 --- a/pywb/wburl.py +++ b/pywb/wburl.py @@ -11,46 +11,52 @@ class WbUrl: """ # Replay Urls # ====================== - >>> repr(WbUrl('/20131010000506/example.com')) - "('replay', '20131010000506', '', 'http://example.com', '/20131010000506/http://example.com')" + >>> repr(WbUrl('20131010000506/example.com')) + "('replay', '20131010000506', '', 'http://example.com', '20131010000506/http://example.com')" - >>> repr(WbUrl('/20130102im_/https://example.com')) - "('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')" + >>> repr(WbUrl('20130102im_/https://example.com')) + "('replay', '20130102', 'im_', 'https://example.com', '20130102im_/https://example.com')" # Protocol agnostic convert to http - >>> repr(WbUrl('/20130102im_///example.com')) - "('replay', '20130102', 'im_', 'http://example.com', '/20130102im_/http://example.com')" + >>> repr(WbUrl('20130102im_///example.com')) + "('replay', '20130102', 'im_', 'http://example.com', '20130102im_/http://example.com')" - >>> repr(WbUrl('/cs_/example.com')) - "('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')" + >>> repr(WbUrl('cs_/example.com')) + "('latest_replay', '', 'cs_', 'http://example.com', 'cs_/http://example.com')" - >>> repr(WbUrl('/https://example.com/xyz')) - "('latest_replay', '', '', 'https://example.com/xyz', '/https://example.com/xyz')" + >>> repr(WbUrl('https://example.com/xyz')) + "('latest_replay', '', '', 'https://example.com/xyz', 'https://example.com/xyz')" - >>> repr(WbUrl('/https://example.com/xyz?a=%2f&b=%2E')) - "('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', '/https://example.com/xyz?a=%2f&b=%2E')" + >>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E')) + "('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')" # Query Urls # ====================== - >>> repr(WbUrl('/*/http://example.com/abc?def=a')) - "('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')" + >>> repr(WbUrl('*/http://example.com/abc?def=a')) + "('query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a')" - >>> repr(WbUrl('/*/http://example.com/abc?def=a*')) - "('url_query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a*')" + >>> repr(WbUrl('*/http://example.com/abc?def=a*')) + "('url_query', '', '', 'http://example.com/abc?def=a', '*/http://example.com/abc?def=a*')" - >>> repr(WbUrl('/json/*/http://example.com/abc?def=a')) - "('query', '', 'json', 'http://example.com/abc?def=a', '/json/*/http://example.com/abc?def=a')" + >>> repr(WbUrl('json/*/http://example.com/abc?def=a')) + "('query', '', 'json', 'http://example.com/abc?def=a', 'json/*/http://example.com/abc?def=a')" - >>> repr(WbUrl('/timemap-link/2011*/http://example.com/abc?def=a')) - "('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', '/timemap-link/2011*/http://example.com/abc?def=a')" + >>> repr(WbUrl('timemap-link/2011*/http://example.com/abc?def=a')) + "('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', 'timemap-link/2011*/http://example.com/abc?def=a')" + + # strip off repeated, likely scheme-agnostic, slashes altogether + >>> repr(WbUrl('///example.com')) + "('latest_replay', '', '', 'http://example.com', 'http://example.com')" + + >>> repr(WbUrl('//example.com/')) + "('latest_replay', '', '', 'http://example.com/', 'http://example.com/')" + + >>> repr(WbUrl('/example.com/')) + "('latest_replay', '', '', 'http://example.com/', 'http://example.com/')" # Error Urls # ====================== - >>> x = WbUrl('abc') - Traceback (most recent call last): - RequestParseException: Invalid WB Request Url: abc - >>> x = WbUrl('/#$%#/') Traceback (most recent call last): BadUrlException: Bad Request Url: http://#$%#/ @@ -62,8 +68,8 @@ class WbUrl: # Regexs # ====================== - QUERY_REGEX = re.compile('^/?([\w\-:]+)?/(\d*)\*/(.*)$') - REPLAY_REGEX = re.compile('^/(\d*)([a-z]+_)?/?(.*)$') + QUERY_REGEX = re.compile('^(?:([\w\-:]+)/)?(\d*)\*/?(.*)$') + REPLAY_REGEX = re.compile('^(\d*)([a-z]+_)?/{0,3}(.*)$') QUERY = 'query' URL_QUERY = 'url_query' @@ -88,10 +94,10 @@ class WbUrl: raise wbexceptions.RequestParseException('Invalid WB Request Url: ', url) # protocol agnostic url -> http:// - if self.url.startswith('//'): - self.url = self.DEFAULT_SCHEME + self.url[2:] + #if self.url.startswith('//'): + # self.url = self.DEFAULT_SCHEME + self.url[2:] # no protocol -> http:// - elif not '://' in self.url: + if not '://' in self.url: self.url = self.DEFAULT_SCHEME + self.url # BUG?: adding upper() because rfc3987 lib rejects lower case %-encoding @@ -148,7 +154,7 @@ class WbUrl: url = overrides['url'] if 'url' in overrides else self.url if atype == self.QUERY or atype == self.URL_QUERY: - tsmod = "/" + tsmod = '' if mod: tsmod += mod + "/" if timestamp: @@ -161,9 +167,9 @@ class WbUrl: else: tsmod = timestamp + mod if len(tsmod) > 0: - return "/" + tsmod + "/" + url + return tsmod + "/" + url else: - return "/" + url + return url def __str__(self): return self.to_str()