diff --git a/pywb/archivalrouter.py b/pywb/archivalrouter.py index 4edaa9c3..1e3afe57 100644 --- a/pywb/archivalrouter.py +++ b/pywb/archivalrouter.py @@ -1,22 +1,122 @@ -from refer_redirect import ReferRedirect -from wbrequestresponse import WbRequest, WbResponse +import urlparse +from wbrequestresponse import WbRequest, WbResponse +from wburlrewriter import ArchivalUrlRewriter + +#================================================================= +# ArchivalRequestRouter -- route WB requests in archival mode +#================================================================= class ArchivalRequestRouter: - def __init__(self, mappings, hostpaths=None): + def __init__(self, mappings, hostpaths = None, abs_path = True): self.mappings = mappings self.fallback = ReferRedirect(hostpaths) + self.abs_path = abs_path - def parse_request(self, env): + def _parseRequest(self, env): request_uri = env['REQUEST_URI'] - for key, value in self.mappings.iteritems(): - if request_uri.startswith(key): - return value, WbRequest.prefix_request(env, key, request_uri) + for coll, handler in self.mappings.iteritems(): + rel_prefix = '/' + coll + '/' + if request_uri.startswith(rel_prefix): + #return value, ArchivalRequestRouter._prefix_request(env, key, request_uri) + req = WbRequest(env, + request_uri = request_uri, + coll = coll, + wb_url = request_uri[len(coll) + 1:], + wb_prefix = self.getPrefix(env, rel_prefix)) + + return handler, req return self.fallback, WbRequest(env) - def handle_request(self, env): - handler, wbrequest = self.parse_request(env) + def handleRequest(self, env): + handler, wbrequest = self._parseRequest(env) return handler.run(wbrequest) + def getPrefix(self, env, rel_prefix): + if self.abs_path: + try: + return env['wsgi.url_scheme'] + '://' + env['HTTP_HOST'] + rel_prefix + except KeyError: + return rel_prefix + else: + return rel_prefix + + +#================================================================= +# ReferRedirect -- redirect urls that have 'fallen through' based on the referrer settings +#================================================================= +class ReferRedirect: + + """ + >>> ReferRedirect('http://localhost:8080/').matchPrefixs + ['http://localhost:8080/'] + + >>> ReferRedirect(['http://example:9090/']).matchPrefixs + ['http://example:9090/'] + + >>> test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') + 'http://localhost:8080/coll/20131010/http://example.com/path/other.html' + + >>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') + 'http://localhost:8080/coll/20131010/http://example.com/other.html' + + >>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html') + 'http://localhost:8080/coll/20131010/http://example.com/other.html' + + >>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') + False + """ + + def __init__(self, matchPrefixs): + if isinstance(matchPrefixs, list): + self.matchPrefixs = matchPrefixs + else: + self.matchPrefixs = [matchPrefixs] + + + def run(self, wbrequest): + if wbrequest.referrer is None: + return None + + if not any (wbrequest.referrer.startswith(i) for i in self.matchPrefixs): + return None + + try: + ref_split = urlparse.urlsplit(wbrequest.referrer) + ref_path = ref_split.path[1:].split('/', 1) + + rewriter = ArchivalUrlRewriter('/' + ref_path[1], '/' + ref_path[0]) + + rel_request_uri = wbrequest.request_uri[1:] + + #ref_wb_url = archiveurl('/' + ref_path[1]) + #ref_wb_url.url = urlparse.urljoin(ref_wb_url.url, wbrequest.request_uri[1:]) + #ref_wb_url.url = ref_wb_url.url.replace('../', '') + + #final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, ref_path[0] + str(ref_wb_url), '', '')) + final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, rewriter.rewrite(rel_request_uri), '', '')) + + except Exception as e: + raise e + + return WbResponse.redir_response(final_url) + +if __name__ == "__main__": + import doctest + + def test_redir(matchHost, request_uri, referrer): + env = {'REQUEST_URI': request_uri, 'HTTP_REFERER': referrer} + + redir = ReferRedirect(matchHost) + req = WbRequest.parse(env) + rep = redir.run(req) + if not rep: + return False + + return rep.get_header('Location') + + + doctest.testmod() + diff --git a/pywb/indexreader.py b/pywb/indexreader.py index 17489a3b..1a4f8709 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -2,6 +2,8 @@ import urllib import urllib2 import wbexceptions +from wbarchivalurl import ArchivalUrl + class RemoteCDXServer: """ >>> x = cdxserver.load('example.com', parse_cdx = True, limit = '2') @@ -45,6 +47,27 @@ class RemoteCDXServer: else: return response + @staticmethod + def getQueryParams(wburl): + return { + + ArchivalUrl.QUERY: + {'collapseTime': '10', 'filter': '!statuscode:(500|502|504)', 'limit': '150000'}, + + ArchivalUrl.URL_QUERY: + {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': '100', + 'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount', + }, + + ArchivalUrl.REPLAY: + {'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': '10', 'closest': wburl.timestamp, 'resolveRevisits': True}, + + ArchivalUrl.LATEST_REPLAY: + {'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True} + + }[wburl.type] + + class CDXCaptureResult: CDX_FORMATS = [["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"], ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"]] diff --git a/pywb/refer_redirect.py b/pywb/refer_redirect.py deleted file mode 100644 index 59f08054..00000000 --- a/pywb/refer_redirect.py +++ /dev/null @@ -1,76 +0,0 @@ -import urlparse -from wbrequestresponse import WbRequest, WbResponse -from archiveurl import archiveurl - - -# Redirect urls that have 'fallen through' based on the referrer -# settings -class ReferRedirect: - - """ - >>> ReferRedirect('http://localhost:8080/').matchPrefixs - ['http://localhost:8080/'] - - >>> ReferRedirect(['http://example:9090/']).matchPrefixs - ['http://example:9090/'] - - >>> test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') - 'http://localhost:8080/coll/20131010/http://example.com/path/other.html' - - >>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') - 'http://localhost:8080/coll/20131010/http://example.com/other.html' - - >>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html') - 'http://localhost:8080/coll/20131010/http://example.com/other.html' - - >>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') - False - """ - - def __init__(self, matchPrefixs): - if isinstance(matchPrefixs, list): - self.matchPrefixs = matchPrefixs - else: - self.matchPrefixs = [matchPrefixs] - - def run(self, wbrequest): - if wbrequest.referrer is None: - return None - - if not any (wbrequest.referrer.startswith(i) for i in self.matchPrefixs): - return None - - try: - ref_split = urlparse.urlsplit(wbrequest.referrer) - ref_path = ref_split.path[1:].split('/', 1) - - ref_wb_url = archiveurl('/' + ref_path[1]) - - ref_wb_url.url = urlparse.urljoin(ref_wb_url.url, wbrequest.request_uri[1:]) - ref_wb_url.url = ref_wb_url.url.replace('../', '') - - final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, ref_path[0] + str(ref_wb_url), '', '')) - - except Exception as e: - return None - - return WbResponse.redir_response(final_url) - -if __name__ == "__main__": - import doctest - - def test_redir(matchHost, request_uri, referrer): - env = {'REQUEST_URI': request_uri, 'HTTP_REFERER': referrer} - - redir = ReferRedirect(matchHost) - req = WbRequest(env) - rep = redir.run(req) - if not rep: - return False - - return rep.get_header('Location') - - - doctest.testmod() - - diff --git a/pywb/wbapp.py b/pywb/wbapp.py index 687c41b1..125a820d 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -1,47 +1,28 @@ -from wbrequestresponse import WbResponse -from archiveurl import archiveurl -from archivalrouter import ArchivalRequestRouter import indexreader import json import wbexceptions import utils +from wbrequestresponse import WbResponse +from archivalrouter import ArchivalRequestRouter + +class EchoEnv: + def run(self, wbrequest): + return WbResponse.text_response(str(wbrequest.env)) + class WBHandler: def run(self, wbrequest): - wburl = archiveurl(wbrequest.wb_url) - wbrequest.parsed_url = wburl - return WbResponse.text_stream(str(vars(wburl))) + return WbResponse.text_response(str(wbrequest)) class QueryHandler: def __init__(self): self.cdxserver = indexreader.RemoteCDXServer('http://web.archive.org/cdx/search/cdx') - @staticmethod - def get_query_params(wburl): - return { - - archiveurl.QUERY: - {'collapseTime': '10', 'filter': '!statuscode:(500|502|504)', 'limit': '150000'}, - - archiveurl.URL_QUERY: - {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': '100', - 'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount', - }, - - archiveurl.REPLAY: - {'sort': 'closest', 'filter': '!statuscode:(500|502|504)', 'limit': '10', 'closest': wburl.timestamp, 'resolveRevisits': True}, - - archiveurl.LATEST_REPLAY: - {'sort': 'reverse', 'filter': 'statuscode:[23]..', 'limit': '1', 'resolveRevisits': True} - - }[wburl.type] - def run(self, wbrequest): - wburl = archiveurl(wbrequest.wb_url) - #wburl = wbresponse.body.parsed_url + wburl = wbrequest.wb_url - params = QueryHandler.get_query_params(wburl) + params = self.cdxserver.getQueryParams(wburl) cdxlines = self.cdxserver.load(wburl.url, params) @@ -56,8 +37,10 @@ class QueryHandler: ## =========== parser = ArchivalRequestRouter( - {'/t1/' : WBHandler(), - '/t2/' : QueryHandler() + { + 't0' : EchoEnv(), + 't1' : WBHandler(), + 't2' : QueryHandler() }, hostpaths = ['http://localhost:9090/']) ## =========== @@ -67,7 +50,7 @@ def application(env, start_response): response = None try: - response = parser.handle_request(env) + response = parser.handleRequest(env) if not response: raise wbexceptions.NotFoundException(env['REQUEST_URI'] + ' was not found') @@ -76,11 +59,11 @@ def application(env, start_response): last_exc = e import traceback traceback.print_exc() - response = handle_exception(env, e) + response = handleException(env, e) return response(env, start_response) -def handle_exception(env, exc): +def handleException(env, exc): if hasattr(exc, 'status'): status = exc.status() else: diff --git a/pywb/archiveurl.py b/pywb/wbarchivalurl.py similarity index 74% rename from pywb/archiveurl.py rename to pywb/wbarchivalurl.py index 5cdd1fe9..85bb4cbe 100644 --- a/pywb/archiveurl.py +++ b/pywb/wbarchivalurl.py @@ -5,51 +5,51 @@ import rfc3987 import wbexceptions -# archiveurl : archivalurl representation for WB +# ArchivalUrl : archivalurl representation for WB -class archiveurl: +class ArchivalUrl: """ # Replay Urls # ====================== - >>> repr(archiveurl('/20131010000506/example.com')) + >>> repr(ArchivalUrl('/20131010000506/example.com')) "('replay', '20131010000506', '', 'http://example.com', '/20131010000506/http://example.com')" - >>> repr(archiveurl('/20130102im_/https://example.com')) + >>> repr(ArchivalUrl('/20130102im_/https://example.com')) "('replay', '20130102', 'im_', 'https://example.com', '/20130102im_/https://example.com')" - >>> repr(archiveurl('/cs_/example.com')) + >>> repr(ArchivalUrl('/cs_/example.com')) "('latest_replay', '', 'cs_', 'http://example.com', '/cs_/http://example.com')" - >>> repr(archiveurl('/https://example.com/xyz')) + >>> repr(ArchivalUrl('/https://example.com/xyz')) "('latest_replay', '', '', 'https://example.com/xyz', '/https://example.com/xyz')" # Query Urls # ====================== - >>> repr(archiveurl('/*/http://example.com/abc?def=a')) + >>> repr(ArchivalUrl('/*/http://example.com/abc?def=a')) "('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')" - >>> repr(archiveurl('/*/http://example.com/abc?def=a*')) + >>> repr(ArchivalUrl('/*/http://example.com/abc?def=a*')) "('url_query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a*')" - >>> repr(archiveurl('/json/*/http://example.com/abc?def=a')) + >>> repr(ArchivalUrl('/json/*/http://example.com/abc?def=a')) "('query', '', 'json', 'http://example.com/abc?def=a', '/json/*/http://example.com/abc?def=a')" - >>> repr(archiveurl('/timemap-link/2011*/http://example.com/abc?def=a')) + >>> repr(ArchivalUrl('/timemap-link/2011*/http://example.com/abc?def=a')) "('query', '2011', 'timemap-link', 'http://example.com/abc?def=a', '/timemap-link/2011*/http://example.com/abc?def=a')" # Error Urls # ====================== - >>> x = archiveurl('abc') + >>> x = ArchivalUrl('abc') Traceback (most recent call last): RequestParseException: Invalid WB Request Url: abc - >>> x = archiveurl('/#$%#/') + >>> x = ArchivalUrl('/#$%#/') Traceback (most recent call last): BadUrlException: Bad Request Url: http://#$%#/ - >>> x = archiveurl('/http://example.com:abc/') + >>> x = ArchivalUrl('/http://example.com:abc/') Traceback (most recent call last): BadUrlException: Bad Request Url: http://example.com:abc/ """ @@ -75,14 +75,14 @@ class archiveurl: self.timestamp = '' self.mod = '' - if not any (f(self, url) for f in [archiveurl._init_query, archiveurl._init_replay]): + if not any (f(self, url) for f in [ArchivalUrl._init_query, ArchivalUrl._init_replay]): raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url) if len(self.url) == 0: raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url) if not self.url.startswith('//') and not '://' in self.url: - self.url = archiveurl.DEFAULT_SCHEME + self.url + self.url = ArchivalUrl.DEFAULT_SCHEME + self.url matcher = rfc3987.match(self.url, 'IRI') @@ -92,7 +92,7 @@ class archiveurl: # Match query regex # ====================== def _init_query(self, url): - query = archiveurl.QUERY_REGEX.match(url) + query = ArchivalUrl.QUERY_REGEX.match(url) if not query: return None @@ -102,16 +102,16 @@ class archiveurl: self.timestamp = res[1] self.url = res[2] if self.url.endswith('*'): - self.type = archiveurl.URL_QUERY + self.type = ArchivalUrl.URL_QUERY self.url = self.url[:-1] else: - self.type = archiveurl.QUERY + self.type = ArchivalUrl.QUERY return True # Match replay regex # ====================== def _init_replay(self, url): - replay = archiveurl.REPLAY_REGEX.match(url) + replay = ArchivalUrl.REPLAY_REGEX.match(url) if not replay: return None @@ -121,16 +121,16 @@ class archiveurl: self.mod = res[1] self.url = res[2] if self.timestamp: - self.type = archiveurl.REPLAY + self.type = ArchivalUrl.REPLAY else: - self.type = archiveurl.LATEST_REPLAY + self.type = ArchivalUrl.LATEST_REPLAY return True # Str Representation # ==================== def __str__(self): - if self.type == archiveurl.QUERY or self.type == archiveurl.URL_QUERY: + if self.type == ArchivalUrl.QUERY or self.type == ArchivalUrl.URL_QUERY: tsmod = "/" if self.mod: tsmod += self.mod + "/" @@ -138,7 +138,7 @@ class archiveurl: tsmod += self.timestamp tsmod += "*/" + self.url - if self.type == archiveurl.URL_QUERY: + if self.type == ArchivalUrl.URL_QUERY: tsmod += "*" return tsmod else: diff --git a/pywb/wbrequestresponse.py b/pywb/wbrequestresponse.py index 3e914aeb..51f81b63 100644 --- a/pywb/wbrequestresponse.py +++ b/pywb/wbrequestresponse.py @@ -1,32 +1,79 @@ +from wbarchivalurl import ArchivalUrl #WB Request and Response class WbRequest: """ - >>> WbRequest.prefix_request({'REQUEST_URI': '/save/_embed/example.com/?a=b'}, '/save/') - WbRequest(env, '/_embed/example.com/?a=b', 'save') + >>> WbRequest.parse({'REQUEST_URI': '/save/_embed/example.com/?a=b'}) + {'wb_url': ('latest_replay', '', '', 'http://_embed/example.com/?a=b', '/http://_embed/example.com/?a=b'), 'coll': 'save', 'wb_prefix': '/save/', 'request_uri': '/save/_embed/example.com/?a=b'} + + >>> WbRequest.parse({'REQUEST_URI': '/2345/20101024101112im_/example.com/?b=c'}) + {'wb_url': ('replay', '20101024101112', 'im_', 'http://example.com/?b=c', '/20101024101112im_/http://example.com/?b=c'), 'coll': '2345', 'wb_prefix': '/2345/', 'request_uri': '/2345/20101024101112im_/example.com/?b=c'} + + >>> WbRequest.parse({'REQUEST_URI': '/2010/example.com'}) + {'wb_url': ('latest_replay', '', '', 'http://example.com', '/http://example.com'), 'coll': '2010', 'wb_prefix': '/2010/', 'request_uri': '/2010/example.com'} + + >>> WbRequest.parse({'REQUEST_URI': '../example.com'}) + {'wb_url': ('latest_replay', '', '', 'http://example.com', '/http://example.com'), 'coll': '', 'wb_prefix': '/', 'request_uri': '../example.com'} """ - def __init__(self, env, request_uri = '', wb_url = '', coll = ''): - self.env = env - - # if len(wb_url) == 0: - # wb_url = request_uri - - setattr(self, 'wb_url', wb_url) - setattr(self, 'coll', coll) - - setattr(self, 'request_uri', request_uri) - setattr(self, 'referrer', env.get('HTTP_REFERER')) - - @staticmethod - def prefix_request(env, prefix, request_uri = ''): + def parse(env, request_uri = ''): if not request_uri: request_uri = env.get('REQUEST_URI') - return WbRequest(env, request_uri, request_uri[len(prefix)-1:], coll = prefix[1:-1]) + + parts = request_uri.split('/', 2) + + # Has coll prefix + if len(parts) == 3: + wb_prefix = '/' + parts[1] + '/' + wb_url = '/' + parts[2] + coll = parts[1] + # No Coll Prefix + elif len(parts) == 2: + wb_prefix = '/' + wb_url = '/' + parts[1] + coll = '' + else: + wb_prefix = '/' + wb_url = parts[0] + coll = '' + + return WbRequest(env, request_uri, wb_prefix, wb_url, coll) + + def __init__(self, env, request_uri, wb_prefix, wb_url, coll): + self.env = env + + self.request_uri = request_uri if request_uri else env.get('REQUEST_URI') + + self.wb_prefix = wb_prefix + + self.wb_url = ArchivalUrl(wb_url) + + self.coll = coll + + self.referrer = env.get('HTTP_REFERER') + + self.is_ajax = self._is_ajax() + + + def _is_ajax(self): + value = self.env.get('HTTP_X_REQUESTED_WITH') + if not value: + return False + + if value.lower() == 'xmlhttprequest': + return True + + if self.referrer and ('ajaxpipe' in self.env.get('QUERY_STRING')): + return True + return False + def __repr__(self): - return "WbRequest(env, '" + (self.wb_url) + "', '" + (self.coll) + "')" + #return "WbRequest(env, '" + (self.wb_url) + "', '" + (self.coll) + "')" + #return str(vars(self)) + varlist = vars(self) + return str({k: varlist[k] for k in ('request_uri', 'wb_prefix', 'wb_url', 'coll')}) class WbResponse: diff --git a/pywb/wburlrewriter.py b/pywb/wburlrewriter.py new file mode 100644 index 00000000..8e587404 --- /dev/null +++ b/pywb/wburlrewriter.py @@ -0,0 +1,52 @@ +import copy +import urlparse + +from wbarchivalurl import ArchivalUrl + +class ArchivalUrlRewriter: + """ + >>> test_rewrite('other.html', '/20131010/http://example.com/path/page.html', 'https://web.archive.org/web/') + 'https://web.archive.org/web/20131010/http://example.com/path/other.html' + + >>> test_rewrite('./other.html', '/20130907*/http://example.com/path/page.html', '/coll/') + '/coll/20130907*/http://example.com/path/other.html' + + >>> test_rewrite('../other.html', '/20131112im_/http://example.com/path/page.html', '/coll/') + '/coll/20131112im_/http://example.com/other.html' + + >>> test_rewrite('../../other.html', '/*/http://example.com/index.html', 'localhost:8080/') + 'localhost:8080/*/http://example.com/other.html' + + >>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '/') + '/2020/http://example.com/other.html' + + >>> test_rewrite('../../other.html', '/2020/http://example.com/index.html', '') + '/2020/http://example.com/other.html' + """ + + def __init__(self, wburl_str, prefix): + self.wburl_str = wburl_str + self.prefix = prefix + if self.prefix.endswith('/'): + self.prefix = self.prefix[:-1] + + def rewrite(self, rel_url): + if '../' in rel_url: + wburl = ArchivalUrl(self.wburl_str) + wburl.url = urlparse.urljoin(wburl.url, rel_url) + wburl.url = wburl.url.replace('../', '') + + final_url = self.prefix + str(wburl) + else: + final_url = urlparse.urljoin(self.prefix + self.wburl_str, rel_url) + + return final_url + +if __name__ == "__main__": + import doctest + + def test_rewrite(rel_url, base_url, prefix): + rewriter = ArchivalUrlRewriter(base_url, prefix) + return rewriter.rewrite(rel_url) + + doctest.testmod()