diff --git a/pywb/__init__.py b/pywb/__init__.py new file mode 100644 index 00000000..769c3cc7 --- /dev/null +++ b/pywb/__init__.py @@ -0,0 +1,2 @@ +#Allow importing + diff --git a/pywb/aurl.py b/pywb/aurl.py index 36314351..f2b863ee 100644 --- a/pywb/aurl.py +++ b/pywb/aurl.py @@ -3,26 +3,31 @@ import re import rfc3987 +import wbexceptions + # aurl : ArchivalUrl representation for WB class aurl: """ # Replay Urls # ====================== - >>> print_test(aurl('/20131010000506/example.com')) - ('replay', '20131010000506', None, 'example.com') + >>> repr(aurl('/20131010000506/example.com')) + "('replay', '20131010000506', '', 'example.com', '/20131010000506/example.com')" - >>> print_test(aurl('/20130102im_/example.com')) - ('replay', '20130102', 'im_', 'example.com') + >>> repr(aurl('/20130102im_/example.com')) + "('replay', '20130102', 'im_', 'example.com', '/20130102im_/example.com')" - >>> print_test(aurl('/https://example.com/xyz')) - ('latest_replay', None, None, 'https://example.com/xyz') + >>> repr(aurl('/cs_/example.com')) + "('latest_replay', '', 'cs_', 'example.com', '/cs_/example.com')" + + >>> repr(aurl('/https://example.com/xyz')) + "('latest_replay', '', '', 'https://example.com/xyz', '/https://example.com/xyz')" # Query Urls # ====================== - >>> print_test(aurl('/*/http://example.com/abc?def=a')) - ('query', None, None, 'http://example.com/abc?def=a') + >>> repr(aurl('/*/http://example.com/abc?def=a')) + "('query', '', '', 'http://example.com/abc?def=a', '/*/http://example.com/abc?def=a')" # Error Urls @@ -33,34 +38,34 @@ class aurl: >>> x = aurl('/#$%#/') Traceback (most recent call last): - RequestParseException: Bad Request Url: #$%#/ + BadUrlException: Bad Request Url: #$%#/ >>> x = aurl('/http://example.com:abc/') Traceback (most recent call last): - RequestParseException: Bad Request Url: http://example.com:abc/ + BadUrlException: Bad Request Url: http://example.com:abc/ """ # Regexs # ====================== QUERY_REGEX = re.compile('^/(\d{1,14})?\*/(.*)$') - REPLAY_REGEX = re.compile('^(/(\d{1,14})([a-z]{2}_)?)?/(.*)$') + REPLAY_REGEX = re.compile('^/(\d{1,14})?([a-z]{2}_)?/?(.*)$') # ====================== def __init__(self, url): self.original_url = url self.type = None - self.url = None - self.timestamp = None - self.mod = None + self.url = '' + self.timestamp = '' + self.mod = '' if not any (f(self, url) for f in [aurl._init_query, aurl._init_replay]): - raise RequestParseException('Invalid WB Request Url: ' + url) + raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url) matcher = rfc3987.match(self.url, 'IRI_reference') if not matcher: - raise RequestParseException('Bad Request Url: ' + self.url) + raise wbexceptions.BadUrlException('Bad Request Url: ' + self.url) # Match query regex # ====================== @@ -69,8 +74,10 @@ class aurl: if not query: return None - self.timestamp = query.group(1) - self.url = query.group(2) + res = query.groups('') + + self.timestamp = res[0] + self.url = res[1] self.type = 'query' return True @@ -81,9 +88,11 @@ class aurl: if not replay: return None - self.timestamp = replay.group(2) - self.mod = replay.group(3) - self.url = replay.group(4) + res = replay.groups('') + + self.timestamp = res[0] + self.mod = res[1] + self.url = res[2] if self.timestamp: self.type = 'replay' else: @@ -91,16 +100,25 @@ class aurl: return True + # Str Representation + # ==================== + def __str__(self): + if self.type == 'query': + return "/*/" + self.url + else: + tsmod = self.timestamp + self.mod + if len(tsmod) > 0: + return "/" + tsmod + "/" + self.url + else: + return "/" + self.url -class RequestParseException(Exception): - pass - - + def __repr__(self): + return str((self.type, self.timestamp, self.mod, self.url, str(self))) if __name__ == "__main__": import doctest - def print_test(self): - return self.type, self.timestamp, self.mod, self.url + #def print_test(self): + # return self.type, self.timestamp, self.mod, self.url, str(self) doctest.testmod() diff --git a/pywb/refer_redirect.py b/pywb/refer_redirect.py new file mode 100644 index 00000000..8595c0a2 --- /dev/null +++ b/pywb/refer_redirect.py @@ -0,0 +1,75 @@ +import aurl +import urlparse +from wbrequestresponse import WbRequest, WbResponse + +# Redirect urls that have 'fallen through' based on the referrer +# settings +class ReferRedirect: + + """ + >>> ReferRedirect('http://localhost:8080/').matchPrefixs + ['http://localhost:8080/'] + + >>> ReferRedirect(['http://example:9090/']).matchPrefixs + ['http://example:9090/'] + + >>> test_redir('http://localhost:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') + 'http://localhost:8080/coll/20131010/http://example.com/path/other.html' + + >>> test_redir('http://localhost:8080/', '/../other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') + 'http://localhost:8080/coll/20131010/http://example.com/other.html' + + >>> test_redir('http://localhost:8080/', '/../../other.html', 'http://localhost:8080/coll/20131010/http://example.com/index.html') + 'http://localhost:8080/coll/20131010/http://example.com/other.html' + + >>> test_redir('http://example:8080/', '/other.html', 'http://localhost:8080/coll/20131010/http://example.com/path/page.html') + False + """ + + def __init__(self, matchPrefixs): + if isinstance(matchPrefixs, list): + self.matchPrefixs = matchPrefixs + else: + self.matchPrefixs = [matchPrefixs] + + def run(self, wbrequest): + if wbrequest.referrer is None: + return None + + if not any (wbrequest.referrer.startswith(i) for i in self.matchPrefixs): + return None + + try: + ref_split = urlparse.urlsplit(wbrequest.referrer) + ref_path = ref_split.path[1:].split('/', 1) + + ref_wb_url = aurl.aurl('/' + ref_path[1]) + + ref_wb_url.url = urlparse.urljoin(ref_wb_url.url, wbrequest.request_uri[1:]) + ref_wb_url.url = ref_wb_url.url.replace('../', '') + + final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, ref_path[0] + str(ref_wb_url), '', '')) + + except Exception as e: + return None + + return WbResponse.redir_response(final_url) + +if __name__ == "__main__": + import doctest + + def test_redir(matchHost, request_uri, referrer): + env = {'REQUEST_URI': request_uri, 'HTTP_REFERER': referrer} + + redir = ReferRedirect(matchHost) + req = WbRequest(env) + rep = redir.run(req) + if not rep: + return False + + return rep.get_header('Location') + + + doctest.testmod() + + diff --git a/pywb/run.sh b/pywb/run.sh new file mode 100755 index 00000000..c8a1198d --- /dev/null +++ b/pywb/run.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +app=$1 +if [ -z "$app" ]; then + app=wbapp.py +fi + +uwsgi --http :9090 --wsgi-file $app diff --git a/pywb/wbapp.py b/pywb/wbapp.py new file mode 100644 index 00000000..5e7b57d4 --- /dev/null +++ b/pywb/wbapp.py @@ -0,0 +1,59 @@ +from wbrequestresponse import WbRequest, WbResponse +from refer_redirect import ReferRedirect +import aurl + +class WBHandler: + def run(self, wbrequest): + wburl = aurl.aurl(wbrequest.wb_url) + return WbResponse.text_response(repr(wburl)) + +class ArchivalParser: + def __init__(self, mappings, hostpaths=None): + self.mappings = mappings + self.fallback = ReferRedirect(hostpaths) + + def find_handler(self, env): + request_uri = env['REQUEST_URI'] + + for key, value in self.mappings.iteritems(): + if request_uri.startswith(key): + env['WB_URL'] = request_uri[len(key)-1:] + env['WB_COLL'] = key[1:-1] + #print "Found: " + str(value) + " for " + key + return value + + return self.fallback + + def handle_request(self, env): + handler = self.find_handler(env) + return handler.run(WbRequest(env)) + + def handle_exception(self, env, exc): + return WbResponse.text_response('Error: ' + str(exc), status = '400 Bad Request') + + def handle_not_found(self, env): + return WbResponse.text_response('Not Found: ' + env['REQUEST_URI'], status = '404 Not Found') + + + +## =========== +parser = ArchivalParser({'/web/': WBHandler()}, hostpaths = ['http://localhost:9090/']) +## =========== + + +def application(env, start_response): + response = None + + try: + response = parser.handle_request(env) + + except Exception as e: + last_exc = e + import traceback + traceback.print_exc() + response = parser.handle_exception(env, e) + + if not response: + response = parser.handle_not_found(env) + + return response(env, start_response) diff --git a/pywb/wbexceptions.py b/pywb/wbexceptions.py new file mode 100644 index 00000000..11c83c1b --- /dev/null +++ b/pywb/wbexceptions.py @@ -0,0 +1,8 @@ + +class RequestParseException(Exception): + pass + +class BadUrlException(Exception): + pass + + diff --git a/pywb/wbrequestresponse.py b/pywb/wbrequestresponse.py new file mode 100644 index 00000000..27cf14a1 --- /dev/null +++ b/pywb/wbrequestresponse.py @@ -0,0 +1,48 @@ +#from werkzeug.wrappers import BaseRequest, BaseResponse, CommonRequestDescriptorsMixin, CommonResponseDescriptorsMixin + +# WbRequest +class WbRequest: + def __init__(self, env): + self.env = env + self.wb_url = env.get('WB_URL') + self.coll = env.get('WB_COLL') + + setattr(self, 'request_uri', env.get('REQUEST_URI')) + setattr(self, 'referrer', env.get('HTTP_REFERER')) + + def __repr__(self): + return self.coll + " " + self.wb_url + + +class WbResponse: + def __init__(self, status, value = [], headersList = []): + self.status = status + self.body = value + self.headersList = headersList + + @staticmethod + def text_response(text, status = '200 OK'): + return WbResponse(status, value = [text], headersList = [('Content-Type', 'text/plain')]) + + @staticmethod + def redir_response(location): + return WbResponse('302 Redirect', headersList = [('Location', location)]) + + def get_header(self, name): + name_upp = name.upper() + for value in self.headersList: + if (value[0].upper() == name_upp): + return value[1] + + def __call__(self, env, start_response): + #headersList = [] + #for key, value in self.headers.iteritems(): + # headersList.append((key, value)) + + start_response(self.status, self.headersList) + return self.body + + + + +