diff --git a/pywb/archivalrouter.py b/pywb/archivalrouter.py index 7f885f66..4edaa9c3 100644 --- a/pywb/archivalrouter.py +++ b/pywb/archivalrouter.py @@ -19,10 +19,4 @@ class ArchivalRequestRouter: handler, wbrequest = self.parse_request(env) return handler.run(wbrequest) - def handle_exception(self, env, exc): - return WbResponse.text_response('Error: ' + str(exc), status = '400 Bad Request') - - def handle_not_found(self, env): - return WbResponse.text_response('Not Found: ' + env['REQUEST_URI'], status = '404 Not Found') - diff --git a/pywb/indexreader.py b/pywb/indexreader.py index 4ad8acb5..17489a3b 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -1,5 +1,6 @@ import urllib import urllib2 +import wbexceptions class RemoteCDXServer: """ @@ -27,17 +28,23 @@ class RemoteCDXServer: params.update(**kwvalues) urlparams = urllib.urlencode(params) - request = urllib2.Request(self.serverUrl, urlparams) - response = urllib2.urlopen(request) + + try: + request = urllib2.Request(self.serverUrl, urlparams) + response = urllib2.urlopen(request) + except urllib2.HTTPError, e: + if e.code == 403: + exc_msg = e.read() + msg = 'Blocked By Robots' if 'Blocked By Robots' in exc_msg else 'Excluded' + raise wbexceptions.AccessException(msg) + else: + raise e if parse_cdx: return map(CDXCaptureResult, response) else: return response -class InvalidCDXException(Exception): - pass - class CDXCaptureResult: CDX_FORMATS = [["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"], ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"]] diff --git a/pywb/utils.py b/pywb/utils.py new file mode 100644 index 00000000..ee70be6a --- /dev/null +++ b/pywb/utils.py @@ -0,0 +1,9 @@ +import itertools + +def peek_iter(iterable): + try: + first = next(iterable) + except StopIteration: + return None + + return itertools.chain([first], iterable) diff --git a/pywb/wbapp.py b/pywb/wbapp.py index b5139585..687c41b1 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -3,11 +3,14 @@ from archiveurl import archiveurl from archivalrouter import ArchivalRequestRouter import indexreader import json +import wbexceptions +import utils class WBHandler: def run(self, wbrequest): wburl = archiveurl(wbrequest.wb_url) - return WbResponse.text_response(repr(wburl)) + wbrequest.parsed_url = wburl + return WbResponse.text_stream(str(vars(wburl))) class QueryHandler: def __init__(self): @@ -15,7 +18,6 @@ class QueryHandler: @staticmethod def get_query_params(wburl): - print wburl.type return { archiveurl.QUERY: @@ -37,23 +39,27 @@ class QueryHandler: def run(self, wbrequest): wburl = archiveurl(wbrequest.wb_url) + #wburl = wbresponse.body.parsed_url params = QueryHandler.get_query_params(wburl) - #parse_cdx = (wburl.mod == 'json') cdxlines = self.cdxserver.load(wburl.url, params) - return WbResponse.text_stream(cdxlines) + cdxlines = utils.peek_iter(cdxlines) + + if cdxlines is not None: + return WbResponse.text_stream(cdxlines) + + raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url) - #if parse_cdx: - # text = str("\n".join(map(str, cdxlines))) - # text = json.dumps(cdxlines, default=lambda o: o.__dict__) - #else: - # text = cdxlines ## =========== -parser = ArchivalRequestRouter({'/web/': QueryHandler()}, hostpaths = ['http://localhost:9090/']) +parser = ArchivalRequestRouter( + {'/t1/' : WBHandler(), + '/t2/' : QueryHandler() + }, + hostpaths = ['http://localhost:9090/']) ## =========== @@ -63,13 +69,26 @@ def application(env, start_response): try: response = parser.handle_request(env) + if not response: + raise wbexceptions.NotFoundException(env['REQUEST_URI'] + ' was not found') + except Exception as e: last_exc = e import traceback traceback.print_exc() - response = parser.handle_exception(env, e) - - if not response: - response = parser.handle_not_found(env) + response = handle_exception(env, e) return response(env, start_response) + +def handle_exception(env, exc): + if hasattr(exc, 'status'): + status = exc.status() + else: + status = '400 Bad Request' + + return WbResponse.text_response(status + ' Error: ' + str(exc), status = status) + +#def handle_not_found(env): +# return WbResponse.text_response('Not Found: ' + env['REQUEST_URI'], status = '404 Not Found') + + diff --git a/pywb/wbexceptions.py b/pywb/wbexceptions.py index 11c83c1b..e0724fa2 100644 --- a/pywb/wbexceptions.py +++ b/pywb/wbexceptions.py @@ -1,8 +1,20 @@ class RequestParseException(Exception): - pass + def status(_): + return '400' class BadUrlException(Exception): - pass + def status(_): + return '400' +class AccessException(Exception): + def status(_): + return '403' +class InvalidCDXException(Exception): + def status(_): + return '500' + +class NotFoundException(Exception): + def status(_): + return '404' diff --git a/pywb/wbrequestresponse.py b/pywb/wbrequestresponse.py index d9189dd8..3e914aeb 100644 --- a/pywb/wbrequestresponse.py +++ b/pywb/wbrequestresponse.py @@ -1,4 +1,3 @@ - #WB Request and Response class WbRequest: @@ -57,8 +56,8 @@ class WbResponse: return WbResponse(status, value = [text], headersList = [('Content-Type', 'text/plain')]) @staticmethod - def redir_response(location): - return WbResponse('302 Redirect', headersList = [('Location', location)]) + def redir_response(location, status = '302 Redirect'): + return WbResponse(status, headersList = [('Location', location)]) def get_header(self, name): name_upp = name.upper() @@ -72,7 +71,12 @@ class WbResponse: # headersList.append((key, value)) start_response(self.status, self.headersList) - return self.body + + if hasattr(self.body, '__iter__'): + return self.body + else: + return [str(self.body)] + def __repr__(self): return str(vars(self))