1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

better exception handling, specific status codes for exceptions,

detect access control and not found exceptions more consistently
This commit is contained in:
Ilya Kreymer 2013-12-19 12:06:47 -08:00
parent ebc76c0791
commit 0a2b16407d
6 changed files with 76 additions and 31 deletions

View File

@ -19,10 +19,4 @@ class ArchivalRequestRouter:
handler, wbrequest = self.parse_request(env) handler, wbrequest = self.parse_request(env)
return handler.run(wbrequest) return handler.run(wbrequest)
def handle_exception(self, env, exc):
return WbResponse.text_response('Error: ' + str(exc), status = '400 Bad Request')
def handle_not_found(self, env):
return WbResponse.text_response('Not Found: ' + env['REQUEST_URI'], status = '404 Not Found')

View File

@ -1,5 +1,6 @@
import urllib import urllib
import urllib2 import urllib2
import wbexceptions
class RemoteCDXServer: class RemoteCDXServer:
""" """
@ -27,17 +28,23 @@ class RemoteCDXServer:
params.update(**kwvalues) params.update(**kwvalues)
urlparams = urllib.urlencode(params) urlparams = urllib.urlencode(params)
request = urllib2.Request(self.serverUrl, urlparams)
response = urllib2.urlopen(request) try:
request = urllib2.Request(self.serverUrl, urlparams)
response = urllib2.urlopen(request)
except urllib2.HTTPError, e:
if e.code == 403:
exc_msg = e.read()
msg = 'Blocked By Robots' if 'Blocked By Robots' in exc_msg else 'Excluded'
raise wbexceptions.AccessException(msg)
else:
raise e
if parse_cdx: if parse_cdx:
return map(CDXCaptureResult, response) return map(CDXCaptureResult, response)
else: else:
return response return response
class InvalidCDXException(Exception):
pass
class CDXCaptureResult: class CDXCaptureResult:
CDX_FORMATS = [["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"], CDX_FORMATS = [["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"]] ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","offset","filename"]]

9
pywb/utils.py Normal file
View File

@ -0,0 +1,9 @@
import itertools
def peek_iter(iterable):
try:
first = next(iterable)
except StopIteration:
return None
return itertools.chain([first], iterable)

View File

@ -3,11 +3,14 @@ from archiveurl import archiveurl
from archivalrouter import ArchivalRequestRouter from archivalrouter import ArchivalRequestRouter
import indexreader import indexreader
import json import json
import wbexceptions
import utils
class WBHandler: class WBHandler:
def run(self, wbrequest): def run(self, wbrequest):
wburl = archiveurl(wbrequest.wb_url) wburl = archiveurl(wbrequest.wb_url)
return WbResponse.text_response(repr(wburl)) wbrequest.parsed_url = wburl
return WbResponse.text_stream(str(vars(wburl)))
class QueryHandler: class QueryHandler:
def __init__(self): def __init__(self):
@ -15,7 +18,6 @@ class QueryHandler:
@staticmethod @staticmethod
def get_query_params(wburl): def get_query_params(wburl):
print wburl.type
return { return {
archiveurl.QUERY: archiveurl.QUERY:
@ -37,23 +39,27 @@ class QueryHandler:
def run(self, wbrequest): def run(self, wbrequest):
wburl = archiveurl(wbrequest.wb_url) wburl = archiveurl(wbrequest.wb_url)
#wburl = wbresponse.body.parsed_url
params = QueryHandler.get_query_params(wburl) params = QueryHandler.get_query_params(wburl)
#parse_cdx = (wburl.mod == 'json')
cdxlines = self.cdxserver.load(wburl.url, params) cdxlines = self.cdxserver.load(wburl.url, params)
return WbResponse.text_stream(cdxlines) cdxlines = utils.peek_iter(cdxlines)
if cdxlines is not None:
return WbResponse.text_stream(cdxlines)
raise wbexceptions.NotFoundException('WB Does Not Have Url: ' + wburl.url)
#if parse_cdx:
# text = str("\n".join(map(str, cdxlines)))
# text = json.dumps(cdxlines, default=lambda o: o.__dict__)
#else:
# text = cdxlines
## =========== ## ===========
parser = ArchivalRequestRouter({'/web/': QueryHandler()}, hostpaths = ['http://localhost:9090/']) parser = ArchivalRequestRouter(
{'/t1/' : WBHandler(),
'/t2/' : QueryHandler()
},
hostpaths = ['http://localhost:9090/'])
## =========== ## ===========
@ -63,13 +69,26 @@ def application(env, start_response):
try: try:
response = parser.handle_request(env) response = parser.handle_request(env)
if not response:
raise wbexceptions.NotFoundException(env['REQUEST_URI'] + ' was not found')
except Exception as e: except Exception as e:
last_exc = e last_exc = e
import traceback import traceback
traceback.print_exc() traceback.print_exc()
response = parser.handle_exception(env, e) response = handle_exception(env, e)
if not response:
response = parser.handle_not_found(env)
return response(env, start_response) return response(env, start_response)
def handle_exception(env, exc):
if hasattr(exc, 'status'):
status = exc.status()
else:
status = '400 Bad Request'
return WbResponse.text_response(status + ' Error: ' + str(exc), status = status)
#def handle_not_found(env):
# return WbResponse.text_response('Not Found: ' + env['REQUEST_URI'], status = '404 Not Found')

View File

@ -1,8 +1,20 @@
class RequestParseException(Exception): class RequestParseException(Exception):
pass def status(_):
return '400'
class BadUrlException(Exception): class BadUrlException(Exception):
pass def status(_):
return '400'
class AccessException(Exception):
def status(_):
return '403'
class InvalidCDXException(Exception):
def status(_):
return '500'
class NotFoundException(Exception):
def status(_):
return '404'

View File

@ -1,4 +1,3 @@
#WB Request and Response #WB Request and Response
class WbRequest: class WbRequest:
@ -57,8 +56,8 @@ class WbResponse:
return WbResponse(status, value = [text], headersList = [('Content-Type', 'text/plain')]) return WbResponse(status, value = [text], headersList = [('Content-Type', 'text/plain')])
@staticmethod @staticmethod
def redir_response(location): def redir_response(location, status = '302 Redirect'):
return WbResponse('302 Redirect', headersList = [('Location', location)]) return WbResponse(status, headersList = [('Location', location)])
def get_header(self, name): def get_header(self, name):
name_upp = name.upper() name_upp = name.upper()
@ -72,7 +71,12 @@ class WbResponse:
# headersList.append((key, value)) # headersList.append((key, value))
start_response(self.status, self.headersList) start_response(self.status, self.headersList)
return self.body
if hasattr(self.body, '__iter__'):
return self.body
else:
return [str(self.body)]
def __repr__(self): def __repr__(self):
return str(vars(self)) return str(vars(self))