From be3c3c877876f9ab9f475e07e6af9ba3e4321e5b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 9 May 2018 12:00:03 -0700 Subject: [PATCH] seq handler: option to filter out bad responses (not 2xx or 3xx) and proceed to next handler. if at last handler, return error response dockerfile: remove volume, port from base image error page: add cors headers for error page --- Dockerfile | 4 ++-- pywb/apps/rewriterapp.py | 4 +++- pywb/apps/wbrequestresponse.py | 13 +++++++++---- pywb/warcserver/basewarcserver.py | 7 ++++++- pywb/warcserver/handlers.py | 19 +++++++++++++++++-- pywb/warcserver/resource/responseloader.py | 5 +++++ 6 files changed, 42 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index e1122b3e..4d100ffb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,11 +19,11 @@ RUN python setup.py install RUN mkdir /webarchive COPY config.yaml /webarchive/ -VOLUME /webarchive +#VOLUME /webarchive WORKDIR /webarchive -EXPOSE 8080 +#EXPOSE 8080 CMD ["uwsgi", "/uwsgi/uwsgi.ini"] diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index cfa6e3e9..e7e86b03 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -467,7 +467,9 @@ class RewriterApp(object): def _not_found_response(self, environ, url): resp = self.not_found_view.render_to_string(environ, url=url) - return WbResponse.text_response(resp, status='404 Not Found', content_type='text/html') + return WbResponse.text_response(resp, status='404 Not Found', + content_type='text/html', + headers=[('Access-Control-Allow-Origin', '*')]) def _error_response(self, environ, msg='', details='', status='404 Not Found'): resp = self.error_view.render_to_string(environ, diff --git a/pywb/apps/wbrequestresponse.py b/pywb/apps/wbrequestresponse.py index 66aa248f..b2d7644f 100644 --- a/pywb/apps/wbrequestresponse.py +++ b/pywb/apps/wbrequestresponse.py @@ -43,11 +43,16 @@ class WbResponse(object): return WbResponse(status_headers, value=stream) @staticmethod - def text_response(text, status='200 OK', content_type='text/plain; charset=utf-8'): + def text_response(text, status='200 OK', content_type='text/plain; charset=utf-8', headers=None): encoded_text = text.encode('utf-8') - status_headers = StatusAndHeaders(status, - [('Content-Type', content_type), - ('Content-Length', str(len(encoded_text)))]) + + def_headers = [('Content-Type', content_type), + ('Content-Length', str(len(encoded_text)))] + + if headers: + def_headers += headers + + status_headers = StatusAndHeaders(status, def_headers) return WbResponse(status_headers, value=[encoded_text]) diff --git a/pywb/warcserver/basewarcserver.py b/pywb/warcserver/basewarcserver.py index a5082e29..5cfeea61 100644 --- a/pywb/warcserver/basewarcserver.py +++ b/pywb/warcserver/basewarcserver.py @@ -7,11 +7,13 @@ from werkzeug.exceptions import HTTPException import requests import traceback import json - +import logging import six JSON_CT = 'application/json; charset=utf-8' +logger = logging.getLogger('warcserver') + #============================================================================= class BaseWarcServer(object): @@ -19,6 +21,9 @@ class BaseWarcServer(object): self.route_dict = {} self.debug = kwargs.get('debug', False) + if self.debug: + logger.setLevel(logging.DEBUG) + self.url_map = Map() def list_routes(environ): diff --git a/pywb/warcserver/handlers.py b/pywb/warcserver/handlers.py index cf1bdf4a..5ab0e2d0 100644 --- a/pywb/warcserver/handlers.py +++ b/pywb/warcserver/handlers.py @@ -84,7 +84,7 @@ class IndexHandler(object): return None, None, errs cdx_iter, errs = self._load_index_source(params) - if not cdx_iter: + if not cdx_iter or errs: return None, None, errs content_type, res = handler(cdx_iter, fields) @@ -152,8 +152,9 @@ class DefaultResourceHandler(ResourceHandler): #============================================================================= class HandlerSeq(object): - def __init__(self, handlers): + def __init__(self, handlers, filter_errors=True): self.handlers = handlers + self.filter_errors = filter_errors def get_supported_modes(self): if self.handlers: @@ -163,12 +164,26 @@ class HandlerSeq(object): def __call__(self, params): all_errs = {} + err_res = None + err_out_headers = None for handler in self.handlers: out_headers, res, errs = handler(params) + if out_headers and self.filter_errors: + status = out_headers.get('Warcserver-Status') + if status and not status.startswith(('1', '2', '3')): + errs = {'status_error': status} + err_res = res + err_out_headers = out_headers + res = None + all_errs.update(errs) + if res is not None: return out_headers, res, all_errs + if err_res and err_out_headers: + return err_out_headers, err_res, all_errs + return None, None, all_errs diff --git a/pywb/warcserver/resource/responseloader.py b/pywb/warcserver/resource/responseloader.py index ea64e081..82b1a8de 100644 --- a/pywb/warcserver/resource/responseloader.py +++ b/pywb/warcserver/resource/responseloader.py @@ -56,6 +56,9 @@ class BaseLoader(object): out_headers['Warcserver-Cdx'] = to_native_str(cdx.to_cdxj().rstrip()) out_headers['Warcserver-Source-Coll'] = to_native_str(source) + status = cdx.get('status') + if status: + out_headers['Warcserver-Status'] = str(status) if not warc_headers: if other_headers: @@ -318,6 +321,8 @@ class LiveWebLoader(BaseLoader): status=upstream_res.status, reason=upstream_res.reason) + cdx['status'] = upstream_res.status + http_headers_buff = status orig_resp = upstream_res._original_response