1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

wsgi wrapper reorg!

support pluggable wsgi apps
utils: BlockLoader() supports loading from package
exceptions: base WbException moved to utils
This commit is contained in:
Ilya Kreymer 2014-03-02 19:26:06 -08:00
parent 47271bbfab
commit f1acad53fc
19 changed files with 217 additions and 151 deletions

0
pywb/apps/__init__.py Normal file
View File

10
pywb/apps/wayback.py Normal file
View File

@ -0,0 +1,10 @@
from pywb.bootstrap.wsgi_wrappers import init_app, start_wsgi_server
from pywb.bootstrap.pywb_init import create_wb_router
#=================================================================
# init pywb app
#=================================================================
application = init_app(create_wb_router, load_yaml=True)
if __name__ == "__main__":
start_wsgi_server(application)

View File

@ -1,56 +0,0 @@
import logging
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.core.views import J2TemplateView, J2HtmlCapturesView
from pywb.core.handlers import WBHandler
from pywb.core.replay_views import ReplayView
#=================================================================
# Config Loading
#=================================================================
def load_template_file(file, desc = None, view_class = J2TemplateView):
if file:
logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
file = view_class(file)
return file
#=================================================================
def create_wb_handler(cdx_server, config, ds_rules_file=None):
record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker'))
paths = config.get('archive_paths')
resolving_loader = ResolvingLoader(paths=paths,
cdx_server=cdx_server,
record_loader=record_loader)
replayer = ReplayView(
content_loader = resolving_loader,
content_rewriter = RewriteContent(ds_rules_file=ds_rules_file),
head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),
buffer_response = config.get('buffer_response', True),
redir_to_exact = config.get('redir_to_exact', True),
reporter = config.get('reporter')
)
wb_handler = WBHandler(
cdx_server,
replayer,
html_view = load_template_file(config.get('query_html'), 'Captures Page', J2HtmlCapturesView),
search_view = load_template_file(config.get('search_html'), 'Search Page'),
)
return wb_handler

View File

@ -1,10 +1,20 @@
from pywb.core.handlers import CDXHandler, StaticHandler
from pywb.core.handlers import DebugEchoHandler, DebugEchoEnvHandler
from pywb.dispatch.archivalrouter import ArchivalRouter, Route from pywb.dispatch.archivalrouter import ArchivalRouter, Route
from pywb.dispatch.proxy import ProxyArchivalRouter from pywb.dispatch.proxy import ProxyArchivalRouter
from pywb.core.indexreader import IndexReader
import config_utils from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.core.indexreader import IndexReader
from pywb.core.views import J2TemplateView, J2HtmlCapturesView
from pywb.core.handlers import WBHandler
from pywb.core.replay_views import ReplayView
from pywb.core.handlers import CDXHandler, StaticHandler
from pywb.core.handlers import DebugEchoHandler, DebugEchoEnvHandler
from pywb.utils.loaders import BlockLoader
import os import os
import yaml import yaml
@ -27,6 +37,7 @@ DEFAULTS = {
'domain_specific_rules': 'rules.yaml', 'domain_specific_rules': 'rules.yaml',
} }
#=================================================================
class DictChain: class DictChain:
def __init__(self, *dicts): def __init__(self, *dicts):
self.dicts = dicts self.dicts = dicts
@ -40,9 +51,63 @@ class DictChain:
#================================================================= #=================================================================
## Reference non-YAML config def load_template_file(file, desc=None, view_class=J2TemplateView):
if file:
logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
file = view_class(file)
return file
#================================================================= #=================================================================
def pywb_config_manual(passed_config = {}): def create_wb_handler(cdx_server, config, ds_rules_file=None):
cookie_maker=config.get('cookie_maker')
record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker)
paths = config.get('archive_paths')
resolving_loader = ResolvingLoader(paths=paths,
cdx_server=cdx_server,
record_loader=record_loader)
head_insert_view = load_template_file(config.get('head_insert_html'),
'Head Insert')
replayer = ReplayView(
content_loader=resolving_loader,
content_rewriter=RewriteContent(ds_rules_file=ds_rules_file),
head_insert_view=head_insert_view,
buffer_response=config.get('buffer_response', True),
redir_to_exact=config.get('redir_to_exact', True),
reporter=config.get('reporter')
)
html_view = load_template_file(config.get('query_html'),
'Captures Page',
J2HtmlCapturesView)
search_view = load_template_file(config.get('search_html'),
'Search Page')
wb_handler = WBHandler(
cdx_server,
replayer,
html_view=html_view,
search_view=search_view,
)
return wb_handler
#=================================================================
def create_wb_router(passed_config = {}):
config = DictChain(passed_config, DEFAULTS) config = DictChain(passed_config, DEFAULTS)
@ -62,7 +127,7 @@ def pywb_config_manual(passed_config = {}):
ds_rules_file = route_config.get('domain_specific_rules', None) ds_rules_file = route_config.get('domain_specific_rules', None)
cdx_server = IndexReader(route_config, ds_rules_file) cdx_server = IndexReader(route_config, ds_rules_file)
wb_handler = config_utils.create_wb_handler( wb_handler = create_wb_handler(
cdx_server=cdx_server, cdx_server=cdx_server,
config=route_config, config=route_config,
ds_rules_file=ds_rules_file, ds_rules_file=ds_rules_file,
@ -107,24 +172,6 @@ def pywb_config_manual(passed_config = {}):
abs_path = config.get('absolute_paths', True), abs_path = config.get('absolute_paths', True),
home_view = config_utils.load_template_file(config.get('home_html'), 'Home Page'), home_view = load_template_file(config.get('home_html'), 'Home Page'),
error_view = config_utils.load_template_file(config.get('error_html'), 'Error Page') error_view = load_template_file(config.get('error_html'), 'Error Page')
) )
#=================================================================
# YAML config loader
#=================================================================
DEFAULT_CONFIG_FILE = 'config.yaml'
def pywb_config(config_file = None):
if not config_file:
config_file = os.environ.get('PYWB_CONFIG', DEFAULT_CONFIG_FILE)
with open(config_file) as fh:
config = yaml.load(fh)
return pywb_config_manual(config)

View File

@ -1,20 +1,19 @@
from pywb.core.wbexceptions import WbException, NotFoundException, InternalRedirect from pywb.utils.wbexception import WbException
from pywb.core.wbexceptions import NotFoundException, InternalRedirect
from pywb.core.wbrequestresponse import WbResponse, StatusAndHeaders from pywb.core.wbrequestresponse import WbResponse, StatusAndHeaders
from pywb.cdx.cdxserver import CDXException from pywb.utils.loaders import BlockLoader
from pywb.utils.canonicalize import UrlCanonicalizeException
from pywb.warc.recordloader import ArchiveLoadFailed
import os import os
import importlib import importlib
import logging import logging
#================================================================= #=================================================================
# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters # adapted from wsgiref.request_uri, but doesn't include domain name and allows all characters
# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3 # allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links # explained here:
# http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
def rel_request_uri(environ, include_query=1): def rel_request_uri(environ, include_query=1):
""" """
Return the requested path, optionally including the query string Return the requested path, optionally including the query string
@ -35,9 +34,9 @@ def rel_request_uri(environ, include_query=1):
return url return url
#================================================================= #=================================================================
def create_wb_app(wb_router): def create_wb_app(wb_router):
# Top-level wsgi application # Top-level wsgi application
def application(env, start_response): def application(env, start_response):
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'): if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
@ -56,8 +55,7 @@ def create_wb_app(wb_router):
except InternalRedirect as ir: except InternalRedirect as ir:
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders)) response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
except (WbException, CDXException, except WbException as e:
UrlCanonicalizeException, ArchiveLoadFailed) as e:
response = handle_exception(env, wb_router.error_view, e, False) response = handle_exception(env, wb_router.error_view, e, False)
except Exception as e: except Exception as e:
@ -69,6 +67,7 @@ def create_wb_app(wb_router):
return application return application
#=================================================================
def handle_exception(env, error_view, exc, print_trace): def handle_exception(env, error_view, exc, print_trace):
if hasattr(exc, 'status'): if hasattr(exc, 'status'):
status = exc.status() status = exc.status()
@ -85,44 +84,82 @@ def handle_exception(env, error_view, exc, print_trace):
if error_view: if error_view:
import traceback import traceback
return error_view.render_response(err_msg = str(exc), err_details = err_details, status = status) return error_view.render_response(err_msg=str(exc),
err_details=err_details,
status=status)
else: else:
return WbResponse.text_response(status + ' Error: ' + str(exc), status = status) return WbResponse.text_response(status + ' Error: ' + str(exc),
status=status)
#================================================================= #=================================================================
DEFAULT_CONFIG_FILE = 'config.yaml' DEFAULT_CONFIG_FILE = 'config.yaml'
DEFAULT_INIT_MODULE = 'pywb.bootstrap.pywb_init' def load_yaml_config(config_file=None):
import yaml
if not config_file:
config_file = DEFAULT_CONFIG_FILE
configdata = BlockLoader().load(config_file)
config = yaml.load(configdata)
return config
#================================================================= #=================================================================
def main(): def init_app(init_func, load_yaml=True, config_file=None):
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
level=logging.DEBUG)
logging.info('')
if load_yaml:
if not config_file:
config_file = os.environ.get('PYWB_CONFIG_FILE')
config = load_yaml_config(config_file)
try: try:
logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG) if load_yaml:
wb_router = init_func(config)
# see if there's a custom init module else:
config_name = os.environ.get('PYWB_CONFIG_MODULE') wb_router = init_func()
except:
if not config_name: msg = '*** pywb app init FAILED config from "%s"!\n'
# use default module logging.exception(msg, init_func.__name__)
config_name = DEFAULT_INIT_MODULE
logging.info('Loading from default config module "{0}"'.format(config_name))
logging.info('')
module = importlib.import_module(config_name)
app = create_wb_app(module.pywb_config())
logging.info('')
logging.info('*** pywb inited with settings from {0}.pywb_config()!\n'.format(config_name))
return app
except Exception:
logging.exception('*** pywb could not init with settings from {0}.pywb_config()!\n'.format(config_name))
raise raise
else:
msg = '*** pywb app inited with config from "%s"!\n'
logging.info(msg, init_func.__name__)
return create_wb_app(wb_router)
#================================================================= #=================================================================
if __name__ == "__main__": DEFAULT_PORT = 8080
pass
else: def start_wsgi_server(the_app):
application = main() from wsgiref.simple_server import make_server
from optparse import OptionParser
opt = OptionParser('%prog [OPTIONS]')
opt.add_option('-p', '--port', type='int', default=None)
options, args = opt.parse_args()
port = options.port
if port is None:
try:
config = load_default_config()
port = config.get('port', DEFAULT_PORT)
except:
port = DEFAULT_PORT
logging.debug('Starting CDX Server on port %s', port)
try:
httpd = make_server('', port, the_app)
httpd.serve_forever()
except KeyboardInterrupt as ex:
pass
logging.debug('Stopping CDX Server')

View File

@ -9,6 +9,7 @@ from pywb.utils.canonicalize import unsurt, UrlCanonicalizer
from query import CDXQuery from query import CDXQuery
#================================================================= #=================================================================
def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
""" """

View File

@ -4,9 +4,11 @@ import itertools
from urllib import urlencode from urllib import urlencode
from urlparse import parse_qs from urlparse import parse_qs
from pywb.utils.wbexception import WbException
#================================================================= #=================================================================
class CDXException(Exception): class CDXException(WbException):
def status(self): def status(self):
return '400 Bad Request' return '400 Bad Request'

View File

@ -33,6 +33,7 @@ def cdx_load(sources, query, perms_checker=None, process=True):
return cdx_iter return cdx_iter
#================================================================= #=================================================================
def restrict_cdx(cdx_iter, query, perms_checker): def restrict_cdx(cdx_iter, query, perms_checker):
""" """
@ -56,6 +57,7 @@ def restrict_cdx(cdx_iter, query, perms_checker):
yield cdx yield cdx
#================================================================= #=================================================================
def process_cdx(cdx_iter, query): def process_cdx(cdx_iter, query):
if query.resolve_revisits: if query.resolve_revisits:
@ -255,7 +257,6 @@ def cdx_resolve_revisits(cdx_iter):
originals = {} originals = {}
for cdx in cdx_iter: for cdx in cdx_iter:
is_revisit = cdx.is_revisit() is_revisit = cdx.is_revisit()
digest = cdx['digest'] digest = cdx['digest']

View File

@ -126,14 +126,19 @@ class CDXServer(BaseCDXServer):
logging.warn('No CDX Sources configured from paths=%s', paths) logging.warn('No CDX Sources configured from paths=%s', paths)
def _add_cdx_source(self, source): def _add_cdx_source(self, source):
if source is None: return if source is None:
return
logging.debug('Adding CDX Source: %s', source) logging.debug('Adding CDX Source: %s', source)
self.sources.append(source) self.sources.append(source)
def add_cdx_source(self, source, config): def add_cdx_source(self, source, config):
if source is None: return if source is None:
return
if isinstance(source, CDXSource): if isinstance(source, CDXSource):
self._add_cdx_source(source) self._add_cdx_source(source)
elif isinstance(source, str): elif isinstance(source, str):
if os.path.isdir(source): if os.path.isdir(source):
for fn in os.listdir(source): for fn in os.listdir(source):
@ -213,5 +218,3 @@ def create_cdx_server(config, ds_rules_file=None):
surt_ordered=surt_ordered, surt_ordered=surt_ordered,
ds_rules_file=ds_rules_file, ds_rules_file=ds_rules_file,
perms_checker=perms_checker) perms_checker=perms_checker)

View File

@ -8,6 +8,7 @@ import urllib
import urllib2 import urllib2
import itertools import itertools
#================================================================= #=================================================================
class CDXSource(object): class CDXSource(object):
""" """
@ -92,7 +93,6 @@ class RedisCDXSource(CDXSource):
if config: if config:
self.key_prefix = config.get('redis_key_prefix', self.key_prefix) self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
def load_cdx(self, query): def load_cdx(self, query):
""" """
Load cdx from redis cache, from an ordered list Load cdx from redis cache, from an ordered list

View File

@ -1,8 +1,6 @@
from pywb.utils.wbexception import WbException
class WbException(Exception):
pass
class NotFoundException(WbException): class NotFoundException(WbException):
def status(self): def status(self):
return '404 Not Found' return '404 Not Found'

View File

@ -4,6 +4,9 @@
import surt import surt
import urlparse import urlparse
from wbexception import WbException
#================================================================= #=================================================================
class UrlCanonicalizer(object): class UrlCanonicalizer(object):
def __init__(self, surt_ordered=True): def __init__(self, surt_ordered=True):
@ -14,7 +17,7 @@ class UrlCanonicalizer(object):
#================================================================= #=================================================================
class UrlCanonicalizeException(Exception): class UrlCanonicalizeException(WbException):
def status(self): def status(self):
return '400 Bad Request' return '400 Bad Request'
@ -164,7 +167,8 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
elif match_type == 'domain': elif match_type == 'domain':
if not surt_ordered: if not surt_ordered:
raise UrlCanonicalizeException('matchType=domain unsupported for non-surt') msg = 'matchType=domain unsupported for non-surt'
raise UrlCanonicalizeException(msg)
host = start_key.split(')/')[0] host = start_key.split(')/')[0]

View File

@ -7,6 +7,7 @@ import os
import hmac import hmac
import urllib2 import urllib2
import time import time
from pkg_resources import resource_stream
#================================================================= #=================================================================
@ -24,16 +25,16 @@ class BlockLoader(object):
def __init__(self, cookie_maker=None): def __init__(self, cookie_maker=None):
self.cookie_maker = cookie_maker self.cookie_maker = cookie_maker
def load(self, url, offset, length): def load(self, url, offset=0, length=-1):
""" """
Determine loading method based on uri Determine loading method based on uri
""" """
if is_http(url): if is_http(url):
return self.load_http(url, offset, length) return self.load_http(url, offset, length)
else: else:
return self.load_file(url, offset, length) return self.load_file_or_resource(url, offset, length)
def load_file(self, url, offset, length): def load_file_or_resource(self, url, offset, length):
""" """
Load a file-like reader from the local file system Load a file-like reader from the local file system
""" """
@ -41,10 +42,18 @@ class BlockLoader(object):
if url.startswith('file://'): if url.startswith('file://'):
url = url[len('file://'):] url = url[len('file://'):]
afile = open(url, 'rb') try:
afile.seek(offset) # first, try as file
afile = open(url, 'rb')
except IOError as file_err:
# then, try as package.path/file
pkg_split = url.split('/', 1)
afile = resource_stream(pkg_split[0], pkg_split[1])
if length > 0: if offset > 0:
afile.seek(offset)
if length >= 0:
return LimitReader(afile, length) return LimitReader(afile, length)
else: else:
return afile return afile

View File

@ -171,7 +171,6 @@ def timestamp_to_datetime(string):
# pad to 6 digits # pad to 6 digits
string = _pad_timestamp(string, PAD_6) string = _pad_timestamp(string, PAD_6)
def clamp(val, min_, max_): def clamp(val, min_, max_):
try: try:
val = int(val) val = int(val)

View File

@ -0,0 +1,3 @@
class WbException(Exception):
def status(self):
return '500 Internal Server Error'

View File

@ -9,6 +9,9 @@ from pywb.utils.statusandheaders import StatusAndHeadersParserException
from pywb.utils.loaders import BlockLoader from pywb.utils.loaders import BlockLoader
from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.wbexception import WbException
#================================================================= #=================================================================
ArcWarcRecord = collections.namedtuple('ArchiveRecord', ArcWarcRecord = collections.namedtuple('ArchiveRecord',
'type, rec_headers, ' + 'type, rec_headers, ' +
@ -16,7 +19,7 @@ ArcWarcRecord = collections.namedtuple('ArchiveRecord',
#================================================================= #=================================================================
class ArchiveLoadFailed(Exception): class ArchiveLoadFailed(WbException):
def __init__(self, reason, filename=''): def __init__(self, reason, filename=''):
super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason)) super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason))
#self.filename = filename #self.filename = filename
@ -62,9 +65,9 @@ class ArcWarcRecordLoader:
decomp_type = 'gzip' decomp_type = 'gzip'
# Create decompressing stream # Create decompressing stream
stream = DecompressingBufferedReader(stream = raw, stream = DecompressingBufferedReader(stream=raw,
decomp_type = decomp_type, decomp_type=decomp_type,
block_size = self.block_size) block_size=self.block_size)
(the_format, rec_headers) = self._detect_type_load_headers(stream) (the_format, rec_headers) = self._detect_type_load_headers(stream)

2
run.sh
View File

@ -10,7 +10,7 @@ mypath=$(cd `dirname $0` && pwd)
# ex: my_pywb.pywb_config() # ex: my_pywb.pywb_config()
#export 'PYWB_CONFIG=my_pywb' #export 'PYWB_CONFIG=my_pywb'
app="pywb.bootstrap.wbapp" app="pywb.apps.wayback"
params="--http-socket :8080 -b 65536" params="--http-socket :8080 -b 65536"
#params="--static-map /static=$mypath/static --http-socket :8080 -b 65536" #params="--static-map /static=$mypath/static --http-socket :8080 -b 65536"

View File

@ -22,6 +22,7 @@ setup(
'pywb.core', 'pywb.core',
'pywb.dispatch', 'pywb.dispatch',
'pywb.bootstrap' 'pywb.bootstrap'
'pywb.apps'
], ],
package_data={ package_data={
'pywb': ['ui/*', 'static/*', '*.yaml'], 'pywb': ['ui/*', 'static/*', '*.yaml'],
@ -41,7 +42,6 @@ setup(
'pyyaml', 'pyyaml',
'WebTest', 'WebTest',
'pytest', 'pytest',
'werkzeug>=0.9.4',
], ],
# tests_require=['WebTest', 'pytest'], # tests_require=['WebTest', 'pytest'],
zip_safe=False zip_safe=False

View File

@ -1,6 +1,6 @@
import webtest import webtest
from pywb.bootstrap.pywb_init import pywb_config from pywb.bootstrap.pywb_init import create_wb_router
from pywb.bootstrap.wbapp import create_wb_app from pywb.bootstrap.wsgi_wrappers import init_app
from pywb.cdx.cdxobject import CDXObject from pywb.cdx.cdxobject import CDXObject
from fixture import TestExclusionPerms from fixture import TestExclusionPerms
@ -11,8 +11,13 @@ class TestWb:
def setup(self): def setup(self):
#self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config()) #self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config())
# save it in self - useful for debugging # save it in self - useful for debugging
self.router = pywb_config(self.TEST_CONFIG) self.app = init_app(create_wb_router,
self.app = create_wb_app(self.router) load_yaml=True,
config_file=self.TEST_CONFIG)
#self.router = pywb_config(self.TEST_CONFIG)
#self.app = create_wb_app(self.router)
self.testapp = webtest.TestApp(self.app) self.testapp = webtest.TestApp(self.app)
def _assert_basic_html(self, resp): def _assert_basic_html(self, resp):