1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

wsgi wrapper reorg!

support pluggable wsgi apps
utils: BlockLoader() supports loading from package
exceptions: base WbException moved to utils
This commit is contained in:
Ilya Kreymer 2014-03-02 19:26:06 -08:00
parent 47271bbfab
commit f1acad53fc
19 changed files with 217 additions and 151 deletions

0
pywb/apps/__init__.py Normal file
View File

10
pywb/apps/wayback.py Normal file
View File

@ -0,0 +1,10 @@
from pywb.bootstrap.wsgi_wrappers import init_app, start_wsgi_server
from pywb.bootstrap.pywb_init import create_wb_router
#=================================================================
# init pywb app
#=================================================================
application = init_app(create_wb_router, load_yaml=True)
if __name__ == "__main__":
start_wsgi_server(application)

View File

@ -1,56 +0,0 @@
import logging
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.core.views import J2TemplateView, J2HtmlCapturesView
from pywb.core.handlers import WBHandler
from pywb.core.replay_views import ReplayView
#=================================================================
# Config Loading
#=================================================================
def load_template_file(file, desc = None, view_class = J2TemplateView):
if file:
logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
file = view_class(file)
return file
#=================================================================
def create_wb_handler(cdx_server, config, ds_rules_file=None):
record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker'))
paths = config.get('archive_paths')
resolving_loader = ResolvingLoader(paths=paths,
cdx_server=cdx_server,
record_loader=record_loader)
replayer = ReplayView(
content_loader = resolving_loader,
content_rewriter = RewriteContent(ds_rules_file=ds_rules_file),
head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),
buffer_response = config.get('buffer_response', True),
redir_to_exact = config.get('redir_to_exact', True),
reporter = config.get('reporter')
)
wb_handler = WBHandler(
cdx_server,
replayer,
html_view = load_template_file(config.get('query_html'), 'Captures Page', J2HtmlCapturesView),
search_view = load_template_file(config.get('search_html'), 'Search Page'),
)
return wb_handler

View File

@ -1,10 +1,20 @@
from pywb.core.handlers import CDXHandler, StaticHandler
from pywb.core.handlers import DebugEchoHandler, DebugEchoEnvHandler
from pywb.dispatch.archivalrouter import ArchivalRouter, Route
from pywb.dispatch.proxy import ProxyArchivalRouter
from pywb.core.indexreader import IndexReader
import config_utils
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.core.indexreader import IndexReader
from pywb.core.views import J2TemplateView, J2HtmlCapturesView
from pywb.core.handlers import WBHandler
from pywb.core.replay_views import ReplayView
from pywb.core.handlers import CDXHandler, StaticHandler
from pywb.core.handlers import DebugEchoHandler, DebugEchoEnvHandler
from pywb.utils.loaders import BlockLoader
import os
import yaml
@ -27,6 +37,7 @@ DEFAULTS = {
'domain_specific_rules': 'rules.yaml',
}
#=================================================================
class DictChain:
def __init__(self, *dicts):
self.dicts = dicts
@ -40,9 +51,63 @@ class DictChain:
#=================================================================
## Reference non-YAML config
def load_template_file(file, desc=None, view_class=J2TemplateView):
if file:
logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
file = view_class(file)
return file
#=================================================================
def pywb_config_manual(passed_config = {}):
def create_wb_handler(cdx_server, config, ds_rules_file=None):
cookie_maker=config.get('cookie_maker')
record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker)
paths = config.get('archive_paths')
resolving_loader = ResolvingLoader(paths=paths,
cdx_server=cdx_server,
record_loader=record_loader)
head_insert_view = load_template_file(config.get('head_insert_html'),
'Head Insert')
replayer = ReplayView(
content_loader=resolving_loader,
content_rewriter=RewriteContent(ds_rules_file=ds_rules_file),
head_insert_view=head_insert_view,
buffer_response=config.get('buffer_response', True),
redir_to_exact=config.get('redir_to_exact', True),
reporter=config.get('reporter')
)
html_view = load_template_file(config.get('query_html'),
'Captures Page',
J2HtmlCapturesView)
search_view = load_template_file(config.get('search_html'),
'Search Page')
wb_handler = WBHandler(
cdx_server,
replayer,
html_view=html_view,
search_view=search_view,
)
return wb_handler
#=================================================================
def create_wb_router(passed_config = {}):
config = DictChain(passed_config, DEFAULTS)
@ -62,7 +127,7 @@ def pywb_config_manual(passed_config = {}):
ds_rules_file = route_config.get('domain_specific_rules', None)
cdx_server = IndexReader(route_config, ds_rules_file)
wb_handler = config_utils.create_wb_handler(
wb_handler = create_wb_handler(
cdx_server=cdx_server,
config=route_config,
ds_rules_file=ds_rules_file,
@ -107,24 +172,6 @@ def pywb_config_manual(passed_config = {}):
abs_path = config.get('absolute_paths', True),
home_view = config_utils.load_template_file(config.get('home_html'), 'Home Page'),
error_view = config_utils.load_template_file(config.get('error_html'), 'Error Page')
home_view = load_template_file(config.get('home_html'), 'Home Page'),
error_view = load_template_file(config.get('error_html'), 'Error Page')
)
#=================================================================
# YAML config loader
#=================================================================
DEFAULT_CONFIG_FILE = 'config.yaml'
def pywb_config(config_file = None):
if not config_file:
config_file = os.environ.get('PYWB_CONFIG', DEFAULT_CONFIG_FILE)
with open(config_file) as fh:
config = yaml.load(fh)
return pywb_config_manual(config)

View File

@ -1,20 +1,19 @@
from pywb.core.wbexceptions import WbException, NotFoundException, InternalRedirect
from pywb.utils.wbexception import WbException
from pywb.core.wbexceptions import NotFoundException, InternalRedirect
from pywb.core.wbrequestresponse import WbResponse, StatusAndHeaders
from pywb.cdx.cdxserver import CDXException
from pywb.utils.canonicalize import UrlCanonicalizeException
from pywb.warc.recordloader import ArchiveLoadFailed
from pywb.utils.loaders import BlockLoader
import os
import importlib
import logging
#=================================================================
# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters
# adapted from wsgiref.request_uri, but doesn't include domain name and allows all characters
# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
# explained here:
# http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
def rel_request_uri(environ, include_query=1):
"""
Return the requested path, optionally including the query string
@ -35,9 +34,9 @@ def rel_request_uri(environ, include_query=1):
return url
#=================================================================
def create_wb_app(wb_router):
# Top-level wsgi application
def application(env, start_response):
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
@ -56,8 +55,7 @@ def create_wb_app(wb_router):
except InternalRedirect as ir:
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
except (WbException, CDXException,
UrlCanonicalizeException, ArchiveLoadFailed) as e:
except WbException as e:
response = handle_exception(env, wb_router.error_view, e, False)
except Exception as e:
@ -69,6 +67,7 @@ def create_wb_app(wb_router):
return application
#=================================================================
def handle_exception(env, error_view, exc, print_trace):
if hasattr(exc, 'status'):
status = exc.status()
@ -85,44 +84,82 @@ def handle_exception(env, error_view, exc, print_trace):
if error_view:
import traceback
return error_view.render_response(err_msg = str(exc), err_details = err_details, status = status)
return error_view.render_response(err_msg=str(exc),
err_details=err_details,
status=status)
else:
return WbResponse.text_response(status + ' Error: ' + str(exc), status = status)
return WbResponse.text_response(status + ' Error: ' + str(exc),
status=status)
#=================================================================
DEFAULT_CONFIG_FILE = 'config.yaml'
DEFAULT_INIT_MODULE = 'pywb.bootstrap.pywb_init'
def load_yaml_config(config_file=None):
import yaml
if not config_file:
config_file = DEFAULT_CONFIG_FILE
configdata = BlockLoader().load(config_file)
config = yaml.load(configdata)
return config
#=================================================================
def main():
def init_app(init_func, load_yaml=True, config_file=None):
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
level=logging.DEBUG)
logging.info('')
if load_yaml:
if not config_file:
config_file = os.environ.get('PYWB_CONFIG_FILE')
config = load_yaml_config(config_file)
try:
logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG)
# see if there's a custom init module
config_name = os.environ.get('PYWB_CONFIG_MODULE')
if not config_name:
# use default module
config_name = DEFAULT_INIT_MODULE
logging.info('Loading from default config module "{0}"'.format(config_name))
logging.info('')
module = importlib.import_module(config_name)
app = create_wb_app(module.pywb_config())
logging.info('')
logging.info('*** pywb inited with settings from {0}.pywb_config()!\n'.format(config_name))
return app
except Exception:
logging.exception('*** pywb could not init with settings from {0}.pywb_config()!\n'.format(config_name))
if load_yaml:
wb_router = init_func(config)
else:
wb_router = init_func()
except:
msg = '*** pywb app init FAILED config from "%s"!\n'
logging.exception(msg, init_func.__name__)
raise
else:
msg = '*** pywb app inited with config from "%s"!\n'
logging.info(msg, init_func.__name__)
return create_wb_app(wb_router)
#=================================================================
if __name__ == "__main__":
pass
else:
application = main()
DEFAULT_PORT = 8080
def start_wsgi_server(the_app):
from wsgiref.simple_server import make_server
from optparse import OptionParser
opt = OptionParser('%prog [OPTIONS]')
opt.add_option('-p', '--port', type='int', default=None)
options, args = opt.parse_args()
port = options.port
if port is None:
try:
config = load_default_config()
port = config.get('port', DEFAULT_PORT)
except:
port = DEFAULT_PORT
logging.debug('Starting CDX Server on port %s', port)
try:
httpd = make_server('', port, the_app)
httpd.serve_forever()
except KeyboardInterrupt as ex:
pass
logging.debug('Stopping CDX Server')

View File

@ -9,6 +9,7 @@ from pywb.utils.canonicalize import unsurt, UrlCanonicalizer
from query import CDXQuery
#=================================================================
def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
"""

View File

@ -4,9 +4,11 @@ import itertools
from urllib import urlencode
from urlparse import parse_qs
from pywb.utils.wbexception import WbException
#=================================================================
class CDXException(Exception):
class CDXException(WbException):
def status(self):
return '400 Bad Request'

View File

@ -33,6 +33,7 @@ def cdx_load(sources, query, perms_checker=None, process=True):
return cdx_iter
#=================================================================
def restrict_cdx(cdx_iter, query, perms_checker):
"""
@ -56,6 +57,7 @@ def restrict_cdx(cdx_iter, query, perms_checker):
yield cdx
#=================================================================
def process_cdx(cdx_iter, query):
if query.resolve_revisits:
@ -255,7 +257,6 @@ def cdx_resolve_revisits(cdx_iter):
originals = {}
for cdx in cdx_iter:
is_revisit = cdx.is_revisit()
digest = cdx['digest']

View File

@ -126,14 +126,19 @@ class CDXServer(BaseCDXServer):
logging.warn('No CDX Sources configured from paths=%s', paths)
def _add_cdx_source(self, source):
if source is None: return
if source is None:
return
logging.debug('Adding CDX Source: %s', source)
self.sources.append(source)
def add_cdx_source(self, source, config):
if source is None: return
if source is None:
return
if isinstance(source, CDXSource):
self._add_cdx_source(source)
elif isinstance(source, str):
if os.path.isdir(source):
for fn in os.listdir(source):
@ -213,5 +218,3 @@ def create_cdx_server(config, ds_rules_file=None):
surt_ordered=surt_ordered,
ds_rules_file=ds_rules_file,
perms_checker=perms_checker)

View File

@ -8,6 +8,7 @@ import urllib
import urllib2
import itertools
#=================================================================
class CDXSource(object):
"""
@ -92,7 +93,6 @@ class RedisCDXSource(CDXSource):
if config:
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
def load_cdx(self, query):
"""
Load cdx from redis cache, from an ordered list

View File

@ -1,8 +1,6 @@
from pywb.utils.wbexception import WbException
class WbException(Exception):
pass
class NotFoundException(WbException):
def status(self):
return '404 Not Found'

View File

@ -4,6 +4,9 @@
import surt
import urlparse
from wbexception import WbException
#=================================================================
class UrlCanonicalizer(object):
def __init__(self, surt_ordered=True):
@ -14,7 +17,7 @@ class UrlCanonicalizer(object):
#=================================================================
class UrlCanonicalizeException(Exception):
class UrlCanonicalizeException(WbException):
def status(self):
return '400 Bad Request'
@ -164,7 +167,8 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
elif match_type == 'domain':
if not surt_ordered:
raise UrlCanonicalizeException('matchType=domain unsupported for non-surt')
msg = 'matchType=domain unsupported for non-surt'
raise UrlCanonicalizeException(msg)
host = start_key.split(')/')[0]

View File

@ -7,6 +7,7 @@ import os
import hmac
import urllib2
import time
from pkg_resources import resource_stream
#=================================================================
@ -24,16 +25,16 @@ class BlockLoader(object):
def __init__(self, cookie_maker=None):
self.cookie_maker = cookie_maker
def load(self, url, offset, length):
def load(self, url, offset=0, length=-1):
"""
Determine loading method based on uri
"""
if is_http(url):
return self.load_http(url, offset, length)
else:
return self.load_file(url, offset, length)
return self.load_file_or_resource(url, offset, length)
def load_file(self, url, offset, length):
def load_file_or_resource(self, url, offset, length):
"""
Load a file-like reader from the local file system
"""
@ -41,10 +42,18 @@ class BlockLoader(object):
if url.startswith('file://'):
url = url[len('file://'):]
afile = open(url, 'rb')
afile.seek(offset)
try:
# first, try as file
afile = open(url, 'rb')
except IOError as file_err:
# then, try as package.path/file
pkg_split = url.split('/', 1)
afile = resource_stream(pkg_split[0], pkg_split[1])
if length > 0:
if offset > 0:
afile.seek(offset)
if length >= 0:
return LimitReader(afile, length)
else:
return afile

View File

@ -171,7 +171,6 @@ def timestamp_to_datetime(string):
# pad to 6 digits
string = _pad_timestamp(string, PAD_6)
def clamp(val, min_, max_):
try:
val = int(val)

View File

@ -0,0 +1,3 @@
class WbException(Exception):
def status(self):
return '500 Internal Server Error'

View File

@ -9,6 +9,9 @@ from pywb.utils.statusandheaders import StatusAndHeadersParserException
from pywb.utils.loaders import BlockLoader
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.wbexception import WbException
#=================================================================
ArcWarcRecord = collections.namedtuple('ArchiveRecord',
'type, rec_headers, ' +
@ -16,7 +19,7 @@ ArcWarcRecord = collections.namedtuple('ArchiveRecord',
#=================================================================
class ArchiveLoadFailed(Exception):
class ArchiveLoadFailed(WbException):
def __init__(self, reason, filename=''):
super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason))
#self.filename = filename
@ -62,9 +65,9 @@ class ArcWarcRecordLoader:
decomp_type = 'gzip'
# Create decompressing stream
stream = DecompressingBufferedReader(stream = raw,
decomp_type = decomp_type,
block_size = self.block_size)
stream = DecompressingBufferedReader(stream=raw,
decomp_type=decomp_type,
block_size=self.block_size)
(the_format, rec_headers) = self._detect_type_load_headers(stream)

2
run.sh
View File

@ -10,7 +10,7 @@ mypath=$(cd `dirname $0` && pwd)
# ex: my_pywb.pywb_config()
#export 'PYWB_CONFIG=my_pywb'
app="pywb.bootstrap.wbapp"
app="pywb.apps.wayback"
params="--http-socket :8080 -b 65536"
#params="--static-map /static=$mypath/static --http-socket :8080 -b 65536"

View File

@ -22,6 +22,7 @@ setup(
'pywb.core',
'pywb.dispatch',
'pywb.bootstrap'
'pywb.apps'
],
package_data={
'pywb': ['ui/*', 'static/*', '*.yaml'],
@ -41,7 +42,6 @@ setup(
'pyyaml',
'WebTest',
'pytest',
'werkzeug>=0.9.4',
],
# tests_require=['WebTest', 'pytest'],
zip_safe=False

View File

@ -1,6 +1,6 @@
import webtest
from pywb.bootstrap.pywb_init import pywb_config
from pywb.bootstrap.wbapp import create_wb_app
from pywb.bootstrap.pywb_init import create_wb_router
from pywb.bootstrap.wsgi_wrappers import init_app
from pywb.cdx.cdxobject import CDXObject
from fixture import TestExclusionPerms
@ -11,8 +11,13 @@ class TestWb:
def setup(self):
#self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config())
# save it in self - useful for debugging
self.router = pywb_config(self.TEST_CONFIG)
self.app = create_wb_app(self.router)
self.app = init_app(create_wb_router,
load_yaml=True,
config_file=self.TEST_CONFIG)
#self.router = pywb_config(self.TEST_CONFIG)
#self.app = create_wb_app(self.router)
self.testapp = webtest.TestApp(self.app)
def _assert_basic_html(self, resp):