diff --git a/pywb/apps/__init__.py b/pywb/apps/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pywb/apps/wayback.py b/pywb/apps/wayback.py new file mode 100644 index 00000000..beaf0b0c --- /dev/null +++ b/pywb/apps/wayback.py @@ -0,0 +1,10 @@ +from pywb.bootstrap.wsgi_wrappers import init_app, start_wsgi_server +from pywb.bootstrap.pywb_init import create_wb_router + +#================================================================= +# init pywb app +#================================================================= +application = init_app(create_wb_router, load_yaml=True) + +if __name__ == "__main__": + start_wsgi_server(application) diff --git a/pywb/bootstrap/config_utils.py b/pywb/bootstrap/config_utils.py deleted file mode 100644 index 686a6bbb..00000000 --- a/pywb/bootstrap/config_utils.py +++ /dev/null @@ -1,56 +0,0 @@ -import logging - -from pywb.warc.recordloader import ArcWarcRecordLoader -from pywb.warc.resolvingloader import ResolvingLoader -from pywb.rewrite.rewrite_content import RewriteContent -from pywb.core.views import J2TemplateView, J2HtmlCapturesView -from pywb.core.handlers import WBHandler -from pywb.core.replay_views import ReplayView - -#================================================================= -# Config Loading -#================================================================= -def load_template_file(file, desc = None, view_class = J2TemplateView): - if file: - logging.debug('Adding {0}: {1}'.format(desc if desc else name, file)) - file = view_class(file) - - return file - -#================================================================= -def create_wb_handler(cdx_server, config, ds_rules_file=None): - - record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker')) - paths = config.get('archive_paths') - - resolving_loader = ResolvingLoader(paths=paths, - cdx_server=cdx_server, - record_loader=record_loader) - - replayer = ReplayView( - content_loader = resolving_loader, - - content_rewriter = RewriteContent(ds_rules_file=ds_rules_file), - - head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'), - - buffer_response = config.get('buffer_response', True), - - redir_to_exact = config.get('redir_to_exact', True), - - reporter = config.get('reporter') - ) - - - wb_handler = WBHandler( - cdx_server, - - replayer, - - html_view = load_template_file(config.get('query_html'), 'Captures Page', J2HtmlCapturesView), - - search_view = load_template_file(config.get('search_html'), 'Search Page'), - ) - - return wb_handler - diff --git a/pywb/bootstrap/pywb_init.py b/pywb/bootstrap/pywb_init.py index 1fe33ddc..d4382204 100644 --- a/pywb/bootstrap/pywb_init.py +++ b/pywb/bootstrap/pywb_init.py @@ -1,10 +1,20 @@ -from pywb.core.handlers import CDXHandler, StaticHandler -from pywb.core.handlers import DebugEchoHandler, DebugEchoEnvHandler from pywb.dispatch.archivalrouter import ArchivalRouter, Route from pywb.dispatch.proxy import ProxyArchivalRouter -from pywb.core.indexreader import IndexReader -import config_utils +from pywb.warc.recordloader import ArcWarcRecordLoader +from pywb.warc.resolvingloader import ResolvingLoader + +from pywb.rewrite.rewrite_content import RewriteContent + +from pywb.core.indexreader import IndexReader +from pywb.core.views import J2TemplateView, J2HtmlCapturesView +from pywb.core.handlers import WBHandler +from pywb.core.replay_views import ReplayView + +from pywb.core.handlers import CDXHandler, StaticHandler +from pywb.core.handlers import DebugEchoHandler, DebugEchoEnvHandler + +from pywb.utils.loaders import BlockLoader import os import yaml @@ -27,6 +37,7 @@ DEFAULTS = { 'domain_specific_rules': 'rules.yaml', } +#================================================================= class DictChain: def __init__(self, *dicts): self.dicts = dicts @@ -40,9 +51,63 @@ class DictChain: #================================================================= -## Reference non-YAML config +def load_template_file(file, desc=None, view_class=J2TemplateView): + if file: + logging.debug('Adding {0}: {1}'.format(desc if desc else name, file)) + file = view_class(file) + + return file + + #================================================================= -def pywb_config_manual(passed_config = {}): +def create_wb_handler(cdx_server, config, ds_rules_file=None): + + cookie_maker=config.get('cookie_maker') + record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker) + + paths = config.get('archive_paths') + + resolving_loader = ResolvingLoader(paths=paths, + cdx_server=cdx_server, + record_loader=record_loader) + + head_insert_view = load_template_file(config.get('head_insert_html'), + 'Head Insert') + + replayer = ReplayView( + content_loader=resolving_loader, + + content_rewriter=RewriteContent(ds_rules_file=ds_rules_file), + + head_insert_view=head_insert_view, + + buffer_response=config.get('buffer_response', True), + + redir_to_exact=config.get('redir_to_exact', True), + + reporter=config.get('reporter') + ) + + html_view = load_template_file(config.get('query_html'), + 'Captures Page', + J2HtmlCapturesView) + + + search_view = load_template_file(config.get('search_html'), + 'Search Page') + + wb_handler = WBHandler( + cdx_server, + replayer, + html_view=html_view, + search_view=search_view, + ) + + return wb_handler + + +#================================================================= +def create_wb_router(passed_config = {}): config = DictChain(passed_config, DEFAULTS) @@ -62,7 +127,7 @@ def pywb_config_manual(passed_config = {}): ds_rules_file = route_config.get('domain_specific_rules', None) cdx_server = IndexReader(route_config, ds_rules_file) - wb_handler = config_utils.create_wb_handler( + wb_handler = create_wb_handler( cdx_server=cdx_server, config=route_config, ds_rules_file=ds_rules_file, @@ -107,24 +172,6 @@ def pywb_config_manual(passed_config = {}): abs_path = config.get('absolute_paths', True), - home_view = config_utils.load_template_file(config.get('home_html'), 'Home Page'), - error_view = config_utils.load_template_file(config.get('error_html'), 'Error Page') + home_view = load_template_file(config.get('home_html'), 'Home Page'), + error_view = load_template_file(config.get('error_html'), 'Error Page') ) - - - -#================================================================= -# YAML config loader -#================================================================= -DEFAULT_CONFIG_FILE = 'config.yaml' - - -def pywb_config(config_file = None): - if not config_file: - config_file = os.environ.get('PYWB_CONFIG', DEFAULT_CONFIG_FILE) - - with open(config_file) as fh: - config = yaml.load(fh) - - return pywb_config_manual(config) - diff --git a/pywb/bootstrap/wbapp.py b/pywb/bootstrap/wsgi_wrappers.py similarity index 52% rename from pywb/bootstrap/wbapp.py rename to pywb/bootstrap/wsgi_wrappers.py index e7ea0c82..4dd04115 100644 --- a/pywb/bootstrap/wbapp.py +++ b/pywb/bootstrap/wsgi_wrappers.py @@ -1,20 +1,19 @@ -from pywb.core.wbexceptions import WbException, NotFoundException, InternalRedirect +from pywb.utils.wbexception import WbException +from pywb.core.wbexceptions import NotFoundException, InternalRedirect from pywb.core.wbrequestresponse import WbResponse, StatusAndHeaders -from pywb.cdx.cdxserver import CDXException -from pywb.utils.canonicalize import UrlCanonicalizeException -from pywb.warc.recordloader import ArchiveLoadFailed +from pywb.utils.loaders import BlockLoader import os import importlib import logging - #================================================================= -# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters +# adapted from wsgiref.request_uri, but doesn't include domain name and allows all characters # allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3 -# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links +# explained here: +# http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links def rel_request_uri(environ, include_query=1): """ Return the requested path, optionally including the query string @@ -35,9 +34,9 @@ def rel_request_uri(environ, include_query=1): return url + #================================================================= def create_wb_app(wb_router): - # Top-level wsgi application def application(env, start_response): if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'): @@ -56,8 +55,7 @@ def create_wb_app(wb_router): except InternalRedirect as ir: response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders)) - except (WbException, CDXException, - UrlCanonicalizeException, ArchiveLoadFailed) as e: + except WbException as e: response = handle_exception(env, wb_router.error_view, e, False) except Exception as e: @@ -69,6 +67,7 @@ def create_wb_app(wb_router): return application +#================================================================= def handle_exception(env, error_view, exc, print_trace): if hasattr(exc, 'status'): status = exc.status() @@ -85,44 +84,82 @@ def handle_exception(env, error_view, exc, print_trace): if error_view: import traceback - return error_view.render_response(err_msg = str(exc), err_details = err_details, status = status) + return error_view.render_response(err_msg=str(exc), + err_details=err_details, + status=status) else: - return WbResponse.text_response(status + ' Error: ' + str(exc), status = status) - + return WbResponse.text_response(status + ' Error: ' + str(exc), + status=status) #================================================================= DEFAULT_CONFIG_FILE = 'config.yaml' -DEFAULT_INIT_MODULE = 'pywb.bootstrap.pywb_init' +def load_yaml_config(config_file=None): + import yaml + + if not config_file: + config_file = DEFAULT_CONFIG_FILE + + configdata = BlockLoader().load(config_file) + config = yaml.load(configdata) + return config #================================================================= -def main(): +def init_app(init_func, load_yaml=True, config_file=None): + logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', + level=logging.DEBUG) + logging.info('') + + if load_yaml: + if not config_file: + config_file = os.environ.get('PYWB_CONFIG_FILE') + config = load_yaml_config(config_file) + try: - logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG) - - # see if there's a custom init module - config_name = os.environ.get('PYWB_CONFIG_MODULE') - - if not config_name: - # use default module - config_name = DEFAULT_INIT_MODULE - logging.info('Loading from default config module "{0}"'.format(config_name)) - logging.info('') - - module = importlib.import_module(config_name) - - app = create_wb_app(module.pywb_config()) - logging.info('') - logging.info('*** pywb inited with settings from {0}.pywb_config()!\n'.format(config_name)) - return app - - except Exception: - logging.exception('*** pywb could not init with settings from {0}.pywb_config()!\n'.format(config_name)) + if load_yaml: + wb_router = init_func(config) + else: + wb_router = init_func() + except: + msg = '*** pywb app init FAILED config from "%s"!\n' + logging.exception(msg, init_func.__name__) raise + else: + msg = '*** pywb app inited with config from "%s"!\n' + logging.info(msg, init_func.__name__) + + return create_wb_app(wb_router) + #================================================================= -if __name__ == "__main__": - pass -else: - application = main() +DEFAULT_PORT = 8080 + +def start_wsgi_server(the_app): + from wsgiref.simple_server import make_server + from optparse import OptionParser + + opt = OptionParser('%prog [OPTIONS]') + opt.add_option('-p', '--port', type='int', default=None) + + options, args = opt.parse_args() + + port = options.port + + if port is None: + try: + config = load_default_config() + port = config.get('port', DEFAULT_PORT) + except: + port = DEFAULT_PORT + + + logging.debug('Starting CDX Server on port %s', port) + + try: + httpd = make_server('', port, the_app) + httpd.serve_forever() + except KeyboardInterrupt as ex: + pass + + logging.debug('Stopping CDX Server') diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index 2e8a3855..e77c4666 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -9,6 +9,7 @@ from pywb.utils.canonicalize import unsurt, UrlCanonicalizer from query import CDXQuery + #================================================================= def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): """ diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 3915f169..49cd74c5 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -4,9 +4,11 @@ import itertools from urllib import urlencode from urlparse import parse_qs +from pywb.utils.wbexception import WbException + #================================================================= -class CDXException(Exception): +class CDXException(WbException): def status(self): return '400 Bad Request' diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index c4f865c2..e3a1a13b 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -33,6 +33,7 @@ def cdx_load(sources, query, perms_checker=None, process=True): return cdx_iter + #================================================================= def restrict_cdx(cdx_iter, query, perms_checker): """ @@ -56,6 +57,7 @@ def restrict_cdx(cdx_iter, query, perms_checker): yield cdx + #================================================================= def process_cdx(cdx_iter, query): if query.resolve_revisits: @@ -255,7 +257,6 @@ def cdx_resolve_revisits(cdx_iter): originals = {} for cdx in cdx_iter: - is_revisit = cdx.is_revisit() digest = cdx['digest'] diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 54d46f4b..2e5ec8ad 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -126,14 +126,19 @@ class CDXServer(BaseCDXServer): logging.warn('No CDX Sources configured from paths=%s', paths) def _add_cdx_source(self, source): - if source is None: return + if source is None: + return + logging.debug('Adding CDX Source: %s', source) self.sources.append(source) def add_cdx_source(self, source, config): - if source is None: return + if source is None: + return + if isinstance(source, CDXSource): self._add_cdx_source(source) + elif isinstance(source, str): if os.path.isdir(source): for fn in os.listdir(source): @@ -213,5 +218,3 @@ def create_cdx_server(config, ds_rules_file=None): surt_ordered=surt_ordered, ds_rules_file=ds_rules_file, perms_checker=perms_checker) - - diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index 0923fba9..dfab0f25 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -8,6 +8,7 @@ import urllib import urllib2 import itertools + #================================================================= class CDXSource(object): """ @@ -92,7 +93,6 @@ class RedisCDXSource(CDXSource): if config: self.key_prefix = config.get('redis_key_prefix', self.key_prefix) - def load_cdx(self, query): """ Load cdx from redis cache, from an ordered list diff --git a/pywb/core/wbexceptions.py b/pywb/core/wbexceptions.py index afacc325..e9b07ad3 100644 --- a/pywb/core/wbexceptions.py +++ b/pywb/core/wbexceptions.py @@ -1,8 +1,6 @@ +from pywb.utils.wbexception import WbException -class WbException(Exception): - pass - class NotFoundException(WbException): def status(self): return '404 Not Found' diff --git a/pywb/utils/canonicalize.py b/pywb/utils/canonicalize.py index 73555ca6..6979a323 100644 --- a/pywb/utils/canonicalize.py +++ b/pywb/utils/canonicalize.py @@ -4,6 +4,9 @@ import surt import urlparse +from wbexception import WbException + + #================================================================= class UrlCanonicalizer(object): def __init__(self, surt_ordered=True): @@ -14,7 +17,7 @@ class UrlCanonicalizer(object): #================================================================= -class UrlCanonicalizeException(Exception): +class UrlCanonicalizeException(WbException): def status(self): return '400 Bad Request' @@ -164,7 +167,8 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None): elif match_type == 'domain': if not surt_ordered: - raise UrlCanonicalizeException('matchType=domain unsupported for non-surt') + msg = 'matchType=domain unsupported for non-surt' + raise UrlCanonicalizeException(msg) host = start_key.split(')/')[0] diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 7813ded8..6f2fa6c9 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -7,6 +7,7 @@ import os import hmac import urllib2 import time +from pkg_resources import resource_stream #================================================================= @@ -24,16 +25,16 @@ class BlockLoader(object): def __init__(self, cookie_maker=None): self.cookie_maker = cookie_maker - def load(self, url, offset, length): + def load(self, url, offset=0, length=-1): """ Determine loading method based on uri """ if is_http(url): return self.load_http(url, offset, length) else: - return self.load_file(url, offset, length) + return self.load_file_or_resource(url, offset, length) - def load_file(self, url, offset, length): + def load_file_or_resource(self, url, offset, length): """ Load a file-like reader from the local file system """ @@ -41,10 +42,18 @@ class BlockLoader(object): if url.startswith('file://'): url = url[len('file://'):] - afile = open(url, 'rb') - afile.seek(offset) + try: + # first, try as file + afile = open(url, 'rb') + except IOError as file_err: + # then, try as package.path/file + pkg_split = url.split('/', 1) + afile = resource_stream(pkg_split[0], pkg_split[1]) - if length > 0: + if offset > 0: + afile.seek(offset) + + if length >= 0: return LimitReader(afile, length) else: return afile diff --git a/pywb/utils/timeutils.py b/pywb/utils/timeutils.py index f93f324d..a89424aa 100644 --- a/pywb/utils/timeutils.py +++ b/pywb/utils/timeutils.py @@ -171,7 +171,6 @@ def timestamp_to_datetime(string): # pad to 6 digits string = _pad_timestamp(string, PAD_6) - def clamp(val, min_, max_): try: val = int(val) diff --git a/pywb/utils/wbexception.py b/pywb/utils/wbexception.py new file mode 100644 index 00000000..a8757935 --- /dev/null +++ b/pywb/utils/wbexception.py @@ -0,0 +1,3 @@ +class WbException(Exception): + def status(self): + return '500 Internal Server Error' diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 446e0da3..fb3af38c 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -9,6 +9,9 @@ from pywb.utils.statusandheaders import StatusAndHeadersParserException from pywb.utils.loaders import BlockLoader from pywb.utils.bufferedreaders import DecompressingBufferedReader +from pywb.utils.wbexception import WbException + + #================================================================= ArcWarcRecord = collections.namedtuple('ArchiveRecord', 'type, rec_headers, ' + @@ -16,7 +19,7 @@ ArcWarcRecord = collections.namedtuple('ArchiveRecord', #================================================================= -class ArchiveLoadFailed(Exception): +class ArchiveLoadFailed(WbException): def __init__(self, reason, filename=''): super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason)) #self.filename = filename @@ -62,9 +65,9 @@ class ArcWarcRecordLoader: decomp_type = 'gzip' # Create decompressing stream - stream = DecompressingBufferedReader(stream = raw, - decomp_type = decomp_type, - block_size = self.block_size) + stream = DecompressingBufferedReader(stream=raw, + decomp_type=decomp_type, + block_size=self.block_size) (the_format, rec_headers) = self._detect_type_load_headers(stream) diff --git a/run.sh b/run.sh index 6232c030..77964b32 100755 --- a/run.sh +++ b/run.sh @@ -10,7 +10,7 @@ mypath=$(cd `dirname $0` && pwd) # ex: my_pywb.pywb_config() #export 'PYWB_CONFIG=my_pywb' -app="pywb.bootstrap.wbapp" +app="pywb.apps.wayback" params="--http-socket :8080 -b 65536" #params="--static-map /static=$mypath/static --http-socket :8080 -b 65536" diff --git a/setup.py b/setup.py index 4c2cad20..889fe2a8 100755 --- a/setup.py +++ b/setup.py @@ -22,6 +22,7 @@ setup( 'pywb.core', 'pywb.dispatch', 'pywb.bootstrap' + 'pywb.apps' ], package_data={ 'pywb': ['ui/*', 'static/*', '*.yaml'], @@ -41,7 +42,6 @@ setup( 'pyyaml', 'WebTest', 'pytest', - 'werkzeug>=0.9.4', ], # tests_require=['WebTest', 'pytest'], zip_safe=False diff --git a/tests/test_integration.py b/tests/test_integration.py index b9b20e06..b71e8574 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,6 +1,6 @@ import webtest -from pywb.bootstrap.pywb_init import pywb_config -from pywb.bootstrap.wbapp import create_wb_app +from pywb.bootstrap.pywb_init import create_wb_router +from pywb.bootstrap.wsgi_wrappers import init_app from pywb.cdx.cdxobject import CDXObject from fixture import TestExclusionPerms @@ -11,8 +11,13 @@ class TestWb: def setup(self): #self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config()) # save it in self - useful for debugging - self.router = pywb_config(self.TEST_CONFIG) - self.app = create_wb_app(self.router) + self.app = init_app(create_wb_router, + load_yaml=True, + config_file=self.TEST_CONFIG) + + #self.router = pywb_config(self.TEST_CONFIG) + #self.app = create_wb_app(self.router) + self.testapp = webtest.TestApp(self.app) def _assert_basic_html(self, resp):