diff --git a/pywb/apps/__init__.py b/pywb/apps/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pywb/apps/cdx_server.py b/pywb/apps/cdx_server.py new file mode 100644 index 00000000..a16df1fe --- /dev/null +++ b/pywb/apps/cdx_server.py @@ -0,0 +1,17 @@ +from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server + +from pywb.core.cdx_handler import create_cdx_server_app + +#================================================================= +# init cdx server app +#================================================================= + +# cdx-server only config +DEFAULT_CONFIG = 'pywb/cdx/config.yaml' + +application = init_app(create_cdx_server_app, + load_yaml=True, + config_file=DEFAULT_CONFIG) + +if __name__ == "__main__": + start_wsgi_server(application) diff --git a/pywb/apps/wayback.py b/pywb/apps/wayback.py new file mode 100644 index 00000000..0cda072b --- /dev/null +++ b/pywb/apps/wayback.py @@ -0,0 +1,10 @@ +from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server +from pywb.core.pywb_init import create_wb_router + +#================================================================= +# init pywb app +#================================================================= +application = init_app(create_wb_router, load_yaml=True) + +if __name__ == "__main__": + start_wsgi_server(application) diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index 2e8a3855..e77c4666 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -9,6 +9,7 @@ from pywb.utils.canonicalize import unsurt, UrlCanonicalizer from query import CDXQuery + #================================================================= def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): """ diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 3915f169..9ea4a92e 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -4,9 +4,11 @@ import itertools from urllib import urlencode from urlparse import parse_qs +from pywb.utils.wbexception import WbException + #================================================================= -class CDXException(Exception): +class CDXException(WbException): def status(self): return '400 Bad Request' @@ -61,7 +63,7 @@ class CDXObject(OrderedDict): cdxformat = i if not cdxformat: - raise Exception('unknown {0}-field cdx format'.format(len(fields))) + raise CDXException('unknown {0}-field cdx format'.format(len(fields))) for header, field in itertools.izip(cdxformat, fields): self[header] = field @@ -85,8 +87,15 @@ class CDXObject(OrderedDict): """ if fields is None: return str(self) + '\n' - else: - return ' '.join(self[x] for x in fields) + '\n' + + try: + result = ' '.join(self[x] for x in fields) + '\n' + except KeyError as ke: + msg = 'Invalid field "{0}" found in fields= argument' + msg = msg.format(ke.message) + raise CDXException(msg) + + return result def __str__(self): if self.cdxline: @@ -109,7 +118,7 @@ class IDXObject(OrderedDict): if len(fields) < self.NUM_REQ_FIELDS: msg = 'invalid idx format: {0} fields found, {1} required' - raise Exception(msg.format(len(fields), self.NUM_REQ_FIELDS)) + raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS)) for header, field in itertools.izip(self.FORMAT, fields): self[header] = field diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index c4f865c2..6963b28c 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -31,8 +31,18 @@ def cdx_load(sources, query, perms_checker=None, process=True): if perms_checker: cdx_iter = restrict_cdx(cdx_iter, query, perms_checker) + if query.output == 'text': + cdx_iter = cdx_to_text(cdx_iter, query.fields) + return cdx_iter + +#================================================================= +def cdx_to_text(cdx_iter, fields): + for cdx in cdx_iter: + yield cdx.to_text(fields) + + #================================================================= def restrict_cdx(cdx_iter, query, perms_checker): """ @@ -56,6 +66,7 @@ def restrict_cdx(cdx_iter, query, perms_checker): yield cdx + #================================================================= def process_cdx(cdx_iter, query): if query.resolve_revisits: @@ -255,7 +266,6 @@ def cdx_resolve_revisits(cdx_iter): originals = {} for cdx in cdx_iter: - is_revisit = cdx.is_revisit() digest = cdx['digest'] diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 54d46f4b..2e5ec8ad 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -126,14 +126,19 @@ class CDXServer(BaseCDXServer): logging.warn('No CDX Sources configured from paths=%s', paths) def _add_cdx_source(self, source): - if source is None: return + if source is None: + return + logging.debug('Adding CDX Source: %s', source) self.sources.append(source) def add_cdx_source(self, source, config): - if source is None: return + if source is None: + return + if isinstance(source, CDXSource): self._add_cdx_source(source) + elif isinstance(source, str): if os.path.isdir(source): for fn in os.listdir(source): @@ -213,5 +218,3 @@ def create_cdx_server(config, ds_rules_file=None): surt_ordered=surt_ordered, ds_rules_file=ds_rules_file, perms_checker=perms_checker) - - diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index 0923fba9..dfab0f25 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -8,6 +8,7 @@ import urllib import urllib2 import itertools + #================================================================= class CDXSource(object): """ @@ -92,7 +93,6 @@ class RedisCDXSource(CDXSource): if config: self.key_prefix = config.get('redis_key_prefix', self.key_prefix) - def load_cdx(self, query): """ Load cdx from redis cache, from an ordered list diff --git a/pywb/cdx/query.py b/pywb/cdx/query.py index dc480836..6449223a 100644 --- a/pywb/cdx/query.py +++ b/pywb/cdx/query.py @@ -1,5 +1,6 @@ from urllib import urlencode from urlparse import parse_qs +from cdxobject import CDXException #================================================================= @@ -62,6 +63,9 @@ class CDXQuery(object): @property def fields(self): v = self.params.get('fields') + # check old param name + if not v: + v = self.params.get('fl') return v.split(',') if v else None @property @@ -105,9 +109,6 @@ class CDXQuery(object): """ params = parse_qs(env['QUERY_STRING']) - if not 'output' in params: - params['output'] = 'text' - # parse_qs produces arrays for single values # cdx processing expects singleton params for all params, # except filters, so convert here @@ -116,4 +117,8 @@ class CDXQuery(object): if name != 'filter': params[name] = val[0] + if not 'output' in params: + params['output'] = 'text' + + return params diff --git a/pywb/cdx/test/cdxserver_test.py b/pywb/cdx/test/cdxserver_test.py index e261ead4..f0a3398d 100644 --- a/pywb/cdx/test/cdxserver_test.py +++ b/pywb/cdx/test/cdxserver_test.py @@ -187,6 +187,7 @@ import pytest def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): kwparams['url'] = url + kwparams['output'] = 'cdxobject' fields = kwparams.get('fields') if fields: fields = fields.split(',') diff --git a/pywb/cdx/test/wsgi_cdxserver_test.py b/pywb/cdx/test/wsgi_cdxserver_test.py deleted file mode 100644 index a7d1ecdb..00000000 --- a/pywb/cdx/test/wsgi_cdxserver_test.py +++ /dev/null @@ -1,15 +0,0 @@ -import webtest -from pywb.cdx.wsgi_cdxserver import create_app -from pywb import get_test_dir - -class TestCdx: - def setup(self): - self.app = create_app(get_test_dir() + 'cdx/') - self.testapp = webtest.TestApp(self.app) - - def test_cdx(self): - resp = self.testapp.get('/cdx?url=http://www.iana.org/_css/2013.1/screen.css') - assert resp.content_type == 'text/plain' - assert resp.content_length > 0 - - diff --git a/pywb/cdx/wsgi_cdxserver.py b/pywb/cdx/wsgi_cdxserver.py deleted file mode 100644 index c9fe11d7..00000000 --- a/pywb/cdx/wsgi_cdxserver.py +++ /dev/null @@ -1,103 +0,0 @@ -from werkzeug.wrappers import BaseResponse -from cdxserver import create_cdx_server -from pywb import get_test_dir -from query import CDXQuery - -import logging -import os -import yaml -import pkg_resources - -#================================================================= -CONFIG_FILE = 'config.yaml' - -RULES_FILE = 'rules.yaml' - -DEFAULT_PORT = 8080 - -#================================================================= - -class CDXQueryRequest(object): - def __init__(self, environ): - self.query = CDXQuery.from_wsgi_env(environ) - - -class WSGICDXServer(object): - def __init__(self, config, rules_file): - self.cdxserver = create_cdx_server(config, rules_file) - - def __call__(self, environ, start_response): - request = CDXQueryRequest(environ) - try: - logging.debug('request.args=%s', request.query) - result = self.cdxserver.load_cdx_query(request.query) - - # TODO: select response type by "output" parameter - response = PlainTextResponse(result, request.query.fields) - return response(environ, start_response) - except Exception as exc: - logging.error('load_cdx failed', exc_info=1) - # TODO: error response should be different for each response - # type - start_response('400 Error', [('Content-Type', 'text/plain')]) - return [str(exc)] - -def cdx_text_out(cdx, fields): - if not fields: - return str(cdx) + '\n' - else: - logging.info('cdx fields=%s', cdx.keys) - # TODO: this will results in an exception if fields contain - # non-existent field name. - return ' '.join(cdx[x] for x in fields) + '\n' - -class PlainTextResponse(BaseResponse): - def __init__(self, cdxitr, fields, status=200, content_type='text/plain'): - super(PlainTextResponse, self).__init__( - response=( - cdx.to_text(fields) for cdx in cdxitr - ), - status=status, content_type=content_type) - -# class JsonResponse(Response): -# pass -# class MementoResponse(Response): -# pass - -def create_app(config=None): - logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', - level=logging.DEBUG) - - if not config: - index_paths = get_test_dir() + 'cdx/' - config = dict(index_paths=index_paths) - - return WSGICDXServer(config, RULES_FILE) - -if __name__ == "__main__": - from optparse import OptionParser - from werkzeug.serving import run_simple - - opt = OptionParser('%prog [OPTIONS]') - opt.add_option('-p', '--port', type='int', default=None) - - options, args = opt.parse_args() - - configdata = pkg_resources.resource_string(__name__, CONFIG_FILE) - config = yaml.load(configdata) - - port = options.port - if port is None: - port = (config and config.get('port')) or DEFAULT_PORT - - app = create_app(config) - - logging.debug('Starting CDX Server on port %s', port) - try: - run_simple('0.0.0.0', port, app, use_reloader=True, use_debugger=True) - except KeyboardInterrupt as ex: - pass - logging.debug('Stopping CDX Server') -else: - # XXX pass production config - application = create_app() diff --git a/pywb/config_utils.py b/pywb/config_utils.py deleted file mode 100644 index 05844a2e..00000000 --- a/pywb/config_utils.py +++ /dev/null @@ -1,56 +0,0 @@ -import views -import handlers -import replay_views -import logging - -from pywb.warc.recordloader import ArcWarcRecordLoader -from pywb.warc.resolvingloader import ResolvingLoader -from pywb.rewrite.rewrite_content import RewriteContent - -#================================================================= -# Config Loading -#================================================================= -def load_template_file(file, desc = None, view_class = views.J2TemplateView): - if file: - logging.debug('Adding {0}: {1}'.format(desc if desc else name, file)) - file = view_class(file) - - return file - -#================================================================= -def create_wb_handler(cdx_server, config, ds_rules_file=None): - - record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker')) - paths = config.get('archive_paths') - - resolving_loader = ResolvingLoader(paths=paths, - cdx_server=cdx_server, - record_loader=record_loader) - - replayer = replay_views.ReplayView( - content_loader = resolving_loader, - - content_rewriter = RewriteContent(ds_rules_file=ds_rules_file), - - head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'), - - buffer_response = config.get('buffer_response', True), - - redir_to_exact = config.get('redir_to_exact', True), - - reporter = config.get('reporter') - ) - - - wb_handler = handlers.WBHandler( - cdx_server, - - replayer, - - html_view = load_template_file(config.get('query_html'), 'Captures Page', views.J2HtmlCapturesView), - - search_view = load_template_file(config.get('search_html'), 'Search Page'), - ) - - return wb_handler - diff --git a/pywb/core/__init__.py b/pywb/core/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pywb/core/cdx_handler.py b/pywb/core/cdx_handler.py new file mode 100644 index 00000000..3f5bb2a8 --- /dev/null +++ b/pywb/core/cdx_handler.py @@ -0,0 +1,43 @@ +from pywb.cdx.query import CDXQuery +from pywb.cdx.cdxserver import create_cdx_server + +from pywb.framework.archivalrouter import ArchivalRouter, Route +from pywb.framework.basehandlers import BaseHandler + +from views import TextCapturesView + + +#================================================================= +class CDXHandler(BaseHandler): + """ + Handler which passes wsgi request to cdx server and + returns a text-based cdx response + """ + def __init__(self, index_reader, view=None): + self.index_reader = index_reader + self.view = view if view else TextCapturesView() + + def __call__(self, wbrequest): + params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env) + cdx_lines = self.index_reader.load_cdx(**params) + + return self.view.render_response(wbrequest, cdx_lines) + + def __str__(self): + return 'CDX Handler: ' + str(self.index_reader) + + +#================================================================= +DEFAULT_RULES = 'pywb/rules.yaml' + +#================================================================= +def create_cdx_server_app(config): + """ + Create a cdx server config to be wrapped in a wsgi app + Currently using single access point '/cdx' + TODO: more complex example with multiple collections? + """ + cdx_server = create_cdx_server(config, DEFAULT_RULES) + port = config.get('port') + routes = [Route('cdx', CDXHandler(cdx_server))] + return ArchivalRouter(routes, port=port) diff --git a/pywb/handlers.py b/pywb/core/handlers.py similarity index 68% rename from pywb/handlers.py rename to pywb/core/handlers.py index 0d9500f4..049888df 100644 --- a/pywb/handlers.py +++ b/pywb/core/handlers.py @@ -1,30 +1,13 @@ -import urlparse import pkgutil import mimetypes import time -from pywb.rewrite.wburl import WbUrl -from pywb.cdx.query import CDXQuery -from wbrequestresponse import WbResponse -from wbexceptions import WbException, NotFoundException +from pywb.framework.basehandlers import BaseHandler, WbUrlHandler +from pywb.framework.wbrequestresponse import WbResponse +from pywb.framework.wbexceptions import WbException, NotFoundException from views import TextCapturesView -#================================================================= -class BaseHandler(object): - def __call__(self, wbrequest): - return wbrequest - - def get_wburl_type(self): - return None - - -#================================================================= -class WbUrlHandler(BaseHandler): - def get_wburl_type(self): - return WbUrl - - #================================================================= # Standard WB Handler #================================================================= @@ -33,11 +16,15 @@ class WBHandler(WbUrlHandler): html_view=None, search_view=None): self.index_reader = index_reader + self.replay = replay - self.text_view = TextCapturesView() + self.text_query_view = TextCapturesView() + + self.query_view = html_view + if not self.query_view: + self.query_view = text_query_view - self.html_view = html_view self.search_view = search_view def __call__(self, wbrequest): @@ -49,11 +36,10 @@ class WBHandler(WbUrlHandler): # new special modifier to always show cdx index if wbrequest.wb_url.mod == 'cdx_': - return self.text_view.render_response(wbrequest, cdx_lines) + return self.text_query_view.render_response(wbrequest, cdx_lines) if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY): - query_view = self.html_view if self.html_view else self.text_view - return query_view.render_response(wbrequest, cdx_lines) + return self.query_view.render_response(wbrequest, cdx_lines) with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: return self.replay(wbrequest, cdx_lines) @@ -70,29 +56,11 @@ class WBHandler(WbUrlHandler): return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay) -#================================================================= -# CDX-Server Handler -- pass all params to cdx server -#================================================================= -class CDXHandler(BaseHandler): - def __init__(self, index_reader, view = None): - self.index_reader = index_reader - self.view = view if view else TextCapturesView() - - def __call__(self, wbrequest): - params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env) - cdx_lines = self.index_reader.load_cdx(**params) - - return self.view.render_response(wbrequest, cdx_lines) - - def __str__(self): - return 'Index Reader: ' + str(self.index_reader) - - #================================================================= # Static Content Handler #================================================================= class StaticHandler(BaseHandler): - def __init__(self, static_path, pkg = __package__): + def __init__(self, static_path, pkg = 'pywb'): mimetypes.init() self.static_path = static_path diff --git a/pywb/indexreader.py b/pywb/core/indexreader.py similarity index 98% rename from pywb/indexreader.py rename to pywb/core/indexreader.py index a422d0b4..b77f8590 100644 --- a/pywb/indexreader.py +++ b/pywb/core/indexreader.py @@ -29,6 +29,7 @@ class IndexReader(object): params.update(wbrequest.custom_params) params['allowFuzzy'] = True + params['output'] = 'cdxobject' cdxlines = self.load_cdx(url=wburl.url, **params) diff --git a/pywb/core/pywb_init.py b/pywb/core/pywb_init.py new file mode 100644 index 00000000..10c7b999 --- /dev/null +++ b/pywb/core/pywb_init.py @@ -0,0 +1,181 @@ +from pywb.framework.archivalrouter import ArchivalRouter, Route +from pywb.framework.proxy import ProxyArchivalRouter + +from pywb.warc.recordloader import ArcWarcRecordLoader +from pywb.warc.resolvingloader import ResolvingLoader + +from pywb.rewrite.rewrite_content import RewriteContent + +from indexreader import IndexReader +from views import J2TemplateView, J2HtmlCapturesView +from replay_views import ReplayView + +from handlers import WBHandler +from handlers import StaticHandler +from cdx_handler import CDXHandler +from handlers import DebugEchoHandler, DebugEchoEnvHandler + + +import os +import yaml +import logging + + +#================================================================= +DEFAULTS = { + 'hostpaths': ['http://localhost:8080'], + 'collections': {'pywb': './sample_archive/cdx/'}, + 'archive_paths': './sample_archive/warcs/', + + 'head_insert_html': 'ui/head_insert.html', + 'query_html': 'ui/query.html', + 'search_html': 'ui/search.html', + 'home_html': 'ui/index.html', + 'error_html': 'ui/error.html', + + 'static_routes': {'static/default': 'static/'}, + + 'domain_specific_rules': 'pywb/rules.yaml', +} + +#================================================================= +class DictChain: + def __init__(self, *dicts): + self.dicts = dicts + + def get(self, key, default_val=None): + for d in self.dicts: + val = d.get(key) + if val is not None: + return val + return default_val + + +#================================================================= +def load_template_file(file, desc=None, view_class=J2TemplateView): + if file: + logging.debug('Adding {0}: {1}'.format(desc if desc else name, file)) + file = view_class(file) + + return file + + +#================================================================= +def create_wb_handler(cdx_server, config, ds_rules_file=None): + + cookie_maker=config.get('cookie_maker') + record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker) + + paths = config.get('archive_paths') + + resolving_loader = ResolvingLoader(paths=paths, + cdx_server=cdx_server, + record_loader=record_loader) + + head_insert_view = load_template_file(config.get('head_insert_html'), + 'Head Insert') + + replayer = ReplayView( + content_loader=resolving_loader, + + content_rewriter=RewriteContent(ds_rules_file=ds_rules_file), + + head_insert_view=head_insert_view, + + buffer_response=config.get('buffer_response', True), + + redir_to_exact=config.get('redir_to_exact', True), + + reporter=config.get('reporter') + ) + + html_view = load_template_file(config.get('query_html'), + 'Captures Page', + J2HtmlCapturesView) + + + search_view = load_template_file(config.get('search_html'), + 'Search Page') + + wb_handler = WBHandler( + cdx_server, + replayer, + html_view=html_view, + search_view=search_view, + ) + + return wb_handler + + +#================================================================= +def create_wb_router(passed_config = {}): + + config = DictChain(passed_config, DEFAULTS) + + routes = [] + + hostpaths = config.get('hostpaths') + + port = config.get('port') + + # collections based on cdx source + collections = config.get('collections') + + for name, value in collections.iteritems(): + if isinstance(value, str): + value = {'index_paths': value} + + route_config = DictChain(value, config) + + ds_rules_file = route_config.get('domain_specific_rules', None) + cdx_server = IndexReader(route_config, ds_rules_file) + + wb_handler = create_wb_handler( + cdx_server=cdx_server, + config=route_config, + ds_rules_file=ds_rules_file, + ) + + logging.debug('Adding Collection: ' + name) + + route_class = route_config.get('route_class', Route) + + routes.append(route_class(name, wb_handler, config = route_config)) + + # cdx query handler + if route_config.get('enable_cdx_api', False): + routes.append(Route(name + '-cdx', CDXHandler(cdx_server))) + + + if config.get('debug_echo_env', False): + routes.append(Route('echo_env', DebugEchoEnvHandler())) + + if config.get('debug_echo_req', False): + routes.append(Route('echo_req', DebugEchoHandler())) + + + static_routes = config.get('static_routes') + + for static_name, static_path in static_routes.iteritems(): + routes.append(Route(static_name, StaticHandler(static_path))) + + # Check for new proxy mode! + if config.get('enable_http_proxy', False): + router = ProxyArchivalRouter + else: + router = ArchivalRouter + + # Finally, create wb router + return router( + routes, + # Specify hostnames that pywb will be running on + # This will help catch occasionally missed rewrites that fall-through to the host + # (See archivalrouter.ReferRedirect) + hostpaths = hostpaths, + port = port, + + abs_path = config.get('absolute_paths', True), + + home_view = load_template_file(config.get('home_html'), 'Home Page'), + error_view = load_template_file(config.get('error_html'), 'Error Page') + ) diff --git a/pywb/replay_views.py b/pywb/core/replay_views.py similarity index 85% rename from pywb/replay_views.py rename to pywb/core/replay_views.py index 31e7af9a..07997396 100644 --- a/pywb/replay_views.py +++ b/pywb/core/replay_views.py @@ -2,9 +2,9 @@ import StringIO from pywb.rewrite.url_rewriter import UrlRewriter from pywb.utils.bufferedreaders import ChunkedDataReader -from wbrequestresponse import WbResponse +from pywb.framework.wbrequestresponse import WbResponse -from wbexceptions import CaptureException, InternalRedirect +from pywb.framework.wbexceptions import CaptureException, InternalRedirect from pywb.warc.recordloader import ArchiveLoadFailed from pywb.utils.loaders import LimitReader @@ -51,7 +51,7 @@ class ReplayView: self._redirect_if_needed(wbrequest, cdx) # one more check for referrer-based self-redirect - self._reject_referrer_self_redirect(wbrequest, status_headers) + self._reject_referrer_self_redirect(wbrequest) response = None @@ -177,25 +177,30 @@ class ReplayView: def _reject_self_redirect(self, wbrequest, cdx, status_headers): - # self-redirect via location + """ + Check if response is a 3xx redirect to the same url + If so, reject this capture to avoid causing redirect loop + """ if status_headers.statusline.startswith('3'): request_url = wbrequest.wb_url.url.lower() location_url = status_headers.get_header('Location').lower() - #TODO: canonicalize before testing? if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)): raise CaptureException('Self Redirect: ' + str(cdx)) - def _reject_referrer_self_redirect(self, wbrequest, status_headers): - # at correct timestamp now, but must check for referrer redirect - # indirect self-redirect, via meta-refresh, if referrer is same as current url - if status_headers.statusline.startswith('2'): - # build full url even if using relative-rewriting - request_url = wbrequest.host_prefix + wbrequest.rel_prefix + str(wbrequest.wb_url) - referrer_url = wbrequest.referrer - if (referrer_url and UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(referrer_url)): - raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url)) - - + def _reject_referrer_self_redirect(self, wbrequest): + """ + Perform final check for referrer based self-redirect. + This method should be called after verifying request timestamp matches capture. + if referrer is same as current url, reject this response and try another capture + """ + if not wbrequest.referrer: + return + # build full url even if using relative-rewriting + request_url = (wbrequest.host_prefix + + wbrequest.rel_prefix + str(wbrequest.wb_url)) + if (UrlRewriter.strip_protocol(request_url) == + UrlRewriter.strip_protocol(wbrequest.referrer)): + raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url)) diff --git a/pywb/views.py b/pywb/core/views.py similarity index 85% rename from pywb/views.py rename to pywb/core/views.py index 67f928d6..3be55eae 100644 --- a/pywb/views.py +++ b/pywb/core/views.py @@ -1,6 +1,6 @@ -import pywb.utils.timeutils as timeutils +from pywb.utils.timeutils import timestamp_to_datetime +from pywb.framework.wbrequestresponse import WbResponse -import wbrequestresponse import urlparse import time @@ -18,7 +18,7 @@ class StaticTextView: return self.text def render_response(self, **kwargs): - return wbrequestresponse.WbResponse.text_stream(self.text) + return WbResponse.text_stream(self.text) #================================================================= class J2TemplateView: @@ -34,7 +34,7 @@ class J2TemplateView: if template_dir.startswith('.') or template_dir.startswith('file://'): loader = FileSystemLoader(template_dir) else: - loader = PackageLoader(__package__, template_dir) + loader = PackageLoader('pywb', template_dir) jinja_env = Environment(loader = loader, trim_blocks = True) jinja_env.filters['format_ts'] = J2TemplateView.format_ts @@ -51,13 +51,13 @@ class J2TemplateView: def render_response(self, **kwargs): template_result = self.render_to_string(**kwargs) status = kwargs.get('status', '200 OK') - return wbrequestresponse.WbResponse.text_response(str(template_result), status = status, content_type = 'text/html; charset=utf-8') + return WbResponse.text_response(str(template_result), status = status, content_type = 'text/html; charset=utf-8') # Filters @staticmethod def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'): - value = timeutils.timestamp_to_datetime(value) + value = timestamp_to_datetime(value) return value.strftime(format_) @staticmethod @@ -90,7 +90,7 @@ class TextCapturesView: cdx += '\n' return cdx cdx_lines = imap(to_str, cdx_lines) - return wbrequestresponse.WbResponse.text_stream(cdx_lines) + return WbResponse.text_stream(cdx_lines) diff --git a/pywb/framework/__init__.py b/pywb/framework/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pywb/archivalrouter.py b/pywb/framework/archivalrouter.py similarity index 74% rename from pywb/archivalrouter.py rename to pywb/framework/archivalrouter.py index 5d3dc9f4..6c901fac 100644 --- a/pywb/archivalrouter.py +++ b/pywb/framework/archivalrouter.py @@ -1,17 +1,31 @@ import urlparse import re -from wbrequestresponse import WbRequest, WbResponse from pywb.rewrite.url_rewriter import UrlRewriter +from wbrequestresponse import WbRequest, WbResponse #================================================================= # ArchivalRouter -- route WB requests in archival mode #================================================================= -class ArchivalRouter: - def __init__(self, routes, hostpaths=None, abs_path=True, home_view=None, error_view=None): +class ArchivalRouter(object): + def __init__(self, routes, + hostpaths=None, + port=None, + abs_path=True, + home_view=None, + error_view=None): + self.routes = routes - self.fallback = ReferRedirect(hostpaths) + + # optional port setting may be ignored by wsgi container + self.port = port + + if hostpaths: + self.fallback = ReferRedirect(hostpaths) + else: + self.fallback = None + self.abs_path = abs_path self.home_view = home_view @@ -29,26 +43,27 @@ class ArchivalRouter: return self.fallback(env, self.routes) if self.fallback else None - def render_home_page(self): # render the homepage! if self.home_view: - return self.home_view.render_response(routes = self.routes) + return self.home_view.render_response(routes=self.routes) else: # default home page template text = '\n'.join(map(str, self.routes)) return WbResponse.text_response(text) + #================================================================= # Route by matching regex (or fixed prefix) # of request uri (excluding first '/') #================================================================= -class Route: +class Route(object): # match upto next / or ? or end - SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)' + SLASH_QUERY_LOOKAHEAD = '(?=/|$|\?)' + def __init__(self, regex, handler, coll_group=0, config={}, + lookahead=SLASH_QUERY_LOOKAHEAD): - def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD): self.path = regex if regex: self.regex = re.compile(regex + lookahead) @@ -59,12 +74,11 @@ class Route: self.coll_group = coll_group self._custom_init(config) - def __call__(self, env, use_abs_prefix): wbrequest = self.parse_request(env, use_abs_prefix) return self.handler(wbrequest) if wbrequest else None - def parse_request(self, env, use_abs_prefix, request_uri = None): + def parse_request(self, env, use_abs_prefix, request_uri=None): if not request_uri: request_uri = env['REL_REQUEST_URI'] @@ -75,10 +89,12 @@ class Route: matched_str = matcher.group(0) if matched_str: rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/' - wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri + # remove the '/' + rel_prefix part of uri + wb_url_str = request_uri[len(matched_str) + 2:] else: rel_prefix = env['SCRIPT_NAME'] + '/' - wb_url_str = request_uri[1:] # the request_uri is the wb_url, since no coll + # the request_uri is the wb_url, since no coll + wb_url_str = request_uri[1:] coll = matcher.group(self.coll_group) @@ -88,20 +104,19 @@ class Route: rel_prefix=rel_prefix, coll=coll, use_abs_prefix=use_abs_prefix, - wburl_class = self.handler.get_wburl_type(), + wburl_class=self.handler.get_wburl_type(), urlrewriter_class=UrlRewriter) - # Allow for applying of additional filters self._apply_filters(wbrequest, matcher) return wbrequest - def _apply_filters(self, wbrequest, matcher): for filter in self.filters: last_grp = len(matcher.groups()) - wbrequest.query_filter.append(filter.format(matcher.group(last_grp))) + filter_str = filter.format(matcher.group(last_grp)) + wbrequest.query_filter.append(filter_str) def _custom_init(self, config): self.filters = config.get('filters', []) @@ -112,7 +127,8 @@ class Route: #================================================================= -# ReferRedirect -- redirect urls that have 'fallen through' based on the referrer settings +# ReferRedirect -- redirect urls that have 'fallen through' +# based on the referrer settings #================================================================= class ReferRedirect: def __init__(self, match_prefixs): @@ -121,7 +137,6 @@ class ReferRedirect: else: self.match_prefixs = [match_prefixs] - def __call__(self, env, routes): referrer = env.get('HTTP_REFERER') @@ -133,7 +148,7 @@ class ReferRedirect: ref_split = urlparse.urlsplit(referrer) # ensure referrer starts with one of allowed hosts - if not any (referrer.startswith(i) for i in self.match_prefixs): + if not any(referrer.startswith(i) for i in self.match_prefixs): if ref_split.netloc != env.get('HTTP_HOST'): return None @@ -144,13 +159,12 @@ class ReferRedirect: if app_path: # must start with current app name, if not root if not path.startswith(app_path): - return None + return None path = path[len(app_path):] - for route in routes: - ref_request = route.parse_request(env, False, request_uri = path) + ref_request = route.parse_request(env, False, request_uri=path) if ref_request: break @@ -174,6 +188,10 @@ class ReferRedirect: # 2013/path.html -> /path.html rel_request_uri = rel_request_uri[len(timestamp_path) - 1:] - final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, rewriter.rewrite(rel_request_uri), '', '')) + final_url = urlparse.urlunsplit((ref_split.scheme, + ref_split.netloc, + rewriter.rewrite(rel_request_uri), + '', + '')) return WbResponse.redir_response(final_url) diff --git a/pywb/framework/basehandlers.py b/pywb/framework/basehandlers.py new file mode 100644 index 00000000..8ae4d662 --- /dev/null +++ b/pywb/framework/basehandlers.py @@ -0,0 +1,23 @@ +from pywb.rewrite.wburl import WbUrl + + +#================================================================= +class BaseHandler(object): + """ + Represents a base handler class that handles any request + """ + def __call__(self, wbrequest): + return wbrequest + + def get_wburl_type(self): + return None + + +#================================================================= +class WbUrlHandler(BaseHandler): + """ + Represents a handler which assumes the request contains a WbUrl + Ensure that the WbUrl is parsed in the request + """ + def get_wburl_type(self): + return WbUrl diff --git a/pywb/proxy.py b/pywb/framework/proxy.py similarity index 68% rename from pywb/proxy.py rename to pywb/framework/proxy.py index fc14d1e5..d27b922e 100644 --- a/pywb/proxy.py +++ b/pywb/framework/proxy.py @@ -2,23 +2,37 @@ from wbrequestresponse import WbResponse, WbRequest from archivalrouter import ArchivalRouter import urlparse + #================================================================= # An experimental router which combines both archival and proxy modes -# http proxy mode support is very simple: only latest capture is available currently +# http proxy mode support is very simple so far: +# only latest capture is available currently #================================================================= +class ProxyArchivalRouter(ArchivalRouter): + def __init__(self, routes, + hostpaths=None, + port=None, + abs_path=True, + home_view=None, + error_view=None): + + (super(ProxyArchivalRouter, self). + __init__(routes, + hostpaths=hostpaths, + port=port, + abs_path=abs_path, + home_view=home_view, + error_view=error_view)) -class ProxyArchivalRouter: - def __init__(self, routes, hostpaths = None, abs_path = True, home_view = None, error_view = None): - self.archival = ArchivalRouter(routes, hostpaths, abs_path, home_view, error_view) self.proxy = ProxyRouter(routes[0].handler, hostpaths, error_view) - self.error_view = error_view + #self.error_view = error_view def __call__(self, env): - response = self.archival(env) + response = self.proxy(env) if response: return response - response = self.proxy(env) + response = super(ProxyArchivalRouter, self).__call__(env) if response: return response @@ -29,7 +43,7 @@ class ProxyArchivalRouter: # Only supports latest capture replay at the moment #================================================================= class ProxyRouter: - def __init__(self, handler, hostpaths = None, error_view = None): + def __init__(self, handler, hostpaths=None, error_view=None): self.handler = handler self.hostpaths = hostpaths @@ -56,27 +70,26 @@ class ProxyRouter: return self.handler(wbrequest) - # Proxy Auto-Config (PAC) script for the proxy def make_pac_response(self, env): server_hostport = env['SERVER_NAME'] + ':' + env['SERVER_PORT'] buff = 'function FindProxyForURL (url, host) {\n' - direct_cond =' if (shExpMatch(host, "{0}")) {{ return "DIRECT"; }}\n' + direct = ' if (shExpMatch(host, "{0}")) {{ return "DIRECT"; }}\n' for hostpath in self.hostpaths: parts = urlparse.urlsplit(hostpath).netloc.split(':') - buff += direct_cond.format(parts[0]) + buff += direct.format(parts[0]) - buff += direct_cond.format(env['SERVER_NAME']) + buff += direct.format(env['SERVER_NAME']) #buff += '\n return "PROXY {0}";\n}}\n'.format(self.hostpaths[0]) buff += '\n return "PROXY {0}";\n}}\n'.format(server_hostport) - return WbResponse.text_response(buff, content_type = 'application/x-ns-proxy-autoconfig') - + content_type = 'application/x-ns-proxy-autoconfig' + return WbResponse.text_response(buff, content_type=content_type) #================================================================= @@ -85,10 +98,11 @@ class ProxyRouter: class ProxyHttpsUrlRewriter: HTTP = 'http://' HTTPS = 'https://' + def __init__(self, wbrequest, prefix): pass - def rewrite(self, url, mod = None): + def rewrite(self, url, mod=None): if url.startswith(self.HTTPS): return self.HTTP + url[len(self.HTTPS):] else: @@ -97,6 +111,5 @@ class ProxyHttpsUrlRewriter: def get_timestamp_url(self, timestamp, url): return url - def get_abs_url(self, url = ''): + def get_abs_url(self, url=''): return url - diff --git a/pywb/test/test_archivalrouter.py b/pywb/framework/test/test_archivalrouter.py similarity index 97% rename from pywb/test/test_archivalrouter.py rename to pywb/framework/test/test_archivalrouter.py index 229fafb6..706027ba 100644 --- a/pywb/test/test_archivalrouter.py +++ b/pywb/framework/test/test_archivalrouter.py @@ -84,8 +84,8 @@ False """ -from pywb.archivalrouter import Route, ReferRedirect -from pywb.handlers import BaseHandler, WbUrlHandler +from pywb.framework.archivalrouter import Route, ReferRedirect +from pywb.framework.basehandlers import BaseHandler, WbUrlHandler import pprint def print_req(req): diff --git a/pywb/test/test_wbrequestresponse.py b/pywb/framework/test/test_wbrequestresponse.py similarity index 98% rename from pywb/test/test_wbrequestresponse.py rename to pywb/framework/test/test_wbrequestresponse.py index 600ec926..977a8863 100644 --- a/pywb/test/test_wbrequestresponse.py +++ b/pywb/framework/test/test_wbrequestresponse.py @@ -41,7 +41,7 @@ from pywb.rewrite.wburl import WbUrl from pywb.rewrite.url_rewriter import UrlRewriter from pywb.utils.statusandheaders import StatusAndHeaders -from pywb.wbrequestresponse import WbRequest, WbResponse +from pywb.framework.wbrequestresponse import WbRequest, WbResponse def print_req_from_uri(request_uri, env={}, use_abs_prefix=False): diff --git a/pywb/wbexceptions.py b/pywb/framework/wbexceptions.py similarity index 81% rename from pywb/wbexceptions.py rename to pywb/framework/wbexceptions.py index afacc325..6d437a4e 100644 --- a/pywb/wbexceptions.py +++ b/pywb/framework/wbexceptions.py @@ -1,23 +1,22 @@ +from pywb.utils.wbexception import WbException -class WbException(Exception): - pass - class NotFoundException(WbException): def status(self): return '404 Not Found' + # Exceptions that effect a specific capture and result in a retry class CaptureException(WbException): def status(self): return '500 Internal Server Error' + class InternalRedirect(WbException): - def __init__(self, location, status = '302 Internal Redirect'): + def __init__(self, location, status='302 Internal Redirect'): WbException.__init__(self, 'Redirecting -> ' + location) self.status = status self.httpHeaders = [('Location', location)] def status(self): return self.status - diff --git a/pywb/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py similarity index 76% rename from pywb/wbrequestresponse.py rename to pywb/framework/wbrequestresponse.py index 4a459c4b..3ef091d9 100644 --- a/pywb/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -26,7 +26,6 @@ class WbRequest: except KeyError: return '' - def __init__(self, env, request_uri=None, rel_prefix='', @@ -40,7 +39,10 @@ class WbRequest: self.env = env - self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI') + if request_uri: + self.request_uri = request_uri + else: + self.request_uri = env.get('REL_REQUEST_URI') self.coll = coll @@ -55,7 +57,6 @@ class WbRequest: else: self.wb_prefix = rel_prefix - if not wb_url_str: wb_url_str = '/' @@ -83,7 +84,6 @@ class WbRequest: # PERF env['X_PERF'] = {} - def _is_ajax(self): value = self.env.get('HTTP_X_REQUESTED_WITH') if not value: @@ -96,7 +96,6 @@ class WbRequest: return True return False - def __repr__(self): varlist = vars(self) varstr = pprint.pformat(varlist) @@ -111,32 +110,39 @@ class WbResponse: Holds a status_headers object and a response iter, to be returned to wsgi container. """ - def __init__(self, status_headers, value = []): + def __init__(self, status_headers, value=[]): self.status_headers = status_headers self.body = value @staticmethod - def text_stream(text, status = '200 OK', content_type = 'text/plain'): - return WbResponse(StatusAndHeaders(status, [('Content-Type', content_type)]), value = text) + def text_stream(stream, status='200 OK', content_type='text/plain'): + status_headers = StatusAndHeaders(status, + [('Content-Type', content_type)]) + + return WbResponse(status_headers, value=stream) @staticmethod - def text_response(text, status = '200 OK', content_type = 'text/plain'): - return WbResponse(StatusAndHeaders(status, [('Content-Type', content_type)]), value = [text]) + def text_response(text, status='200 OK', content_type='text/plain'): + status_headers = StatusAndHeaders(status, + [('Content-Type', content_type)]) + + return WbResponse(status_headers, value=[text]) @staticmethod - def redir_response(location, status = '302 Redirect'): - return WbResponse(StatusAndHeaders(status, [('Location', location)])) - + def redir_response(location, status='302 Redirect'): + return WbResponse(StatusAndHeaders(status, + [('Location', location)])) def __call__(self, env, start_response): # PERF perfstats = env.get('X_PERF') if perfstats: - self.status_headers.headers.append(('X-Archive-Perf-Stats', str(perfstats))) + self.status_headers.headers.append(('X-Archive-Perf-Stats', + str(perfstats))) - - start_response(self.status_headers.statusline, self.status_headers.headers) + start_response(self.status_headers.statusline, + self.status_headers.headers) if env['REQUEST_METHOD'] == 'HEAD': if hasattr(self.body, 'close'): @@ -148,6 +154,5 @@ class WbResponse: else: return [str(self.body)] - def __repr__(self): return str(vars(self)) diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py new file mode 100644 index 00000000..1dd433de --- /dev/null +++ b/pywb/framework/wsgi_wrappers.py @@ -0,0 +1,165 @@ +from pywb.utils.wbexception import WbException +from pywb.utils.loaders import load_yaml_config + +from wbexceptions import NotFoundException, InternalRedirect +from wbrequestresponse import WbResponse, StatusAndHeaders + + +import os +import importlib +import logging + + +DEFAULT_PORT = 8080 + +#================================================================= +# adapted from wsgiref.request_uri, but doesn't include domain name +# and allows all characters which are allowed in the path segment +# according to: http://tools.ietf.org/html/rfc3986#section-3.3 +# explained here: +# http://stackoverflow.com/questions/4669692/ +# valid-characters-for-directory-part-of-a-url-for-short-links + + +def rel_request_uri(environ, include_query=1): + """ + Return the requested path, optionally including the query string + + # Simple test: + >>> rel_request_uri({'PATH_INFO': '/web/example.com'}) + '/web/example.com' + + # Test all unecoded special chars and double-quote + # (double-quote must be encoded but not single quote) + >>> rel_request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""}) + "/web/example.com/0~!+$&'()*+,;=:%22" + """ + from urllib import quote + url = quote(environ.get('PATH_INFO', ''), safe='/~!$&\'()*+,;=:@') + if include_query and environ.get('QUERY_STRING'): + url += '?' + environ['QUERY_STRING'] + + return url + + +#================================================================= +class WSGIApp(object): + def __init__(self, wb_router): + self.wb_router = wb_router + self.port = DEFAULT_PORT + if hasattr(wb_router, 'port'): + self.port = wb_router.port + + # Top-level wsgi application + def __call__(self, env, start_response): + if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'): + env['REL_REQUEST_URI'] = rel_request_uri(env) + else: + env['REL_REQUEST_URI'] = env['REQUEST_URI'] + + wb_router = self.wb_router + response = None + + try: + response = wb_router(env) + + if not response: + msg = 'No handler for "{0}"'.format(env['REL_REQUEST_URI']) + raise NotFoundException(msg) + + except InternalRedirect as ir: + response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders)) + + except WbException as e: + response = handle_exception(env, wb_router.error_view, e, False) + + except Exception as e: + response = handle_exception(env, wb_router.error_view, e, True) + + return response(env, start_response) + + +#================================================================= +def handle_exception(env, error_view, exc, print_trace): + if hasattr(exc, 'status'): + status = exc.status() + else: + status = '400 Bad Request' + + if print_trace: + import traceback + err_details = traceback.format_exc(exc) + print err_details + else: + logging.info(str(exc)) + err_details = None + + if error_view: + import traceback + return error_view.render_response(err_msg=str(exc), + err_details=err_details, + status=status) + else: + return WbResponse.text_response(status + ' Error: ' + str(exc), + status=status) + +#================================================================= +DEFAULT_CONFIG_FILE = 'config.yaml' + + +#================================================================= +def init_app(init_func, load_yaml=True, config_file=None): + logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s', + level=logging.DEBUG) + logging.info('') + + if load_yaml: + if not config_file: + config_file = os.environ.get('PYWB_CONFIG_FILE') + if not config_file: + config_file = DEFAULT_CONFIG_FILE + + config = load_yaml_config(config_file) + + try: + if load_yaml: + wb_router = init_func(config) + else: + wb_router = init_func() + except: + msg = '*** pywb app init FAILED config from "%s"!\n' + logging.exception(msg, init_func.__name__) + raise + else: + msg = '*** pywb app inited with config from "%s"!\n' + logging.info(msg, init_func.__name__) + + return WSGIApp(wb_router) + + +#================================================================= +def start_wsgi_server(the_app): + from wsgiref.simple_server import make_server + from optparse import OptionParser + + opt = OptionParser('%prog [OPTIONS]') + opt.add_option('-p', '--port', type='int', default=None) + + options, args = opt.parse_args() + + port = options.port + + port = the_app.port + + if not port: + port = DEFAULT_PORT + + logging.debug('Starting CDX Server on port %s', port) + + try: + httpd = make_server('', port, the_app) + httpd.serve_forever() + except KeyboardInterrupt as ex: + pass + + logging.debug('Stopping CDX Server') diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py deleted file mode 100644 index bd63bfd5..00000000 --- a/pywb/pywb_init.py +++ /dev/null @@ -1,128 +0,0 @@ -import handlers -import archivalrouter -import config_utils -import proxy -from indexreader import IndexReader - -import os -import yaml -import logging - -#================================================================= -DEFAULTS = { - 'hostpaths': ['http://localhost:8080'], - 'collections': {'pywb': './sample_archive/cdx/'}, - 'archive_paths': './sample_archive/warcs/', - - 'head_insert_html': 'ui/head_insert.html', - 'query_html': 'ui/query.html', - 'search_html': 'ui/search.html', - 'home_html': 'ui/index.html', - 'error_html': 'ui/error.html', - - 'static_routes': {'static/default': 'static/'}, - - 'domain_specific_rules': 'rules.yaml', -} - -class DictChain: - def __init__(self, *dicts): - self.dicts = dicts - - def get(self, key, default_val=None): - for d in self.dicts: - val = d.get(key) - if val is not None: - return val - return default_val - - -#================================================================= -## Reference non-YAML config -#================================================================= -def pywb_config_manual(passed_config = {}): - - config = DictChain(passed_config, DEFAULTS) - - routes = [] - - hostpaths = config.get('hostpaths') - - # collections based on cdx source - collections = config.get('collections') - - for name, value in collections.iteritems(): - if isinstance(value, str): - value = {'index_paths': value} - - route_config = DictChain(value, config) - - ds_rules_file = route_config.get('domain_specific_rules', None) - cdx_server = IndexReader(route_config, ds_rules_file) - - wb_handler = config_utils.create_wb_handler( - cdx_server=cdx_server, - config=route_config, - ds_rules_file=ds_rules_file, - ) - - logging.debug('Adding Collection: ' + name) - - route_class = route_config.get('route_class', archivalrouter.Route) - - routes.append(route_class(name, wb_handler, config = route_config)) - - # cdx query handler - if route_config.get('enable_cdx_api', False): - routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_server))) - - - if config.get('debug_echo_env', False): - routes.append(archivalrouter.Route('echo_env', handlers.DebugEchoEnvHandler())) - - if config.get('debug_echo_req', False): - routes.append(archivalrouter.Route('echo_req', handlers.DebugEchoHandler())) - - - static_routes = config.get('static_routes') - - for static_name, static_path in static_routes.iteritems(): - routes.append(archivalrouter.Route(static_name, handlers.StaticHandler(static_path))) - - # Check for new proxy mode! - if config.get('enable_http_proxy', False): - router = proxy.ProxyArchivalRouter - else: - router = archivalrouter.ArchivalRouter - - # Finally, create wb router - return router( - routes, - # Specify hostnames that pywb will be running on - # This will help catch occasionally missed rewrites that fall-through to the host - # (See archivalrouter.ReferRedirect) - hostpaths = hostpaths, - - abs_path = config.get('absolute_paths', True), - - home_view = config_utils.load_template_file(config.get('home_html'), 'Home Page'), - error_view = config_utils.load_template_file(config.get('error_html'), 'Error Page') - ) - - - -#================================================================= -# YAML config loader -#================================================================= -DEFAULT_CONFIG_FILE = 'config.yaml' - - -def pywb_config(config_file = None): - if not config_file: - config_file = os.environ.get('PYWB_CONFIG', DEFAULT_CONFIG_FILE) - - with open(config_file) as fh: - config = yaml.load(fh) - - return pywb_config_manual(config) - diff --git a/pywb/utils/canonicalize.py b/pywb/utils/canonicalize.py index 73555ca6..6979a323 100644 --- a/pywb/utils/canonicalize.py +++ b/pywb/utils/canonicalize.py @@ -4,6 +4,9 @@ import surt import urlparse +from wbexception import WbException + + #================================================================= class UrlCanonicalizer(object): def __init__(self, surt_ordered=True): @@ -14,7 +17,7 @@ class UrlCanonicalizer(object): #================================================================= -class UrlCanonicalizeException(Exception): +class UrlCanonicalizeException(WbException): def status(self): return '400 Bad Request' @@ -164,7 +167,8 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None): elif match_type == 'domain': if not surt_ordered: - raise UrlCanonicalizeException('matchType=domain unsupported for non-surt') + msg = 'matchType=domain unsupported for non-surt' + raise UrlCanonicalizeException(msg) host = start_key.split(')/')[0] diff --git a/pywb/utils/dsrules.py b/pywb/utils/dsrules.py index 2e6f9626..672ce738 100644 --- a/pywb/utils/dsrules.py +++ b/pywb/utils/dsrules.py @@ -1,10 +1,9 @@ -import yaml import pkgutil +from loaders import load_yaml_config + #================================================================= - -DEFAULT_RULES_FILE = 'rules.yaml' -DEFAULT_RULES_PKG = 'pywb' +DEFAULT_RULES_FILE = 'pywb/rules.yaml' #================================================================= @@ -23,10 +22,14 @@ class RuleSet(object): self.rules = [] - ds_rules_file = kwargs.get('ds_rules_file') default_rule_config = kwargs.get('default_rule_config') - config = self.load_default_rules(ds_rules_file) + ds_rules_file = kwargs.get('ds_rules_file') + + if not ds_rules_file: + ds_rules_file = DEFAULT_RULES_FILE + + config = load_yaml_config(ds_rules_file) rulesmap = config.get('rules') if config else None @@ -53,22 +56,6 @@ class RuleSet(object): if not def_key_found and default_rule_config is not None: self.rules.append(rule_cls(self.DEFAULT_KEY, default_rule_config)) - @staticmethod - def load_default_rules(filename=None, pkg=None): - config = None - - if not filename: - filename = DEFAULT_RULES_FILE - - if not pkg: - pkg = DEFAULT_RULES_PKG - - if filename: - yaml_str = pkgutil.get_data(pkg, filename) - config = yaml.load(yaml_str) - - return config - def iter_matching(self, urlkey): """ Iterate over all matching rules for given urlkey diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 7813ded8..d2ca827f 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -7,11 +7,20 @@ import os import hmac import urllib2 import time +import pkg_resources #================================================================= def is_http(filename): - return any(filename.startswith(x) for x in ['http://', 'https://']) + return filename.startswith(('http://', 'https://')) + + +#================================================================= +def load_yaml_config(config_file): + import yaml + configdata = BlockLoader().load(config_file) + config = yaml.load(configdata) + return config #================================================================= @@ -24,27 +33,46 @@ class BlockLoader(object): def __init__(self, cookie_maker=None): self.cookie_maker = cookie_maker - def load(self, url, offset, length): + def load(self, url, offset=0, length=-1): """ Determine loading method based on uri """ if is_http(url): return self.load_http(url, offset, length) else: - return self.load_file(url, offset, length) + return self.load_file_or_resource(url, offset, length) - def load_file(self, url, offset, length): + def load_file_or_resource(self, url, offset, length): """ Load a file-like reader from the local file system """ + file_only = False + if url.startswith('file://'): url = url[len('file://'):] + file_only = True - afile = open(url, 'rb') - afile.seek(offset) + try: + # first, try as file + afile = open(url, 'rb') - if length > 0: + except IOError: + if file_only: + raise + + # then, try as package.path/file + pkg_split = url.split('/', 1) + if len(pkg_split) == 1: + raise + + afile = pkg_resources.resource_stream(pkg_split[0], + pkg_split[1]) + + if offset > 0: + afile.seek(offset) + + if length >= 0: return LimitReader(afile, length) else: return afile diff --git a/pywb/utils/test/loaders_test.py b/pywb/utils/test/loaders_test.py index 7dc42d83..a8454816 100644 --- a/pywb/utils/test/loaders_test.py +++ b/pywb/utils/test/loaders_test.py @@ -30,9 +30,9 @@ >>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline() ' CDX N b a m s k r M S V g\\n' -#DecompressingBufferedReader readline() with decompression ->>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline() -' CDX N b a m s k r M S V g\\n' +#DecompressingBufferedReader readline() with decompression (zipnum file, no header) +>>> DecompressingBufferedReader(open(test_zip_dir + 'zipnum-sample.cdx.gz', 'rb'), decomp_type = 'gzip').readline() +'com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz\\n' >>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read() 'Example Domain' @@ -60,7 +60,7 @@ from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb import get_test_dir #test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/' test_cdx_dir = get_test_dir() + 'cdx/' - +test_zip_dir = get_test_dir() + 'zipcdx/' def read_multiple(reader, inc_reads): result = None diff --git a/pywb/utils/timeutils.py b/pywb/utils/timeutils.py index f93f324d..a89424aa 100644 --- a/pywb/utils/timeutils.py +++ b/pywb/utils/timeutils.py @@ -171,7 +171,6 @@ def timestamp_to_datetime(string): # pad to 6 digits string = _pad_timestamp(string, PAD_6) - def clamp(val, min_, max_): try: val = int(val) diff --git a/pywb/utils/wbexception.py b/pywb/utils/wbexception.py new file mode 100644 index 00000000..a8757935 --- /dev/null +++ b/pywb/utils/wbexception.py @@ -0,0 +1,3 @@ +class WbException(Exception): + def status(self): + return '500 Internal Server Error' diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 446e0da3..fb3af38c 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -9,6 +9,9 @@ from pywb.utils.statusandheaders import StatusAndHeadersParserException from pywb.utils.loaders import BlockLoader from pywb.utils.bufferedreaders import DecompressingBufferedReader +from pywb.utils.wbexception import WbException + + #================================================================= ArcWarcRecord = collections.namedtuple('ArchiveRecord', 'type, rec_headers, ' + @@ -16,7 +19,7 @@ ArcWarcRecord = collections.namedtuple('ArchiveRecord', #================================================================= -class ArchiveLoadFailed(Exception): +class ArchiveLoadFailed(WbException): def __init__(self, reason, filename=''): super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason)) #self.filename = filename @@ -62,9 +65,9 @@ class ArcWarcRecordLoader: decomp_type = 'gzip' # Create decompressing stream - stream = DecompressingBufferedReader(stream = raw, - decomp_type = decomp_type, - block_size = self.block_size) + stream = DecompressingBufferedReader(stream=raw, + decomp_type=decomp_type, + block_size=self.block_size) (the_format, rec_headers) = self._detect_type_load_headers(stream) diff --git a/pywb/warc/resolvingloader.py b/pywb/warc/resolvingloader.py index 041024e7..6a44739d 100644 --- a/pywb/warc/resolvingloader.py +++ b/pywb/warc/resolvingloader.py @@ -176,6 +176,6 @@ class ResolvingLoader: params = {'url': url, 'closest': timestamp, 'filter': 'digest:' + digest, - 'output': 'raw'} + 'output': 'cdxobject'} return self.cdx_server.load_cdx(**params) diff --git a/pywb/wbapp.py b/pywb/wbapp.py deleted file mode 100644 index ac51ba9d..00000000 --- a/pywb/wbapp.py +++ /dev/null @@ -1,124 +0,0 @@ -from wbexceptions import WbException, NotFoundException, InternalRedirect -from wbrequestresponse import WbResponse, StatusAndHeaders - -from pywb.cdx.cdxserver import CDXException -from pywb.utils.canonicalize import UrlCanonicalizeException -from pywb.warc.recordloader import ArchiveLoadFailed - -import os -import importlib -import logging - - - -#================================================================= -# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters -# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3 -# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links -def rel_request_uri(environ, include_query=1): - """ - Return the requested path, optionally including the query string - - # Simple test: - >>> rel_request_uri({'PATH_INFO': '/web/example.com'}) - '/web/example.com' - - # Test all unecoded special chars and double-quote - # (double-quote must be encoded but not single quote) - >>> rel_request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""}) - "/web/example.com/0~!+$&'()*+,;=:%22" - """ - from urllib import quote - url = quote(environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@') - if include_query and environ.get('QUERY_STRING'): - url += '?' + environ['QUERY_STRING'] - - return url - -#================================================================= -def create_wb_app(wb_router): - - # Top-level wsgi application - def application(env, start_response): - if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'): - env['REL_REQUEST_URI'] = rel_request_uri(env) - else: - env['REL_REQUEST_URI'] = env['REQUEST_URI'] - - response = None - - try: - response = wb_router(env) - - if not response: - raise NotFoundException('No handler for "{0}"'.format(env['REL_REQUEST_URI'])) - - except InternalRedirect as ir: - response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders)) - - except (WbException, CDXException, - UrlCanonicalizeException, ArchiveLoadFailed) as e: - response = handle_exception(env, wb_router.error_view, e, False) - - except Exception as e: - response = handle_exception(env, wb_router.error_view, e, True) - - return response(env, start_response) - - - return application - - -def handle_exception(env, error_view, exc, print_trace): - if hasattr(exc, 'status'): - status = exc.status() - else: - status = '400 Bad Request' - - if print_trace: - import traceback - err_details = traceback.format_exc(exc) - print err_details - else: - logging.info(str(exc)) - err_details = None - - if error_view: - import traceback - return error_view.render_response(err_msg = str(exc), err_details = err_details, status = status) - else: - return WbResponse.text_response(status + ' Error: ' + str(exc), status = status) - - -#================================================================= -DEFAULT_CONFIG_FILE = 'config.yaml' - -def main(): - try: - logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG) - - # see if there's a custom init module - config_name = os.environ.get('PYWB_CONFIG_MODULE') - - if not config_name: - # use default module - config_name = 'pywb.pywb_init' - logging.info('Loading from default config module "{0}"'.format(config_name)) - logging.info('') - - module = importlib.import_module(config_name) - - app = create_wb_app(module.pywb_config()) - logging.info('') - logging.info('*** pywb inited with settings from {0}.pywb_config()!\n'.format(config_name)) - return app - - except Exception: - logging.exception('*** pywb could not init with settings from {0}.pywb_config()!\n'.format(config_name)) - raise - -#================================================================= -if __name__ == "__main__": - pass -else: - application = main() diff --git a/run.sh b/run.sh index d6e484b9..77964b32 100755 --- a/run.sh +++ b/run.sh @@ -10,14 +10,14 @@ mypath=$(cd `dirname $0` && pwd) # ex: my_pywb.pywb_config() #export 'PYWB_CONFIG=my_pywb' -app="pywb.wbapp" +app="pywb.apps.wayback" params="--http-socket :8080 -b 65536" #params="--static-map /static=$mypath/static --http-socket :8080 -b 65536" if [ -z "$1" ]; then # Standard root config - params="$params --wsgi pywb.wbapp" + params="$params --wsgi $app" else # run with --mount # requires a file not a package, so creating a mount_run.py to load the package diff --git a/sample_archive/cdx/iana.cdx.gz b/sample_archive/cdx/iana.cdx.gz deleted file mode 100644 index 11499ca5..00000000 Binary files a/sample_archive/cdx/iana.cdx.gz and /dev/null differ diff --git a/setup.py b/setup.py index c9ff86bd..54f136b4 100755 --- a/setup.py +++ b/setup.py @@ -14,7 +14,14 @@ setup( license='GPL', packages=find_packages(), provides=[ - 'pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite' + 'pywb', + 'pywb.utils', + 'pywb.cdx', + 'pywb.warc', + 'pywb.rewrite', + 'pywb.framework' + 'pywb.core', + 'pywb.apps' ], package_data={ 'pywb': ['ui/*', 'static/*', '*.yaml'], @@ -34,7 +41,6 @@ setup( 'pyyaml', 'WebTest', 'pytest', - 'werkzeug>=0.9.4', ], # tests_require=['WebTest', 'pytest'], zip_safe=False diff --git a/test_config.yaml b/test_config.yaml index 20e52933..d6c75650 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -90,6 +90,9 @@ enable_http_proxy: true # enable cdx server api for querying cdx directly (experimental) enable_cdx_api: true +# test different port +port: 9000 + # optional reporter callback func # if set, called with request and cdx object reporter: !!python/object/new:tests.fixture.PrintReporter [] diff --git a/tests/test_wsgi_cdxserver.py b/tests/test_cdx_server_app.py similarity index 73% rename from tests/test_wsgi_cdxserver.py rename to tests/test_cdx_server_app.py index 8eee2484..613273b5 100644 --- a/tests/test_wsgi_cdxserver.py +++ b/tests/test_cdx_server_app.py @@ -1,32 +1,26 @@ -import os import re +import webtest -import pytest from urllib import urlencode -from werkzeug.test import Client -from werkzeug.wrappers import BaseResponse, Response - -import yaml - from pywb.cdx.cdxobject import CDXObject -from pywb.cdx.wsgi_cdxserver import create_app +from pywb.apps.cdx_server import application -from tests.fixture import testconfig +import pytest +#================================================================ @pytest.fixture -def client(testconfig): - app = create_app(testconfig) - return Client(app, Response) +def client(): + return webtest.TestApp(application) -# ================================================================ -def query(client, url, **params): +#================================================================ +def query(client, url, is_error=False, **params): params['url'] = url - return client.get('/cdx?' + urlencode(params, doseq=1)) + return client.get('/cdx?' + urlencode(params, doseq=1), expect_errors=is_error) -# ================================================================ +#================================================================ def test_exact_url(client): """ basic exact match, no filters, etc. @@ -34,48 +28,54 @@ def test_exact_url(client): resp = query(client, 'http://www.iana.org/') assert resp.status_code == 200 - print resp.data + print resp.body + +#================================================================ def test_prefix_match(client): """ prefix match test """ resp = query(client, 'http://www.iana.org/', matchType='prefix') - print resp.data.splitlines() + print resp.body.splitlines() assert resp.status_code == 200 suburls = 0 - for l in resp.data.splitlines(): + for l in resp.body.splitlines(): fields = l.split(' ') if len(fields[0]) > len('org,iana)/'): suburls += 1 assert suburls > 0 - + + +#================================================================ def test_filters(client): """ filter cdxes by mimetype and filename field, exact match. """ resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css', filter=('mimetype:warc/revisit', 'filename:dupes.warc.gz')) - - assert resp.status_code == 200 - assert resp.mimetype == 'text/plain' - for l in resp.data.splitlines(): + assert resp.status_code == 200 + assert resp.content_type == 'text/plain' + + for l in resp.body.splitlines(): fields = l.split(' ') assert fields[0] == 'org,iana)/_css/2013.1/screen.css' assert fields[3] == 'warc/revisit' assert fields[10] == 'dupes.warc.gz' + +#================================================================ def test_limit(client): resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css', limit='1') assert resp.status_code == 200 - assert resp.mimetype == 'text/plain' + assert resp.content_type == 'text/plain' - cdxes = resp.data.splitlines() + cdxes = resp.body.splitlines() assert len(cdxes) == 1 fields = cdxes[0].split(' ') assert fields[0] == 'org,iana)/_css/2013.1/screen.css' @@ -86,15 +86,17 @@ def test_limit(client): limit='1', reverse='1') assert resp.status_code == 200 - assert resp.mimetype == 'text/plain' + assert resp.content_type == 'text/plain' - cdxes = resp.data.splitlines() + cdxes = resp.body.splitlines() assert len(cdxes) == 1 fields = cdxes[0].split(' ') assert fields[0] == 'org,iana)/_css/2013.1/screen.css' assert fields[1] == '20140127171239' assert fields[3] == 'warc/revisit' + +#================================================================ def test_fields(client): """ retrieve subset of fields with ``fields`` parameter. @@ -104,7 +106,7 @@ def test_fields(client): assert resp.status_code == 200 - cdxes = resp.data.splitlines() + cdxes = resp.body.splitlines() for cdx in cdxes: fields = cdx.split(' ') @@ -113,16 +115,21 @@ def test_fields(client): assert re.match(r'\d{14}$', fields[1]) assert re.match(r'\d{3}|-', fields[2]) + +#================================================================ def test_fields_undefined(client): """ - server shall respond with Bad Request (TODO: with proper explanation), + server shall respond with Bad Request and name of undefined when ``fields`` parameter contains undefined name(s). """ resp = query(client, 'http://www.iana.org/_css/2013.1/print.css', + is_error=True, fields='urlkey,nosuchfield') resp.status_code == 400 - + + +#================================================================ def test_resolveRevisits(client): """ with ``resolveRevisits=true``, server adds three fields pointing to @@ -132,9 +139,9 @@ def test_resolveRevisits(client): resolveRevisits='true' ) assert resp.status_code == 200 - assert resp.mimetype == 'text/plain' + assert resp.content_type == 'text/plain' - cdxes = resp.data.splitlines() + cdxes = resp.body.splitlines() originals = {} for cdx in cdxes: fields = cdx.split(' ') @@ -151,6 +158,8 @@ def test_resolveRevisits(client): orig = originals.get(sha) assert orig == (int(orig_size), int(orig_offset), orig_fn) + +#================================================================ def test_resolveRevisits_orig_fields(client): """ when resolveRevisits=true, extra three fields are named @@ -162,9 +171,9 @@ def test_resolveRevisits_orig_fields(client): fields='urlkey,orig.length,orig.offset,orig.filename' ) assert resp.status_code == 200 - assert resp.mimetype == 'text/plain' + assert resp.content_type == 'text/plain' - cdxes = resp.data.splitlines() + cdxes = resp.body.splitlines() for cdx in cdxes: fields = cdx.split(' ') assert len(fields) == 4 @@ -172,6 +181,8 @@ def test_resolveRevisits_orig_fields(client): assert (orig_len == '-' and orig_offset == '-' and orig_fn == '-' or (int(orig_len), int(orig_offset), orig_fn)) + +#================================================================ def test_collapseTime_resolveRevisits_reverse(client): resp = query(client, 'http://www.iana.org/_css/2013.1/print.css', collapseTime='11', @@ -179,11 +190,10 @@ def test_collapseTime_resolveRevisits_reverse(client): reverse='true' ) - cdxes = [CDXObject(l) for l in resp.data.splitlines()] - + cdxes = [CDXObject(l) for l in resp.body.splitlines()] + assert len(cdxes) == 3 # timestamp is in descending order for i in range(len(cdxes) - 1): assert cdxes[i]['timestamp'] >= cdxes[i + 1]['timestamp'] - diff --git a/tests/test_integration.py b/tests/test_integration.py index 6e24ec6a..6e539c31 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,6 +1,6 @@ import webtest -from pywb.pywb_init import pywb_config -from pywb.wbapp import create_wb_app +from pywb.core.pywb_init import create_wb_router +from pywb.framework.wsgi_wrappers import init_app from pywb.cdx.cdxobject import CDXObject from fixture import TestExclusionPerms @@ -11,8 +11,13 @@ class TestWb: def setup(self): #self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config()) # save it in self - useful for debugging - self.router = pywb_config(self.TEST_CONFIG) - self.app = create_wb_app(self.router) + self.app = init_app(create_wb_router, + load_yaml=True, + config_file=self.TEST_CONFIG) + + #self.router = pywb_config(self.TEST_CONFIG) + #self.app = create_wb_app(self.router) + self.testapp = webtest.TestApp(self.app) def _assert_basic_html(self, resp):