From ac0bf5a415b01755d31851393e099a2bd2c78413 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 23 Mar 2014 12:44:28 -0700 Subject: [PATCH] refactor: IndexReader -> QueryHandler, move query output support to QueryHandler. allow for multiple query views in QueryHandler --- pywb/core/cdx_handler.py | 33 +++++------ pywb/core/handlers.py | 24 +++----- pywb/core/indexreader.py | 105 ++++++++++++++++++++++++++++------ pywb/core/pywb_init.py | 70 ++++++++++------------- pywb/core/views.py | 50 ++++++++-------- pywb/perms/test/test_perms.py | 15 +++-- pywb/rewrite/wburl.py | 8 +++ 7 files changed, 178 insertions(+), 127 deletions(-) diff --git a/pywb/core/cdx_handler.py b/pywb/core/cdx_handler.py index 1a549857..ca5f317c 100644 --- a/pywb/core/cdx_handler.py +++ b/pywb/core/cdx_handler.py @@ -2,32 +2,31 @@ from pywb.cdx.cdxserver import create_cdx_server from pywb.framework.archivalrouter import ArchivalRouter, Route from pywb.framework.basehandlers import BaseHandler +from pywb.framework.wbrequestresponse import WbResponse -from indexreader import IndexReader -from views import TextCapturesView +from query_handler import QueryHandler from urlparse import parse_qs #================================================================= -class CDXHandler(BaseHandler): +class CDXAPIHandler(BaseHandler): """ Handler which passes wsgi request to cdx server and - returns a text-based cdx response + returns a text-based cdx api """ - def __init__(self, index_reader, view=None): - self.index_reader = index_reader - self.view = view if view else TextCapturesView() + def __init__(self, index_handler): + self.index_handler = index_handler def __call__(self, wbrequest): params = self.extract_params_from_wsgi_env(wbrequest.env) - cdx_iter = self.index_reader.load_cdx(wbrequest, params) + cdx_iter = self.index_handler.load_cdx(wbrequest, params) - return self.view.render_response(wbrequest, cdx_iter) + return WbResponse.text_stream(cdx_iter) def __str__(self): - return 'CDX Handler: ' + str(self.index_reader) + return 'CDX Handler: ' + str(self.index_handler) @staticmethod def extract_params_from_wsgi_env(env): @@ -46,25 +45,21 @@ class CDXHandler(BaseHandler): if not 'output' in params: params['output'] = 'text' + elif params['output'] not in ('text'): + params['output'] = 'text' return params -#================================================================= -DEFAULT_RULES = 'pywb/rules.yaml' - - #================================================================= def create_cdx_server_app(config): """ Create a cdx server config to be wrapped in a wsgi app - Currently using single access point '/cdx' + Currently using single access point '/cdx' to expose the api TODO: more complex example with multiple collections? """ - cdx_server = create_cdx_server(config, DEFAULT_RULES) - perms_policy = config.get('perms_policy') - cdx_server = IndexReader(cdx_server, perms_policy) + query_handler = QueryHandler.init_from_config(config) port = config.get('port') - routes = [Route('cdx', CDXHandler(cdx_server))] + routes = [Route('cdx', CDXAPIHandler(query_handler))] return ArchivalRouter(routes, port=port) diff --git a/pywb/core/handlers.py b/pywb/core/handlers.py index 6fa726a3..5e85b1dc 100644 --- a/pywb/core/handlers.py +++ b/pywb/core/handlers.py @@ -8,26 +8,18 @@ from pywb.utils.loaders import BlockLoader from pywb.framework.basehandlers import BaseHandler, WbUrlHandler from pywb.framework.wbrequestresponse import WbResponse -from views import TextCapturesView - #================================================================= # Standard WB Handler #================================================================= class WBHandler(WbUrlHandler): def __init__(self, index_reader, replay, - html_view=None, search_view=None): + search_view=None): self.index_reader = index_reader self.replay = replay - self.text_query_view = TextCapturesView() - - self.query_view = html_view - if not self.query_view: - self.query_view = self.text_query_view - self.search_view = search_view def __call__(self, wbrequest): @@ -35,20 +27,18 @@ class WBHandler(WbUrlHandler): return self.render_search_page(wbrequest) with PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t: - cdx_lines = self.index_reader.load_for_request(wbrequest) + response = self.index_reader.load_for_request(wbrequest) - # new special modifier to always show cdx index - if wbrequest.wb_url.mod == 'cdx_': - return self.text_query_view.render_response(wbrequest, cdx_lines) + if isinstance(response, WbResponse): + return response - if ((wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or - (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY)): - return self.query_view.render_response(wbrequest, cdx_lines) + cdx_lines = response[0] + cdx_callback = response[1] with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: return self.replay(wbrequest, cdx_lines, - self.index_reader.cdx_load_callback(wbrequest)) + cdx_callback) def render_search_page(self, wbrequest): if self.search_view: diff --git a/pywb/core/indexreader.py b/pywb/core/indexreader.py index a605f479..dbf404a0 100644 --- a/pywb/core/indexreader.py +++ b/pywb/core/indexreader.py @@ -2,25 +2,59 @@ import urllib import urllib2 from pywb.perms.perms_filter import make_perms_cdx_filter +from pywb.framework.wbrequestresponse import WbResponse +from pywb.cdx.cdxserver import create_cdx_server + #================================================================= -class IndexReader(object): +DEFAULT_RULES = 'pywb/rules.yaml' + + +#================================================================= +class QueryHandler(object): """ - Main interface for reading index (currently only CDX) from a + Main interface for querying the index (currently only CDX) from a source server (currently a cdx server) - Creates an appropriate query based on wbrequest type info + Creates an appropriate query based on wbrequest type info and outputs + a returns a view for the cdx, either a raw cdx iter, an html view, + etc... """ - def __init__(self, cdx_server, perms_policy=None): + def __init__(self, cdx_server, html_query_view=None, perms_policy=None): self.cdx_server = cdx_server self.perms_policy = perms_policy + self.views = {} + if html_query_view: + self.views['html'] = html_query_view + + @staticmethod + def init_from_config(config, + ds_rules_file=DEFAULT_RULES, + html_view=None): + + perms_policy = config.get('perms_policy') + + cdx_server = create_cdx_server(config, ds_rules_file) + + return QueryHandler(cdx_server, html_view, perms_policy) + def load_for_request(self, wbrequest): - wburl = wbrequest.wb_url + wb_url = wbrequest.wb_url + + # cdx server only supports text and cdxobject for now + if wb_url.mod == 'cdx_': + output = 'text' + elif wb_url.mod == 'timemap_': + output = 'timemap' + elif wb_url.is_query(): + output = 'html' + else: + output = 'cdxobject' # init standard params - params = self.get_query_params(wburl) + params = self.get_query_params(wb_url) # add any custom filter from the request if wbrequest.query_filter: @@ -30,12 +64,15 @@ class IndexReader(object): params.update(wbrequest.custom_params) params['allowFuzzy'] = True - params['url'] = wburl.url - params['output'] = 'cdxobject' + params['url'] = wb_url.url + params['output'] = output - cdxlines = self.load_cdx(wbrequest, params) + cdx_iter = self.load_cdx(wbrequest, params) - return cdxlines + if wb_url.is_replay(): + return (cdx_iter, self.cdx_load_callback(wbrequest)) + + return self.make_cdx_response(wbrequest, params, cdx_iter) def load_cdx(self, wbrequest, params): if self.perms_policy: @@ -43,7 +80,19 @@ class IndexReader(object): if perms_op: params['custom_ops'] = [perms_op] - return self.cdx_server.load_cdx(**params) + cdx_iter = self.cdx_server.load_cdx(**params) + return cdx_iter + + def make_cdx_response(self, wbrequest, params, cdx_iter): + output = params['output'] + + # if not text, the iterator is assumed to be CDXObjects + if output and output != 'text': + view = self.views.get(output) + if view: + return view.render_response(wbrequest, cdx_iter) + + return WbResponse.text_stream(cdx_iter) def cdx_load_callback(self, wbrequest): def load_cdx(params): @@ -52,23 +101,45 @@ class IndexReader(object): return load_cdx - def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 100): + def get_query_params(self, + wburl, limit=150000, + collapse_time=None, + replay_closest=100): + if wburl.type == wburl.URL_QUERY: raise NotImplementedError('Url Query Not Yet Supported') return { wburl.QUERY: - {'collapseTime': collapse_time, 'filter': ['!statuscode:(500|502|504)'], 'limit': limit}, + {'collapseTime': collapse_time, + 'filter': ['!statuscode:(500|502|504)'], + 'limit': limit, + }, wburl.URL_QUERY: - {'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit, - 'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount', + {'collapse': 'urlkey', + 'matchType': 'prefix', + 'showGroupCount': True, + 'showUniqCount': True, + 'lastSkipTimestamp': True, + 'limit': limit, + 'fl': ('urlkey,original,timestamp,' + + 'endtimestamp,groupcount,uniqcount'), }, wburl.REPLAY: - {'sort': 'closest', 'filter': ['!statuscode:(500|502|504)'], 'limit': replay_closest, 'closest': wburl.timestamp, 'resolveRevisits': True}, + {'sort': 'closest', + 'filter': ['!statuscode:(500|502|504)'], + 'limit': replay_closest, + 'closest': wburl.timestamp, + 'resolveRevisits': True, + }, wburl.LATEST_REPLAY: - {'sort': 'reverse', 'filter': ['statuscode:[23]..'], 'limit': '1', 'resolveRevisits': True} + {'sort': 'reverse', + 'filter': ['statuscode:[23]..'], + 'limit': '1', + 'resolveRevisits': True, + } }[wburl.type] diff --git a/pywb/core/pywb_init.py b/pywb/core/pywb_init.py index 1b3c28d4..c46c06c2 100644 --- a/pywb/core/pywb_init.py +++ b/pywb/core/pywb_init.py @@ -9,16 +9,14 @@ from pywb.warc.resolvingloader import ResolvingLoader from pywb.rewrite.rewrite_content import RewriteContent from pywb.rewrite.rewriterules import use_lxml_parser -from pywb.cdx.cdxserver import create_cdx_server - -from indexreader import IndexReader -from views import J2TemplateView, J2HtmlCapturesView +from views import load_template_file, load_query_template from replay_views import ReplayView +from query_handler import QueryHandler from handlers import WBHandler from handlers import StaticHandler -from cdx_handler import CDXHandler from handlers import DebugEchoHandler, DebugEchoEnvHandler +from cdx_handler import CDXAPIHandler import os @@ -46,6 +44,7 @@ DEFAULTS = { 'use_lxml_parser': True, } + #================================================================= class DictChain: def __init__(self, *dicts): @@ -60,18 +59,9 @@ class DictChain: #================================================================= -def load_template_file(file, desc=None, view_class=J2TemplateView): - if file: - logging.debug('Adding {0}: {1}'.format(desc if desc else name, file)) - file = view_class(file) +def create_wb_handler(query_handler, config, ds_rules_file=None): - return file - - -#================================================================= -def create_wb_handler(cdx_server, config, ds_rules_file=None): - - cookie_maker=config.get('cookie_maker') + cookie_maker = config.get('cookie_maker') record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker) paths = config.get('archive_paths') @@ -98,20 +88,15 @@ def create_wb_handler(cdx_server, config, ds_rules_file=None): reporter=config.get('reporter') ) - html_view = load_template_file(config.get('query_html'), - 'Captures Page', - J2HtmlCapturesView) - - search_view = load_template_file(config.get('search_html'), 'Search Page') wb_handler_class = config.get('wb_handler_class', WBHandler) wb_handler = wb_handler_class( - cdx_server, + query_handler, replayer, - html_view=html_view, + #html_view=html_view, search_view=search_view, ) @@ -119,7 +104,7 @@ def create_wb_handler(cdx_server, config, ds_rules_file=None): #================================================================= -def create_wb_router(passed_config = {}): +def create_wb_router(passed_config={}): config = DictChain(passed_config, DEFAULTS) @@ -153,15 +138,20 @@ def create_wb_router(passed_config = {}): ds_rules_file = route_config.get('domain_specific_rules', None) - perms_policy = route_config.get('perms_policy', None) + #perms_policy = route_config.get('perms_policy', None) + # + #cdx_server = create_cdx_server(route_config, + # ds_rules_file) + # + html_view = load_query_template(config.get('query_html'), + 'Captures Page') - cdx_server = create_cdx_server(route_config, - ds_rules_file) - - cdx_server = IndexReader(cdx_server, perms_policy) + query_handler = QueryHandler.init_from_config(route_config, + ds_rules_file, + html_view) wb_handler = create_wb_handler( - cdx_server=cdx_server, + query_handler=query_handler, config=route_config, ds_rules_file=ds_rules_file, ) @@ -176,8 +166,7 @@ def create_wb_router(passed_config = {}): # cdx query handler if route_config.get('enable_cdx_api', False): - routes.append(Route(name + '-cdx', CDXHandler(cdx_server))) - + routes.append(Route(name + '-cdx', CDXAPIHandler(query_handler))) if config.get('debug_echo_env', False): routes.append(Route('echo_env', DebugEchoEnvHandler())) @@ -185,7 +174,6 @@ def create_wb_router(passed_config = {}): if config.get('debug_echo_req', False): routes.append(Route('echo_req', DebugEchoHandler())) - static_routes = config.get('static_routes') for static_name, static_path in static_routes.iteritems(): @@ -201,13 +189,17 @@ def create_wb_router(passed_config = {}): return router( routes, # Specify hostnames that pywb will be running on - # This will help catch occasionally missed rewrites that fall-through to the host + # This will help catch occasionally missed rewrites that + # fall-through to the host # (See archivalrouter.ReferRedirect) - hostpaths = hostpaths, - port = port, + hostpaths=hostpaths, + port=port, - abs_path = config.get('absolute_paths', True), + abs_path=config.get('absolute_paths', True), - home_view = load_template_file(config.get('home_html'), 'Home Page'), - error_view = load_template_file(config.get('error_html'), 'Error Page') + home_view=load_template_file(config.get('home_html'), + 'Home Page'), + + error_view=load_template_file(config.get('error_html'), + 'Error Page') ) diff --git a/pywb/core/views.py b/pywb/core/views.py index 319207fe..760c07ef 100644 --- a/pywb/core/views.py +++ b/pywb/core/views.py @@ -2,7 +2,7 @@ from pywb.utils.timeutils import timestamp_to_datetime from pywb.framework.wbrequestresponse import WbResponse import urlparse -import time +import logging from os import path from itertools import imap @@ -20,6 +20,7 @@ class StaticTextView: def render_response(self, **kwargs): return WbResponse.text_stream(self.text) + #================================================================= class J2TemplateView: def __init__(self, filename): @@ -29,14 +30,13 @@ class J2TemplateView: self.jinja_env = self.make_jinja_env(template_dir) - def make_jinja_env(self, template_dir): if template_dir.startswith('.') or template_dir.startswith('file://'): loader = FileSystemLoader(template_dir) else: loader = PackageLoader('pywb', template_dir) - jinja_env = Environment(loader = loader, trim_blocks = True) + jinja_env = Environment(loader=loader, trim_blocks=True) jinja_env.filters['format_ts'] = J2TemplateView.format_ts jinja_env.filters['host'] = J2TemplateView.get_host jinja_env.filters['request_hostname'] = J2TemplateView.request_hostname @@ -52,8 +52,10 @@ class J2TemplateView: def render_response(self, **kwargs): template_result = self.render_to_string(**kwargs) status = kwargs.get('status', '200 OK') - return WbResponse.text_response(str(template_result), status = status, content_type = 'text/html; charset=utf-8') - + content_type = 'text/html; charset=utf-8' + return WbResponse.text_response(str(template_result), + status=status, + content_type=content_type) # Filters @staticmethod @@ -65,13 +67,24 @@ class J2TemplateView: def get_host(url): return urlparse.urlsplit(url).netloc - @staticmethod def request_hostname(env): return env.get('HTTP_HOST', 'localhost') -# cdx index view +#================================================================= +def load_template_file(file, desc=None, view_class=J2TemplateView): + if file: + logging.debug('Adding {0}: {1}'.format(desc if desc else name, file)) + file = view_class(file) + + return file + + +#================================================================= +def load_query_template(file, desc=None): + return load_template_file(file, desc, J2HtmlCapturesView) + #================================================================= # html captures 'calendar' view @@ -79,23 +92,6 @@ class J2TemplateView: class J2HtmlCapturesView(J2TemplateView): def render_response(self, wbrequest, cdx_lines): return J2TemplateView.render_response(self, - cdx_lines = list(cdx_lines), - url = wbrequest.wb_url.url, - prefix = wbrequest.wb_prefix) - - -#================================================================= -# stream raw cdx text -#================================================================= -class TextCapturesView: - def render_response(self, wbrequest, cdx_lines): - def to_str(cdx): - cdx = str(cdx) - if not cdx.endswith('\n'): - cdx += '\n' - return cdx - cdx_lines = imap(to_str, cdx_lines) - return WbResponse.text_stream(cdx_lines) - - - + cdx_lines=list(cdx_lines), + url=wbrequest.wb_url.url, + prefix=wbrequest.wb_prefix) diff --git a/pywb/perms/test/test_perms.py b/pywb/perms/test/test_perms.py index 3e86835e..9c1e5bff 100644 --- a/pywb/perms/test/test_perms.py +++ b/pywb/perms/test/test_perms.py @@ -2,9 +2,7 @@ from pywb.cdx.cdxops import cdx_load from pywb.cdx.query import CDXQuery from pywb.cdx.cdxserver import CDXServer from pywb.utils.wbexception import AccessException -from pywb.core.indexreader import IndexReader - -#from pywb.perms.perms_filter import AllowAllPerms +from pywb.core.query_handler import QueryHandler from pytest import raises @@ -13,16 +11,17 @@ from tests.fixture import testconfig #================================================================ def test_excluded(testconfig): - sources = testconfig.get('index_paths') - perms_policy = testconfig.get('perms_policy') + #sources = testconfig.get('index_paths') + #perms_policy = testconfig.get('perms_policy') - cdx_server = CDXServer(sources) - index_reader = IndexReader(cdx_server, perms_policy) + #cdx_server = CDXServer(sources) + #index_handler = IndexHandler(cdx_server, perms_policy=perms_policy) + query_handler = QueryHandler.init_from_config(testconfig) url = 'http://www.iana.org/_img/bookmark_icon.ico' params = dict(url=url) with raises(AccessException): - cdxobjs = list(index_reader.load_cdx(None, params)) + cdxobjs = list(query_handler.load_cdx(None, params)) print cdxobjs diff --git a/pywb/rewrite/wburl.py b/pywb/rewrite/wburl.py index affea2d3..0cad8ed2 100644 --- a/pywb/rewrite/wburl.py +++ b/pywb/rewrite/wburl.py @@ -58,6 +58,14 @@ class BaseWbUrl(object): self.mod = mod self.type = type + def is_replay(self): + return (self.type == self.REPLAY or + self.type == self.LATEST_REPLAY) + + def is_query(self): + return (self.type == self.QUERY or + self.type == self.URL_QUERY) + #================================================================= class WbUrl(BaseWbUrl):