1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

refactor: IndexReader -> QueryHandler, move query output support

to QueryHandler. allow for multiple query views in QueryHandler
This commit is contained in:
Ilya Kreymer 2014-03-23 12:44:28 -07:00
parent 79da12348f
commit ac0bf5a415
7 changed files with 178 additions and 127 deletions

View File

@ -2,32 +2,31 @@ from pywb.cdx.cdxserver import create_cdx_server
from pywb.framework.archivalrouter import ArchivalRouter, Route from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.framework.basehandlers import BaseHandler from pywb.framework.basehandlers import BaseHandler
from pywb.framework.wbrequestresponse import WbResponse
from indexreader import IndexReader from query_handler import QueryHandler
from views import TextCapturesView
from urlparse import parse_qs from urlparse import parse_qs
#================================================================= #=================================================================
class CDXHandler(BaseHandler): class CDXAPIHandler(BaseHandler):
""" """
Handler which passes wsgi request to cdx server and Handler which passes wsgi request to cdx server and
returns a text-based cdx response returns a text-based cdx api
""" """
def __init__(self, index_reader, view=None): def __init__(self, index_handler):
self.index_reader = index_reader self.index_handler = index_handler
self.view = view if view else TextCapturesView()
def __call__(self, wbrequest): def __call__(self, wbrequest):
params = self.extract_params_from_wsgi_env(wbrequest.env) params = self.extract_params_from_wsgi_env(wbrequest.env)
cdx_iter = self.index_reader.load_cdx(wbrequest, params) cdx_iter = self.index_handler.load_cdx(wbrequest, params)
return self.view.render_response(wbrequest, cdx_iter) return WbResponse.text_stream(cdx_iter)
def __str__(self): def __str__(self):
return 'CDX Handler: ' + str(self.index_reader) return 'CDX Handler: ' + str(self.index_handler)
@staticmethod @staticmethod
def extract_params_from_wsgi_env(env): def extract_params_from_wsgi_env(env):
@ -46,25 +45,21 @@ class CDXHandler(BaseHandler):
if not 'output' in params: if not 'output' in params:
params['output'] = 'text' params['output'] = 'text'
elif params['output'] not in ('text'):
params['output'] = 'text'
return params return params
#=================================================================
DEFAULT_RULES = 'pywb/rules.yaml'
#================================================================= #=================================================================
def create_cdx_server_app(config): def create_cdx_server_app(config):
""" """
Create a cdx server config to be wrapped in a wsgi app Create a cdx server config to be wrapped in a wsgi app
Currently using single access point '/cdx' Currently using single access point '/cdx' to expose the api
TODO: more complex example with multiple collections? TODO: more complex example with multiple collections?
""" """
cdx_server = create_cdx_server(config, DEFAULT_RULES) query_handler = QueryHandler.init_from_config(config)
perms_policy = config.get('perms_policy')
cdx_server = IndexReader(cdx_server, perms_policy)
port = config.get('port') port = config.get('port')
routes = [Route('cdx', CDXHandler(cdx_server))] routes = [Route('cdx', CDXAPIHandler(query_handler))]
return ArchivalRouter(routes, port=port) return ArchivalRouter(routes, port=port)

View File

@ -8,26 +8,18 @@ from pywb.utils.loaders import BlockLoader
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.wbrequestresponse import WbResponse
from views import TextCapturesView
#================================================================= #=================================================================
# Standard WB Handler # Standard WB Handler
#================================================================= #=================================================================
class WBHandler(WbUrlHandler): class WBHandler(WbUrlHandler):
def __init__(self, index_reader, replay, def __init__(self, index_reader, replay,
html_view=None, search_view=None): search_view=None):
self.index_reader = index_reader self.index_reader = index_reader
self.replay = replay self.replay = replay
self.text_query_view = TextCapturesView()
self.query_view = html_view
if not self.query_view:
self.query_view = self.text_query_view
self.search_view = search_view self.search_view = search_view
def __call__(self, wbrequest): def __call__(self, wbrequest):
@ -35,20 +27,18 @@ class WBHandler(WbUrlHandler):
return self.render_search_page(wbrequest) return self.render_search_page(wbrequest)
with PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t: with PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
cdx_lines = self.index_reader.load_for_request(wbrequest) response = self.index_reader.load_for_request(wbrequest)
# new special modifier to always show cdx index if isinstance(response, WbResponse):
if wbrequest.wb_url.mod == 'cdx_': return response
return self.text_query_view.render_response(wbrequest, cdx_lines)
if ((wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or cdx_lines = response[0]
(wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY)): cdx_callback = response[1]
return self.query_view.render_response(wbrequest, cdx_lines)
with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
return self.replay(wbrequest, return self.replay(wbrequest,
cdx_lines, cdx_lines,
self.index_reader.cdx_load_callback(wbrequest)) cdx_callback)
def render_search_page(self, wbrequest): def render_search_page(self, wbrequest):
if self.search_view: if self.search_view:

View File

@ -2,25 +2,59 @@ import urllib
import urllib2 import urllib2
from pywb.perms.perms_filter import make_perms_cdx_filter from pywb.perms.perms_filter import make_perms_cdx_filter
from pywb.framework.wbrequestresponse import WbResponse
from pywb.cdx.cdxserver import create_cdx_server
#================================================================= #=================================================================
class IndexReader(object): DEFAULT_RULES = 'pywb/rules.yaml'
#=================================================================
class QueryHandler(object):
""" """
Main interface for reading index (currently only CDX) from a Main interface for querying the index (currently only CDX) from a
source server (currently a cdx server) source server (currently a cdx server)
Creates an appropriate query based on wbrequest type info Creates an appropriate query based on wbrequest type info and outputs
a returns a view for the cdx, either a raw cdx iter, an html view,
etc...
""" """
def __init__(self, cdx_server, perms_policy=None): def __init__(self, cdx_server, html_query_view=None, perms_policy=None):
self.cdx_server = cdx_server self.cdx_server = cdx_server
self.perms_policy = perms_policy self.perms_policy = perms_policy
self.views = {}
if html_query_view:
self.views['html'] = html_query_view
@staticmethod
def init_from_config(config,
ds_rules_file=DEFAULT_RULES,
html_view=None):
perms_policy = config.get('perms_policy')
cdx_server = create_cdx_server(config, ds_rules_file)
return QueryHandler(cdx_server, html_view, perms_policy)
def load_for_request(self, wbrequest): def load_for_request(self, wbrequest):
wburl = wbrequest.wb_url wb_url = wbrequest.wb_url
# cdx server only supports text and cdxobject for now
if wb_url.mod == 'cdx_':
output = 'text'
elif wb_url.mod == 'timemap_':
output = 'timemap'
elif wb_url.is_query():
output = 'html'
else:
output = 'cdxobject'
# init standard params # init standard params
params = self.get_query_params(wburl) params = self.get_query_params(wb_url)
# add any custom filter from the request # add any custom filter from the request
if wbrequest.query_filter: if wbrequest.query_filter:
@ -30,12 +64,15 @@ class IndexReader(object):
params.update(wbrequest.custom_params) params.update(wbrequest.custom_params)
params['allowFuzzy'] = True params['allowFuzzy'] = True
params['url'] = wburl.url params['url'] = wb_url.url
params['output'] = 'cdxobject' params['output'] = output
cdxlines = self.load_cdx(wbrequest, params) cdx_iter = self.load_cdx(wbrequest, params)
return cdxlines if wb_url.is_replay():
return (cdx_iter, self.cdx_load_callback(wbrequest))
return self.make_cdx_response(wbrequest, params, cdx_iter)
def load_cdx(self, wbrequest, params): def load_cdx(self, wbrequest, params):
if self.perms_policy: if self.perms_policy:
@ -43,7 +80,19 @@ class IndexReader(object):
if perms_op: if perms_op:
params['custom_ops'] = [perms_op] params['custom_ops'] = [perms_op]
return self.cdx_server.load_cdx(**params) cdx_iter = self.cdx_server.load_cdx(**params)
return cdx_iter
def make_cdx_response(self, wbrequest, params, cdx_iter):
output = params['output']
# if not text, the iterator is assumed to be CDXObjects
if output and output != 'text':
view = self.views.get(output)
if view:
return view.render_response(wbrequest, cdx_iter)
return WbResponse.text_stream(cdx_iter)
def cdx_load_callback(self, wbrequest): def cdx_load_callback(self, wbrequest):
def load_cdx(params): def load_cdx(params):
@ -52,23 +101,45 @@ class IndexReader(object):
return load_cdx return load_cdx
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 100): def get_query_params(self,
wburl, limit=150000,
collapse_time=None,
replay_closest=100):
if wburl.type == wburl.URL_QUERY: if wburl.type == wburl.URL_QUERY:
raise NotImplementedError('Url Query Not Yet Supported') raise NotImplementedError('Url Query Not Yet Supported')
return { return {
wburl.QUERY: wburl.QUERY:
{'collapseTime': collapse_time, 'filter': ['!statuscode:(500|502|504)'], 'limit': limit}, {'collapseTime': collapse_time,
'filter': ['!statuscode:(500|502|504)'],
'limit': limit,
},
wburl.URL_QUERY: wburl.URL_QUERY:
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit, {'collapse': 'urlkey',
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount', 'matchType': 'prefix',
'showGroupCount': True,
'showUniqCount': True,
'lastSkipTimestamp': True,
'limit': limit,
'fl': ('urlkey,original,timestamp,' +
'endtimestamp,groupcount,uniqcount'),
}, },
wburl.REPLAY: wburl.REPLAY:
{'sort': 'closest', 'filter': ['!statuscode:(500|502|504)'], 'limit': replay_closest, 'closest': wburl.timestamp, 'resolveRevisits': True}, {'sort': 'closest',
'filter': ['!statuscode:(500|502|504)'],
'limit': replay_closest,
'closest': wburl.timestamp,
'resolveRevisits': True,
},
wburl.LATEST_REPLAY: wburl.LATEST_REPLAY:
{'sort': 'reverse', 'filter': ['statuscode:[23]..'], 'limit': '1', 'resolveRevisits': True} {'sort': 'reverse',
'filter': ['statuscode:[23]..'],
'limit': '1',
'resolveRevisits': True,
}
}[wburl.type] }[wburl.type]

View File

@ -9,16 +9,14 @@ from pywb.warc.resolvingloader import ResolvingLoader
from pywb.rewrite.rewrite_content import RewriteContent from pywb.rewrite.rewrite_content import RewriteContent
from pywb.rewrite.rewriterules import use_lxml_parser from pywb.rewrite.rewriterules import use_lxml_parser
from pywb.cdx.cdxserver import create_cdx_server from views import load_template_file, load_query_template
from indexreader import IndexReader
from views import J2TemplateView, J2HtmlCapturesView
from replay_views import ReplayView from replay_views import ReplayView
from query_handler import QueryHandler
from handlers import WBHandler from handlers import WBHandler
from handlers import StaticHandler from handlers import StaticHandler
from cdx_handler import CDXHandler
from handlers import DebugEchoHandler, DebugEchoEnvHandler from handlers import DebugEchoHandler, DebugEchoEnvHandler
from cdx_handler import CDXAPIHandler
import os import os
@ -46,6 +44,7 @@ DEFAULTS = {
'use_lxml_parser': True, 'use_lxml_parser': True,
} }
#================================================================= #=================================================================
class DictChain: class DictChain:
def __init__(self, *dicts): def __init__(self, *dicts):
@ -60,18 +59,9 @@ class DictChain:
#================================================================= #=================================================================
def load_template_file(file, desc=None, view_class=J2TemplateView): def create_wb_handler(query_handler, config, ds_rules_file=None):
if file:
logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
file = view_class(file)
return file cookie_maker = config.get('cookie_maker')
#=================================================================
def create_wb_handler(cdx_server, config, ds_rules_file=None):
cookie_maker=config.get('cookie_maker')
record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker) record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker)
paths = config.get('archive_paths') paths = config.get('archive_paths')
@ -98,20 +88,15 @@ def create_wb_handler(cdx_server, config, ds_rules_file=None):
reporter=config.get('reporter') reporter=config.get('reporter')
) )
html_view = load_template_file(config.get('query_html'),
'Captures Page',
J2HtmlCapturesView)
search_view = load_template_file(config.get('search_html'), search_view = load_template_file(config.get('search_html'),
'Search Page') 'Search Page')
wb_handler_class = config.get('wb_handler_class', WBHandler) wb_handler_class = config.get('wb_handler_class', WBHandler)
wb_handler = wb_handler_class( wb_handler = wb_handler_class(
cdx_server, query_handler,
replayer, replayer,
html_view=html_view, #html_view=html_view,
search_view=search_view, search_view=search_view,
) )
@ -119,7 +104,7 @@ def create_wb_handler(cdx_server, config, ds_rules_file=None):
#================================================================= #=================================================================
def create_wb_router(passed_config = {}): def create_wb_router(passed_config={}):
config = DictChain(passed_config, DEFAULTS) config = DictChain(passed_config, DEFAULTS)
@ -153,15 +138,20 @@ def create_wb_router(passed_config = {}):
ds_rules_file = route_config.get('domain_specific_rules', None) ds_rules_file = route_config.get('domain_specific_rules', None)
perms_policy = route_config.get('perms_policy', None) #perms_policy = route_config.get('perms_policy', None)
#
#cdx_server = create_cdx_server(route_config,
# ds_rules_file)
#
html_view = load_query_template(config.get('query_html'),
'Captures Page')
cdx_server = create_cdx_server(route_config, query_handler = QueryHandler.init_from_config(route_config,
ds_rules_file) ds_rules_file,
html_view)
cdx_server = IndexReader(cdx_server, perms_policy)
wb_handler = create_wb_handler( wb_handler = create_wb_handler(
cdx_server=cdx_server, query_handler=query_handler,
config=route_config, config=route_config,
ds_rules_file=ds_rules_file, ds_rules_file=ds_rules_file,
) )
@ -176,8 +166,7 @@ def create_wb_router(passed_config = {}):
# cdx query handler # cdx query handler
if route_config.get('enable_cdx_api', False): if route_config.get('enable_cdx_api', False):
routes.append(Route(name + '-cdx', CDXHandler(cdx_server))) routes.append(Route(name + '-cdx', CDXAPIHandler(query_handler)))
if config.get('debug_echo_env', False): if config.get('debug_echo_env', False):
routes.append(Route('echo_env', DebugEchoEnvHandler())) routes.append(Route('echo_env', DebugEchoEnvHandler()))
@ -185,7 +174,6 @@ def create_wb_router(passed_config = {}):
if config.get('debug_echo_req', False): if config.get('debug_echo_req', False):
routes.append(Route('echo_req', DebugEchoHandler())) routes.append(Route('echo_req', DebugEchoHandler()))
static_routes = config.get('static_routes') static_routes = config.get('static_routes')
for static_name, static_path in static_routes.iteritems(): for static_name, static_path in static_routes.iteritems():
@ -201,13 +189,17 @@ def create_wb_router(passed_config = {}):
return router( return router(
routes, routes,
# Specify hostnames that pywb will be running on # Specify hostnames that pywb will be running on
# This will help catch occasionally missed rewrites that fall-through to the host # This will help catch occasionally missed rewrites that
# fall-through to the host
# (See archivalrouter.ReferRedirect) # (See archivalrouter.ReferRedirect)
hostpaths = hostpaths, hostpaths=hostpaths,
port = port, port=port,
abs_path = config.get('absolute_paths', True), abs_path=config.get('absolute_paths', True),
home_view = load_template_file(config.get('home_html'), 'Home Page'), home_view=load_template_file(config.get('home_html'),
error_view = load_template_file(config.get('error_html'), 'Error Page') 'Home Page'),
error_view=load_template_file(config.get('error_html'),
'Error Page')
) )

View File

@ -2,7 +2,7 @@ from pywb.utils.timeutils import timestamp_to_datetime
from pywb.framework.wbrequestresponse import WbResponse from pywb.framework.wbrequestresponse import WbResponse
import urlparse import urlparse
import time import logging
from os import path from os import path
from itertools import imap from itertools import imap
@ -20,6 +20,7 @@ class StaticTextView:
def render_response(self, **kwargs): def render_response(self, **kwargs):
return WbResponse.text_stream(self.text) return WbResponse.text_stream(self.text)
#================================================================= #=================================================================
class J2TemplateView: class J2TemplateView:
def __init__(self, filename): def __init__(self, filename):
@ -29,14 +30,13 @@ class J2TemplateView:
self.jinja_env = self.make_jinja_env(template_dir) self.jinja_env = self.make_jinja_env(template_dir)
def make_jinja_env(self, template_dir): def make_jinja_env(self, template_dir):
if template_dir.startswith('.') or template_dir.startswith('file://'): if template_dir.startswith('.') or template_dir.startswith('file://'):
loader = FileSystemLoader(template_dir) loader = FileSystemLoader(template_dir)
else: else:
loader = PackageLoader('pywb', template_dir) loader = PackageLoader('pywb', template_dir)
jinja_env = Environment(loader = loader, trim_blocks = True) jinja_env = Environment(loader=loader, trim_blocks=True)
jinja_env.filters['format_ts'] = J2TemplateView.format_ts jinja_env.filters['format_ts'] = J2TemplateView.format_ts
jinja_env.filters['host'] = J2TemplateView.get_host jinja_env.filters['host'] = J2TemplateView.get_host
jinja_env.filters['request_hostname'] = J2TemplateView.request_hostname jinja_env.filters['request_hostname'] = J2TemplateView.request_hostname
@ -52,8 +52,10 @@ class J2TemplateView:
def render_response(self, **kwargs): def render_response(self, **kwargs):
template_result = self.render_to_string(**kwargs) template_result = self.render_to_string(**kwargs)
status = kwargs.get('status', '200 OK') status = kwargs.get('status', '200 OK')
return WbResponse.text_response(str(template_result), status = status, content_type = 'text/html; charset=utf-8') content_type = 'text/html; charset=utf-8'
return WbResponse.text_response(str(template_result),
status=status,
content_type=content_type)
# Filters # Filters
@staticmethod @staticmethod
@ -65,13 +67,24 @@ class J2TemplateView:
def get_host(url): def get_host(url):
return urlparse.urlsplit(url).netloc return urlparse.urlsplit(url).netloc
@staticmethod @staticmethod
def request_hostname(env): def request_hostname(env):
return env.get('HTTP_HOST', 'localhost') return env.get('HTTP_HOST', 'localhost')
# cdx index view #=================================================================
def load_template_file(file, desc=None, view_class=J2TemplateView):
if file:
logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
file = view_class(file)
return file
#=================================================================
def load_query_template(file, desc=None):
return load_template_file(file, desc, J2HtmlCapturesView)
#================================================================= #=================================================================
# html captures 'calendar' view # html captures 'calendar' view
@ -79,23 +92,6 @@ class J2TemplateView:
class J2HtmlCapturesView(J2TemplateView): class J2HtmlCapturesView(J2TemplateView):
def render_response(self, wbrequest, cdx_lines): def render_response(self, wbrequest, cdx_lines):
return J2TemplateView.render_response(self, return J2TemplateView.render_response(self,
cdx_lines = list(cdx_lines), cdx_lines=list(cdx_lines),
url = wbrequest.wb_url.url, url=wbrequest.wb_url.url,
prefix = wbrequest.wb_prefix) prefix=wbrequest.wb_prefix)
#=================================================================
# stream raw cdx text
#=================================================================
class TextCapturesView:
def render_response(self, wbrequest, cdx_lines):
def to_str(cdx):
cdx = str(cdx)
if not cdx.endswith('\n'):
cdx += '\n'
return cdx
cdx_lines = imap(to_str, cdx_lines)
return WbResponse.text_stream(cdx_lines)

View File

@ -2,9 +2,7 @@ from pywb.cdx.cdxops import cdx_load
from pywb.cdx.query import CDXQuery from pywb.cdx.query import CDXQuery
from pywb.cdx.cdxserver import CDXServer from pywb.cdx.cdxserver import CDXServer
from pywb.utils.wbexception import AccessException from pywb.utils.wbexception import AccessException
from pywb.core.indexreader import IndexReader from pywb.core.query_handler import QueryHandler
#from pywb.perms.perms_filter import AllowAllPerms
from pytest import raises from pytest import raises
@ -13,16 +11,17 @@ from tests.fixture import testconfig
#================================================================ #================================================================
def test_excluded(testconfig): def test_excluded(testconfig):
sources = testconfig.get('index_paths') #sources = testconfig.get('index_paths')
perms_policy = testconfig.get('perms_policy') #perms_policy = testconfig.get('perms_policy')
cdx_server = CDXServer(sources) #cdx_server = CDXServer(sources)
index_reader = IndexReader(cdx_server, perms_policy) #index_handler = IndexHandler(cdx_server, perms_policy=perms_policy)
query_handler = QueryHandler.init_from_config(testconfig)
url = 'http://www.iana.org/_img/bookmark_icon.ico' url = 'http://www.iana.org/_img/bookmark_icon.ico'
params = dict(url=url) params = dict(url=url)
with raises(AccessException): with raises(AccessException):
cdxobjs = list(index_reader.load_cdx(None, params)) cdxobjs = list(query_handler.load_cdx(None, params))
print cdxobjs print cdxobjs

View File

@ -58,6 +58,14 @@ class BaseWbUrl(object):
self.mod = mod self.mod = mod
self.type = type self.type = type
def is_replay(self):
return (self.type == self.REPLAY or
self.type == self.LATEST_REPLAY)
def is_query(self):
return (self.type == self.QUERY or
self.type == self.URL_QUERY)
#================================================================= #=================================================================
class WbUrl(BaseWbUrl): class WbUrl(BaseWbUrl):