1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

refactor: IndexReader -> QueryHandler, move query output support

to QueryHandler. allow for multiple query views in QueryHandler
This commit is contained in:
Ilya Kreymer 2014-03-23 12:44:28 -07:00
parent 79da12348f
commit ac0bf5a415
7 changed files with 178 additions and 127 deletions

View File

@ -2,32 +2,31 @@ from pywb.cdx.cdxserver import create_cdx_server
from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.framework.basehandlers import BaseHandler
from pywb.framework.wbrequestresponse import WbResponse
from indexreader import IndexReader
from views import TextCapturesView
from query_handler import QueryHandler
from urlparse import parse_qs
#=================================================================
class CDXHandler(BaseHandler):
class CDXAPIHandler(BaseHandler):
"""
Handler which passes wsgi request to cdx server and
returns a text-based cdx response
returns a text-based cdx api
"""
def __init__(self, index_reader, view=None):
self.index_reader = index_reader
self.view = view if view else TextCapturesView()
def __init__(self, index_handler):
self.index_handler = index_handler
def __call__(self, wbrequest):
params = self.extract_params_from_wsgi_env(wbrequest.env)
cdx_iter = self.index_reader.load_cdx(wbrequest, params)
cdx_iter = self.index_handler.load_cdx(wbrequest, params)
return self.view.render_response(wbrequest, cdx_iter)
return WbResponse.text_stream(cdx_iter)
def __str__(self):
return 'CDX Handler: ' + str(self.index_reader)
return 'CDX Handler: ' + str(self.index_handler)
@staticmethod
def extract_params_from_wsgi_env(env):
@ -46,25 +45,21 @@ class CDXHandler(BaseHandler):
if not 'output' in params:
params['output'] = 'text'
elif params['output'] not in ('text'):
params['output'] = 'text'
return params
#=================================================================
DEFAULT_RULES = 'pywb/rules.yaml'
#=================================================================
def create_cdx_server_app(config):
"""
Create a cdx server config to be wrapped in a wsgi app
Currently using single access point '/cdx'
Currently using single access point '/cdx' to expose the api
TODO: more complex example with multiple collections?
"""
cdx_server = create_cdx_server(config, DEFAULT_RULES)
perms_policy = config.get('perms_policy')
cdx_server = IndexReader(cdx_server, perms_policy)
query_handler = QueryHandler.init_from_config(config)
port = config.get('port')
routes = [Route('cdx', CDXHandler(cdx_server))]
routes = [Route('cdx', CDXAPIHandler(query_handler))]
return ArchivalRouter(routes, port=port)

View File

@ -8,26 +8,18 @@ from pywb.utils.loaders import BlockLoader
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse
from views import TextCapturesView
#=================================================================
# Standard WB Handler
#=================================================================
class WBHandler(WbUrlHandler):
def __init__(self, index_reader, replay,
html_view=None, search_view=None):
search_view=None):
self.index_reader = index_reader
self.replay = replay
self.text_query_view = TextCapturesView()
self.query_view = html_view
if not self.query_view:
self.query_view = self.text_query_view
self.search_view = search_view
def __call__(self, wbrequest):
@ -35,20 +27,18 @@ class WBHandler(WbUrlHandler):
return self.render_search_page(wbrequest)
with PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
cdx_lines = self.index_reader.load_for_request(wbrequest)
response = self.index_reader.load_for_request(wbrequest)
# new special modifier to always show cdx index
if wbrequest.wb_url.mod == 'cdx_':
return self.text_query_view.render_response(wbrequest, cdx_lines)
if isinstance(response, WbResponse):
return response
if ((wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or
(wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY)):
return self.query_view.render_response(wbrequest, cdx_lines)
cdx_lines = response[0]
cdx_callback = response[1]
with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
return self.replay(wbrequest,
cdx_lines,
self.index_reader.cdx_load_callback(wbrequest))
cdx_callback)
def render_search_page(self, wbrequest):
if self.search_view:

View File

@ -2,25 +2,59 @@ import urllib
import urllib2
from pywb.perms.perms_filter import make_perms_cdx_filter
from pywb.framework.wbrequestresponse import WbResponse
from pywb.cdx.cdxserver import create_cdx_server
#=================================================================
class IndexReader(object):
DEFAULT_RULES = 'pywb/rules.yaml'
#=================================================================
class QueryHandler(object):
"""
Main interface for reading index (currently only CDX) from a
Main interface for querying the index (currently only CDX) from a
source server (currently a cdx server)
Creates an appropriate query based on wbrequest type info
Creates an appropriate query based on wbrequest type info and outputs
a returns a view for the cdx, either a raw cdx iter, an html view,
etc...
"""
def __init__(self, cdx_server, perms_policy=None):
def __init__(self, cdx_server, html_query_view=None, perms_policy=None):
self.cdx_server = cdx_server
self.perms_policy = perms_policy
self.views = {}
if html_query_view:
self.views['html'] = html_query_view
@staticmethod
def init_from_config(config,
ds_rules_file=DEFAULT_RULES,
html_view=None):
perms_policy = config.get('perms_policy')
cdx_server = create_cdx_server(config, ds_rules_file)
return QueryHandler(cdx_server, html_view, perms_policy)
def load_for_request(self, wbrequest):
wburl = wbrequest.wb_url
wb_url = wbrequest.wb_url
# cdx server only supports text and cdxobject for now
if wb_url.mod == 'cdx_':
output = 'text'
elif wb_url.mod == 'timemap_':
output = 'timemap'
elif wb_url.is_query():
output = 'html'
else:
output = 'cdxobject'
# init standard params
params = self.get_query_params(wburl)
params = self.get_query_params(wb_url)
# add any custom filter from the request
if wbrequest.query_filter:
@ -30,12 +64,15 @@ class IndexReader(object):
params.update(wbrequest.custom_params)
params['allowFuzzy'] = True
params['url'] = wburl.url
params['output'] = 'cdxobject'
params['url'] = wb_url.url
params['output'] = output
cdxlines = self.load_cdx(wbrequest, params)
cdx_iter = self.load_cdx(wbrequest, params)
return cdxlines
if wb_url.is_replay():
return (cdx_iter, self.cdx_load_callback(wbrequest))
return self.make_cdx_response(wbrequest, params, cdx_iter)
def load_cdx(self, wbrequest, params):
if self.perms_policy:
@ -43,7 +80,19 @@ class IndexReader(object):
if perms_op:
params['custom_ops'] = [perms_op]
return self.cdx_server.load_cdx(**params)
cdx_iter = self.cdx_server.load_cdx(**params)
return cdx_iter
def make_cdx_response(self, wbrequest, params, cdx_iter):
output = params['output']
# if not text, the iterator is assumed to be CDXObjects
if output and output != 'text':
view = self.views.get(output)
if view:
return view.render_response(wbrequest, cdx_iter)
return WbResponse.text_stream(cdx_iter)
def cdx_load_callback(self, wbrequest):
def load_cdx(params):
@ -52,23 +101,45 @@ class IndexReader(object):
return load_cdx
def get_query_params(self, wburl, limit = 150000, collapse_time = None, replay_closest = 100):
def get_query_params(self,
wburl, limit=150000,
collapse_time=None,
replay_closest=100):
if wburl.type == wburl.URL_QUERY:
raise NotImplementedError('Url Query Not Yet Supported')
return {
wburl.QUERY:
{'collapseTime': collapse_time, 'filter': ['!statuscode:(500|502|504)'], 'limit': limit},
{'collapseTime': collapse_time,
'filter': ['!statuscode:(500|502|504)'],
'limit': limit,
},
wburl.URL_QUERY:
{'collapse': 'urlkey', 'matchType': 'prefix', 'showGroupCount': True, 'showUniqCount': True, 'lastSkipTimestamp': True, 'limit': limit,
'fl': 'urlkey,original,timestamp,endtimestamp,groupcount,uniqcount',
{'collapse': 'urlkey',
'matchType': 'prefix',
'showGroupCount': True,
'showUniqCount': True,
'lastSkipTimestamp': True,
'limit': limit,
'fl': ('urlkey,original,timestamp,' +
'endtimestamp,groupcount,uniqcount'),
},
wburl.REPLAY:
{'sort': 'closest', 'filter': ['!statuscode:(500|502|504)'], 'limit': replay_closest, 'closest': wburl.timestamp, 'resolveRevisits': True},
{'sort': 'closest',
'filter': ['!statuscode:(500|502|504)'],
'limit': replay_closest,
'closest': wburl.timestamp,
'resolveRevisits': True,
},
wburl.LATEST_REPLAY:
{'sort': 'reverse', 'filter': ['statuscode:[23]..'], 'limit': '1', 'resolveRevisits': True}
{'sort': 'reverse',
'filter': ['statuscode:[23]..'],
'limit': '1',
'resolveRevisits': True,
}
}[wburl.type]

View File

@ -9,16 +9,14 @@ from pywb.warc.resolvingloader import ResolvingLoader
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.rewrite.rewriterules import use_lxml_parser
from pywb.cdx.cdxserver import create_cdx_server
from indexreader import IndexReader
from views import J2TemplateView, J2HtmlCapturesView
from views import load_template_file, load_query_template
from replay_views import ReplayView
from query_handler import QueryHandler
from handlers import WBHandler
from handlers import StaticHandler
from cdx_handler import CDXHandler
from handlers import DebugEchoHandler, DebugEchoEnvHandler
from cdx_handler import CDXAPIHandler
import os
@ -46,6 +44,7 @@ DEFAULTS = {
'use_lxml_parser': True,
}
#=================================================================
class DictChain:
def __init__(self, *dicts):
@ -60,18 +59,9 @@ class DictChain:
#=================================================================
def load_template_file(file, desc=None, view_class=J2TemplateView):
if file:
logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
file = view_class(file)
def create_wb_handler(query_handler, config, ds_rules_file=None):
return file
#=================================================================
def create_wb_handler(cdx_server, config, ds_rules_file=None):
cookie_maker=config.get('cookie_maker')
cookie_maker = config.get('cookie_maker')
record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker)
paths = config.get('archive_paths')
@ -98,20 +88,15 @@ def create_wb_handler(cdx_server, config, ds_rules_file=None):
reporter=config.get('reporter')
)
html_view = load_template_file(config.get('query_html'),
'Captures Page',
J2HtmlCapturesView)
search_view = load_template_file(config.get('search_html'),
'Search Page')
wb_handler_class = config.get('wb_handler_class', WBHandler)
wb_handler = wb_handler_class(
cdx_server,
query_handler,
replayer,
html_view=html_view,
#html_view=html_view,
search_view=search_view,
)
@ -119,7 +104,7 @@ def create_wb_handler(cdx_server, config, ds_rules_file=None):
#=================================================================
def create_wb_router(passed_config = {}):
def create_wb_router(passed_config={}):
config = DictChain(passed_config, DEFAULTS)
@ -153,15 +138,20 @@ def create_wb_router(passed_config = {}):
ds_rules_file = route_config.get('domain_specific_rules', None)
perms_policy = route_config.get('perms_policy', None)
#perms_policy = route_config.get('perms_policy', None)
#
#cdx_server = create_cdx_server(route_config,
# ds_rules_file)
#
html_view = load_query_template(config.get('query_html'),
'Captures Page')
cdx_server = create_cdx_server(route_config,
ds_rules_file)
cdx_server = IndexReader(cdx_server, perms_policy)
query_handler = QueryHandler.init_from_config(route_config,
ds_rules_file,
html_view)
wb_handler = create_wb_handler(
cdx_server=cdx_server,
query_handler=query_handler,
config=route_config,
ds_rules_file=ds_rules_file,
)
@ -176,8 +166,7 @@ def create_wb_router(passed_config = {}):
# cdx query handler
if route_config.get('enable_cdx_api', False):
routes.append(Route(name + '-cdx', CDXHandler(cdx_server)))
routes.append(Route(name + '-cdx', CDXAPIHandler(query_handler)))
if config.get('debug_echo_env', False):
routes.append(Route('echo_env', DebugEchoEnvHandler()))
@ -185,7 +174,6 @@ def create_wb_router(passed_config = {}):
if config.get('debug_echo_req', False):
routes.append(Route('echo_req', DebugEchoHandler()))
static_routes = config.get('static_routes')
for static_name, static_path in static_routes.iteritems():
@ -201,13 +189,17 @@ def create_wb_router(passed_config = {}):
return router(
routes,
# Specify hostnames that pywb will be running on
# This will help catch occasionally missed rewrites that fall-through to the host
# This will help catch occasionally missed rewrites that
# fall-through to the host
# (See archivalrouter.ReferRedirect)
hostpaths = hostpaths,
port = port,
hostpaths=hostpaths,
port=port,
abs_path = config.get('absolute_paths', True),
abs_path=config.get('absolute_paths', True),
home_view = load_template_file(config.get('home_html'), 'Home Page'),
error_view = load_template_file(config.get('error_html'), 'Error Page')
home_view=load_template_file(config.get('home_html'),
'Home Page'),
error_view=load_template_file(config.get('error_html'),
'Error Page')
)

View File

@ -2,7 +2,7 @@ from pywb.utils.timeutils import timestamp_to_datetime
from pywb.framework.wbrequestresponse import WbResponse
import urlparse
import time
import logging
from os import path
from itertools import imap
@ -20,6 +20,7 @@ class StaticTextView:
def render_response(self, **kwargs):
return WbResponse.text_stream(self.text)
#=================================================================
class J2TemplateView:
def __init__(self, filename):
@ -29,14 +30,13 @@ class J2TemplateView:
self.jinja_env = self.make_jinja_env(template_dir)
def make_jinja_env(self, template_dir):
if template_dir.startswith('.') or template_dir.startswith('file://'):
loader = FileSystemLoader(template_dir)
else:
loader = PackageLoader('pywb', template_dir)
jinja_env = Environment(loader = loader, trim_blocks = True)
jinja_env = Environment(loader=loader, trim_blocks=True)
jinja_env.filters['format_ts'] = J2TemplateView.format_ts
jinja_env.filters['host'] = J2TemplateView.get_host
jinja_env.filters['request_hostname'] = J2TemplateView.request_hostname
@ -52,8 +52,10 @@ class J2TemplateView:
def render_response(self, **kwargs):
template_result = self.render_to_string(**kwargs)
status = kwargs.get('status', '200 OK')
return WbResponse.text_response(str(template_result), status = status, content_type = 'text/html; charset=utf-8')
content_type = 'text/html; charset=utf-8'
return WbResponse.text_response(str(template_result),
status=status,
content_type=content_type)
# Filters
@staticmethod
@ -65,13 +67,24 @@ class J2TemplateView:
def get_host(url):
return urlparse.urlsplit(url).netloc
@staticmethod
def request_hostname(env):
return env.get('HTTP_HOST', 'localhost')
# cdx index view
#=================================================================
def load_template_file(file, desc=None, view_class=J2TemplateView):
if file:
logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
file = view_class(file)
return file
#=================================================================
def load_query_template(file, desc=None):
return load_template_file(file, desc, J2HtmlCapturesView)
#=================================================================
# html captures 'calendar' view
@ -79,23 +92,6 @@ class J2TemplateView:
class J2HtmlCapturesView(J2TemplateView):
def render_response(self, wbrequest, cdx_lines):
return J2TemplateView.render_response(self,
cdx_lines = list(cdx_lines),
url = wbrequest.wb_url.url,
prefix = wbrequest.wb_prefix)
#=================================================================
# stream raw cdx text
#=================================================================
class TextCapturesView:
def render_response(self, wbrequest, cdx_lines):
def to_str(cdx):
cdx = str(cdx)
if not cdx.endswith('\n'):
cdx += '\n'
return cdx
cdx_lines = imap(to_str, cdx_lines)
return WbResponse.text_stream(cdx_lines)
cdx_lines=list(cdx_lines),
url=wbrequest.wb_url.url,
prefix=wbrequest.wb_prefix)

View File

@ -2,9 +2,7 @@ from pywb.cdx.cdxops import cdx_load
from pywb.cdx.query import CDXQuery
from pywb.cdx.cdxserver import CDXServer
from pywb.utils.wbexception import AccessException
from pywb.core.indexreader import IndexReader
#from pywb.perms.perms_filter import AllowAllPerms
from pywb.core.query_handler import QueryHandler
from pytest import raises
@ -13,16 +11,17 @@ from tests.fixture import testconfig
#================================================================
def test_excluded(testconfig):
sources = testconfig.get('index_paths')
perms_policy = testconfig.get('perms_policy')
#sources = testconfig.get('index_paths')
#perms_policy = testconfig.get('perms_policy')
cdx_server = CDXServer(sources)
index_reader = IndexReader(cdx_server, perms_policy)
#cdx_server = CDXServer(sources)
#index_handler = IndexHandler(cdx_server, perms_policy=perms_policy)
query_handler = QueryHandler.init_from_config(testconfig)
url = 'http://www.iana.org/_img/bookmark_icon.ico'
params = dict(url=url)
with raises(AccessException):
cdxobjs = list(index_reader.load_cdx(None, params))
cdxobjs = list(query_handler.load_cdx(None, params))
print cdxobjs

View File

@ -58,6 +58,14 @@ class BaseWbUrl(object):
self.mod = mod
self.type = type
def is_replay(self):
return (self.type == self.REPLAY or
self.type == self.LATEST_REPLAY)
def is_query(self):
return (self.type == self.QUERY or
self.type == self.URL_QUERY)
#=================================================================
class WbUrl(BaseWbUrl):