1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Support for new UI, as per #16

* Refactor views class to support more Jinja2 views (J2Template)
* Add a home page, collection search page, and error pages, all optional
* all exceptions appear on error page
* wbrequest supports a request with an empty or / wb_url
This commit is contained in:
Ilya Kreymer 2014-01-31 10:04:21 -08:00
parent 57fe9515db
commit 304ddbec84
19 changed files with 281 additions and 148 deletions

View File

@ -27,6 +27,8 @@ Ex: The [Internet Archive Wayback Machine](https//archive.org/web/) has urls of
A listing of archived content, often in calendar form, is available when a `*` is used instead of timestamp. A listing of archived content, often in calendar form, is available when a `*` is used instead of timestamp.
The Wayback Machine uses an html parser to rewrite relative and absolute links, as well as absolute links found in javascript, css and some xml.
pywb uses this interface as a starting point. pywb uses this interface as a starting point.
@ -36,7 +38,7 @@ pywb currently works best with 2.7.x
It should run in a standard WSGI container, although currently It should run in a standard WSGI container, although currently
tested primarily with uWSGI 1.9 and 2.0 tested primarily with uWSGI 1.9 and 2.0
Support for other versions of Python 3 is planned. Support for Python 3 is planned.
### Installation ### Installation

View File

@ -52,6 +52,9 @@ routes:
# if omitted, the capture listing lists raw index # if omitted, the capture listing lists raw index
calendar_html_template: ./ui/query.html calendar_html_template: ./ui/query.html
# ui: optional Jinja2 template to use for 'search' page
# this page is displayed when no search url is entered
search_html_template: ./ui/search.html
# list of host names that pywb will be running from to detect # list of host names that pywb will be running from to detect
# 'fallthrough' requests based on referrer # 'fallthrough' requests based on referrer
@ -63,6 +66,13 @@ routes:
hostpaths: ['http://localhost:8080/'] hostpaths: ['http://localhost:8080/']
# ui: optional Jinja2 template for home page
# if no other route is set to home page, this template will
# be rendered at /, /index.htm and /index.html
home_html_template: ./ui/index.html
# ui: optional Jinja2 template for rendering any errors
# the error page may print a detailed error message
error_html_template: ./ui/error.html

View File

@ -1,5 +1,6 @@
import urlparse import urlparse
import re import re
import wbexceptions
from wbrequestresponse import WbRequest, WbResponse from wbrequestresponse import WbRequest, WbResponse
from url_rewriter import UrlRewriter from url_rewriter import UrlRewriter
@ -9,25 +10,39 @@ from wburl import WbUrl
# ArchivalRequestRouter -- route WB requests in archival mode # ArchivalRequestRouter -- route WB requests in archival mode
#================================================================= #=================================================================
class ArchivalRequestRouter: class ArchivalRequestRouter:
def __init__(self, handlers, hostpaths = None, abs_path = True, archivalurl_class = WbUrl): def __init__(self, routes, hostpaths = None, abs_path = True, archivalurl_class = WbUrl, homepage = None, errorpage = None):
self.handlers = handlers self.routes = routes
self.fallback = ReferRedirect(hostpaths) self.fallback = ReferRedirect(hostpaths)
self.abs_path = abs_path self.abs_path = abs_path
self.archivalurl_class = archivalurl_class self.archivalurl_class = archivalurl_class
self.homepage = homepage
self.errorpage = errorpage
def __call__(self, env): def __call__(self, env):
for handler in self.handlers: for route in self.routes:
result = handler(env, self.abs_path, self.archivalurl_class) result = route(env, self.abs_path, self.archivalurl_class)
if result: if result:
return result return result
# Home Page
if env['REL_REQUEST_URI'] in ['/', '/index.html', '/index.htm']:
return self.render_homepage()
if not self.fallback: if not self.fallback:
return None return None
return self.fallback(WbRequest.from_uri(None, env)) return self.fallback(WbRequest.from_uri(None, env))
def render_homepage(self):
# render the homepage!
if self.homepage:
return self.homepage.render_response(routes = self.routes)
else:
# default home page template
text = '\n'.join(map(str, self.routes))
return WbResponse.text_response(text)
#================================================================= #=================================================================
# Route by matching regex (or fixed prefix) # Route by matching regex (or fixed prefix)
@ -36,10 +51,11 @@ class ArchivalRequestRouter:
class Route: class Route:
# match upto next slash # match upto next slash
SLASH_LOOKAHEAD ='(?=/)' SLASH_LOOKAHEAD ='(?=/|$)'
def __init__(self, regex, handler, coll_group = 0, lookahead = SLASH_LOOKAHEAD): def __init__(self, regex, handler, coll_group = 0, lookahead = SLASH_LOOKAHEAD):
self.path = regex
self.regex = re.compile(regex + lookahead) self.regex = re.compile(regex + lookahead)
self.handler = handler self.handler = handler
# collection id from regex group (default 0) # collection id from regex group (default 0)
@ -83,6 +99,10 @@ class Route:
def _handle_request(self, wbrequest): def _handle_request(self, wbrequest):
return self.handler(wbrequest) return self.handler(wbrequest)
def __str__(self):
#return '* ' + self.regex_str + ' => ' + str(self.handler)
return str(self.handler)
#================================================================= #=================================================================
# ReferRedirect -- redirect urls that have 'fallen through' based on the referrer settings # ReferRedirect -- redirect urls that have 'fallen through' based on the referrer settings

View File

@ -114,6 +114,9 @@ def cdx_reverse(cdx_iter, limit):
>>> test_cdx('org,iana)/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1) >>> test_cdx('org,iana)/_js/2013.1/jquery.js', reverse = True, resolve_revisits = True, limit = 1)
org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jquery.js application/x-javascript 200 AAW2RS7JB7HTF666XNZDQYJFA6PDQBPO - - 543 778507 iana.warc.gz 33449 7311 iana.warc.gz
# no match, single result
>>> test_cdx('org,iana)/dont_have_this', reverse = True, resolve_revisits = True, limit = 1)
""" """
# optimize for single last # optimize for single last
@ -123,7 +126,7 @@ def cdx_reverse(cdx_iter, limit):
for cdx in cdx_iter: for cdx in cdx_iter:
last = cdx last = cdx
return [last] return [last] if last else []
reverse_cdxs = deque(maxlen = limit) reverse_cdxs = deque(maxlen = limit)

View File

@ -2,34 +2,52 @@ import views
import utils import utils
import urlparse import urlparse
from wbrequestresponse import WbResponse
#================================================================= #=================================================================
# Standard WB Handler # Standard WB Handler
#================================================================= #=================================================================
class WBHandler: class WBHandler:
def __init__(self, cdx_reader, replay, html_view = None): def __init__(self, cdx_reader, replay, capturespage = None, searchpage = None):
self.cdx_reader = cdx_reader self.cdx_reader = cdx_reader
self.replay = replay self.replay = replay
self.html_view = html_view
self.text_view = views.TextQueryView() self.text_view = views.TextCapturesView()
self.html_view = capturespage
self.searchpage = searchpage
def __call__(self, wbrequest): def __call__(self, wbrequest):
if wbrequest.wb_url_str == '/':
return self.render_searchpage(wbrequest)
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t: with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'query') as t:
cdx_lines = self.cdx_reader.load_for_request(wbrequest, parsed_cdx = True) cdx_lines = self.cdx_reader.load_for_request(wbrequest, parsed_cdx = True)
# new special modifier to always show cdx index # new special modifier to always show cdx index
if wbrequest.wb_url.mod == 'cdx_': if wbrequest.wb_url.mod == 'cdx_':
return self.text_view(wbrequest, cdx_lines) return self.text_view.render_response(wbrequest, cdx_lines)
if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY): if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY):
if not self.html_view: query_view = self.html_view if self.html_view else self.text_view
return self.text_view(wbrequest, cdx_lines) return query_view.render_response(wbrequest, cdx_lines)
else:
return self.html_view(wbrequest, cdx_lines)
with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: with utils.PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
return self.replay(wbrequest, cdx_lines, self.cdx_reader) return self.replay(wbrequest, cdx_lines, self.cdx_reader)
def render_searchpage(self, wbrequest):
if self.searchpage:
return self.searchpage.render_response(wbrequest = wbrequest)
else:
return WbResponse.text_response('No Lookup Url Specified')
def __str__(self):
return 'WBHandler: ' + str(self.cdx_reader) + ', ' + str(self.replay)
#================================================================= #=================================================================
# CDX-Server Handler -- pass all params to cdx server # CDX-Server Handler -- pass all params to cdx server
@ -37,7 +55,7 @@ class WBHandler:
class CDXHandler: class CDXHandler:
def __init__(self, cdx_reader, view = None): def __init__(self, cdx_reader, view = None):
self.cdx_reader = cdx_reader self.cdx_reader = cdx_reader
self.view = view if view else views.TextQueryView() self.view = view if view else views.TextCapturesView()
def __call__(self, wbrequest): def __call__(self, wbrequest):
url = wbrequest.wb_url.url url = wbrequest.wb_url.url

View File

@ -83,7 +83,10 @@ class LocalCDXServer(IndexReader):
def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues): def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
# canonicalize to surt (canonicalization is part of surt conversion) # canonicalize to surt (canonicalization is part of surt conversion)
key = surt.surt(url) try:
key = surt.surt(url)
except Exception as e:
raise wbexceptions.BadUrlException('Bad Request Url: ' + url)
# if not surt, unsurt the surt to get canonicalized non-surt url # if not surt, unsurt the surt to get canonicalized non-surt url
if not self.surt_ordered: if not self.surt_ordered:
@ -123,6 +126,10 @@ class LocalCDXServer(IndexReader):
}[wburl.type] }[wburl.type]
def __str__(self):
return 'load cdx indexes from ' + str(self.sources)
#================================================================= #=================================================================
class RemoteCDXServer(IndexReader): class RemoteCDXServer(IndexReader):
@ -196,6 +203,10 @@ class RemoteCDXServer(IndexReader):
}[wburl.type] }[wburl.type]
def __str__(self):
return 'server cdx from ' + self.server_url
#================================================================= #=================================================================
class CDXCaptureResult(OrderedDict): class CDXCaptureResult(OrderedDict):
CDX_FORMATS = [ CDX_FORMATS = [

View File

@ -39,13 +39,13 @@ def pywb_config_manual():
prefixes = [replay_resolvers.PrefixResolver(test_dir + 'warcs/')] prefixes = [replay_resolvers.PrefixResolver(test_dir + 'warcs/')]
# Jinja2 head insert # Jinja2 head insert
head_insert = views.J2HeadInsertView('./ui/head_insert.html') head_insert = views.J2TemplateView('./ui/head_insert.html')
# Create rewriting replay handler to rewrite records # Create rewriting replay handler to rewrite records
replayer = replay_views.RewritingReplayView(resolvers = prefixes, archiveloader = aloader, head_insert = head_insert, buffer_response = True) replayer = replay_views.RewritingReplayView(resolvers = prefixes, archiveloader = aloader, head_insert_view = head_insert, buffer_response = True)
# Create Jinja2 based html query view # Create Jinja2 based html query view
html_view = views.J2QueryView('./ui/query.html') html_view = views.J2HtmlCapturesView('./ui/query.html')
# WB handler which uses the index reader, replayer, and html_view # WB handler which uses the index reader, replayer, and html_view
wb_handler = handlers.WBHandler(indexs, replayer, html_view) wb_handler = handlers.WBHandler(indexs, replayer, html_view)
@ -81,11 +81,21 @@ def pywb_config(config_file = None):
routes = map(yaml_parse_route, config['routes']) routes = map(yaml_parse_route, config['routes'])
homepage = yaml_load_template(config, 'home_html_template', 'Home Page Template')
errorpage = yaml_load_template(config, 'error_html_template', 'Error Page Template')
hostpaths = config.get('hostpaths', ['http://localhost:8080/']) hostpaths = config.get('hostpaths', ['http://localhost:8080/'])
return ArchivalRequestRouter(routes, hostpaths) return ArchivalRequestRouter(routes, hostpaths, homepage = homepage, errorpage = errorpage)
def yaml_load_template(config, name, desc = None):
file = config.get(name)
if file:
logging.info('Adding {0}: {1}'.format(desc if desc else name, file))
file = views.J2TemplateView(file)
return file
def yaml_parse_index_loader(config): def yaml_parse_index_loader(config):
@ -113,17 +123,19 @@ def yaml_parse_index_loader(config):
return indexreader.LocalCDXServer([uri]) return indexreader.LocalCDXServer([uri])
def yaml_parse_head_insert(config): def yaml_parse_head_insert(config):
# First, try a template file # First, try a template file
head_insert_file = config.get('head_insert_html_template') head_insert_file = config.get('head_insert_html_template')
if head_insert_file: if head_insert_file:
logging.info('Adding Head-Insert Template: ' + head_insert_file) logging.info('Adding Head-Insert Template: ' + head_insert_file)
return views.J2HeadInsertView(head_insert_file) return views.J2TemplateView(head_insert_file)
# Then, static head_insert text # Then, static head_insert text
head_insert_text = config.get('head_insert_text', '') head_insert_text = config.get('head_insert_text', '')
logging.info('Adding Head-Insert Text: ' + head_insert_text) logging.info('Adding Head-Insert Text: ' + head_insert_text)
return head_insert_text return views.StaticTextView(head_insert_text)
def yaml_parse_calendar_view(config): def yaml_parse_calendar_view(config):
@ -133,7 +145,7 @@ def yaml_parse_calendar_view(config):
else: else:
logging.info('No HTML Calendar View Present') logging.info('No HTML Calendar View Present')
return views.J2QueryView(html_view_file) if html_view_file else None return views.J2HtmlCapturesView(html_view_file) if html_view_file else None
@ -150,12 +162,14 @@ def yaml_parse_route(config):
replayer = replay_views.RewritingReplayView(resolvers = archive_resolvers, replayer = replay_views.RewritingReplayView(resolvers = archive_resolvers,
archiveloader = archive_loader, archiveloader = archive_loader,
head_insert = head_insert, head_insert_view = head_insert,
buffer_response = config.get('buffer_response', False)) buffer_response = config.get('buffer_response', False))
html_view = yaml_parse_calendar_view(config) html_view = yaml_parse_calendar_view(config)
wb_handler = handlers.WBHandler(index_loader, replayer, html_view) searchpage = yaml_load_template(config, 'search_html_template', 'Search Page Template')
wb_handler = handlers.WBHandler(index_loader, replayer, html_view, searchpage = searchpage)
return Route(name, wb_handler) return Route(name, wb_handler)

View File

@ -16,6 +16,13 @@ class PrefixResolver:
def __call__(self, filename): def __call__(self, filename):
return [self.prefix + filename] if (self.contains in filename) else [] return [self.prefix + filename] if (self.contains in filename) else []
def __repr__(self):
if self.contains:
return "PrefixResolver('{0}', contains = '{1}')".format(self.prefix, self.contains)
else:
return "PrefixResolver('{0}')".format(self.prefix)
#====================================== #======================================
class RedisResolver: class RedisResolver:
def __init__(self, redis_url, key_prefix = 'w:'): def __init__(self, redis_url, key_prefix = 'w:'):
@ -31,9 +38,14 @@ class RedisResolver:
print e print e
return None return None
def __repr__(self):
return "RedisResolver('{0}')".format(self.redis_url)
#====================================== #======================================
class PathIndexResolver: class PathIndexResolver:
def __init__(self, pathindex_file): def __init__(self, pathindex_file):
self.pathindex_file = pathindex_file
self.reader = binsearch.FileReader(pathindex_file) self.reader = binsearch.FileReader(pathindex_file)
def __call__(self, filename): def __call__(self, filename):
@ -47,27 +59,32 @@ class PathIndexResolver:
return gen_list(result) return gen_list(result)
def __repr__(self):
return "PathIndexResolver('{0}')".format(self.pathindex_file)
#TODO: more options (remote files, contains param, etc..) #TODO: more options (remote files, contains param, etc..)
# find best resolver given the path # find best resolver given the path
def make_best_resolver(path): def make_best_resolver(path):
""" """
# http path # http path
>>> class_name(make_best_resolver('http://myhost.example.com/warcs/')) >>> make_best_resolver('http://myhost.example.com/warcs/')
'PrefixResolver' PrefixResolver('http://myhost.example.com/warcs/')
# redis path # redis path
>>> class_name(make_best_resolver('redis://myhost.example.com:1234/1')) >>> make_best_resolver('redis://myhost.example.com:1234/1')
'RedisResolver' RedisResolver('redis://myhost.example.com:1234/1')
# a file # a file
>>> class_name(make_best_resolver('file://' + os.path.realpath(__file__))) >>> make_best_resolver('file://' + os.path.dirname(os.path.realpath(__file__)) + '/replay_resolvers.py')
'PathIndexResolver' PathIndexResolver('/home/ilya/workspace/pywb/pywb/replay_resolvers.py')
# a dir # a dir
>>> class_name(make_best_resolver('file://' + os.path.dirname(os.path.realpath(__file__)))) >>> make_best_resolver('file://' + os.path.dirname(os.path.realpath(__file__)))
'PrefixResolver' PrefixResolver('/home/ilya/workspace/pywb/pywb')
""" """
url_parts = urlparse.urlsplit(path) url_parts = urlparse.urlsplit(path)
if url_parts.scheme == 'redis': if url_parts.scheme == 'redis':
@ -90,9 +107,6 @@ import utils
#================================================================= #=================================================================
if __name__ == "__main__" or utils.enable_doctests(): if __name__ == "__main__" or utils.enable_doctests():
def class_name(obj):
return obj.__class__.__name__
import doctest import doctest
doctest.testmod() doctest.testmod()

View File

@ -210,12 +210,15 @@ class ReplayView:
stream.close() stream.close()
def __str__(self):
return 'find archive files from ' + str(self.resolvers)
#================================================================= #=================================================================
class RewritingReplayView(ReplayView): class RewritingReplayView(ReplayView):
def __init__(self, resolvers, archiveloader, head_insert = None, header_rewriter = None, redir_to_exact = True, buffer_response = False): def __init__(self, resolvers, archiveloader, head_insert_view = None, header_rewriter = None, redir_to_exact = True, buffer_response = False):
ReplayView.__init__(self, resolvers, archiveloader) ReplayView.__init__(self, resolvers, archiveloader)
self.head_insert = head_insert self.head_insert_view = head_insert_view
self.header_rewriter = header_rewriter if header_rewriter else HeaderRewriter() self.header_rewriter = header_rewriter if header_rewriter else HeaderRewriter()
self.redir_to_exact = redir_to_exact self.redir_to_exact = redir_to_exact
@ -300,12 +303,7 @@ class RewritingReplayView(ReplayView):
status_headers = rewritten_headers.status_headers status_headers = rewritten_headers.status_headers
if text_type == 'html': if text_type == 'html':
# Support head_insert func head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = response.cdx) if self.head_insert_view else None
if hasattr(self.head_insert, '__call__'):
head_insert_str = self.head_insert(wbrequest, response.cdx)
else:
head_insert_str = str(self.head_insert)
rewriter = html_rewriter.HTMLRewriter(urlrewriter, outstream = None, head_insert = head_insert_str) rewriter = html_rewriter.HTMLRewriter(urlrewriter, outstream = None, head_insert = head_insert_str)
elif text_type == 'css': elif text_type == 'css':
rewriter = regex_rewriters.CSSRewriter(urlrewriter) rewriter = regex_rewriters.CSSRewriter(urlrewriter)

View File

@ -10,63 +10,72 @@ from jinja2 import Environment, FileSystemLoader
#================================================================= #=================================================================
class TextQueryView: class StaticTextView:
def __call__(self, wbrequest, cdx_lines): def __init__(self, text):
self.text = text
def render_to_string(self, **kwargs):
return self.text
def render_response(self, **kwargs):
return wbrequestresponse.WbResponse.text_stream(self.text)
#=================================================================
class J2TemplateView:
def __init__(self, filename):
template_dir, template_file = path.split(filename)
self.template_file = template_file
self.jinja_env = self.make_jinja_env(template_dir)
def make_jinja_env(self, template_dir):
jinja_env = Environment(loader = FileSystemLoader(template_dir), trim_blocks = True)
jinja_env.filters['format_ts'] = J2TemplateView.format_ts
return jinja_env
def render_to_string(self, **kwargs):
template = self.jinja_env.get_template(self.template_file)
template_result = template.render(**kwargs)
return template_result
def render_response(self, **kwargs):
template_result = self.render_to_string(**kwargs)
return wbrequestresponse.WbResponse.text_response(str(template_result), content_type = 'text/html; charset=utf-8')
# Filters
@staticmethod
def format_ts(value, format='%a, %b %d %Y %H:%M:%S'):
value = utils.timestamp_to_datetime(value)
return time.strftime(format, value)
# cdx index view
#=================================================================
# html captures 'calendar' view
#=================================================================
class J2HtmlCapturesView(J2TemplateView):
def render_response(self, wbrequest, cdx_lines):
return J2TemplateView.render_response(self,
cdx_lines = list(cdx_lines),
url = wbrequest.wb_url.url,
prefix = wbrequest.wb_prefix)
#=================================================================
# stream raw cdx text
#=================================================================
class TextCapturesView:
def render_response(self, wbrequest, cdx_lines):
cdx_lines = imap(lambda x: str(x) + '\n', cdx_lines) cdx_lines = imap(lambda x: str(x) + '\n', cdx_lines)
return wbrequestresponse.WbResponse.text_stream(cdx_lines) return wbrequestresponse.WbResponse.text_stream(cdx_lines)
#=================================================================
class J2QueryView:
def __init__(self, filename, buffer_index = True):
template_dir, template_file = path.split(filename)
self.template_file = template_file
self.buffer_index = buffer_index
self.jinja_env = make_jinja_env(template_dir)
def __call__(self, wbrequest, cdx_lines):
template = self.jinja_env.get_template(self.template_file)
# buffer/convert to list so we have length available for template
if self.buffer_index:
cdx_lines = list(cdx_lines)
response = template.render(cdx_lines = cdx_lines,
url = wbrequest.wb_url.url,
prefix = wbrequest.wb_prefix)
return wbrequestresponse.WbResponse.text_response(str(response), content_type = 'text/html')
#=================================================================
# Render the head insert (eg. banner)
#=================================================================
class J2HeadInsertView:
def __init__(self, filename, buffer_index = True):
template_dir, template_file = path.split(filename)
self.template_file = template_file
self.jinja_env = make_jinja_env(template_dir)
def __call__(self, wbrequest, cdx):
template = self.jinja_env.get_template(self.template_file)
return template.render(wbrequest = wbrequest,cdx = cdx)
#=================================================================
# Jinja funcs
def make_jinja_env(template_dir):
jinja_env = Environment(loader = FileSystemLoader(template_dir), trim_blocks = True)
jinja_env.filters['format_ts'] = format_ts
return jinja_env
# Filters
def format_ts(value, format='%H:%M / %d-%m-%Y'):
value = utils.timestamp_to_datetime(value)
return time.strftime(format, value)

View File

@ -8,29 +8,6 @@ import importlib
import logging import logging
## ===========
'''
To declare Wayback with one collection, `mycoll`
and will be accessed by user at:
`http://mywb.example.com:8080/mycoll/`
and will load cdx from cdx server running at:
`http://cdx.example.com/cdx`
and look for warcs at paths:
`http://warcs.example.com/servewarc/` and
`http://warcs.example.com/anotherpath/`,
one could declare a `sample_wb_settings()` method as follows
'''
def create_wb_app(wb_router): def create_wb_app(wb_router):
# Top-level wsgi application # Top-level wsgi application
@ -52,14 +29,13 @@ def create_wb_app(wb_router):
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders)) response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e: except (wbexceptions.NotFoundException, wbexceptions.AccessException) as e:
logging.info(str(e)) response = handle_exception(env, wb_router.errorpage, e, False)
response = handle_exception(env, e)
except wbexceptions.WbException as wbe:
response = handle_exception(env, wb_router.errorpage, wbe, False)
except Exception as e: except Exception as e:
last_exc = e response = handle_exception(env, wb_router.errorpage, e, True)
import traceback
traceback.print_exc()
response = handle_exception(env, e)
return response(env, start_response) return response(env, start_response)
@ -67,13 +43,25 @@ def create_wb_app(wb_router):
return application return application
def handle_exception(env, exc): def handle_exception(env, errorpage, exc, print_trace):
if hasattr(exc, 'status'): if hasattr(exc, 'status'):
status = exc.status() status = exc.status()
else: else:
status = '400 Bad Request' status = '400 Bad Request'
return WbResponse.text_response(status + ' Error: ' + str(exc), status = status) if print_trace:
import traceback
err_details = traceback.format_exc(exc)
print err_details
else:
logging.info(str(exc))
err_details = None
if errorpage:
import traceback
return errorpage.render_response(err_msg = str(exc), err_details = err_details)
else:
return WbResponse.text_response(status + ' Error: ' + str(exc), status = status)
#================================================================= #=================================================================

View File

@ -1,26 +1,33 @@
class RequestParseException(Exception): class WbException(Exception):
pass
class RequestParseException(WbException):
def __init__(self, string, to_parse):
WbException.__init__(self, string + to_parse)
self.to_parse = to_parse
def status(_): def status(_):
return '400 Bad Request' return '400 Bad Request'
class BadUrlException(Exception): class BadUrlException(WbException):
def status(_): def status(_):
return '400 Bad Request' return '400 Bad Request'
class AccessException(Exception): class AccessException(WbException):
def status(_): def status(_):
return '403 Forbidden' return '403 Forbidden'
class InvalidCDXException(Exception): class InvalidCDXException(WbException):
def status(_): def status(_):
return '500 Internal Server Error' return '500 Internal Server Error'
class NotFoundException(Exception): class NotFoundException(WbException):
def status(_): def status(_):
return '404 Not Found' return '404 Not Found'
# Exceptions that effect a specific capture and result in a retry # Exceptions that effect a specific capture and result in a retry
class CaptureException(Exception): class CaptureException(WbException):
def status(_): def status(_):
return '500 Internal Server Error' return '500 Internal Server Error'
@ -47,9 +54,9 @@ class ArchiveLoadFailed(CaptureException):
def status(_): def status(_):
return '503 Service Unavailable' return '503 Service Unavailable'
class InternalRedirect(Exception): class InternalRedirect(WbException):
def __init__(self, location, status = '302 Internal Redirect'): def __init__(self, location, status = '302 Internal Redirect'):
Exception.__init__(self, 'Redirecting -> ' + location) WbException.__init__(self, 'Redirecting -> ' + location)
self.status = status self.status = status
self.httpHeaders = [('Location', location)] self.httpHeaders = [('Location', location)]

View File

@ -68,7 +68,14 @@ class WbRequest:
self.wb_prefix = wb_prefix if not use_abs_prefix else WbRequest.make_abs_prefix(env, wb_prefix) self.wb_prefix = wb_prefix if not use_abs_prefix else WbRequest.make_abs_prefix(env, wb_prefix)
self.wb_url = archivalurl_class(wb_url) # wb_url present and not root page
if wb_url != '/' and wb_url != '' and archivalurl_class:
self.wb_url_str = wb_url
self.wb_url = archivalurl_class(wb_url)
else:
# no wb_url, just store blank
self.wb_url_str = '/'
self.wb_url = None
self.coll = coll self.coll = coll

View File

@ -82,10 +82,10 @@ class WbUrl:
self.mod = '' self.mod = ''
if not any (f(url) for f in [self._init_query, self._init_replay]): if not any (f(url) for f in [self._init_query, self._init_replay]):
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url) raise wbexceptions.RequestParseException('Invalid WB Request Url: ', url)
if len(self.url) == 0: if len(self.url) == 0:
raise wbexceptions.RequestParseException('Invalid WB Request Url: ' + url) raise wbexceptions.RequestParseException('Invalid WB Request Url: ', url)
# protocol agnostic url -> http:// # protocol agnostic url -> http://
if self.url.startswith('//'): if self.url.startswith('//'):

11
ui/error.html Normal file
View File

@ -0,0 +1,11 @@
<h2>Pywb Error</h2>
<b>{{ err_msg }}</b>
{% if err_details %}
<p>Error Details:</p>
<p>
<pre>
{{ err_details }}
</pre>
</p>
{% endif %}

View File

@ -1,7 +1,7 @@
<!-- WB Insert --> <!-- WB Insert -->
<script> <script>
wbinfo = {} wbinfo = {}
wbinfo.capture_str = "{{ cdx['timestamp'] | format_ts('%a, %b %d %Y %H:%M:%S') }}"; wbinfo.capture_str = "{{ cdx['timestamp'] | format_ts }}";
</script> </script>
<script src='/static/wb.js'> </script> <script src='/static/wb.js'> </script>
<link rel='stylesheet' href='/static/wb.css'/> <link rel='stylesheet' href='/static/wb.css'/>

9
ui/index.html Normal file
View File

@ -0,0 +1,9 @@
<h2>pywb Sample Home Page</h2>
The following archive collections are available:
<ul>
{% for route in routes %}
<li><a href="{{ '/' + route.path }}">{{ '/' + route.path }}</a>: {{ route | string }}</li>
{% endfor %}
</ul>

View File

@ -4,14 +4,20 @@
<table id="captures" style="border-spacing: 10px;"> <table id="captures" style="border-spacing: 10px;">
<tr> <tr>
<th>Capture</th> <th>Capture</th>
<th>Status</th>
<th>Original Url</th>
<th>Archive File</th> <th>Archive File</th>
</tr> </tr>
{% for cdx in cdx_lines %} {% for cdx in cdx_lines %}
<tr style="{{ 'font-weight: bold' if cdx['mimetype'] != 'warc/revisit' else '' }}"> <tr style="{{ 'font-weight: bold' if cdx['mimetype'] != 'warc/revisit' else '' }}">
<td><a href="{{ prefix }}{{ cdx.timestamp }}/{{ url }}">{{ cdx['timestamp'] | format_ts('%a, %b %d %Y %H:%M:%S') }}</a></td> <td><a href="{{ prefix }}{{ cdx.timestamp }}/{{ url }}">{{ cdx['timestamp'] | format_ts}}</a></td>
<td>{{ cdx['filename'] }}</td> <td>{{ cdx['filename'] }}</td>
<td>{{ cdx['statuscode'] }}</td>
<td>{{ cdx['originalurl'] }}</td>
</tr> </tr>
{% endfor %} {% endfor %}
</table> </table>
<i><b>* Unique captures are bold.</b><br/>* Other captures are duplicates of a previous capture.</i> <p>
<i><b>* Unique captures are bold.</b> Other captures are duplicates of a previous capture.</i>
</p>
</body> </body>

6
ui/search.html Normal file
View File

@ -0,0 +1,6 @@
<h2>pywb Search Page</h2>
Search Archived Content:
<form onsubmit="url = document.getElementById('search').value; if (url != '') { document.location.href = '{{ wbrequest.wb_prefix }}' + '*/' + url; } return false;">
<input id="search" name="search" placeholder="Enter url to search"/>
<button type="submit">Search</button>
</form>