mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Merge pull request #28 from ikreymer/pkg-reorg
pywb pkg refactoring: create pywb.framework, pywb.core and pywb.apps
This commit is contained in:
commit
5a28bc6992
0
pywb/apps/__init__.py
Normal file
0
pywb/apps/__init__.py
Normal file
17
pywb/apps/cdx_server.py
Normal file
17
pywb/apps/cdx_server.py
Normal file
@ -0,0 +1,17 @@
|
||||
from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server
|
||||
|
||||
from pywb.core.cdx_handler import create_cdx_server_app
|
||||
|
||||
#=================================================================
|
||||
# init cdx server app
|
||||
#=================================================================
|
||||
|
||||
# cdx-server only config
|
||||
DEFAULT_CONFIG = 'pywb/cdx/config.yaml'
|
||||
|
||||
application = init_app(create_cdx_server_app,
|
||||
load_yaml=True,
|
||||
config_file=DEFAULT_CONFIG)
|
||||
|
||||
if __name__ == "__main__":
|
||||
start_wsgi_server(application)
|
10
pywb/apps/wayback.py
Normal file
10
pywb/apps/wayback.py
Normal file
@ -0,0 +1,10 @@
|
||||
from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server
|
||||
from pywb.core.pywb_init import create_wb_router
|
||||
|
||||
#=================================================================
|
||||
# init pywb app
|
||||
#=================================================================
|
||||
application = init_app(create_wb_router, load_yaml=True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
start_wsgi_server(application)
|
@ -9,6 +9,7 @@ from pywb.utils.canonicalize import unsurt, UrlCanonicalizer
|
||||
|
||||
from query import CDXQuery
|
||||
|
||||
|
||||
#=================================================================
|
||||
def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
|
||||
"""
|
||||
|
@ -4,9 +4,11 @@ import itertools
|
||||
from urllib import urlencode
|
||||
from urlparse import parse_qs
|
||||
|
||||
from pywb.utils.wbexception import WbException
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXException(Exception):
|
||||
class CDXException(WbException):
|
||||
def status(self):
|
||||
return '400 Bad Request'
|
||||
|
||||
@ -61,7 +63,7 @@ class CDXObject(OrderedDict):
|
||||
cdxformat = i
|
||||
|
||||
if not cdxformat:
|
||||
raise Exception('unknown {0}-field cdx format'.format(len(fields)))
|
||||
raise CDXException('unknown {0}-field cdx format'.format(len(fields)))
|
||||
|
||||
for header, field in itertools.izip(cdxformat, fields):
|
||||
self[header] = field
|
||||
@ -85,8 +87,15 @@ class CDXObject(OrderedDict):
|
||||
"""
|
||||
if fields is None:
|
||||
return str(self) + '\n'
|
||||
else:
|
||||
return ' '.join(self[x] for x in fields) + '\n'
|
||||
|
||||
try:
|
||||
result = ' '.join(self[x] for x in fields) + '\n'
|
||||
except KeyError as ke:
|
||||
msg = 'Invalid field "{0}" found in fields= argument'
|
||||
msg = msg.format(ke.message)
|
||||
raise CDXException(msg)
|
||||
|
||||
return result
|
||||
|
||||
def __str__(self):
|
||||
if self.cdxline:
|
||||
@ -109,7 +118,7 @@ class IDXObject(OrderedDict):
|
||||
|
||||
if len(fields) < self.NUM_REQ_FIELDS:
|
||||
msg = 'invalid idx format: {0} fields found, {1} required'
|
||||
raise Exception(msg.format(len(fields), self.NUM_REQ_FIELDS))
|
||||
raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS))
|
||||
|
||||
for header, field in itertools.izip(self.FORMAT, fields):
|
||||
self[header] = field
|
||||
|
@ -31,8 +31,18 @@ def cdx_load(sources, query, perms_checker=None, process=True):
|
||||
if perms_checker:
|
||||
cdx_iter = restrict_cdx(cdx_iter, query, perms_checker)
|
||||
|
||||
if query.output == 'text':
|
||||
cdx_iter = cdx_to_text(cdx_iter, query.fields)
|
||||
|
||||
return cdx_iter
|
||||
|
||||
|
||||
#=================================================================
|
||||
def cdx_to_text(cdx_iter, fields):
|
||||
for cdx in cdx_iter:
|
||||
yield cdx.to_text(fields)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def restrict_cdx(cdx_iter, query, perms_checker):
|
||||
"""
|
||||
@ -56,6 +66,7 @@ def restrict_cdx(cdx_iter, query, perms_checker):
|
||||
|
||||
yield cdx
|
||||
|
||||
|
||||
#=================================================================
|
||||
def process_cdx(cdx_iter, query):
|
||||
if query.resolve_revisits:
|
||||
@ -255,7 +266,6 @@ def cdx_resolve_revisits(cdx_iter):
|
||||
originals = {}
|
||||
|
||||
for cdx in cdx_iter:
|
||||
|
||||
is_revisit = cdx.is_revisit()
|
||||
|
||||
digest = cdx['digest']
|
||||
|
@ -126,14 +126,19 @@ class CDXServer(BaseCDXServer):
|
||||
logging.warn('No CDX Sources configured from paths=%s', paths)
|
||||
|
||||
def _add_cdx_source(self, source):
|
||||
if source is None: return
|
||||
if source is None:
|
||||
return
|
||||
|
||||
logging.debug('Adding CDX Source: %s', source)
|
||||
self.sources.append(source)
|
||||
|
||||
def add_cdx_source(self, source, config):
|
||||
if source is None: return
|
||||
if source is None:
|
||||
return
|
||||
|
||||
if isinstance(source, CDXSource):
|
||||
self._add_cdx_source(source)
|
||||
|
||||
elif isinstance(source, str):
|
||||
if os.path.isdir(source):
|
||||
for fn in os.listdir(source):
|
||||
@ -213,5 +218,3 @@ def create_cdx_server(config, ds_rules_file=None):
|
||||
surt_ordered=surt_ordered,
|
||||
ds_rules_file=ds_rules_file,
|
||||
perms_checker=perms_checker)
|
||||
|
||||
|
||||
|
@ -8,6 +8,7 @@ import urllib
|
||||
import urllib2
|
||||
import itertools
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXSource(object):
|
||||
"""
|
||||
@ -92,7 +93,6 @@ class RedisCDXSource(CDXSource):
|
||||
if config:
|
||||
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
|
||||
|
||||
|
||||
def load_cdx(self, query):
|
||||
"""
|
||||
Load cdx from redis cache, from an ordered list
|
||||
|
@ -1,5 +1,6 @@
|
||||
from urllib import urlencode
|
||||
from urlparse import parse_qs
|
||||
from cdxobject import CDXException
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -62,6 +63,9 @@ class CDXQuery(object):
|
||||
@property
|
||||
def fields(self):
|
||||
v = self.params.get('fields')
|
||||
# check old param name
|
||||
if not v:
|
||||
v = self.params.get('fl')
|
||||
return v.split(',') if v else None
|
||||
|
||||
@property
|
||||
@ -105,9 +109,6 @@ class CDXQuery(object):
|
||||
"""
|
||||
params = parse_qs(env['QUERY_STRING'])
|
||||
|
||||
if not 'output' in params:
|
||||
params['output'] = 'text'
|
||||
|
||||
# parse_qs produces arrays for single values
|
||||
# cdx processing expects singleton params for all params,
|
||||
# except filters, so convert here
|
||||
@ -116,4 +117,8 @@ class CDXQuery(object):
|
||||
if name != 'filter':
|
||||
params[name] = val[0]
|
||||
|
||||
if not 'output' in params:
|
||||
params['output'] = 'text'
|
||||
|
||||
|
||||
return params
|
||||
|
@ -187,6 +187,7 @@ import pytest
|
||||
|
||||
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
||||
kwparams['url'] = url
|
||||
kwparams['output'] = 'cdxobject'
|
||||
fields = kwparams.get('fields')
|
||||
if fields:
|
||||
fields = fields.split(',')
|
||||
|
@ -1,15 +0,0 @@
|
||||
import webtest
|
||||
from pywb.cdx.wsgi_cdxserver import create_app
|
||||
from pywb import get_test_dir
|
||||
|
||||
class TestCdx:
|
||||
def setup(self):
|
||||
self.app = create_app(get_test_dir() + 'cdx/')
|
||||
self.testapp = webtest.TestApp(self.app)
|
||||
|
||||
def test_cdx(self):
|
||||
resp = self.testapp.get('/cdx?url=http://www.iana.org/_css/2013.1/screen.css')
|
||||
assert resp.content_type == 'text/plain'
|
||||
assert resp.content_length > 0
|
||||
|
||||
|
@ -1,103 +0,0 @@
|
||||
from werkzeug.wrappers import BaseResponse
|
||||
from cdxserver import create_cdx_server
|
||||
from pywb import get_test_dir
|
||||
from query import CDXQuery
|
||||
|
||||
import logging
|
||||
import os
|
||||
import yaml
|
||||
import pkg_resources
|
||||
|
||||
#=================================================================
|
||||
CONFIG_FILE = 'config.yaml'
|
||||
|
||||
RULES_FILE = 'rules.yaml'
|
||||
|
||||
DEFAULT_PORT = 8080
|
||||
|
||||
#=================================================================
|
||||
|
||||
class CDXQueryRequest(object):
|
||||
def __init__(self, environ):
|
||||
self.query = CDXQuery.from_wsgi_env(environ)
|
||||
|
||||
|
||||
class WSGICDXServer(object):
|
||||
def __init__(self, config, rules_file):
|
||||
self.cdxserver = create_cdx_server(config, rules_file)
|
||||
|
||||
def __call__(self, environ, start_response):
|
||||
request = CDXQueryRequest(environ)
|
||||
try:
|
||||
logging.debug('request.args=%s', request.query)
|
||||
result = self.cdxserver.load_cdx_query(request.query)
|
||||
|
||||
# TODO: select response type by "output" parameter
|
||||
response = PlainTextResponse(result, request.query.fields)
|
||||
return response(environ, start_response)
|
||||
except Exception as exc:
|
||||
logging.error('load_cdx failed', exc_info=1)
|
||||
# TODO: error response should be different for each response
|
||||
# type
|
||||
start_response('400 Error', [('Content-Type', 'text/plain')])
|
||||
return [str(exc)]
|
||||
|
||||
def cdx_text_out(cdx, fields):
|
||||
if not fields:
|
||||
return str(cdx) + '\n'
|
||||
else:
|
||||
logging.info('cdx fields=%s', cdx.keys)
|
||||
# TODO: this will results in an exception if fields contain
|
||||
# non-existent field name.
|
||||
return ' '.join(cdx[x] for x in fields) + '\n'
|
||||
|
||||
class PlainTextResponse(BaseResponse):
|
||||
def __init__(self, cdxitr, fields, status=200, content_type='text/plain'):
|
||||
super(PlainTextResponse, self).__init__(
|
||||
response=(
|
||||
cdx.to_text(fields) for cdx in cdxitr
|
||||
),
|
||||
status=status, content_type=content_type)
|
||||
|
||||
# class JsonResponse(Response):
|
||||
# pass
|
||||
# class MementoResponse(Response):
|
||||
# pass
|
||||
|
||||
def create_app(config=None):
|
||||
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
|
||||
level=logging.DEBUG)
|
||||
|
||||
if not config:
|
||||
index_paths = get_test_dir() + 'cdx/'
|
||||
config = dict(index_paths=index_paths)
|
||||
|
||||
return WSGICDXServer(config, RULES_FILE)
|
||||
|
||||
if __name__ == "__main__":
|
||||
from optparse import OptionParser
|
||||
from werkzeug.serving import run_simple
|
||||
|
||||
opt = OptionParser('%prog [OPTIONS]')
|
||||
opt.add_option('-p', '--port', type='int', default=None)
|
||||
|
||||
options, args = opt.parse_args()
|
||||
|
||||
configdata = pkg_resources.resource_string(__name__, CONFIG_FILE)
|
||||
config = yaml.load(configdata)
|
||||
|
||||
port = options.port
|
||||
if port is None:
|
||||
port = (config and config.get('port')) or DEFAULT_PORT
|
||||
|
||||
app = create_app(config)
|
||||
|
||||
logging.debug('Starting CDX Server on port %s', port)
|
||||
try:
|
||||
run_simple('0.0.0.0', port, app, use_reloader=True, use_debugger=True)
|
||||
except KeyboardInterrupt as ex:
|
||||
pass
|
||||
logging.debug('Stopping CDX Server')
|
||||
else:
|
||||
# XXX pass production config
|
||||
application = create_app()
|
@ -1,56 +0,0 @@
|
||||
import views
|
||||
import handlers
|
||||
import replay_views
|
||||
import logging
|
||||
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
from pywb.warc.resolvingloader import ResolvingLoader
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
|
||||
#=================================================================
|
||||
# Config Loading
|
||||
#=================================================================
|
||||
def load_template_file(file, desc = None, view_class = views.J2TemplateView):
|
||||
if file:
|
||||
logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
|
||||
file = view_class(file)
|
||||
|
||||
return file
|
||||
|
||||
#=================================================================
|
||||
def create_wb_handler(cdx_server, config, ds_rules_file=None):
|
||||
|
||||
record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker'))
|
||||
paths = config.get('archive_paths')
|
||||
|
||||
resolving_loader = ResolvingLoader(paths=paths,
|
||||
cdx_server=cdx_server,
|
||||
record_loader=record_loader)
|
||||
|
||||
replayer = replay_views.ReplayView(
|
||||
content_loader = resolving_loader,
|
||||
|
||||
content_rewriter = RewriteContent(ds_rules_file=ds_rules_file),
|
||||
|
||||
head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),
|
||||
|
||||
buffer_response = config.get('buffer_response', True),
|
||||
|
||||
redir_to_exact = config.get('redir_to_exact', True),
|
||||
|
||||
reporter = config.get('reporter')
|
||||
)
|
||||
|
||||
|
||||
wb_handler = handlers.WBHandler(
|
||||
cdx_server,
|
||||
|
||||
replayer,
|
||||
|
||||
html_view = load_template_file(config.get('query_html'), 'Captures Page', views.J2HtmlCapturesView),
|
||||
|
||||
search_view = load_template_file(config.get('search_html'), 'Search Page'),
|
||||
)
|
||||
|
||||
return wb_handler
|
||||
|
0
pywb/core/__init__.py
Normal file
0
pywb/core/__init__.py
Normal file
43
pywb/core/cdx_handler.py
Normal file
43
pywb/core/cdx_handler.py
Normal file
@ -0,0 +1,43 @@
|
||||
from pywb.cdx.query import CDXQuery
|
||||
from pywb.cdx.cdxserver import create_cdx_server
|
||||
|
||||
from pywb.framework.archivalrouter import ArchivalRouter, Route
|
||||
from pywb.framework.basehandlers import BaseHandler
|
||||
|
||||
from views import TextCapturesView
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXHandler(BaseHandler):
|
||||
"""
|
||||
Handler which passes wsgi request to cdx server and
|
||||
returns a text-based cdx response
|
||||
"""
|
||||
def __init__(self, index_reader, view=None):
|
||||
self.index_reader = index_reader
|
||||
self.view = view if view else TextCapturesView()
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env)
|
||||
cdx_lines = self.index_reader.load_cdx(**params)
|
||||
|
||||
return self.view.render_response(wbrequest, cdx_lines)
|
||||
|
||||
def __str__(self):
|
||||
return 'CDX Handler: ' + str(self.index_reader)
|
||||
|
||||
|
||||
#=================================================================
|
||||
DEFAULT_RULES = 'pywb/rules.yaml'
|
||||
|
||||
#=================================================================
|
||||
def create_cdx_server_app(config):
|
||||
"""
|
||||
Create a cdx server config to be wrapped in a wsgi app
|
||||
Currently using single access point '/cdx'
|
||||
TODO: more complex example with multiple collections?
|
||||
"""
|
||||
cdx_server = create_cdx_server(config, DEFAULT_RULES)
|
||||
port = config.get('port')
|
||||
routes = [Route('cdx', CDXHandler(cdx_server))]
|
||||
return ArchivalRouter(routes, port=port)
|
@ -1,30 +1,13 @@
|
||||
import urlparse
|
||||
import pkgutil
|
||||
import mimetypes
|
||||
import time
|
||||
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.cdx.query import CDXQuery
|
||||
from wbrequestresponse import WbResponse
|
||||
from wbexceptions import WbException, NotFoundException
|
||||
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
from pywb.framework.wbexceptions import WbException, NotFoundException
|
||||
from views import TextCapturesView
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BaseHandler(object):
|
||||
def __call__(self, wbrequest):
|
||||
return wbrequest
|
||||
|
||||
def get_wburl_type(self):
|
||||
return None
|
||||
|
||||
|
||||
#=================================================================
|
||||
class WbUrlHandler(BaseHandler):
|
||||
def get_wburl_type(self):
|
||||
return WbUrl
|
||||
|
||||
|
||||
#=================================================================
|
||||
# Standard WB Handler
|
||||
#=================================================================
|
||||
@ -33,11 +16,15 @@ class WBHandler(WbUrlHandler):
|
||||
html_view=None, search_view=None):
|
||||
|
||||
self.index_reader = index_reader
|
||||
|
||||
self.replay = replay
|
||||
|
||||
self.text_view = TextCapturesView()
|
||||
self.text_query_view = TextCapturesView()
|
||||
|
||||
self.query_view = html_view
|
||||
if not self.query_view:
|
||||
self.query_view = text_query_view
|
||||
|
||||
self.html_view = html_view
|
||||
self.search_view = search_view
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
@ -49,11 +36,10 @@ class WBHandler(WbUrlHandler):
|
||||
|
||||
# new special modifier to always show cdx index
|
||||
if wbrequest.wb_url.mod == 'cdx_':
|
||||
return self.text_view.render_response(wbrequest, cdx_lines)
|
||||
return self.text_query_view.render_response(wbrequest, cdx_lines)
|
||||
|
||||
if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY):
|
||||
query_view = self.html_view if self.html_view else self.text_view
|
||||
return query_view.render_response(wbrequest, cdx_lines)
|
||||
return self.query_view.render_response(wbrequest, cdx_lines)
|
||||
|
||||
with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
|
||||
return self.replay(wbrequest, cdx_lines)
|
||||
@ -70,29 +56,11 @@ class WBHandler(WbUrlHandler):
|
||||
return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay)
|
||||
|
||||
|
||||
#=================================================================
|
||||
# CDX-Server Handler -- pass all params to cdx server
|
||||
#=================================================================
|
||||
class CDXHandler(BaseHandler):
|
||||
def __init__(self, index_reader, view = None):
|
||||
self.index_reader = index_reader
|
||||
self.view = view if view else TextCapturesView()
|
||||
|
||||
def __call__(self, wbrequest):
|
||||
params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env)
|
||||
cdx_lines = self.index_reader.load_cdx(**params)
|
||||
|
||||
return self.view.render_response(wbrequest, cdx_lines)
|
||||
|
||||
def __str__(self):
|
||||
return 'Index Reader: ' + str(self.index_reader)
|
||||
|
||||
|
||||
#=================================================================
|
||||
# Static Content Handler
|
||||
#=================================================================
|
||||
class StaticHandler(BaseHandler):
|
||||
def __init__(self, static_path, pkg = __package__):
|
||||
def __init__(self, static_path, pkg = 'pywb'):
|
||||
mimetypes.init()
|
||||
|
||||
self.static_path = static_path
|
@ -29,6 +29,7 @@ class IndexReader(object):
|
||||
params.update(wbrequest.custom_params)
|
||||
|
||||
params['allowFuzzy'] = True
|
||||
params['output'] = 'cdxobject'
|
||||
|
||||
cdxlines = self.load_cdx(url=wburl.url, **params)
|
||||
|
181
pywb/core/pywb_init.py
Normal file
181
pywb/core/pywb_init.py
Normal file
@ -0,0 +1,181 @@
|
||||
from pywb.framework.archivalrouter import ArchivalRouter, Route
|
||||
from pywb.framework.proxy import ProxyArchivalRouter
|
||||
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
from pywb.warc.resolvingloader import ResolvingLoader
|
||||
|
||||
from pywb.rewrite.rewrite_content import RewriteContent
|
||||
|
||||
from indexreader import IndexReader
|
||||
from views import J2TemplateView, J2HtmlCapturesView
|
||||
from replay_views import ReplayView
|
||||
|
||||
from handlers import WBHandler
|
||||
from handlers import StaticHandler
|
||||
from cdx_handler import CDXHandler
|
||||
from handlers import DebugEchoHandler, DebugEchoEnvHandler
|
||||
|
||||
|
||||
import os
|
||||
import yaml
|
||||
import logging
|
||||
|
||||
|
||||
#=================================================================
|
||||
DEFAULTS = {
|
||||
'hostpaths': ['http://localhost:8080'],
|
||||
'collections': {'pywb': './sample_archive/cdx/'},
|
||||
'archive_paths': './sample_archive/warcs/',
|
||||
|
||||
'head_insert_html': 'ui/head_insert.html',
|
||||
'query_html': 'ui/query.html',
|
||||
'search_html': 'ui/search.html',
|
||||
'home_html': 'ui/index.html',
|
||||
'error_html': 'ui/error.html',
|
||||
|
||||
'static_routes': {'static/default': 'static/'},
|
||||
|
||||
'domain_specific_rules': 'pywb/rules.yaml',
|
||||
}
|
||||
|
||||
#=================================================================
|
||||
class DictChain:
|
||||
def __init__(self, *dicts):
|
||||
self.dicts = dicts
|
||||
|
||||
def get(self, key, default_val=None):
|
||||
for d in self.dicts:
|
||||
val = d.get(key)
|
||||
if val is not None:
|
||||
return val
|
||||
return default_val
|
||||
|
||||
|
||||
#=================================================================
|
||||
def load_template_file(file, desc=None, view_class=J2TemplateView):
|
||||
if file:
|
||||
logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
|
||||
file = view_class(file)
|
||||
|
||||
return file
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_wb_handler(cdx_server, config, ds_rules_file=None):
|
||||
|
||||
cookie_maker=config.get('cookie_maker')
|
||||
record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker)
|
||||
|
||||
paths = config.get('archive_paths')
|
||||
|
||||
resolving_loader = ResolvingLoader(paths=paths,
|
||||
cdx_server=cdx_server,
|
||||
record_loader=record_loader)
|
||||
|
||||
head_insert_view = load_template_file(config.get('head_insert_html'),
|
||||
'Head Insert')
|
||||
|
||||
replayer = ReplayView(
|
||||
content_loader=resolving_loader,
|
||||
|
||||
content_rewriter=RewriteContent(ds_rules_file=ds_rules_file),
|
||||
|
||||
head_insert_view=head_insert_view,
|
||||
|
||||
buffer_response=config.get('buffer_response', True),
|
||||
|
||||
redir_to_exact=config.get('redir_to_exact', True),
|
||||
|
||||
reporter=config.get('reporter')
|
||||
)
|
||||
|
||||
html_view = load_template_file(config.get('query_html'),
|
||||
'Captures Page',
|
||||
J2HtmlCapturesView)
|
||||
|
||||
|
||||
search_view = load_template_file(config.get('search_html'),
|
||||
'Search Page')
|
||||
|
||||
wb_handler = WBHandler(
|
||||
cdx_server,
|
||||
replayer,
|
||||
html_view=html_view,
|
||||
search_view=search_view,
|
||||
)
|
||||
|
||||
return wb_handler
|
||||
|
||||
|
||||
#=================================================================
|
||||
def create_wb_router(passed_config = {}):
|
||||
|
||||
config = DictChain(passed_config, DEFAULTS)
|
||||
|
||||
routes = []
|
||||
|
||||
hostpaths = config.get('hostpaths')
|
||||
|
||||
port = config.get('port')
|
||||
|
||||
# collections based on cdx source
|
||||
collections = config.get('collections')
|
||||
|
||||
for name, value in collections.iteritems():
|
||||
if isinstance(value, str):
|
||||
value = {'index_paths': value}
|
||||
|
||||
route_config = DictChain(value, config)
|
||||
|
||||
ds_rules_file = route_config.get('domain_specific_rules', None)
|
||||
cdx_server = IndexReader(route_config, ds_rules_file)
|
||||
|
||||
wb_handler = create_wb_handler(
|
||||
cdx_server=cdx_server,
|
||||
config=route_config,
|
||||
ds_rules_file=ds_rules_file,
|
||||
)
|
||||
|
||||
logging.debug('Adding Collection: ' + name)
|
||||
|
||||
route_class = route_config.get('route_class', Route)
|
||||
|
||||
routes.append(route_class(name, wb_handler, config = route_config))
|
||||
|
||||
# cdx query handler
|
||||
if route_config.get('enable_cdx_api', False):
|
||||
routes.append(Route(name + '-cdx', CDXHandler(cdx_server)))
|
||||
|
||||
|
||||
if config.get('debug_echo_env', False):
|
||||
routes.append(Route('echo_env', DebugEchoEnvHandler()))
|
||||
|
||||
if config.get('debug_echo_req', False):
|
||||
routes.append(Route('echo_req', DebugEchoHandler()))
|
||||
|
||||
|
||||
static_routes = config.get('static_routes')
|
||||
|
||||
for static_name, static_path in static_routes.iteritems():
|
||||
routes.append(Route(static_name, StaticHandler(static_path)))
|
||||
|
||||
# Check for new proxy mode!
|
||||
if config.get('enable_http_proxy', False):
|
||||
router = ProxyArchivalRouter
|
||||
else:
|
||||
router = ArchivalRouter
|
||||
|
||||
# Finally, create wb router
|
||||
return router(
|
||||
routes,
|
||||
# Specify hostnames that pywb will be running on
|
||||
# This will help catch occasionally missed rewrites that fall-through to the host
|
||||
# (See archivalrouter.ReferRedirect)
|
||||
hostpaths = hostpaths,
|
||||
port = port,
|
||||
|
||||
abs_path = config.get('absolute_paths', True),
|
||||
|
||||
home_view = load_template_file(config.get('home_html'), 'Home Page'),
|
||||
error_view = load_template_file(config.get('error_html'), 'Error Page')
|
||||
)
|
@ -2,9 +2,9 @@ import StringIO
|
||||
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.utils.bufferedreaders import ChunkedDataReader
|
||||
from wbrequestresponse import WbResponse
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
|
||||
from wbexceptions import CaptureException, InternalRedirect
|
||||
from pywb.framework.wbexceptions import CaptureException, InternalRedirect
|
||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||
|
||||
from pywb.utils.loaders import LimitReader
|
||||
@ -51,7 +51,7 @@ class ReplayView:
|
||||
self._redirect_if_needed(wbrequest, cdx)
|
||||
|
||||
# one more check for referrer-based self-redirect
|
||||
self._reject_referrer_self_redirect(wbrequest, status_headers)
|
||||
self._reject_referrer_self_redirect(wbrequest)
|
||||
|
||||
response = None
|
||||
|
||||
@ -177,25 +177,30 @@ class ReplayView:
|
||||
|
||||
|
||||
def _reject_self_redirect(self, wbrequest, cdx, status_headers):
|
||||
# self-redirect via location
|
||||
"""
|
||||
Check if response is a 3xx redirect to the same url
|
||||
If so, reject this capture to avoid causing redirect loop
|
||||
"""
|
||||
if status_headers.statusline.startswith('3'):
|
||||
request_url = wbrequest.wb_url.url.lower()
|
||||
location_url = status_headers.get_header('Location').lower()
|
||||
|
||||
#TODO: canonicalize before testing?
|
||||
if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)):
|
||||
raise CaptureException('Self Redirect: ' + str(cdx))
|
||||
|
||||
def _reject_referrer_self_redirect(self, wbrequest, status_headers):
|
||||
# at correct timestamp now, but must check for referrer redirect
|
||||
# indirect self-redirect, via meta-refresh, if referrer is same as current url
|
||||
if status_headers.statusline.startswith('2'):
|
||||
# build full url even if using relative-rewriting
|
||||
request_url = wbrequest.host_prefix + wbrequest.rel_prefix + str(wbrequest.wb_url)
|
||||
referrer_url = wbrequest.referrer
|
||||
if (referrer_url and UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(referrer_url)):
|
||||
raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))
|
||||
|
||||
|
||||
def _reject_referrer_self_redirect(self, wbrequest):
|
||||
"""
|
||||
Perform final check for referrer based self-redirect.
|
||||
This method should be called after verifying request timestamp matches capture.
|
||||
if referrer is same as current url, reject this response and try another capture
|
||||
"""
|
||||
if not wbrequest.referrer:
|
||||
return
|
||||
|
||||
# build full url even if using relative-rewriting
|
||||
request_url = (wbrequest.host_prefix +
|
||||
wbrequest.rel_prefix + str(wbrequest.wb_url))
|
||||
|
||||
if (UrlRewriter.strip_protocol(request_url) ==
|
||||
UrlRewriter.strip_protocol(wbrequest.referrer)):
|
||||
raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))
|
@ -1,6 +1,6 @@
|
||||
import pywb.utils.timeutils as timeutils
|
||||
from pywb.utils.timeutils import timestamp_to_datetime
|
||||
from pywb.framework.wbrequestresponse import WbResponse
|
||||
|
||||
import wbrequestresponse
|
||||
import urlparse
|
||||
import time
|
||||
|
||||
@ -18,7 +18,7 @@ class StaticTextView:
|
||||
return self.text
|
||||
|
||||
def render_response(self, **kwargs):
|
||||
return wbrequestresponse.WbResponse.text_stream(self.text)
|
||||
return WbResponse.text_stream(self.text)
|
||||
|
||||
#=================================================================
|
||||
class J2TemplateView:
|
||||
@ -34,7 +34,7 @@ class J2TemplateView:
|
||||
if template_dir.startswith('.') or template_dir.startswith('file://'):
|
||||
loader = FileSystemLoader(template_dir)
|
||||
else:
|
||||
loader = PackageLoader(__package__, template_dir)
|
||||
loader = PackageLoader('pywb', template_dir)
|
||||
|
||||
jinja_env = Environment(loader = loader, trim_blocks = True)
|
||||
jinja_env.filters['format_ts'] = J2TemplateView.format_ts
|
||||
@ -51,13 +51,13 @@ class J2TemplateView:
|
||||
def render_response(self, **kwargs):
|
||||
template_result = self.render_to_string(**kwargs)
|
||||
status = kwargs.get('status', '200 OK')
|
||||
return wbrequestresponse.WbResponse.text_response(str(template_result), status = status, content_type = 'text/html; charset=utf-8')
|
||||
return WbResponse.text_response(str(template_result), status = status, content_type = 'text/html; charset=utf-8')
|
||||
|
||||
|
||||
# Filters
|
||||
@staticmethod
|
||||
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
|
||||
value = timeutils.timestamp_to_datetime(value)
|
||||
value = timestamp_to_datetime(value)
|
||||
return value.strftime(format_)
|
||||
|
||||
@staticmethod
|
||||
@ -90,7 +90,7 @@ class TextCapturesView:
|
||||
cdx += '\n'
|
||||
return cdx
|
||||
cdx_lines = imap(to_str, cdx_lines)
|
||||
return wbrequestresponse.WbResponse.text_stream(cdx_lines)
|
||||
return WbResponse.text_stream(cdx_lines)
|
||||
|
||||
|
||||
|
0
pywb/framework/__init__.py
Normal file
0
pywb/framework/__init__.py
Normal file
@ -1,17 +1,31 @@
|
||||
import urlparse
|
||||
import re
|
||||
|
||||
from wbrequestresponse import WbRequest, WbResponse
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from wbrequestresponse import WbRequest, WbResponse
|
||||
|
||||
|
||||
#=================================================================
|
||||
# ArchivalRouter -- route WB requests in archival mode
|
||||
#=================================================================
|
||||
class ArchivalRouter:
|
||||
def __init__(self, routes, hostpaths=None, abs_path=True, home_view=None, error_view=None):
|
||||
class ArchivalRouter(object):
|
||||
def __init__(self, routes,
|
||||
hostpaths=None,
|
||||
port=None,
|
||||
abs_path=True,
|
||||
home_view=None,
|
||||
error_view=None):
|
||||
|
||||
self.routes = routes
|
||||
self.fallback = ReferRedirect(hostpaths)
|
||||
|
||||
# optional port setting may be ignored by wsgi container
|
||||
self.port = port
|
||||
|
||||
if hostpaths:
|
||||
self.fallback = ReferRedirect(hostpaths)
|
||||
else:
|
||||
self.fallback = None
|
||||
|
||||
self.abs_path = abs_path
|
||||
|
||||
self.home_view = home_view
|
||||
@ -29,26 +43,27 @@ class ArchivalRouter:
|
||||
|
||||
return self.fallback(env, self.routes) if self.fallback else None
|
||||
|
||||
|
||||
def render_home_page(self):
|
||||
# render the homepage!
|
||||
if self.home_view:
|
||||
return self.home_view.render_response(routes = self.routes)
|
||||
return self.home_view.render_response(routes=self.routes)
|
||||
else:
|
||||
# default home page template
|
||||
text = '\n'.join(map(str, self.routes))
|
||||
return WbResponse.text_response(text)
|
||||
|
||||
|
||||
#=================================================================
|
||||
# Route by matching regex (or fixed prefix)
|
||||
# of request uri (excluding first '/')
|
||||
#=================================================================
|
||||
class Route:
|
||||
class Route(object):
|
||||
# match upto next / or ? or end
|
||||
SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)'
|
||||
SLASH_QUERY_LOOKAHEAD = '(?=/|$|\?)'
|
||||
|
||||
def __init__(self, regex, handler, coll_group=0, config={},
|
||||
lookahead=SLASH_QUERY_LOOKAHEAD):
|
||||
|
||||
def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD):
|
||||
self.path = regex
|
||||
if regex:
|
||||
self.regex = re.compile(regex + lookahead)
|
||||
@ -59,12 +74,11 @@ class Route:
|
||||
self.coll_group = coll_group
|
||||
self._custom_init(config)
|
||||
|
||||
|
||||
def __call__(self, env, use_abs_prefix):
|
||||
wbrequest = self.parse_request(env, use_abs_prefix)
|
||||
return self.handler(wbrequest) if wbrequest else None
|
||||
|
||||
def parse_request(self, env, use_abs_prefix, request_uri = None):
|
||||
def parse_request(self, env, use_abs_prefix, request_uri=None):
|
||||
if not request_uri:
|
||||
request_uri = env['REL_REQUEST_URI']
|
||||
|
||||
@ -75,10 +89,12 @@ class Route:
|
||||
matched_str = matcher.group(0)
|
||||
if matched_str:
|
||||
rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
|
||||
wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri
|
||||
# remove the '/' + rel_prefix part of uri
|
||||
wb_url_str = request_uri[len(matched_str) + 2:]
|
||||
else:
|
||||
rel_prefix = env['SCRIPT_NAME'] + '/'
|
||||
wb_url_str = request_uri[1:] # the request_uri is the wb_url, since no coll
|
||||
# the request_uri is the wb_url, since no coll
|
||||
wb_url_str = request_uri[1:]
|
||||
|
||||
coll = matcher.group(self.coll_group)
|
||||
|
||||
@ -88,20 +104,19 @@ class Route:
|
||||
rel_prefix=rel_prefix,
|
||||
coll=coll,
|
||||
use_abs_prefix=use_abs_prefix,
|
||||
wburl_class = self.handler.get_wburl_type(),
|
||||
wburl_class=self.handler.get_wburl_type(),
|
||||
urlrewriter_class=UrlRewriter)
|
||||
|
||||
|
||||
# Allow for applying of additional filters
|
||||
self._apply_filters(wbrequest, matcher)
|
||||
|
||||
return wbrequest
|
||||
|
||||
|
||||
def _apply_filters(self, wbrequest, matcher):
|
||||
for filter in self.filters:
|
||||
last_grp = len(matcher.groups())
|
||||
wbrequest.query_filter.append(filter.format(matcher.group(last_grp)))
|
||||
filter_str = filter.format(matcher.group(last_grp))
|
||||
wbrequest.query_filter.append(filter_str)
|
||||
|
||||
def _custom_init(self, config):
|
||||
self.filters = config.get('filters', [])
|
||||
@ -112,7 +127,8 @@ class Route:
|
||||
|
||||
|
||||
#=================================================================
|
||||
# ReferRedirect -- redirect urls that have 'fallen through' based on the referrer settings
|
||||
# ReferRedirect -- redirect urls that have 'fallen through'
|
||||
# based on the referrer settings
|
||||
#=================================================================
|
||||
class ReferRedirect:
|
||||
def __init__(self, match_prefixs):
|
||||
@ -121,7 +137,6 @@ class ReferRedirect:
|
||||
else:
|
||||
self.match_prefixs = [match_prefixs]
|
||||
|
||||
|
||||
def __call__(self, env, routes):
|
||||
referrer = env.get('HTTP_REFERER')
|
||||
|
||||
@ -133,7 +148,7 @@ class ReferRedirect:
|
||||
ref_split = urlparse.urlsplit(referrer)
|
||||
|
||||
# ensure referrer starts with one of allowed hosts
|
||||
if not any (referrer.startswith(i) for i in self.match_prefixs):
|
||||
if not any(referrer.startswith(i) for i in self.match_prefixs):
|
||||
if ref_split.netloc != env.get('HTTP_HOST'):
|
||||
return None
|
||||
|
||||
@ -144,13 +159,12 @@ class ReferRedirect:
|
||||
if app_path:
|
||||
# must start with current app name, if not root
|
||||
if not path.startswith(app_path):
|
||||
return None
|
||||
return None
|
||||
|
||||
path = path[len(app_path):]
|
||||
|
||||
|
||||
for route in routes:
|
||||
ref_request = route.parse_request(env, False, request_uri = path)
|
||||
ref_request = route.parse_request(env, False, request_uri=path)
|
||||
if ref_request:
|
||||
break
|
||||
|
||||
@ -174,6 +188,10 @@ class ReferRedirect:
|
||||
# 2013/path.html -> /path.html
|
||||
rel_request_uri = rel_request_uri[len(timestamp_path) - 1:]
|
||||
|
||||
final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, rewriter.rewrite(rel_request_uri), '', ''))
|
||||
final_url = urlparse.urlunsplit((ref_split.scheme,
|
||||
ref_split.netloc,
|
||||
rewriter.rewrite(rel_request_uri),
|
||||
'',
|
||||
''))
|
||||
|
||||
return WbResponse.redir_response(final_url)
|
23
pywb/framework/basehandlers.py
Normal file
23
pywb/framework/basehandlers.py
Normal file
@ -0,0 +1,23 @@
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BaseHandler(object):
|
||||
"""
|
||||
Represents a base handler class that handles any request
|
||||
"""
|
||||
def __call__(self, wbrequest):
|
||||
return wbrequest
|
||||
|
||||
def get_wburl_type(self):
|
||||
return None
|
||||
|
||||
|
||||
#=================================================================
|
||||
class WbUrlHandler(BaseHandler):
|
||||
"""
|
||||
Represents a handler which assumes the request contains a WbUrl
|
||||
Ensure that the WbUrl is parsed in the request
|
||||
"""
|
||||
def get_wburl_type(self):
|
||||
return WbUrl
|
@ -2,23 +2,37 @@ from wbrequestresponse import WbResponse, WbRequest
|
||||
from archivalrouter import ArchivalRouter
|
||||
import urlparse
|
||||
|
||||
|
||||
#=================================================================
|
||||
# An experimental router which combines both archival and proxy modes
|
||||
# http proxy mode support is very simple: only latest capture is available currently
|
||||
# http proxy mode support is very simple so far:
|
||||
# only latest capture is available currently
|
||||
#=================================================================
|
||||
class ProxyArchivalRouter(ArchivalRouter):
|
||||
def __init__(self, routes,
|
||||
hostpaths=None,
|
||||
port=None,
|
||||
abs_path=True,
|
||||
home_view=None,
|
||||
error_view=None):
|
||||
|
||||
(super(ProxyArchivalRouter, self).
|
||||
__init__(routes,
|
||||
hostpaths=hostpaths,
|
||||
port=port,
|
||||
abs_path=abs_path,
|
||||
home_view=home_view,
|
||||
error_view=error_view))
|
||||
|
||||
class ProxyArchivalRouter:
|
||||
def __init__(self, routes, hostpaths = None, abs_path = True, home_view = None, error_view = None):
|
||||
self.archival = ArchivalRouter(routes, hostpaths, abs_path, home_view, error_view)
|
||||
self.proxy = ProxyRouter(routes[0].handler, hostpaths, error_view)
|
||||
self.error_view = error_view
|
||||
#self.error_view = error_view
|
||||
|
||||
def __call__(self, env):
|
||||
response = self.archival(env)
|
||||
response = self.proxy(env)
|
||||
if response:
|
||||
return response
|
||||
|
||||
response = self.proxy(env)
|
||||
response = super(ProxyArchivalRouter, self).__call__(env)
|
||||
if response:
|
||||
return response
|
||||
|
||||
@ -29,7 +43,7 @@ class ProxyArchivalRouter:
|
||||
# Only supports latest capture replay at the moment
|
||||
#=================================================================
|
||||
class ProxyRouter:
|
||||
def __init__(self, handler, hostpaths = None, error_view = None):
|
||||
def __init__(self, handler, hostpaths=None, error_view=None):
|
||||
self.handler = handler
|
||||
self.hostpaths = hostpaths
|
||||
|
||||
@ -56,27 +70,26 @@ class ProxyRouter:
|
||||
|
||||
return self.handler(wbrequest)
|
||||
|
||||
|
||||
# Proxy Auto-Config (PAC) script for the proxy
|
||||
def make_pac_response(self, env):
|
||||
server_hostport = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
|
||||
|
||||
buff = 'function FindProxyForURL (url, host) {\n'
|
||||
|
||||
direct_cond =' if (shExpMatch(host, "{0}")) {{ return "DIRECT"; }}\n'
|
||||
direct = ' if (shExpMatch(host, "{0}")) {{ return "DIRECT"; }}\n'
|
||||
|
||||
for hostpath in self.hostpaths:
|
||||
parts = urlparse.urlsplit(hostpath).netloc.split(':')
|
||||
buff += direct_cond.format(parts[0])
|
||||
buff += direct.format(parts[0])
|
||||
|
||||
buff += direct_cond.format(env['SERVER_NAME'])
|
||||
buff += direct.format(env['SERVER_NAME'])
|
||||
|
||||
#buff += '\n return "PROXY {0}";\n}}\n'.format(self.hostpaths[0])
|
||||
buff += '\n return "PROXY {0}";\n}}\n'.format(server_hostport)
|
||||
|
||||
return WbResponse.text_response(buff, content_type = 'application/x-ns-proxy-autoconfig')
|
||||
|
||||
content_type = 'application/x-ns-proxy-autoconfig'
|
||||
|
||||
return WbResponse.text_response(buff, content_type=content_type)
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -85,10 +98,11 @@ class ProxyRouter:
|
||||
class ProxyHttpsUrlRewriter:
|
||||
HTTP = 'http://'
|
||||
HTTPS = 'https://'
|
||||
|
||||
def __init__(self, wbrequest, prefix):
|
||||
pass
|
||||
|
||||
def rewrite(self, url, mod = None):
|
||||
def rewrite(self, url, mod=None):
|
||||
if url.startswith(self.HTTPS):
|
||||
return self.HTTP + url[len(self.HTTPS):]
|
||||
else:
|
||||
@ -97,6 +111,5 @@ class ProxyHttpsUrlRewriter:
|
||||
def get_timestamp_url(self, timestamp, url):
|
||||
return url
|
||||
|
||||
def get_abs_url(self, url = ''):
|
||||
def get_abs_url(self, url=''):
|
||||
return url
|
||||
|
@ -84,8 +84,8 @@ False
|
||||
|
||||
"""
|
||||
|
||||
from pywb.archivalrouter import Route, ReferRedirect
|
||||
from pywb.handlers import BaseHandler, WbUrlHandler
|
||||
from pywb.framework.archivalrouter import Route, ReferRedirect
|
||||
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
|
||||
import pprint
|
||||
|
||||
def print_req(req):
|
@ -41,7 +41,7 @@ from pywb.rewrite.wburl import WbUrl
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
|
||||
from pywb.wbrequestresponse import WbRequest, WbResponse
|
||||
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
|
||||
|
||||
|
||||
def print_req_from_uri(request_uri, env={}, use_abs_prefix=False):
|
@ -1,23 +1,22 @@
|
||||
from pywb.utils.wbexception import WbException
|
||||
|
||||
|
||||
class WbException(Exception):
|
||||
pass
|
||||
|
||||
class NotFoundException(WbException):
|
||||
def status(self):
|
||||
return '404 Not Found'
|
||||
|
||||
|
||||
# Exceptions that effect a specific capture and result in a retry
|
||||
class CaptureException(WbException):
|
||||
def status(self):
|
||||
return '500 Internal Server Error'
|
||||
|
||||
|
||||
class InternalRedirect(WbException):
|
||||
def __init__(self, location, status = '302 Internal Redirect'):
|
||||
def __init__(self, location, status='302 Internal Redirect'):
|
||||
WbException.__init__(self, 'Redirecting -> ' + location)
|
||||
self.status = status
|
||||
self.httpHeaders = [('Location', location)]
|
||||
|
||||
def status(self):
|
||||
return self.status
|
||||
|
@ -26,7 +26,6 @@ class WbRequest:
|
||||
except KeyError:
|
||||
return ''
|
||||
|
||||
|
||||
def __init__(self, env,
|
||||
request_uri=None,
|
||||
rel_prefix='',
|
||||
@ -40,7 +39,10 @@ class WbRequest:
|
||||
|
||||
self.env = env
|
||||
|
||||
self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI')
|
||||
if request_uri:
|
||||
self.request_uri = request_uri
|
||||
else:
|
||||
self.request_uri = env.get('REL_REQUEST_URI')
|
||||
|
||||
self.coll = coll
|
||||
|
||||
@ -55,7 +57,6 @@ class WbRequest:
|
||||
else:
|
||||
self.wb_prefix = rel_prefix
|
||||
|
||||
|
||||
if not wb_url_str:
|
||||
wb_url_str = '/'
|
||||
|
||||
@ -83,7 +84,6 @@ class WbRequest:
|
||||
# PERF
|
||||
env['X_PERF'] = {}
|
||||
|
||||
|
||||
def _is_ajax(self):
|
||||
value = self.env.get('HTTP_X_REQUESTED_WITH')
|
||||
if not value:
|
||||
@ -96,7 +96,6 @@ class WbRequest:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def __repr__(self):
|
||||
varlist = vars(self)
|
||||
varstr = pprint.pformat(varlist)
|
||||
@ -111,32 +110,39 @@ class WbResponse:
|
||||
Holds a status_headers object and a response iter, to be
|
||||
returned to wsgi container.
|
||||
"""
|
||||
def __init__(self, status_headers, value = []):
|
||||
def __init__(self, status_headers, value=[]):
|
||||
self.status_headers = status_headers
|
||||
self.body = value
|
||||
|
||||
@staticmethod
|
||||
def text_stream(text, status = '200 OK', content_type = 'text/plain'):
|
||||
return WbResponse(StatusAndHeaders(status, [('Content-Type', content_type)]), value = text)
|
||||
def text_stream(stream, status='200 OK', content_type='text/plain'):
|
||||
status_headers = StatusAndHeaders(status,
|
||||
[('Content-Type', content_type)])
|
||||
|
||||
return WbResponse(status_headers, value=stream)
|
||||
|
||||
@staticmethod
|
||||
def text_response(text, status = '200 OK', content_type = 'text/plain'):
|
||||
return WbResponse(StatusAndHeaders(status, [('Content-Type', content_type)]), value = [text])
|
||||
def text_response(text, status='200 OK', content_type='text/plain'):
|
||||
status_headers = StatusAndHeaders(status,
|
||||
[('Content-Type', content_type)])
|
||||
|
||||
return WbResponse(status_headers, value=[text])
|
||||
|
||||
@staticmethod
|
||||
def redir_response(location, status = '302 Redirect'):
|
||||
return WbResponse(StatusAndHeaders(status, [('Location', location)]))
|
||||
|
||||
def redir_response(location, status='302 Redirect'):
|
||||
return WbResponse(StatusAndHeaders(status,
|
||||
[('Location', location)]))
|
||||
|
||||
def __call__(self, env, start_response):
|
||||
|
||||
# PERF
|
||||
perfstats = env.get('X_PERF')
|
||||
if perfstats:
|
||||
self.status_headers.headers.append(('X-Archive-Perf-Stats', str(perfstats)))
|
||||
self.status_headers.headers.append(('X-Archive-Perf-Stats',
|
||||
str(perfstats)))
|
||||
|
||||
|
||||
start_response(self.status_headers.statusline, self.status_headers.headers)
|
||||
start_response(self.status_headers.statusline,
|
||||
self.status_headers.headers)
|
||||
|
||||
if env['REQUEST_METHOD'] == 'HEAD':
|
||||
if hasattr(self.body, 'close'):
|
||||
@ -148,6 +154,5 @@ class WbResponse:
|
||||
else:
|
||||
return [str(self.body)]
|
||||
|
||||
|
||||
def __repr__(self):
|
||||
return str(vars(self))
|
165
pywb/framework/wsgi_wrappers.py
Normal file
165
pywb/framework/wsgi_wrappers.py
Normal file
@ -0,0 +1,165 @@
|
||||
from pywb.utils.wbexception import WbException
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
|
||||
from wbexceptions import NotFoundException, InternalRedirect
|
||||
from wbrequestresponse import WbResponse, StatusAndHeaders
|
||||
|
||||
|
||||
import os
|
||||
import importlib
|
||||
import logging
|
||||
|
||||
|
||||
DEFAULT_PORT = 8080
|
||||
|
||||
#=================================================================
|
||||
# adapted from wsgiref.request_uri, but doesn't include domain name
|
||||
# and allows all characters which are allowed in the path segment
|
||||
# according to: http://tools.ietf.org/html/rfc3986#section-3.3
|
||||
# explained here:
|
||||
# http://stackoverflow.com/questions/4669692/
|
||||
# valid-characters-for-directory-part-of-a-url-for-short-links
|
||||
|
||||
|
||||
def rel_request_uri(environ, include_query=1):
|
||||
"""
|
||||
Return the requested path, optionally including the query string
|
||||
|
||||
# Simple test:
|
||||
>>> rel_request_uri({'PATH_INFO': '/web/example.com'})
|
||||
'/web/example.com'
|
||||
|
||||
# Test all unecoded special chars and double-quote
|
||||
# (double-quote must be encoded but not single quote)
|
||||
>>> rel_request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""})
|
||||
"/web/example.com/0~!+$&'()*+,;=:%22"
|
||||
"""
|
||||
from urllib import quote
|
||||
url = quote(environ.get('PATH_INFO', ''), safe='/~!$&\'()*+,;=:@')
|
||||
if include_query and environ.get('QUERY_STRING'):
|
||||
url += '?' + environ['QUERY_STRING']
|
||||
|
||||
return url
|
||||
|
||||
|
||||
#=================================================================
|
||||
class WSGIApp(object):
|
||||
def __init__(self, wb_router):
|
||||
self.wb_router = wb_router
|
||||
self.port = DEFAULT_PORT
|
||||
if hasattr(wb_router, 'port'):
|
||||
self.port = wb_router.port
|
||||
|
||||
# Top-level wsgi application
|
||||
def __call__(self, env, start_response):
|
||||
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
|
||||
env['REL_REQUEST_URI'] = rel_request_uri(env)
|
||||
else:
|
||||
env['REL_REQUEST_URI'] = env['REQUEST_URI']
|
||||
|
||||
wb_router = self.wb_router
|
||||
response = None
|
||||
|
||||
try:
|
||||
response = wb_router(env)
|
||||
|
||||
if not response:
|
||||
msg = 'No handler for "{0}"'.format(env['REL_REQUEST_URI'])
|
||||
raise NotFoundException(msg)
|
||||
|
||||
except InternalRedirect as ir:
|
||||
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
|
||||
|
||||
except WbException as e:
|
||||
response = handle_exception(env, wb_router.error_view, e, False)
|
||||
|
||||
except Exception as e:
|
||||
response = handle_exception(env, wb_router.error_view, e, True)
|
||||
|
||||
return response(env, start_response)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def handle_exception(env, error_view, exc, print_trace):
|
||||
if hasattr(exc, 'status'):
|
||||
status = exc.status()
|
||||
else:
|
||||
status = '400 Bad Request'
|
||||
|
||||
if print_trace:
|
||||
import traceback
|
||||
err_details = traceback.format_exc(exc)
|
||||
print err_details
|
||||
else:
|
||||
logging.info(str(exc))
|
||||
err_details = None
|
||||
|
||||
if error_view:
|
||||
import traceback
|
||||
return error_view.render_response(err_msg=str(exc),
|
||||
err_details=err_details,
|
||||
status=status)
|
||||
else:
|
||||
return WbResponse.text_response(status + ' Error: ' + str(exc),
|
||||
status=status)
|
||||
|
||||
#=================================================================
|
||||
DEFAULT_CONFIG_FILE = 'config.yaml'
|
||||
|
||||
|
||||
#=================================================================
|
||||
def init_app(init_func, load_yaml=True, config_file=None):
|
||||
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
|
||||
level=logging.DEBUG)
|
||||
logging.info('')
|
||||
|
||||
if load_yaml:
|
||||
if not config_file:
|
||||
config_file = os.environ.get('PYWB_CONFIG_FILE')
|
||||
if not config_file:
|
||||
config_file = DEFAULT_CONFIG_FILE
|
||||
|
||||
config = load_yaml_config(config_file)
|
||||
|
||||
try:
|
||||
if load_yaml:
|
||||
wb_router = init_func(config)
|
||||
else:
|
||||
wb_router = init_func()
|
||||
except:
|
||||
msg = '*** pywb app init FAILED config from "%s"!\n'
|
||||
logging.exception(msg, init_func.__name__)
|
||||
raise
|
||||
else:
|
||||
msg = '*** pywb app inited with config from "%s"!\n'
|
||||
logging.info(msg, init_func.__name__)
|
||||
|
||||
return WSGIApp(wb_router)
|
||||
|
||||
|
||||
#=================================================================
|
||||
def start_wsgi_server(the_app):
|
||||
from wsgiref.simple_server import make_server
|
||||
from optparse import OptionParser
|
||||
|
||||
opt = OptionParser('%prog [OPTIONS]')
|
||||
opt.add_option('-p', '--port', type='int', default=None)
|
||||
|
||||
options, args = opt.parse_args()
|
||||
|
||||
port = options.port
|
||||
|
||||
port = the_app.port
|
||||
|
||||
if not port:
|
||||
port = DEFAULT_PORT
|
||||
|
||||
logging.debug('Starting CDX Server on port %s', port)
|
||||
|
||||
try:
|
||||
httpd = make_server('', port, the_app)
|
||||
httpd.serve_forever()
|
||||
except KeyboardInterrupt as ex:
|
||||
pass
|
||||
|
||||
logging.debug('Stopping CDX Server')
|
@ -1,128 +0,0 @@
|
||||
import handlers
|
||||
import archivalrouter
|
||||
import config_utils
|
||||
import proxy
|
||||
from indexreader import IndexReader
|
||||
|
||||
import os
|
||||
import yaml
|
||||
import logging
|
||||
|
||||
#=================================================================
|
||||
DEFAULTS = {
|
||||
'hostpaths': ['http://localhost:8080'],
|
||||
'collections': {'pywb': './sample_archive/cdx/'},
|
||||
'archive_paths': './sample_archive/warcs/',
|
||||
|
||||
'head_insert_html': 'ui/head_insert.html',
|
||||
'query_html': 'ui/query.html',
|
||||
'search_html': 'ui/search.html',
|
||||
'home_html': 'ui/index.html',
|
||||
'error_html': 'ui/error.html',
|
||||
|
||||
'static_routes': {'static/default': 'static/'},
|
||||
|
||||
'domain_specific_rules': 'rules.yaml',
|
||||
}
|
||||
|
||||
class DictChain:
|
||||
def __init__(self, *dicts):
|
||||
self.dicts = dicts
|
||||
|
||||
def get(self, key, default_val=None):
|
||||
for d in self.dicts:
|
||||
val = d.get(key)
|
||||
if val is not None:
|
||||
return val
|
||||
return default_val
|
||||
|
||||
|
||||
#=================================================================
|
||||
## Reference non-YAML config
|
||||
#=================================================================
|
||||
def pywb_config_manual(passed_config = {}):
|
||||
|
||||
config = DictChain(passed_config, DEFAULTS)
|
||||
|
||||
routes = []
|
||||
|
||||
hostpaths = config.get('hostpaths')
|
||||
|
||||
# collections based on cdx source
|
||||
collections = config.get('collections')
|
||||
|
||||
for name, value in collections.iteritems():
|
||||
if isinstance(value, str):
|
||||
value = {'index_paths': value}
|
||||
|
||||
route_config = DictChain(value, config)
|
||||
|
||||
ds_rules_file = route_config.get('domain_specific_rules', None)
|
||||
cdx_server = IndexReader(route_config, ds_rules_file)
|
||||
|
||||
wb_handler = config_utils.create_wb_handler(
|
||||
cdx_server=cdx_server,
|
||||
config=route_config,
|
||||
ds_rules_file=ds_rules_file,
|
||||
)
|
||||
|
||||
logging.debug('Adding Collection: ' + name)
|
||||
|
||||
route_class = route_config.get('route_class', archivalrouter.Route)
|
||||
|
||||
routes.append(route_class(name, wb_handler, config = route_config))
|
||||
|
||||
# cdx query handler
|
||||
if route_config.get('enable_cdx_api', False):
|
||||
routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_server)))
|
||||
|
||||
|
||||
if config.get('debug_echo_env', False):
|
||||
routes.append(archivalrouter.Route('echo_env', handlers.DebugEchoEnvHandler()))
|
||||
|
||||
if config.get('debug_echo_req', False):
|
||||
routes.append(archivalrouter.Route('echo_req', handlers.DebugEchoHandler()))
|
||||
|
||||
|
||||
static_routes = config.get('static_routes')
|
||||
|
||||
for static_name, static_path in static_routes.iteritems():
|
||||
routes.append(archivalrouter.Route(static_name, handlers.StaticHandler(static_path)))
|
||||
|
||||
# Check for new proxy mode!
|
||||
if config.get('enable_http_proxy', False):
|
||||
router = proxy.ProxyArchivalRouter
|
||||
else:
|
||||
router = archivalrouter.ArchivalRouter
|
||||
|
||||
# Finally, create wb router
|
||||
return router(
|
||||
routes,
|
||||
# Specify hostnames that pywb will be running on
|
||||
# This will help catch occasionally missed rewrites that fall-through to the host
|
||||
# (See archivalrouter.ReferRedirect)
|
||||
hostpaths = hostpaths,
|
||||
|
||||
abs_path = config.get('absolute_paths', True),
|
||||
|
||||
home_view = config_utils.load_template_file(config.get('home_html'), 'Home Page'),
|
||||
error_view = config_utils.load_template_file(config.get('error_html'), 'Error Page')
|
||||
)
|
||||
|
||||
|
||||
|
||||
#=================================================================
|
||||
# YAML config loader
|
||||
#=================================================================
|
||||
DEFAULT_CONFIG_FILE = 'config.yaml'
|
||||
|
||||
|
||||
def pywb_config(config_file = None):
|
||||
if not config_file:
|
||||
config_file = os.environ.get('PYWB_CONFIG', DEFAULT_CONFIG_FILE)
|
||||
|
||||
with open(config_file) as fh:
|
||||
config = yaml.load(fh)
|
||||
|
||||
return pywb_config_manual(config)
|
||||
|
@ -4,6 +4,9 @@
|
||||
import surt
|
||||
import urlparse
|
||||
|
||||
from wbexception import WbException
|
||||
|
||||
|
||||
#=================================================================
|
||||
class UrlCanonicalizer(object):
|
||||
def __init__(self, surt_ordered=True):
|
||||
@ -14,7 +17,7 @@ class UrlCanonicalizer(object):
|
||||
|
||||
|
||||
#=================================================================
|
||||
class UrlCanonicalizeException(Exception):
|
||||
class UrlCanonicalizeException(WbException):
|
||||
def status(self):
|
||||
return '400 Bad Request'
|
||||
|
||||
@ -164,7 +167,8 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
||||
|
||||
elif match_type == 'domain':
|
||||
if not surt_ordered:
|
||||
raise UrlCanonicalizeException('matchType=domain unsupported for non-surt')
|
||||
msg = 'matchType=domain unsupported for non-surt'
|
||||
raise UrlCanonicalizeException(msg)
|
||||
|
||||
host = start_key.split(')/')[0]
|
||||
|
||||
|
@ -1,10 +1,9 @@
|
||||
import yaml
|
||||
import pkgutil
|
||||
from loaders import load_yaml_config
|
||||
|
||||
|
||||
#=================================================================
|
||||
|
||||
DEFAULT_RULES_FILE = 'rules.yaml'
|
||||
DEFAULT_RULES_PKG = 'pywb'
|
||||
DEFAULT_RULES_FILE = 'pywb/rules.yaml'
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -23,10 +22,14 @@ class RuleSet(object):
|
||||
|
||||
self.rules = []
|
||||
|
||||
ds_rules_file = kwargs.get('ds_rules_file')
|
||||
default_rule_config = kwargs.get('default_rule_config')
|
||||
|
||||
config = self.load_default_rules(ds_rules_file)
|
||||
ds_rules_file = kwargs.get('ds_rules_file')
|
||||
|
||||
if not ds_rules_file:
|
||||
ds_rules_file = DEFAULT_RULES_FILE
|
||||
|
||||
config = load_yaml_config(ds_rules_file)
|
||||
|
||||
rulesmap = config.get('rules') if config else None
|
||||
|
||||
@ -53,22 +56,6 @@ class RuleSet(object):
|
||||
if not def_key_found and default_rule_config is not None:
|
||||
self.rules.append(rule_cls(self.DEFAULT_KEY, default_rule_config))
|
||||
|
||||
@staticmethod
|
||||
def load_default_rules(filename=None, pkg=None):
|
||||
config = None
|
||||
|
||||
if not filename:
|
||||
filename = DEFAULT_RULES_FILE
|
||||
|
||||
if not pkg:
|
||||
pkg = DEFAULT_RULES_PKG
|
||||
|
||||
if filename:
|
||||
yaml_str = pkgutil.get_data(pkg, filename)
|
||||
config = yaml.load(yaml_str)
|
||||
|
||||
return config
|
||||
|
||||
def iter_matching(self, urlkey):
|
||||
"""
|
||||
Iterate over all matching rules for given urlkey
|
||||
|
@ -7,11 +7,20 @@ import os
|
||||
import hmac
|
||||
import urllib2
|
||||
import time
|
||||
import pkg_resources
|
||||
|
||||
|
||||
#=================================================================
|
||||
def is_http(filename):
|
||||
return any(filename.startswith(x) for x in ['http://', 'https://'])
|
||||
return filename.startswith(('http://', 'https://'))
|
||||
|
||||
|
||||
#=================================================================
|
||||
def load_yaml_config(config_file):
|
||||
import yaml
|
||||
configdata = BlockLoader().load(config_file)
|
||||
config = yaml.load(configdata)
|
||||
return config
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -24,27 +33,46 @@ class BlockLoader(object):
|
||||
def __init__(self, cookie_maker=None):
|
||||
self.cookie_maker = cookie_maker
|
||||
|
||||
def load(self, url, offset, length):
|
||||
def load(self, url, offset=0, length=-1):
|
||||
"""
|
||||
Determine loading method based on uri
|
||||
"""
|
||||
if is_http(url):
|
||||
return self.load_http(url, offset, length)
|
||||
else:
|
||||
return self.load_file(url, offset, length)
|
||||
return self.load_file_or_resource(url, offset, length)
|
||||
|
||||
def load_file(self, url, offset, length):
|
||||
def load_file_or_resource(self, url, offset, length):
|
||||
"""
|
||||
Load a file-like reader from the local file system
|
||||
"""
|
||||
|
||||
file_only = False
|
||||
|
||||
if url.startswith('file://'):
|
||||
url = url[len('file://'):]
|
||||
file_only = True
|
||||
|
||||
afile = open(url, 'rb')
|
||||
afile.seek(offset)
|
||||
try:
|
||||
# first, try as file
|
||||
afile = open(url, 'rb')
|
||||
|
||||
if length > 0:
|
||||
except IOError:
|
||||
if file_only:
|
||||
raise
|
||||
|
||||
# then, try as package.path/file
|
||||
pkg_split = url.split('/', 1)
|
||||
if len(pkg_split) == 1:
|
||||
raise
|
||||
|
||||
afile = pkg_resources.resource_stream(pkg_split[0],
|
||||
pkg_split[1])
|
||||
|
||||
if offset > 0:
|
||||
afile.seek(offset)
|
||||
|
||||
if length >= 0:
|
||||
return LimitReader(afile, length)
|
||||
else:
|
||||
return afile
|
||||
|
@ -30,9 +30,9 @@
|
||||
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
|
||||
' CDX N b a m s k r M S V g\\n'
|
||||
|
||||
#DecompressingBufferedReader readline() with decompression
|
||||
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
|
||||
' CDX N b a m s k r M S V g\\n'
|
||||
#DecompressingBufferedReader readline() with decompression (zipnum file, no header)
|
||||
>>> DecompressingBufferedReader(open(test_zip_dir + 'zipnum-sample.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
|
||||
'com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz\\n'
|
||||
|
||||
>>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
|
||||
'Example Domain'
|
||||
@ -60,7 +60,7 @@ from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
from pywb import get_test_dir
|
||||
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'
|
||||
test_cdx_dir = get_test_dir() + 'cdx/'
|
||||
|
||||
test_zip_dir = get_test_dir() + 'zipcdx/'
|
||||
|
||||
def read_multiple(reader, inc_reads):
|
||||
result = None
|
||||
|
@ -171,7 +171,6 @@ def timestamp_to_datetime(string):
|
||||
# pad to 6 digits
|
||||
string = _pad_timestamp(string, PAD_6)
|
||||
|
||||
|
||||
def clamp(val, min_, max_):
|
||||
try:
|
||||
val = int(val)
|
||||
|
3
pywb/utils/wbexception.py
Normal file
3
pywb/utils/wbexception.py
Normal file
@ -0,0 +1,3 @@
|
||||
class WbException(Exception):
|
||||
def status(self):
|
||||
return '500 Internal Server Error'
|
@ -9,6 +9,9 @@ from pywb.utils.statusandheaders import StatusAndHeadersParserException
|
||||
from pywb.utils.loaders import BlockLoader
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
|
||||
from pywb.utils.wbexception import WbException
|
||||
|
||||
|
||||
#=================================================================
|
||||
ArcWarcRecord = collections.namedtuple('ArchiveRecord',
|
||||
'type, rec_headers, ' +
|
||||
@ -16,7 +19,7 @@ ArcWarcRecord = collections.namedtuple('ArchiveRecord',
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ArchiveLoadFailed(Exception):
|
||||
class ArchiveLoadFailed(WbException):
|
||||
def __init__(self, reason, filename=''):
|
||||
super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason))
|
||||
#self.filename = filename
|
||||
@ -62,9 +65,9 @@ class ArcWarcRecordLoader:
|
||||
decomp_type = 'gzip'
|
||||
|
||||
# Create decompressing stream
|
||||
stream = DecompressingBufferedReader(stream = raw,
|
||||
decomp_type = decomp_type,
|
||||
block_size = self.block_size)
|
||||
stream = DecompressingBufferedReader(stream=raw,
|
||||
decomp_type=decomp_type,
|
||||
block_size=self.block_size)
|
||||
|
||||
(the_format, rec_headers) = self._detect_type_load_headers(stream)
|
||||
|
||||
|
@ -176,6 +176,6 @@ class ResolvingLoader:
|
||||
params = {'url': url,
|
||||
'closest': timestamp,
|
||||
'filter': 'digest:' + digest,
|
||||
'output': 'raw'}
|
||||
'output': 'cdxobject'}
|
||||
|
||||
return self.cdx_server.load_cdx(**params)
|
||||
|
124
pywb/wbapp.py
124
pywb/wbapp.py
@ -1,124 +0,0 @@
|
||||
from wbexceptions import WbException, NotFoundException, InternalRedirect
|
||||
from wbrequestresponse import WbResponse, StatusAndHeaders
|
||||
|
||||
from pywb.cdx.cdxserver import CDXException
|
||||
from pywb.utils.canonicalize import UrlCanonicalizeException
|
||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||
|
||||
import os
|
||||
import importlib
|
||||
import logging
|
||||
|
||||
|
||||
|
||||
#=================================================================
|
||||
# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters
|
||||
# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
|
||||
# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
|
||||
def rel_request_uri(environ, include_query=1):
|
||||
"""
|
||||
Return the requested path, optionally including the query string
|
||||
|
||||
# Simple test:
|
||||
>>> rel_request_uri({'PATH_INFO': '/web/example.com'})
|
||||
'/web/example.com'
|
||||
|
||||
# Test all unecoded special chars and double-quote
|
||||
# (double-quote must be encoded but not single quote)
|
||||
>>> rel_request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""})
|
||||
"/web/example.com/0~!+$&'()*+,;=:%22"
|
||||
"""
|
||||
from urllib import quote
|
||||
url = quote(environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@')
|
||||
if include_query and environ.get('QUERY_STRING'):
|
||||
url += '?' + environ['QUERY_STRING']
|
||||
|
||||
return url
|
||||
|
||||
#=================================================================
|
||||
def create_wb_app(wb_router):
|
||||
|
||||
# Top-level wsgi application
|
||||
def application(env, start_response):
|
||||
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
|
||||
env['REL_REQUEST_URI'] = rel_request_uri(env)
|
||||
else:
|
||||
env['REL_REQUEST_URI'] = env['REQUEST_URI']
|
||||
|
||||
response = None
|
||||
|
||||
try:
|
||||
response = wb_router(env)
|
||||
|
||||
if not response:
|
||||
raise NotFoundException('No handler for "{0}"'.format(env['REL_REQUEST_URI']))
|
||||
|
||||
except InternalRedirect as ir:
|
||||
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
|
||||
|
||||
except (WbException, CDXException,
|
||||
UrlCanonicalizeException, ArchiveLoadFailed) as e:
|
||||
response = handle_exception(env, wb_router.error_view, e, False)
|
||||
|
||||
except Exception as e:
|
||||
response = handle_exception(env, wb_router.error_view, e, True)
|
||||
|
||||
return response(env, start_response)
|
||||
|
||||
|
||||
return application
|
||||
|
||||
|
||||
def handle_exception(env, error_view, exc, print_trace):
|
||||
if hasattr(exc, 'status'):
|
||||
status = exc.status()
|
||||
else:
|
||||
status = '400 Bad Request'
|
||||
|
||||
if print_trace:
|
||||
import traceback
|
||||
err_details = traceback.format_exc(exc)
|
||||
print err_details
|
||||
else:
|
||||
logging.info(str(exc))
|
||||
err_details = None
|
||||
|
||||
if error_view:
|
||||
import traceback
|
||||
return error_view.render_response(err_msg = str(exc), err_details = err_details, status = status)
|
||||
else:
|
||||
return WbResponse.text_response(status + ' Error: ' + str(exc), status = status)
|
||||
|
||||
|
||||
#=================================================================
|
||||
DEFAULT_CONFIG_FILE = 'config.yaml'
|
||||
|
||||
def main():
|
||||
try:
|
||||
logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG)
|
||||
|
||||
# see if there's a custom init module
|
||||
config_name = os.environ.get('PYWB_CONFIG_MODULE')
|
||||
|
||||
if not config_name:
|
||||
# use default module
|
||||
config_name = 'pywb.pywb_init'
|
||||
logging.info('Loading from default config module "{0}"'.format(config_name))
|
||||
logging.info('')
|
||||
|
||||
module = importlib.import_module(config_name)
|
||||
|
||||
app = create_wb_app(module.pywb_config())
|
||||
logging.info('')
|
||||
logging.info('*** pywb inited with settings from {0}.pywb_config()!\n'.format(config_name))
|
||||
return app
|
||||
|
||||
except Exception:
|
||||
logging.exception('*** pywb could not init with settings from {0}.pywb_config()!\n'.format(config_name))
|
||||
raise
|
||||
|
||||
#=================================================================
|
||||
if __name__ == "__main__":
|
||||
pass
|
||||
else:
|
||||
application = main()
|
4
run.sh
4
run.sh
@ -10,14 +10,14 @@ mypath=$(cd `dirname $0` && pwd)
|
||||
# ex: my_pywb.pywb_config()
|
||||
#export 'PYWB_CONFIG=my_pywb'
|
||||
|
||||
app="pywb.wbapp"
|
||||
app="pywb.apps.wayback"
|
||||
|
||||
params="--http-socket :8080 -b 65536"
|
||||
#params="--static-map /static=$mypath/static --http-socket :8080 -b 65536"
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
# Standard root config
|
||||
params="$params --wsgi pywb.wbapp"
|
||||
params="$params --wsgi $app"
|
||||
else
|
||||
# run with --mount
|
||||
# requires a file not a package, so creating a mount_run.py to load the package
|
||||
|
Binary file not shown.
10
setup.py
10
setup.py
@ -14,7 +14,14 @@ setup(
|
||||
license='GPL',
|
||||
packages=find_packages(),
|
||||
provides=[
|
||||
'pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'
|
||||
'pywb',
|
||||
'pywb.utils',
|
||||
'pywb.cdx',
|
||||
'pywb.warc',
|
||||
'pywb.rewrite',
|
||||
'pywb.framework'
|
||||
'pywb.core',
|
||||
'pywb.apps'
|
||||
],
|
||||
package_data={
|
||||
'pywb': ['ui/*', 'static/*', '*.yaml'],
|
||||
@ -34,7 +41,6 @@ setup(
|
||||
'pyyaml',
|
||||
'WebTest',
|
||||
'pytest',
|
||||
'werkzeug>=0.9.4',
|
||||
],
|
||||
# tests_require=['WebTest', 'pytest'],
|
||||
zip_safe=False
|
||||
|
@ -90,6 +90,9 @@ enable_http_proxy: true
|
||||
# enable cdx server api for querying cdx directly (experimental)
|
||||
enable_cdx_api: true
|
||||
|
||||
# test different port
|
||||
port: 9000
|
||||
|
||||
# optional reporter callback func
|
||||
# if set, called with request and cdx object
|
||||
reporter: !!python/object/new:tests.fixture.PrintReporter []
|
||||
|
@ -1,32 +1,26 @@
|
||||
import os
|
||||
import re
|
||||
import webtest
|
||||
|
||||
import pytest
|
||||
from urllib import urlencode
|
||||
|
||||
from werkzeug.test import Client
|
||||
from werkzeug.wrappers import BaseResponse, Response
|
||||
|
||||
import yaml
|
||||
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
from pywb.cdx.wsgi_cdxserver import create_app
|
||||
from pywb.apps.cdx_server import application
|
||||
|
||||
from tests.fixture import testconfig
|
||||
import pytest
|
||||
|
||||
#================================================================
|
||||
@pytest.fixture
|
||||
def client(testconfig):
|
||||
app = create_app(testconfig)
|
||||
return Client(app, Response)
|
||||
def client():
|
||||
return webtest.TestApp(application)
|
||||
|
||||
# ================================================================
|
||||
|
||||
def query(client, url, **params):
|
||||
#================================================================
|
||||
def query(client, url, is_error=False, **params):
|
||||
params['url'] = url
|
||||
return client.get('/cdx?' + urlencode(params, doseq=1))
|
||||
return client.get('/cdx?' + urlencode(params, doseq=1), expect_errors=is_error)
|
||||
|
||||
# ================================================================
|
||||
|
||||
#================================================================
|
||||
def test_exact_url(client):
|
||||
"""
|
||||
basic exact match, no filters, etc.
|
||||
@ -34,48 +28,54 @@ def test_exact_url(client):
|
||||
resp = query(client, 'http://www.iana.org/')
|
||||
|
||||
assert resp.status_code == 200
|
||||
print resp.data
|
||||
print resp.body
|
||||
|
||||
|
||||
#================================================================
|
||||
def test_prefix_match(client):
|
||||
"""
|
||||
prefix match test
|
||||
"""
|
||||
resp = query(client, 'http://www.iana.org/', matchType='prefix')
|
||||
|
||||
print resp.data.splitlines()
|
||||
print resp.body.splitlines()
|
||||
assert resp.status_code == 200
|
||||
|
||||
suburls = 0
|
||||
for l in resp.data.splitlines():
|
||||
for l in resp.body.splitlines():
|
||||
fields = l.split(' ')
|
||||
if len(fields[0]) > len('org,iana)/'):
|
||||
suburls += 1
|
||||
assert suburls > 0
|
||||
|
||||
|
||||
|
||||
#================================================================
|
||||
def test_filters(client):
|
||||
"""
|
||||
filter cdxes by mimetype and filename field, exact match.
|
||||
"""
|
||||
resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
|
||||
filter=('mimetype:warc/revisit', 'filename:dupes.warc.gz'))
|
||||
|
||||
assert resp.status_code == 200
|
||||
assert resp.mimetype == 'text/plain'
|
||||
|
||||
for l in resp.data.splitlines():
|
||||
assert resp.status_code == 200
|
||||
assert resp.content_type == 'text/plain'
|
||||
|
||||
for l in resp.body.splitlines():
|
||||
fields = l.split(' ')
|
||||
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
|
||||
assert fields[3] == 'warc/revisit'
|
||||
assert fields[10] == 'dupes.warc.gz'
|
||||
|
||||
|
||||
#================================================================
|
||||
def test_limit(client):
|
||||
resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
|
||||
limit='1')
|
||||
|
||||
assert resp.status_code == 200
|
||||
assert resp.mimetype == 'text/plain'
|
||||
assert resp.content_type == 'text/plain'
|
||||
|
||||
cdxes = resp.data.splitlines()
|
||||
cdxes = resp.body.splitlines()
|
||||
assert len(cdxes) == 1
|
||||
fields = cdxes[0].split(' ')
|
||||
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
|
||||
@ -86,15 +86,17 @@ def test_limit(client):
|
||||
limit='1', reverse='1')
|
||||
|
||||
assert resp.status_code == 200
|
||||
assert resp.mimetype == 'text/plain'
|
||||
assert resp.content_type == 'text/plain'
|
||||
|
||||
cdxes = resp.data.splitlines()
|
||||
cdxes = resp.body.splitlines()
|
||||
assert len(cdxes) == 1
|
||||
fields = cdxes[0].split(' ')
|
||||
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
|
||||
assert fields[1] == '20140127171239'
|
||||
assert fields[3] == 'warc/revisit'
|
||||
|
||||
|
||||
#================================================================
|
||||
def test_fields(client):
|
||||
"""
|
||||
retrieve subset of fields with ``fields`` parameter.
|
||||
@ -104,7 +106,7 @@ def test_fields(client):
|
||||
|
||||
assert resp.status_code == 200
|
||||
|
||||
cdxes = resp.data.splitlines()
|
||||
cdxes = resp.body.splitlines()
|
||||
|
||||
for cdx in cdxes:
|
||||
fields = cdx.split(' ')
|
||||
@ -113,16 +115,21 @@ def test_fields(client):
|
||||
assert re.match(r'\d{14}$', fields[1])
|
||||
assert re.match(r'\d{3}|-', fields[2])
|
||||
|
||||
|
||||
#================================================================
|
||||
def test_fields_undefined(client):
|
||||
"""
|
||||
server shall respond with Bad Request (TODO: with proper explanation),
|
||||
server shall respond with Bad Request and name of undefined
|
||||
when ``fields`` parameter contains undefined name(s).
|
||||
"""
|
||||
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
|
||||
is_error=True,
|
||||
fields='urlkey,nosuchfield')
|
||||
|
||||
resp.status_code == 400
|
||||
|
||||
|
||||
|
||||
#================================================================
|
||||
def test_resolveRevisits(client):
|
||||
"""
|
||||
with ``resolveRevisits=true``, server adds three fields pointing to
|
||||
@ -132,9 +139,9 @@ def test_resolveRevisits(client):
|
||||
resolveRevisits='true'
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
assert resp.mimetype == 'text/plain'
|
||||
assert resp.content_type == 'text/plain'
|
||||
|
||||
cdxes = resp.data.splitlines()
|
||||
cdxes = resp.body.splitlines()
|
||||
originals = {}
|
||||
for cdx in cdxes:
|
||||
fields = cdx.split(' ')
|
||||
@ -151,6 +158,8 @@ def test_resolveRevisits(client):
|
||||
orig = originals.get(sha)
|
||||
assert orig == (int(orig_size), int(orig_offset), orig_fn)
|
||||
|
||||
|
||||
#================================================================
|
||||
def test_resolveRevisits_orig_fields(client):
|
||||
"""
|
||||
when resolveRevisits=true, extra three fields are named
|
||||
@ -162,9 +171,9 @@ def test_resolveRevisits_orig_fields(client):
|
||||
fields='urlkey,orig.length,orig.offset,orig.filename'
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
assert resp.mimetype == 'text/plain'
|
||||
assert resp.content_type == 'text/plain'
|
||||
|
||||
cdxes = resp.data.splitlines()
|
||||
cdxes = resp.body.splitlines()
|
||||
for cdx in cdxes:
|
||||
fields = cdx.split(' ')
|
||||
assert len(fields) == 4
|
||||
@ -172,6 +181,8 @@ def test_resolveRevisits_orig_fields(client):
|
||||
assert (orig_len == '-' and orig_offset == '-' and orig_fn == '-' or
|
||||
(int(orig_len), int(orig_offset), orig_fn))
|
||||
|
||||
|
||||
#================================================================
|
||||
def test_collapseTime_resolveRevisits_reverse(client):
|
||||
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
|
||||
collapseTime='11',
|
||||
@ -179,11 +190,10 @@ def test_collapseTime_resolveRevisits_reverse(client):
|
||||
reverse='true'
|
||||
)
|
||||
|
||||
cdxes = [CDXObject(l) for l in resp.data.splitlines()]
|
||||
|
||||
cdxes = [CDXObject(l) for l in resp.body.splitlines()]
|
||||
|
||||
assert len(cdxes) == 3
|
||||
|
||||
# timestamp is in descending order
|
||||
for i in range(len(cdxes) - 1):
|
||||
assert cdxes[i]['timestamp'] >= cdxes[i + 1]['timestamp']
|
||||
|
@ -1,6 +1,6 @@
|
||||
import webtest
|
||||
from pywb.pywb_init import pywb_config
|
||||
from pywb.wbapp import create_wb_app
|
||||
from pywb.core.pywb_init import create_wb_router
|
||||
from pywb.framework.wsgi_wrappers import init_app
|
||||
from pywb.cdx.cdxobject import CDXObject
|
||||
|
||||
from fixture import TestExclusionPerms
|
||||
@ -11,8 +11,13 @@ class TestWb:
|
||||
def setup(self):
|
||||
#self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config())
|
||||
# save it in self - useful for debugging
|
||||
self.router = pywb_config(self.TEST_CONFIG)
|
||||
self.app = create_wb_app(self.router)
|
||||
self.app = init_app(create_wb_router,
|
||||
load_yaml=True,
|
||||
config_file=self.TEST_CONFIG)
|
||||
|
||||
#self.router = pywb_config(self.TEST_CONFIG)
|
||||
#self.app = create_wb_app(self.router)
|
||||
|
||||
self.testapp = webtest.TestApp(self.app)
|
||||
|
||||
def _assert_basic_html(self, resp):
|
||||
|
Loading…
x
Reference in New Issue
Block a user