mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
Merge pull request #28 from ikreymer/pkg-reorg
pywb pkg refactoring: create pywb.framework, pywb.core and pywb.apps
This commit is contained in:
commit
5a28bc6992
0
pywb/apps/__init__.py
Normal file
0
pywb/apps/__init__.py
Normal file
17
pywb/apps/cdx_server.py
Normal file
17
pywb/apps/cdx_server.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server
|
||||||
|
|
||||||
|
from pywb.core.cdx_handler import create_cdx_server_app
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# init cdx server app
|
||||||
|
#=================================================================
|
||||||
|
|
||||||
|
# cdx-server only config
|
||||||
|
DEFAULT_CONFIG = 'pywb/cdx/config.yaml'
|
||||||
|
|
||||||
|
application = init_app(create_cdx_server_app,
|
||||||
|
load_yaml=True,
|
||||||
|
config_file=DEFAULT_CONFIG)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
start_wsgi_server(application)
|
10
pywb/apps/wayback.py
Normal file
10
pywb/apps/wayback.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server
|
||||||
|
from pywb.core.pywb_init import create_wb_router
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# init pywb app
|
||||||
|
#=================================================================
|
||||||
|
application = init_app(create_wb_router, load_yaml=True)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
start_wsgi_server(application)
|
@ -9,6 +9,7 @@ from pywb.utils.canonicalize import unsurt, UrlCanonicalizer
|
|||||||
|
|
||||||
from query import CDXQuery
|
from query import CDXQuery
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
|
def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
|
||||||
"""
|
"""
|
||||||
|
@ -4,9 +4,11 @@ import itertools
|
|||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
from urlparse import parse_qs
|
from urlparse import parse_qs
|
||||||
|
|
||||||
|
from pywb.utils.wbexception import WbException
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CDXException(Exception):
|
class CDXException(WbException):
|
||||||
def status(self):
|
def status(self):
|
||||||
return '400 Bad Request'
|
return '400 Bad Request'
|
||||||
|
|
||||||
@ -61,7 +63,7 @@ class CDXObject(OrderedDict):
|
|||||||
cdxformat = i
|
cdxformat = i
|
||||||
|
|
||||||
if not cdxformat:
|
if not cdxformat:
|
||||||
raise Exception('unknown {0}-field cdx format'.format(len(fields)))
|
raise CDXException('unknown {0}-field cdx format'.format(len(fields)))
|
||||||
|
|
||||||
for header, field in itertools.izip(cdxformat, fields):
|
for header, field in itertools.izip(cdxformat, fields):
|
||||||
self[header] = field
|
self[header] = field
|
||||||
@ -85,8 +87,15 @@ class CDXObject(OrderedDict):
|
|||||||
"""
|
"""
|
||||||
if fields is None:
|
if fields is None:
|
||||||
return str(self) + '\n'
|
return str(self) + '\n'
|
||||||
else:
|
|
||||||
return ' '.join(self[x] for x in fields) + '\n'
|
try:
|
||||||
|
result = ' '.join(self[x] for x in fields) + '\n'
|
||||||
|
except KeyError as ke:
|
||||||
|
msg = 'Invalid field "{0}" found in fields= argument'
|
||||||
|
msg = msg.format(ke.message)
|
||||||
|
raise CDXException(msg)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
if self.cdxline:
|
if self.cdxline:
|
||||||
@ -109,7 +118,7 @@ class IDXObject(OrderedDict):
|
|||||||
|
|
||||||
if len(fields) < self.NUM_REQ_FIELDS:
|
if len(fields) < self.NUM_REQ_FIELDS:
|
||||||
msg = 'invalid idx format: {0} fields found, {1} required'
|
msg = 'invalid idx format: {0} fields found, {1} required'
|
||||||
raise Exception(msg.format(len(fields), self.NUM_REQ_FIELDS))
|
raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS))
|
||||||
|
|
||||||
for header, field in itertools.izip(self.FORMAT, fields):
|
for header, field in itertools.izip(self.FORMAT, fields):
|
||||||
self[header] = field
|
self[header] = field
|
||||||
|
@ -31,8 +31,18 @@ def cdx_load(sources, query, perms_checker=None, process=True):
|
|||||||
if perms_checker:
|
if perms_checker:
|
||||||
cdx_iter = restrict_cdx(cdx_iter, query, perms_checker)
|
cdx_iter = restrict_cdx(cdx_iter, query, perms_checker)
|
||||||
|
|
||||||
|
if query.output == 'text':
|
||||||
|
cdx_iter = cdx_to_text(cdx_iter, query.fields)
|
||||||
|
|
||||||
return cdx_iter
|
return cdx_iter
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def cdx_to_text(cdx_iter, fields):
|
||||||
|
for cdx in cdx_iter:
|
||||||
|
yield cdx.to_text(fields)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def restrict_cdx(cdx_iter, query, perms_checker):
|
def restrict_cdx(cdx_iter, query, perms_checker):
|
||||||
"""
|
"""
|
||||||
@ -56,6 +66,7 @@ def restrict_cdx(cdx_iter, query, perms_checker):
|
|||||||
|
|
||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def process_cdx(cdx_iter, query):
|
def process_cdx(cdx_iter, query):
|
||||||
if query.resolve_revisits:
|
if query.resolve_revisits:
|
||||||
@ -255,7 +266,6 @@ def cdx_resolve_revisits(cdx_iter):
|
|||||||
originals = {}
|
originals = {}
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
|
|
||||||
is_revisit = cdx.is_revisit()
|
is_revisit = cdx.is_revisit()
|
||||||
|
|
||||||
digest = cdx['digest']
|
digest = cdx['digest']
|
||||||
|
@ -126,14 +126,19 @@ class CDXServer(BaseCDXServer):
|
|||||||
logging.warn('No CDX Sources configured from paths=%s', paths)
|
logging.warn('No CDX Sources configured from paths=%s', paths)
|
||||||
|
|
||||||
def _add_cdx_source(self, source):
|
def _add_cdx_source(self, source):
|
||||||
if source is None: return
|
if source is None:
|
||||||
|
return
|
||||||
|
|
||||||
logging.debug('Adding CDX Source: %s', source)
|
logging.debug('Adding CDX Source: %s', source)
|
||||||
self.sources.append(source)
|
self.sources.append(source)
|
||||||
|
|
||||||
def add_cdx_source(self, source, config):
|
def add_cdx_source(self, source, config):
|
||||||
if source is None: return
|
if source is None:
|
||||||
|
return
|
||||||
|
|
||||||
if isinstance(source, CDXSource):
|
if isinstance(source, CDXSource):
|
||||||
self._add_cdx_source(source)
|
self._add_cdx_source(source)
|
||||||
|
|
||||||
elif isinstance(source, str):
|
elif isinstance(source, str):
|
||||||
if os.path.isdir(source):
|
if os.path.isdir(source):
|
||||||
for fn in os.listdir(source):
|
for fn in os.listdir(source):
|
||||||
@ -213,5 +218,3 @@ def create_cdx_server(config, ds_rules_file=None):
|
|||||||
surt_ordered=surt_ordered,
|
surt_ordered=surt_ordered,
|
||||||
ds_rules_file=ds_rules_file,
|
ds_rules_file=ds_rules_file,
|
||||||
perms_checker=perms_checker)
|
perms_checker=perms_checker)
|
||||||
|
|
||||||
|
|
||||||
|
@ -8,6 +8,7 @@ import urllib
|
|||||||
import urllib2
|
import urllib2
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CDXSource(object):
|
class CDXSource(object):
|
||||||
"""
|
"""
|
||||||
@ -92,7 +93,6 @@ class RedisCDXSource(CDXSource):
|
|||||||
if config:
|
if config:
|
||||||
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
|
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
|
||||||
|
|
||||||
|
|
||||||
def load_cdx(self, query):
|
def load_cdx(self, query):
|
||||||
"""
|
"""
|
||||||
Load cdx from redis cache, from an ordered list
|
Load cdx from redis cache, from an ordered list
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
from urlparse import parse_qs
|
from urlparse import parse_qs
|
||||||
|
from cdxobject import CDXException
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -62,6 +63,9 @@ class CDXQuery(object):
|
|||||||
@property
|
@property
|
||||||
def fields(self):
|
def fields(self):
|
||||||
v = self.params.get('fields')
|
v = self.params.get('fields')
|
||||||
|
# check old param name
|
||||||
|
if not v:
|
||||||
|
v = self.params.get('fl')
|
||||||
return v.split(',') if v else None
|
return v.split(',') if v else None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -105,9 +109,6 @@ class CDXQuery(object):
|
|||||||
"""
|
"""
|
||||||
params = parse_qs(env['QUERY_STRING'])
|
params = parse_qs(env['QUERY_STRING'])
|
||||||
|
|
||||||
if not 'output' in params:
|
|
||||||
params['output'] = 'text'
|
|
||||||
|
|
||||||
# parse_qs produces arrays for single values
|
# parse_qs produces arrays for single values
|
||||||
# cdx processing expects singleton params for all params,
|
# cdx processing expects singleton params for all params,
|
||||||
# except filters, so convert here
|
# except filters, so convert here
|
||||||
@ -116,4 +117,8 @@ class CDXQuery(object):
|
|||||||
if name != 'filter':
|
if name != 'filter':
|
||||||
params[name] = val[0]
|
params[name] = val[0]
|
||||||
|
|
||||||
|
if not 'output' in params:
|
||||||
|
params['output'] = 'text'
|
||||||
|
|
||||||
|
|
||||||
return params
|
return params
|
||||||
|
@ -187,6 +187,7 @@ import pytest
|
|||||||
|
|
||||||
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
||||||
kwparams['url'] = url
|
kwparams['url'] = url
|
||||||
|
kwparams['output'] = 'cdxobject'
|
||||||
fields = kwparams.get('fields')
|
fields = kwparams.get('fields')
|
||||||
if fields:
|
if fields:
|
||||||
fields = fields.split(',')
|
fields = fields.split(',')
|
||||||
|
@ -1,15 +0,0 @@
|
|||||||
import webtest
|
|
||||||
from pywb.cdx.wsgi_cdxserver import create_app
|
|
||||||
from pywb import get_test_dir
|
|
||||||
|
|
||||||
class TestCdx:
|
|
||||||
def setup(self):
|
|
||||||
self.app = create_app(get_test_dir() + 'cdx/')
|
|
||||||
self.testapp = webtest.TestApp(self.app)
|
|
||||||
|
|
||||||
def test_cdx(self):
|
|
||||||
resp = self.testapp.get('/cdx?url=http://www.iana.org/_css/2013.1/screen.css')
|
|
||||||
assert resp.content_type == 'text/plain'
|
|
||||||
assert resp.content_length > 0
|
|
||||||
|
|
||||||
|
|
@ -1,103 +0,0 @@
|
|||||||
from werkzeug.wrappers import BaseResponse
|
|
||||||
from cdxserver import create_cdx_server
|
|
||||||
from pywb import get_test_dir
|
|
||||||
from query import CDXQuery
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import yaml
|
|
||||||
import pkg_resources
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
CONFIG_FILE = 'config.yaml'
|
|
||||||
|
|
||||||
RULES_FILE = 'rules.yaml'
|
|
||||||
|
|
||||||
DEFAULT_PORT = 8080
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
|
|
||||||
class CDXQueryRequest(object):
|
|
||||||
def __init__(self, environ):
|
|
||||||
self.query = CDXQuery.from_wsgi_env(environ)
|
|
||||||
|
|
||||||
|
|
||||||
class WSGICDXServer(object):
|
|
||||||
def __init__(self, config, rules_file):
|
|
||||||
self.cdxserver = create_cdx_server(config, rules_file)
|
|
||||||
|
|
||||||
def __call__(self, environ, start_response):
|
|
||||||
request = CDXQueryRequest(environ)
|
|
||||||
try:
|
|
||||||
logging.debug('request.args=%s', request.query)
|
|
||||||
result = self.cdxserver.load_cdx_query(request.query)
|
|
||||||
|
|
||||||
# TODO: select response type by "output" parameter
|
|
||||||
response = PlainTextResponse(result, request.query.fields)
|
|
||||||
return response(environ, start_response)
|
|
||||||
except Exception as exc:
|
|
||||||
logging.error('load_cdx failed', exc_info=1)
|
|
||||||
# TODO: error response should be different for each response
|
|
||||||
# type
|
|
||||||
start_response('400 Error', [('Content-Type', 'text/plain')])
|
|
||||||
return [str(exc)]
|
|
||||||
|
|
||||||
def cdx_text_out(cdx, fields):
|
|
||||||
if not fields:
|
|
||||||
return str(cdx) + '\n'
|
|
||||||
else:
|
|
||||||
logging.info('cdx fields=%s', cdx.keys)
|
|
||||||
# TODO: this will results in an exception if fields contain
|
|
||||||
# non-existent field name.
|
|
||||||
return ' '.join(cdx[x] for x in fields) + '\n'
|
|
||||||
|
|
||||||
class PlainTextResponse(BaseResponse):
|
|
||||||
def __init__(self, cdxitr, fields, status=200, content_type='text/plain'):
|
|
||||||
super(PlainTextResponse, self).__init__(
|
|
||||||
response=(
|
|
||||||
cdx.to_text(fields) for cdx in cdxitr
|
|
||||||
),
|
|
||||||
status=status, content_type=content_type)
|
|
||||||
|
|
||||||
# class JsonResponse(Response):
|
|
||||||
# pass
|
|
||||||
# class MementoResponse(Response):
|
|
||||||
# pass
|
|
||||||
|
|
||||||
def create_app(config=None):
|
|
||||||
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
|
|
||||||
level=logging.DEBUG)
|
|
||||||
|
|
||||||
if not config:
|
|
||||||
index_paths = get_test_dir() + 'cdx/'
|
|
||||||
config = dict(index_paths=index_paths)
|
|
||||||
|
|
||||||
return WSGICDXServer(config, RULES_FILE)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
from optparse import OptionParser
|
|
||||||
from werkzeug.serving import run_simple
|
|
||||||
|
|
||||||
opt = OptionParser('%prog [OPTIONS]')
|
|
||||||
opt.add_option('-p', '--port', type='int', default=None)
|
|
||||||
|
|
||||||
options, args = opt.parse_args()
|
|
||||||
|
|
||||||
configdata = pkg_resources.resource_string(__name__, CONFIG_FILE)
|
|
||||||
config = yaml.load(configdata)
|
|
||||||
|
|
||||||
port = options.port
|
|
||||||
if port is None:
|
|
||||||
port = (config and config.get('port')) or DEFAULT_PORT
|
|
||||||
|
|
||||||
app = create_app(config)
|
|
||||||
|
|
||||||
logging.debug('Starting CDX Server on port %s', port)
|
|
||||||
try:
|
|
||||||
run_simple('0.0.0.0', port, app, use_reloader=True, use_debugger=True)
|
|
||||||
except KeyboardInterrupt as ex:
|
|
||||||
pass
|
|
||||||
logging.debug('Stopping CDX Server')
|
|
||||||
else:
|
|
||||||
# XXX pass production config
|
|
||||||
application = create_app()
|
|
@ -1,56 +0,0 @@
|
|||||||
import views
|
|
||||||
import handlers
|
|
||||||
import replay_views
|
|
||||||
import logging
|
|
||||||
|
|
||||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
|
||||||
from pywb.warc.resolvingloader import ResolvingLoader
|
|
||||||
from pywb.rewrite.rewrite_content import RewriteContent
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# Config Loading
|
|
||||||
#=================================================================
|
|
||||||
def load_template_file(file, desc = None, view_class = views.J2TemplateView):
|
|
||||||
if file:
|
|
||||||
logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
|
|
||||||
file = view_class(file)
|
|
||||||
|
|
||||||
return file
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def create_wb_handler(cdx_server, config, ds_rules_file=None):
|
|
||||||
|
|
||||||
record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker'))
|
|
||||||
paths = config.get('archive_paths')
|
|
||||||
|
|
||||||
resolving_loader = ResolvingLoader(paths=paths,
|
|
||||||
cdx_server=cdx_server,
|
|
||||||
record_loader=record_loader)
|
|
||||||
|
|
||||||
replayer = replay_views.ReplayView(
|
|
||||||
content_loader = resolving_loader,
|
|
||||||
|
|
||||||
content_rewriter = RewriteContent(ds_rules_file=ds_rules_file),
|
|
||||||
|
|
||||||
head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),
|
|
||||||
|
|
||||||
buffer_response = config.get('buffer_response', True),
|
|
||||||
|
|
||||||
redir_to_exact = config.get('redir_to_exact', True),
|
|
||||||
|
|
||||||
reporter = config.get('reporter')
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
wb_handler = handlers.WBHandler(
|
|
||||||
cdx_server,
|
|
||||||
|
|
||||||
replayer,
|
|
||||||
|
|
||||||
html_view = load_template_file(config.get('query_html'), 'Captures Page', views.J2HtmlCapturesView),
|
|
||||||
|
|
||||||
search_view = load_template_file(config.get('search_html'), 'Search Page'),
|
|
||||||
)
|
|
||||||
|
|
||||||
return wb_handler
|
|
||||||
|
|
0
pywb/core/__init__.py
Normal file
0
pywb/core/__init__.py
Normal file
43
pywb/core/cdx_handler.py
Normal file
43
pywb/core/cdx_handler.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
from pywb.cdx.query import CDXQuery
|
||||||
|
from pywb.cdx.cdxserver import create_cdx_server
|
||||||
|
|
||||||
|
from pywb.framework.archivalrouter import ArchivalRouter, Route
|
||||||
|
from pywb.framework.basehandlers import BaseHandler
|
||||||
|
|
||||||
|
from views import TextCapturesView
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class CDXHandler(BaseHandler):
|
||||||
|
"""
|
||||||
|
Handler which passes wsgi request to cdx server and
|
||||||
|
returns a text-based cdx response
|
||||||
|
"""
|
||||||
|
def __init__(self, index_reader, view=None):
|
||||||
|
self.index_reader = index_reader
|
||||||
|
self.view = view if view else TextCapturesView()
|
||||||
|
|
||||||
|
def __call__(self, wbrequest):
|
||||||
|
params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env)
|
||||||
|
cdx_lines = self.index_reader.load_cdx(**params)
|
||||||
|
|
||||||
|
return self.view.render_response(wbrequest, cdx_lines)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return 'CDX Handler: ' + str(self.index_reader)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
DEFAULT_RULES = 'pywb/rules.yaml'
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def create_cdx_server_app(config):
|
||||||
|
"""
|
||||||
|
Create a cdx server config to be wrapped in a wsgi app
|
||||||
|
Currently using single access point '/cdx'
|
||||||
|
TODO: more complex example with multiple collections?
|
||||||
|
"""
|
||||||
|
cdx_server = create_cdx_server(config, DEFAULT_RULES)
|
||||||
|
port = config.get('port')
|
||||||
|
routes = [Route('cdx', CDXHandler(cdx_server))]
|
||||||
|
return ArchivalRouter(routes, port=port)
|
@ -1,30 +1,13 @@
|
|||||||
import urlparse
|
|
||||||
import pkgutil
|
import pkgutil
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from pywb.rewrite.wburl import WbUrl
|
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
|
||||||
from pywb.cdx.query import CDXQuery
|
from pywb.framework.wbrequestresponse import WbResponse
|
||||||
from wbrequestresponse import WbResponse
|
from pywb.framework.wbexceptions import WbException, NotFoundException
|
||||||
from wbexceptions import WbException, NotFoundException
|
|
||||||
from views import TextCapturesView
|
from views import TextCapturesView
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class BaseHandler(object):
|
|
||||||
def __call__(self, wbrequest):
|
|
||||||
return wbrequest
|
|
||||||
|
|
||||||
def get_wburl_type(self):
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
class WbUrlHandler(BaseHandler):
|
|
||||||
def get_wburl_type(self):
|
|
||||||
return WbUrl
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# Standard WB Handler
|
# Standard WB Handler
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -33,11 +16,15 @@ class WBHandler(WbUrlHandler):
|
|||||||
html_view=None, search_view=None):
|
html_view=None, search_view=None):
|
||||||
|
|
||||||
self.index_reader = index_reader
|
self.index_reader = index_reader
|
||||||
|
|
||||||
self.replay = replay
|
self.replay = replay
|
||||||
|
|
||||||
self.text_view = TextCapturesView()
|
self.text_query_view = TextCapturesView()
|
||||||
|
|
||||||
|
self.query_view = html_view
|
||||||
|
if not self.query_view:
|
||||||
|
self.query_view = text_query_view
|
||||||
|
|
||||||
self.html_view = html_view
|
|
||||||
self.search_view = search_view
|
self.search_view = search_view
|
||||||
|
|
||||||
def __call__(self, wbrequest):
|
def __call__(self, wbrequest):
|
||||||
@ -49,11 +36,10 @@ class WBHandler(WbUrlHandler):
|
|||||||
|
|
||||||
# new special modifier to always show cdx index
|
# new special modifier to always show cdx index
|
||||||
if wbrequest.wb_url.mod == 'cdx_':
|
if wbrequest.wb_url.mod == 'cdx_':
|
||||||
return self.text_view.render_response(wbrequest, cdx_lines)
|
return self.text_query_view.render_response(wbrequest, cdx_lines)
|
||||||
|
|
||||||
if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY):
|
if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY):
|
||||||
query_view = self.html_view if self.html_view else self.text_view
|
return self.query_view.render_response(wbrequest, cdx_lines)
|
||||||
return query_view.render_response(wbrequest, cdx_lines)
|
|
||||||
|
|
||||||
with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
|
with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
|
||||||
return self.replay(wbrequest, cdx_lines)
|
return self.replay(wbrequest, cdx_lines)
|
||||||
@ -70,29 +56,11 @@ class WBHandler(WbUrlHandler):
|
|||||||
return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay)
|
return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# CDX-Server Handler -- pass all params to cdx server
|
|
||||||
#=================================================================
|
|
||||||
class CDXHandler(BaseHandler):
|
|
||||||
def __init__(self, index_reader, view = None):
|
|
||||||
self.index_reader = index_reader
|
|
||||||
self.view = view if view else TextCapturesView()
|
|
||||||
|
|
||||||
def __call__(self, wbrequest):
|
|
||||||
params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env)
|
|
||||||
cdx_lines = self.index_reader.load_cdx(**params)
|
|
||||||
|
|
||||||
return self.view.render_response(wbrequest, cdx_lines)
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return 'Index Reader: ' + str(self.index_reader)
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# Static Content Handler
|
# Static Content Handler
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class StaticHandler(BaseHandler):
|
class StaticHandler(BaseHandler):
|
||||||
def __init__(self, static_path, pkg = __package__):
|
def __init__(self, static_path, pkg = 'pywb'):
|
||||||
mimetypes.init()
|
mimetypes.init()
|
||||||
|
|
||||||
self.static_path = static_path
|
self.static_path = static_path
|
@ -29,6 +29,7 @@ class IndexReader(object):
|
|||||||
params.update(wbrequest.custom_params)
|
params.update(wbrequest.custom_params)
|
||||||
|
|
||||||
params['allowFuzzy'] = True
|
params['allowFuzzy'] = True
|
||||||
|
params['output'] = 'cdxobject'
|
||||||
|
|
||||||
cdxlines = self.load_cdx(url=wburl.url, **params)
|
cdxlines = self.load_cdx(url=wburl.url, **params)
|
||||||
|
|
181
pywb/core/pywb_init.py
Normal file
181
pywb/core/pywb_init.py
Normal file
@ -0,0 +1,181 @@
|
|||||||
|
from pywb.framework.archivalrouter import ArchivalRouter, Route
|
||||||
|
from pywb.framework.proxy import ProxyArchivalRouter
|
||||||
|
|
||||||
|
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||||
|
from pywb.warc.resolvingloader import ResolvingLoader
|
||||||
|
|
||||||
|
from pywb.rewrite.rewrite_content import RewriteContent
|
||||||
|
|
||||||
|
from indexreader import IndexReader
|
||||||
|
from views import J2TemplateView, J2HtmlCapturesView
|
||||||
|
from replay_views import ReplayView
|
||||||
|
|
||||||
|
from handlers import WBHandler
|
||||||
|
from handlers import StaticHandler
|
||||||
|
from cdx_handler import CDXHandler
|
||||||
|
from handlers import DebugEchoHandler, DebugEchoEnvHandler
|
||||||
|
|
||||||
|
|
||||||
|
import os
|
||||||
|
import yaml
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
DEFAULTS = {
|
||||||
|
'hostpaths': ['http://localhost:8080'],
|
||||||
|
'collections': {'pywb': './sample_archive/cdx/'},
|
||||||
|
'archive_paths': './sample_archive/warcs/',
|
||||||
|
|
||||||
|
'head_insert_html': 'ui/head_insert.html',
|
||||||
|
'query_html': 'ui/query.html',
|
||||||
|
'search_html': 'ui/search.html',
|
||||||
|
'home_html': 'ui/index.html',
|
||||||
|
'error_html': 'ui/error.html',
|
||||||
|
|
||||||
|
'static_routes': {'static/default': 'static/'},
|
||||||
|
|
||||||
|
'domain_specific_rules': 'pywb/rules.yaml',
|
||||||
|
}
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class DictChain:
|
||||||
|
def __init__(self, *dicts):
|
||||||
|
self.dicts = dicts
|
||||||
|
|
||||||
|
def get(self, key, default_val=None):
|
||||||
|
for d in self.dicts:
|
||||||
|
val = d.get(key)
|
||||||
|
if val is not None:
|
||||||
|
return val
|
||||||
|
return default_val
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def load_template_file(file, desc=None, view_class=J2TemplateView):
|
||||||
|
if file:
|
||||||
|
logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
|
||||||
|
file = view_class(file)
|
||||||
|
|
||||||
|
return file
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def create_wb_handler(cdx_server, config, ds_rules_file=None):
|
||||||
|
|
||||||
|
cookie_maker=config.get('cookie_maker')
|
||||||
|
record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker)
|
||||||
|
|
||||||
|
paths = config.get('archive_paths')
|
||||||
|
|
||||||
|
resolving_loader = ResolvingLoader(paths=paths,
|
||||||
|
cdx_server=cdx_server,
|
||||||
|
record_loader=record_loader)
|
||||||
|
|
||||||
|
head_insert_view = load_template_file(config.get('head_insert_html'),
|
||||||
|
'Head Insert')
|
||||||
|
|
||||||
|
replayer = ReplayView(
|
||||||
|
content_loader=resolving_loader,
|
||||||
|
|
||||||
|
content_rewriter=RewriteContent(ds_rules_file=ds_rules_file),
|
||||||
|
|
||||||
|
head_insert_view=head_insert_view,
|
||||||
|
|
||||||
|
buffer_response=config.get('buffer_response', True),
|
||||||
|
|
||||||
|
redir_to_exact=config.get('redir_to_exact', True),
|
||||||
|
|
||||||
|
reporter=config.get('reporter')
|
||||||
|
)
|
||||||
|
|
||||||
|
html_view = load_template_file(config.get('query_html'),
|
||||||
|
'Captures Page',
|
||||||
|
J2HtmlCapturesView)
|
||||||
|
|
||||||
|
|
||||||
|
search_view = load_template_file(config.get('search_html'),
|
||||||
|
'Search Page')
|
||||||
|
|
||||||
|
wb_handler = WBHandler(
|
||||||
|
cdx_server,
|
||||||
|
replayer,
|
||||||
|
html_view=html_view,
|
||||||
|
search_view=search_view,
|
||||||
|
)
|
||||||
|
|
||||||
|
return wb_handler
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def create_wb_router(passed_config = {}):
|
||||||
|
|
||||||
|
config = DictChain(passed_config, DEFAULTS)
|
||||||
|
|
||||||
|
routes = []
|
||||||
|
|
||||||
|
hostpaths = config.get('hostpaths')
|
||||||
|
|
||||||
|
port = config.get('port')
|
||||||
|
|
||||||
|
# collections based on cdx source
|
||||||
|
collections = config.get('collections')
|
||||||
|
|
||||||
|
for name, value in collections.iteritems():
|
||||||
|
if isinstance(value, str):
|
||||||
|
value = {'index_paths': value}
|
||||||
|
|
||||||
|
route_config = DictChain(value, config)
|
||||||
|
|
||||||
|
ds_rules_file = route_config.get('domain_specific_rules', None)
|
||||||
|
cdx_server = IndexReader(route_config, ds_rules_file)
|
||||||
|
|
||||||
|
wb_handler = create_wb_handler(
|
||||||
|
cdx_server=cdx_server,
|
||||||
|
config=route_config,
|
||||||
|
ds_rules_file=ds_rules_file,
|
||||||
|
)
|
||||||
|
|
||||||
|
logging.debug('Adding Collection: ' + name)
|
||||||
|
|
||||||
|
route_class = route_config.get('route_class', Route)
|
||||||
|
|
||||||
|
routes.append(route_class(name, wb_handler, config = route_config))
|
||||||
|
|
||||||
|
# cdx query handler
|
||||||
|
if route_config.get('enable_cdx_api', False):
|
||||||
|
routes.append(Route(name + '-cdx', CDXHandler(cdx_server)))
|
||||||
|
|
||||||
|
|
||||||
|
if config.get('debug_echo_env', False):
|
||||||
|
routes.append(Route('echo_env', DebugEchoEnvHandler()))
|
||||||
|
|
||||||
|
if config.get('debug_echo_req', False):
|
||||||
|
routes.append(Route('echo_req', DebugEchoHandler()))
|
||||||
|
|
||||||
|
|
||||||
|
static_routes = config.get('static_routes')
|
||||||
|
|
||||||
|
for static_name, static_path in static_routes.iteritems():
|
||||||
|
routes.append(Route(static_name, StaticHandler(static_path)))
|
||||||
|
|
||||||
|
# Check for new proxy mode!
|
||||||
|
if config.get('enable_http_proxy', False):
|
||||||
|
router = ProxyArchivalRouter
|
||||||
|
else:
|
||||||
|
router = ArchivalRouter
|
||||||
|
|
||||||
|
# Finally, create wb router
|
||||||
|
return router(
|
||||||
|
routes,
|
||||||
|
# Specify hostnames that pywb will be running on
|
||||||
|
# This will help catch occasionally missed rewrites that fall-through to the host
|
||||||
|
# (See archivalrouter.ReferRedirect)
|
||||||
|
hostpaths = hostpaths,
|
||||||
|
port = port,
|
||||||
|
|
||||||
|
abs_path = config.get('absolute_paths', True),
|
||||||
|
|
||||||
|
home_view = load_template_file(config.get('home_html'), 'Home Page'),
|
||||||
|
error_view = load_template_file(config.get('error_html'), 'Error Page')
|
||||||
|
)
|
@ -2,9 +2,9 @@ import StringIO
|
|||||||
|
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
from pywb.utils.bufferedreaders import ChunkedDataReader
|
from pywb.utils.bufferedreaders import ChunkedDataReader
|
||||||
from wbrequestresponse import WbResponse
|
from pywb.framework.wbrequestresponse import WbResponse
|
||||||
|
|
||||||
from wbexceptions import CaptureException, InternalRedirect
|
from pywb.framework.wbexceptions import CaptureException, InternalRedirect
|
||||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||||
|
|
||||||
from pywb.utils.loaders import LimitReader
|
from pywb.utils.loaders import LimitReader
|
||||||
@ -51,7 +51,7 @@ class ReplayView:
|
|||||||
self._redirect_if_needed(wbrequest, cdx)
|
self._redirect_if_needed(wbrequest, cdx)
|
||||||
|
|
||||||
# one more check for referrer-based self-redirect
|
# one more check for referrer-based self-redirect
|
||||||
self._reject_referrer_self_redirect(wbrequest, status_headers)
|
self._reject_referrer_self_redirect(wbrequest)
|
||||||
|
|
||||||
response = None
|
response = None
|
||||||
|
|
||||||
@ -177,25 +177,30 @@ class ReplayView:
|
|||||||
|
|
||||||
|
|
||||||
def _reject_self_redirect(self, wbrequest, cdx, status_headers):
|
def _reject_self_redirect(self, wbrequest, cdx, status_headers):
|
||||||
# self-redirect via location
|
"""
|
||||||
|
Check if response is a 3xx redirect to the same url
|
||||||
|
If so, reject this capture to avoid causing redirect loop
|
||||||
|
"""
|
||||||
if status_headers.statusline.startswith('3'):
|
if status_headers.statusline.startswith('3'):
|
||||||
request_url = wbrequest.wb_url.url.lower()
|
request_url = wbrequest.wb_url.url.lower()
|
||||||
location_url = status_headers.get_header('Location').lower()
|
location_url = status_headers.get_header('Location').lower()
|
||||||
|
|
||||||
#TODO: canonicalize before testing?
|
|
||||||
if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)):
|
if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)):
|
||||||
raise CaptureException('Self Redirect: ' + str(cdx))
|
raise CaptureException('Self Redirect: ' + str(cdx))
|
||||||
|
|
||||||
def _reject_referrer_self_redirect(self, wbrequest, status_headers):
|
def _reject_referrer_self_redirect(self, wbrequest):
|
||||||
# at correct timestamp now, but must check for referrer redirect
|
"""
|
||||||
# indirect self-redirect, via meta-refresh, if referrer is same as current url
|
Perform final check for referrer based self-redirect.
|
||||||
if status_headers.statusline.startswith('2'):
|
This method should be called after verifying request timestamp matches capture.
|
||||||
# build full url even if using relative-rewriting
|
if referrer is same as current url, reject this response and try another capture
|
||||||
request_url = wbrequest.host_prefix + wbrequest.rel_prefix + str(wbrequest.wb_url)
|
"""
|
||||||
referrer_url = wbrequest.referrer
|
if not wbrequest.referrer:
|
||||||
if (referrer_url and UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(referrer_url)):
|
return
|
||||||
raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# build full url even if using relative-rewriting
|
||||||
|
request_url = (wbrequest.host_prefix +
|
||||||
|
wbrequest.rel_prefix + str(wbrequest.wb_url))
|
||||||
|
|
||||||
|
if (UrlRewriter.strip_protocol(request_url) ==
|
||||||
|
UrlRewriter.strip_protocol(wbrequest.referrer)):
|
||||||
|
raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))
|
@ -1,6 +1,6 @@
|
|||||||
import pywb.utils.timeutils as timeutils
|
from pywb.utils.timeutils import timestamp_to_datetime
|
||||||
|
from pywb.framework.wbrequestresponse import WbResponse
|
||||||
|
|
||||||
import wbrequestresponse
|
|
||||||
import urlparse
|
import urlparse
|
||||||
import time
|
import time
|
||||||
|
|
||||||
@ -18,7 +18,7 @@ class StaticTextView:
|
|||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
def render_response(self, **kwargs):
|
def render_response(self, **kwargs):
|
||||||
return wbrequestresponse.WbResponse.text_stream(self.text)
|
return WbResponse.text_stream(self.text)
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class J2TemplateView:
|
class J2TemplateView:
|
||||||
@ -34,7 +34,7 @@ class J2TemplateView:
|
|||||||
if template_dir.startswith('.') or template_dir.startswith('file://'):
|
if template_dir.startswith('.') or template_dir.startswith('file://'):
|
||||||
loader = FileSystemLoader(template_dir)
|
loader = FileSystemLoader(template_dir)
|
||||||
else:
|
else:
|
||||||
loader = PackageLoader(__package__, template_dir)
|
loader = PackageLoader('pywb', template_dir)
|
||||||
|
|
||||||
jinja_env = Environment(loader = loader, trim_blocks = True)
|
jinja_env = Environment(loader = loader, trim_blocks = True)
|
||||||
jinja_env.filters['format_ts'] = J2TemplateView.format_ts
|
jinja_env.filters['format_ts'] = J2TemplateView.format_ts
|
||||||
@ -51,13 +51,13 @@ class J2TemplateView:
|
|||||||
def render_response(self, **kwargs):
|
def render_response(self, **kwargs):
|
||||||
template_result = self.render_to_string(**kwargs)
|
template_result = self.render_to_string(**kwargs)
|
||||||
status = kwargs.get('status', '200 OK')
|
status = kwargs.get('status', '200 OK')
|
||||||
return wbrequestresponse.WbResponse.text_response(str(template_result), status = status, content_type = 'text/html; charset=utf-8')
|
return WbResponse.text_response(str(template_result), status = status, content_type = 'text/html; charset=utf-8')
|
||||||
|
|
||||||
|
|
||||||
# Filters
|
# Filters
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
|
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
|
||||||
value = timeutils.timestamp_to_datetime(value)
|
value = timestamp_to_datetime(value)
|
||||||
return value.strftime(format_)
|
return value.strftime(format_)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -90,7 +90,7 @@ class TextCapturesView:
|
|||||||
cdx += '\n'
|
cdx += '\n'
|
||||||
return cdx
|
return cdx
|
||||||
cdx_lines = imap(to_str, cdx_lines)
|
cdx_lines = imap(to_str, cdx_lines)
|
||||||
return wbrequestresponse.WbResponse.text_stream(cdx_lines)
|
return WbResponse.text_stream(cdx_lines)
|
||||||
|
|
||||||
|
|
||||||
|
|
0
pywb/framework/__init__.py
Normal file
0
pywb/framework/__init__.py
Normal file
@ -1,17 +1,31 @@
|
|||||||
import urlparse
|
import urlparse
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from wbrequestresponse import WbRequest, WbResponse
|
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
|
from wbrequestresponse import WbRequest, WbResponse
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# ArchivalRouter -- route WB requests in archival mode
|
# ArchivalRouter -- route WB requests in archival mode
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ArchivalRouter:
|
class ArchivalRouter(object):
|
||||||
def __init__(self, routes, hostpaths=None, abs_path=True, home_view=None, error_view=None):
|
def __init__(self, routes,
|
||||||
|
hostpaths=None,
|
||||||
|
port=None,
|
||||||
|
abs_path=True,
|
||||||
|
home_view=None,
|
||||||
|
error_view=None):
|
||||||
|
|
||||||
self.routes = routes
|
self.routes = routes
|
||||||
self.fallback = ReferRedirect(hostpaths)
|
|
||||||
|
# optional port setting may be ignored by wsgi container
|
||||||
|
self.port = port
|
||||||
|
|
||||||
|
if hostpaths:
|
||||||
|
self.fallback = ReferRedirect(hostpaths)
|
||||||
|
else:
|
||||||
|
self.fallback = None
|
||||||
|
|
||||||
self.abs_path = abs_path
|
self.abs_path = abs_path
|
||||||
|
|
||||||
self.home_view = home_view
|
self.home_view = home_view
|
||||||
@ -29,26 +43,27 @@ class ArchivalRouter:
|
|||||||
|
|
||||||
return self.fallback(env, self.routes) if self.fallback else None
|
return self.fallback(env, self.routes) if self.fallback else None
|
||||||
|
|
||||||
|
|
||||||
def render_home_page(self):
|
def render_home_page(self):
|
||||||
# render the homepage!
|
# render the homepage!
|
||||||
if self.home_view:
|
if self.home_view:
|
||||||
return self.home_view.render_response(routes = self.routes)
|
return self.home_view.render_response(routes=self.routes)
|
||||||
else:
|
else:
|
||||||
# default home page template
|
# default home page template
|
||||||
text = '\n'.join(map(str, self.routes))
|
text = '\n'.join(map(str, self.routes))
|
||||||
return WbResponse.text_response(text)
|
return WbResponse.text_response(text)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# Route by matching regex (or fixed prefix)
|
# Route by matching regex (or fixed prefix)
|
||||||
# of request uri (excluding first '/')
|
# of request uri (excluding first '/')
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class Route:
|
class Route(object):
|
||||||
# match upto next / or ? or end
|
# match upto next / or ? or end
|
||||||
SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)'
|
SLASH_QUERY_LOOKAHEAD = '(?=/|$|\?)'
|
||||||
|
|
||||||
|
def __init__(self, regex, handler, coll_group=0, config={},
|
||||||
|
lookahead=SLASH_QUERY_LOOKAHEAD):
|
||||||
|
|
||||||
def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD):
|
|
||||||
self.path = regex
|
self.path = regex
|
||||||
if regex:
|
if regex:
|
||||||
self.regex = re.compile(regex + lookahead)
|
self.regex = re.compile(regex + lookahead)
|
||||||
@ -59,12 +74,11 @@ class Route:
|
|||||||
self.coll_group = coll_group
|
self.coll_group = coll_group
|
||||||
self._custom_init(config)
|
self._custom_init(config)
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, env, use_abs_prefix):
|
def __call__(self, env, use_abs_prefix):
|
||||||
wbrequest = self.parse_request(env, use_abs_prefix)
|
wbrequest = self.parse_request(env, use_abs_prefix)
|
||||||
return self.handler(wbrequest) if wbrequest else None
|
return self.handler(wbrequest) if wbrequest else None
|
||||||
|
|
||||||
def parse_request(self, env, use_abs_prefix, request_uri = None):
|
def parse_request(self, env, use_abs_prefix, request_uri=None):
|
||||||
if not request_uri:
|
if not request_uri:
|
||||||
request_uri = env['REL_REQUEST_URI']
|
request_uri = env['REL_REQUEST_URI']
|
||||||
|
|
||||||
@ -75,10 +89,12 @@ class Route:
|
|||||||
matched_str = matcher.group(0)
|
matched_str = matcher.group(0)
|
||||||
if matched_str:
|
if matched_str:
|
||||||
rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
|
rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
|
||||||
wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri
|
# remove the '/' + rel_prefix part of uri
|
||||||
|
wb_url_str = request_uri[len(matched_str) + 2:]
|
||||||
else:
|
else:
|
||||||
rel_prefix = env['SCRIPT_NAME'] + '/'
|
rel_prefix = env['SCRIPT_NAME'] + '/'
|
||||||
wb_url_str = request_uri[1:] # the request_uri is the wb_url, since no coll
|
# the request_uri is the wb_url, since no coll
|
||||||
|
wb_url_str = request_uri[1:]
|
||||||
|
|
||||||
coll = matcher.group(self.coll_group)
|
coll = matcher.group(self.coll_group)
|
||||||
|
|
||||||
@ -88,20 +104,19 @@ class Route:
|
|||||||
rel_prefix=rel_prefix,
|
rel_prefix=rel_prefix,
|
||||||
coll=coll,
|
coll=coll,
|
||||||
use_abs_prefix=use_abs_prefix,
|
use_abs_prefix=use_abs_prefix,
|
||||||
wburl_class = self.handler.get_wburl_type(),
|
wburl_class=self.handler.get_wburl_type(),
|
||||||
urlrewriter_class=UrlRewriter)
|
urlrewriter_class=UrlRewriter)
|
||||||
|
|
||||||
|
|
||||||
# Allow for applying of additional filters
|
# Allow for applying of additional filters
|
||||||
self._apply_filters(wbrequest, matcher)
|
self._apply_filters(wbrequest, matcher)
|
||||||
|
|
||||||
return wbrequest
|
return wbrequest
|
||||||
|
|
||||||
|
|
||||||
def _apply_filters(self, wbrequest, matcher):
|
def _apply_filters(self, wbrequest, matcher):
|
||||||
for filter in self.filters:
|
for filter in self.filters:
|
||||||
last_grp = len(matcher.groups())
|
last_grp = len(matcher.groups())
|
||||||
wbrequest.query_filter.append(filter.format(matcher.group(last_grp)))
|
filter_str = filter.format(matcher.group(last_grp))
|
||||||
|
wbrequest.query_filter.append(filter_str)
|
||||||
|
|
||||||
def _custom_init(self, config):
|
def _custom_init(self, config):
|
||||||
self.filters = config.get('filters', [])
|
self.filters = config.get('filters', [])
|
||||||
@ -112,7 +127,8 @@ class Route:
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# ReferRedirect -- redirect urls that have 'fallen through' based on the referrer settings
|
# ReferRedirect -- redirect urls that have 'fallen through'
|
||||||
|
# based on the referrer settings
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ReferRedirect:
|
class ReferRedirect:
|
||||||
def __init__(self, match_prefixs):
|
def __init__(self, match_prefixs):
|
||||||
@ -121,7 +137,6 @@ class ReferRedirect:
|
|||||||
else:
|
else:
|
||||||
self.match_prefixs = [match_prefixs]
|
self.match_prefixs = [match_prefixs]
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, env, routes):
|
def __call__(self, env, routes):
|
||||||
referrer = env.get('HTTP_REFERER')
|
referrer = env.get('HTTP_REFERER')
|
||||||
|
|
||||||
@ -133,7 +148,7 @@ class ReferRedirect:
|
|||||||
ref_split = urlparse.urlsplit(referrer)
|
ref_split = urlparse.urlsplit(referrer)
|
||||||
|
|
||||||
# ensure referrer starts with one of allowed hosts
|
# ensure referrer starts with one of allowed hosts
|
||||||
if not any (referrer.startswith(i) for i in self.match_prefixs):
|
if not any(referrer.startswith(i) for i in self.match_prefixs):
|
||||||
if ref_split.netloc != env.get('HTTP_HOST'):
|
if ref_split.netloc != env.get('HTTP_HOST'):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -144,13 +159,12 @@ class ReferRedirect:
|
|||||||
if app_path:
|
if app_path:
|
||||||
# must start with current app name, if not root
|
# must start with current app name, if not root
|
||||||
if not path.startswith(app_path):
|
if not path.startswith(app_path):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
path = path[len(app_path):]
|
path = path[len(app_path):]
|
||||||
|
|
||||||
|
|
||||||
for route in routes:
|
for route in routes:
|
||||||
ref_request = route.parse_request(env, False, request_uri = path)
|
ref_request = route.parse_request(env, False, request_uri=path)
|
||||||
if ref_request:
|
if ref_request:
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -174,6 +188,10 @@ class ReferRedirect:
|
|||||||
# 2013/path.html -> /path.html
|
# 2013/path.html -> /path.html
|
||||||
rel_request_uri = rel_request_uri[len(timestamp_path) - 1:]
|
rel_request_uri = rel_request_uri[len(timestamp_path) - 1:]
|
||||||
|
|
||||||
final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, rewriter.rewrite(rel_request_uri), '', ''))
|
final_url = urlparse.urlunsplit((ref_split.scheme,
|
||||||
|
ref_split.netloc,
|
||||||
|
rewriter.rewrite(rel_request_uri),
|
||||||
|
'',
|
||||||
|
''))
|
||||||
|
|
||||||
return WbResponse.redir_response(final_url)
|
return WbResponse.redir_response(final_url)
|
23
pywb/framework/basehandlers.py
Normal file
23
pywb/framework/basehandlers.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
from pywb.rewrite.wburl import WbUrl
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class BaseHandler(object):
|
||||||
|
"""
|
||||||
|
Represents a base handler class that handles any request
|
||||||
|
"""
|
||||||
|
def __call__(self, wbrequest):
|
||||||
|
return wbrequest
|
||||||
|
|
||||||
|
def get_wburl_type(self):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class WbUrlHandler(BaseHandler):
|
||||||
|
"""
|
||||||
|
Represents a handler which assumes the request contains a WbUrl
|
||||||
|
Ensure that the WbUrl is parsed in the request
|
||||||
|
"""
|
||||||
|
def get_wburl_type(self):
|
||||||
|
return WbUrl
|
@ -2,23 +2,37 @@ from wbrequestresponse import WbResponse, WbRequest
|
|||||||
from archivalrouter import ArchivalRouter
|
from archivalrouter import ArchivalRouter
|
||||||
import urlparse
|
import urlparse
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# An experimental router which combines both archival and proxy modes
|
# An experimental router which combines both archival and proxy modes
|
||||||
# http proxy mode support is very simple: only latest capture is available currently
|
# http proxy mode support is very simple so far:
|
||||||
|
# only latest capture is available currently
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
class ProxyArchivalRouter(ArchivalRouter):
|
||||||
|
def __init__(self, routes,
|
||||||
|
hostpaths=None,
|
||||||
|
port=None,
|
||||||
|
abs_path=True,
|
||||||
|
home_view=None,
|
||||||
|
error_view=None):
|
||||||
|
|
||||||
|
(super(ProxyArchivalRouter, self).
|
||||||
|
__init__(routes,
|
||||||
|
hostpaths=hostpaths,
|
||||||
|
port=port,
|
||||||
|
abs_path=abs_path,
|
||||||
|
home_view=home_view,
|
||||||
|
error_view=error_view))
|
||||||
|
|
||||||
class ProxyArchivalRouter:
|
|
||||||
def __init__(self, routes, hostpaths = None, abs_path = True, home_view = None, error_view = None):
|
|
||||||
self.archival = ArchivalRouter(routes, hostpaths, abs_path, home_view, error_view)
|
|
||||||
self.proxy = ProxyRouter(routes[0].handler, hostpaths, error_view)
|
self.proxy = ProxyRouter(routes[0].handler, hostpaths, error_view)
|
||||||
self.error_view = error_view
|
#self.error_view = error_view
|
||||||
|
|
||||||
def __call__(self, env):
|
def __call__(self, env):
|
||||||
response = self.archival(env)
|
response = self.proxy(env)
|
||||||
if response:
|
if response:
|
||||||
return response
|
return response
|
||||||
|
|
||||||
response = self.proxy(env)
|
response = super(ProxyArchivalRouter, self).__call__(env)
|
||||||
if response:
|
if response:
|
||||||
return response
|
return response
|
||||||
|
|
||||||
@ -29,7 +43,7 @@ class ProxyArchivalRouter:
|
|||||||
# Only supports latest capture replay at the moment
|
# Only supports latest capture replay at the moment
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ProxyRouter:
|
class ProxyRouter:
|
||||||
def __init__(self, handler, hostpaths = None, error_view = None):
|
def __init__(self, handler, hostpaths=None, error_view=None):
|
||||||
self.handler = handler
|
self.handler = handler
|
||||||
self.hostpaths = hostpaths
|
self.hostpaths = hostpaths
|
||||||
|
|
||||||
@ -56,27 +70,26 @@ class ProxyRouter:
|
|||||||
|
|
||||||
return self.handler(wbrequest)
|
return self.handler(wbrequest)
|
||||||
|
|
||||||
|
|
||||||
# Proxy Auto-Config (PAC) script for the proxy
|
# Proxy Auto-Config (PAC) script for the proxy
|
||||||
def make_pac_response(self, env):
|
def make_pac_response(self, env):
|
||||||
server_hostport = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
|
server_hostport = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
|
||||||
|
|
||||||
buff = 'function FindProxyForURL (url, host) {\n'
|
buff = 'function FindProxyForURL (url, host) {\n'
|
||||||
|
|
||||||
direct_cond =' if (shExpMatch(host, "{0}")) {{ return "DIRECT"; }}\n'
|
direct = ' if (shExpMatch(host, "{0}")) {{ return "DIRECT"; }}\n'
|
||||||
|
|
||||||
for hostpath in self.hostpaths:
|
for hostpath in self.hostpaths:
|
||||||
parts = urlparse.urlsplit(hostpath).netloc.split(':')
|
parts = urlparse.urlsplit(hostpath).netloc.split(':')
|
||||||
buff += direct_cond.format(parts[0])
|
buff += direct.format(parts[0])
|
||||||
|
|
||||||
buff += direct_cond.format(env['SERVER_NAME'])
|
buff += direct.format(env['SERVER_NAME'])
|
||||||
|
|
||||||
#buff += '\n return "PROXY {0}";\n}}\n'.format(self.hostpaths[0])
|
#buff += '\n return "PROXY {0}";\n}}\n'.format(self.hostpaths[0])
|
||||||
buff += '\n return "PROXY {0}";\n}}\n'.format(server_hostport)
|
buff += '\n return "PROXY {0}";\n}}\n'.format(server_hostport)
|
||||||
|
|
||||||
return WbResponse.text_response(buff, content_type = 'application/x-ns-proxy-autoconfig')
|
content_type = 'application/x-ns-proxy-autoconfig'
|
||||||
|
|
||||||
|
|
||||||
|
return WbResponse.text_response(buff, content_type=content_type)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -85,10 +98,11 @@ class ProxyRouter:
|
|||||||
class ProxyHttpsUrlRewriter:
|
class ProxyHttpsUrlRewriter:
|
||||||
HTTP = 'http://'
|
HTTP = 'http://'
|
||||||
HTTPS = 'https://'
|
HTTPS = 'https://'
|
||||||
|
|
||||||
def __init__(self, wbrequest, prefix):
|
def __init__(self, wbrequest, prefix):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def rewrite(self, url, mod = None):
|
def rewrite(self, url, mod=None):
|
||||||
if url.startswith(self.HTTPS):
|
if url.startswith(self.HTTPS):
|
||||||
return self.HTTP + url[len(self.HTTPS):]
|
return self.HTTP + url[len(self.HTTPS):]
|
||||||
else:
|
else:
|
||||||
@ -97,6 +111,5 @@ class ProxyHttpsUrlRewriter:
|
|||||||
def get_timestamp_url(self, timestamp, url):
|
def get_timestamp_url(self, timestamp, url):
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def get_abs_url(self, url = ''):
|
def get_abs_url(self, url=''):
|
||||||
return url
|
return url
|
||||||
|
|
@ -84,8 +84,8 @@ False
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from pywb.archivalrouter import Route, ReferRedirect
|
from pywb.framework.archivalrouter import Route, ReferRedirect
|
||||||
from pywb.handlers import BaseHandler, WbUrlHandler
|
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
|
||||||
import pprint
|
import pprint
|
||||||
|
|
||||||
def print_req(req):
|
def print_req(req):
|
@ -41,7 +41,7 @@ from pywb.rewrite.wburl import WbUrl
|
|||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
|
|
||||||
from pywb.wbrequestresponse import WbRequest, WbResponse
|
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
|
||||||
|
|
||||||
|
|
||||||
def print_req_from_uri(request_uri, env={}, use_abs_prefix=False):
|
def print_req_from_uri(request_uri, env={}, use_abs_prefix=False):
|
@ -1,23 +1,22 @@
|
|||||||
|
from pywb.utils.wbexception import WbException
|
||||||
|
|
||||||
|
|
||||||
class WbException(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
class NotFoundException(WbException):
|
class NotFoundException(WbException):
|
||||||
def status(self):
|
def status(self):
|
||||||
return '404 Not Found'
|
return '404 Not Found'
|
||||||
|
|
||||||
|
|
||||||
# Exceptions that effect a specific capture and result in a retry
|
# Exceptions that effect a specific capture and result in a retry
|
||||||
class CaptureException(WbException):
|
class CaptureException(WbException):
|
||||||
def status(self):
|
def status(self):
|
||||||
return '500 Internal Server Error'
|
return '500 Internal Server Error'
|
||||||
|
|
||||||
|
|
||||||
class InternalRedirect(WbException):
|
class InternalRedirect(WbException):
|
||||||
def __init__(self, location, status = '302 Internal Redirect'):
|
def __init__(self, location, status='302 Internal Redirect'):
|
||||||
WbException.__init__(self, 'Redirecting -> ' + location)
|
WbException.__init__(self, 'Redirecting -> ' + location)
|
||||||
self.status = status
|
self.status = status
|
||||||
self.httpHeaders = [('Location', location)]
|
self.httpHeaders = [('Location', location)]
|
||||||
|
|
||||||
def status(self):
|
def status(self):
|
||||||
return self.status
|
return self.status
|
||||||
|
|
@ -26,7 +26,6 @@ class WbRequest:
|
|||||||
except KeyError:
|
except KeyError:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, env,
|
def __init__(self, env,
|
||||||
request_uri=None,
|
request_uri=None,
|
||||||
rel_prefix='',
|
rel_prefix='',
|
||||||
@ -40,7 +39,10 @@ class WbRequest:
|
|||||||
|
|
||||||
self.env = env
|
self.env = env
|
||||||
|
|
||||||
self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI')
|
if request_uri:
|
||||||
|
self.request_uri = request_uri
|
||||||
|
else:
|
||||||
|
self.request_uri = env.get('REL_REQUEST_URI')
|
||||||
|
|
||||||
self.coll = coll
|
self.coll = coll
|
||||||
|
|
||||||
@ -55,7 +57,6 @@ class WbRequest:
|
|||||||
else:
|
else:
|
||||||
self.wb_prefix = rel_prefix
|
self.wb_prefix = rel_prefix
|
||||||
|
|
||||||
|
|
||||||
if not wb_url_str:
|
if not wb_url_str:
|
||||||
wb_url_str = '/'
|
wb_url_str = '/'
|
||||||
|
|
||||||
@ -83,7 +84,6 @@ class WbRequest:
|
|||||||
# PERF
|
# PERF
|
||||||
env['X_PERF'] = {}
|
env['X_PERF'] = {}
|
||||||
|
|
||||||
|
|
||||||
def _is_ajax(self):
|
def _is_ajax(self):
|
||||||
value = self.env.get('HTTP_X_REQUESTED_WITH')
|
value = self.env.get('HTTP_X_REQUESTED_WITH')
|
||||||
if not value:
|
if not value:
|
||||||
@ -96,7 +96,6 @@ class WbRequest:
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
varlist = vars(self)
|
varlist = vars(self)
|
||||||
varstr = pprint.pformat(varlist)
|
varstr = pprint.pformat(varlist)
|
||||||
@ -111,32 +110,39 @@ class WbResponse:
|
|||||||
Holds a status_headers object and a response iter, to be
|
Holds a status_headers object and a response iter, to be
|
||||||
returned to wsgi container.
|
returned to wsgi container.
|
||||||
"""
|
"""
|
||||||
def __init__(self, status_headers, value = []):
|
def __init__(self, status_headers, value=[]):
|
||||||
self.status_headers = status_headers
|
self.status_headers = status_headers
|
||||||
self.body = value
|
self.body = value
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def text_stream(text, status = '200 OK', content_type = 'text/plain'):
|
def text_stream(stream, status='200 OK', content_type='text/plain'):
|
||||||
return WbResponse(StatusAndHeaders(status, [('Content-Type', content_type)]), value = text)
|
status_headers = StatusAndHeaders(status,
|
||||||
|
[('Content-Type', content_type)])
|
||||||
|
|
||||||
|
return WbResponse(status_headers, value=stream)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def text_response(text, status = '200 OK', content_type = 'text/plain'):
|
def text_response(text, status='200 OK', content_type='text/plain'):
|
||||||
return WbResponse(StatusAndHeaders(status, [('Content-Type', content_type)]), value = [text])
|
status_headers = StatusAndHeaders(status,
|
||||||
|
[('Content-Type', content_type)])
|
||||||
|
|
||||||
|
return WbResponse(status_headers, value=[text])
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def redir_response(location, status = '302 Redirect'):
|
def redir_response(location, status='302 Redirect'):
|
||||||
return WbResponse(StatusAndHeaders(status, [('Location', location)]))
|
return WbResponse(StatusAndHeaders(status,
|
||||||
|
[('Location', location)]))
|
||||||
|
|
||||||
def __call__(self, env, start_response):
|
def __call__(self, env, start_response):
|
||||||
|
|
||||||
# PERF
|
# PERF
|
||||||
perfstats = env.get('X_PERF')
|
perfstats = env.get('X_PERF')
|
||||||
if perfstats:
|
if perfstats:
|
||||||
self.status_headers.headers.append(('X-Archive-Perf-Stats', str(perfstats)))
|
self.status_headers.headers.append(('X-Archive-Perf-Stats',
|
||||||
|
str(perfstats)))
|
||||||
|
|
||||||
|
start_response(self.status_headers.statusline,
|
||||||
start_response(self.status_headers.statusline, self.status_headers.headers)
|
self.status_headers.headers)
|
||||||
|
|
||||||
if env['REQUEST_METHOD'] == 'HEAD':
|
if env['REQUEST_METHOD'] == 'HEAD':
|
||||||
if hasattr(self.body, 'close'):
|
if hasattr(self.body, 'close'):
|
||||||
@ -148,6 +154,5 @@ class WbResponse:
|
|||||||
else:
|
else:
|
||||||
return [str(self.body)]
|
return [str(self.body)]
|
||||||
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return str(vars(self))
|
return str(vars(self))
|
165
pywb/framework/wsgi_wrappers.py
Normal file
165
pywb/framework/wsgi_wrappers.py
Normal file
@ -0,0 +1,165 @@
|
|||||||
|
from pywb.utils.wbexception import WbException
|
||||||
|
from pywb.utils.loaders import load_yaml_config
|
||||||
|
|
||||||
|
from wbexceptions import NotFoundException, InternalRedirect
|
||||||
|
from wbrequestresponse import WbResponse, StatusAndHeaders
|
||||||
|
|
||||||
|
|
||||||
|
import os
|
||||||
|
import importlib
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_PORT = 8080
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
# adapted from wsgiref.request_uri, but doesn't include domain name
|
||||||
|
# and allows all characters which are allowed in the path segment
|
||||||
|
# according to: http://tools.ietf.org/html/rfc3986#section-3.3
|
||||||
|
# explained here:
|
||||||
|
# http://stackoverflow.com/questions/4669692/
|
||||||
|
# valid-characters-for-directory-part-of-a-url-for-short-links
|
||||||
|
|
||||||
|
|
||||||
|
def rel_request_uri(environ, include_query=1):
|
||||||
|
"""
|
||||||
|
Return the requested path, optionally including the query string
|
||||||
|
|
||||||
|
# Simple test:
|
||||||
|
>>> rel_request_uri({'PATH_INFO': '/web/example.com'})
|
||||||
|
'/web/example.com'
|
||||||
|
|
||||||
|
# Test all unecoded special chars and double-quote
|
||||||
|
# (double-quote must be encoded but not single quote)
|
||||||
|
>>> rel_request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""})
|
||||||
|
"/web/example.com/0~!+$&'()*+,;=:%22"
|
||||||
|
"""
|
||||||
|
from urllib import quote
|
||||||
|
url = quote(environ.get('PATH_INFO', ''), safe='/~!$&\'()*+,;=:@')
|
||||||
|
if include_query and environ.get('QUERY_STRING'):
|
||||||
|
url += '?' + environ['QUERY_STRING']
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class WSGIApp(object):
|
||||||
|
def __init__(self, wb_router):
|
||||||
|
self.wb_router = wb_router
|
||||||
|
self.port = DEFAULT_PORT
|
||||||
|
if hasattr(wb_router, 'port'):
|
||||||
|
self.port = wb_router.port
|
||||||
|
|
||||||
|
# Top-level wsgi application
|
||||||
|
def __call__(self, env, start_response):
|
||||||
|
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
|
||||||
|
env['REL_REQUEST_URI'] = rel_request_uri(env)
|
||||||
|
else:
|
||||||
|
env['REL_REQUEST_URI'] = env['REQUEST_URI']
|
||||||
|
|
||||||
|
wb_router = self.wb_router
|
||||||
|
response = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = wb_router(env)
|
||||||
|
|
||||||
|
if not response:
|
||||||
|
msg = 'No handler for "{0}"'.format(env['REL_REQUEST_URI'])
|
||||||
|
raise NotFoundException(msg)
|
||||||
|
|
||||||
|
except InternalRedirect as ir:
|
||||||
|
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
|
||||||
|
|
||||||
|
except WbException as e:
|
||||||
|
response = handle_exception(env, wb_router.error_view, e, False)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
response = handle_exception(env, wb_router.error_view, e, True)
|
||||||
|
|
||||||
|
return response(env, start_response)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def handle_exception(env, error_view, exc, print_trace):
|
||||||
|
if hasattr(exc, 'status'):
|
||||||
|
status = exc.status()
|
||||||
|
else:
|
||||||
|
status = '400 Bad Request'
|
||||||
|
|
||||||
|
if print_trace:
|
||||||
|
import traceback
|
||||||
|
err_details = traceback.format_exc(exc)
|
||||||
|
print err_details
|
||||||
|
else:
|
||||||
|
logging.info(str(exc))
|
||||||
|
err_details = None
|
||||||
|
|
||||||
|
if error_view:
|
||||||
|
import traceback
|
||||||
|
return error_view.render_response(err_msg=str(exc),
|
||||||
|
err_details=err_details,
|
||||||
|
status=status)
|
||||||
|
else:
|
||||||
|
return WbResponse.text_response(status + ' Error: ' + str(exc),
|
||||||
|
status=status)
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
DEFAULT_CONFIG_FILE = 'config.yaml'
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def init_app(init_func, load_yaml=True, config_file=None):
|
||||||
|
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
|
||||||
|
level=logging.DEBUG)
|
||||||
|
logging.info('')
|
||||||
|
|
||||||
|
if load_yaml:
|
||||||
|
if not config_file:
|
||||||
|
config_file = os.environ.get('PYWB_CONFIG_FILE')
|
||||||
|
if not config_file:
|
||||||
|
config_file = DEFAULT_CONFIG_FILE
|
||||||
|
|
||||||
|
config = load_yaml_config(config_file)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if load_yaml:
|
||||||
|
wb_router = init_func(config)
|
||||||
|
else:
|
||||||
|
wb_router = init_func()
|
||||||
|
except:
|
||||||
|
msg = '*** pywb app init FAILED config from "%s"!\n'
|
||||||
|
logging.exception(msg, init_func.__name__)
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
msg = '*** pywb app inited with config from "%s"!\n'
|
||||||
|
logging.info(msg, init_func.__name__)
|
||||||
|
|
||||||
|
return WSGIApp(wb_router)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def start_wsgi_server(the_app):
|
||||||
|
from wsgiref.simple_server import make_server
|
||||||
|
from optparse import OptionParser
|
||||||
|
|
||||||
|
opt = OptionParser('%prog [OPTIONS]')
|
||||||
|
opt.add_option('-p', '--port', type='int', default=None)
|
||||||
|
|
||||||
|
options, args = opt.parse_args()
|
||||||
|
|
||||||
|
port = options.port
|
||||||
|
|
||||||
|
port = the_app.port
|
||||||
|
|
||||||
|
if not port:
|
||||||
|
port = DEFAULT_PORT
|
||||||
|
|
||||||
|
logging.debug('Starting CDX Server on port %s', port)
|
||||||
|
|
||||||
|
try:
|
||||||
|
httpd = make_server('', port, the_app)
|
||||||
|
httpd.serve_forever()
|
||||||
|
except KeyboardInterrupt as ex:
|
||||||
|
pass
|
||||||
|
|
||||||
|
logging.debug('Stopping CDX Server')
|
@ -1,128 +0,0 @@
|
|||||||
import handlers
|
|
||||||
import archivalrouter
|
|
||||||
import config_utils
|
|
||||||
import proxy
|
|
||||||
from indexreader import IndexReader
|
|
||||||
|
|
||||||
import os
|
|
||||||
import yaml
|
|
||||||
import logging
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
DEFAULTS = {
|
|
||||||
'hostpaths': ['http://localhost:8080'],
|
|
||||||
'collections': {'pywb': './sample_archive/cdx/'},
|
|
||||||
'archive_paths': './sample_archive/warcs/',
|
|
||||||
|
|
||||||
'head_insert_html': 'ui/head_insert.html',
|
|
||||||
'query_html': 'ui/query.html',
|
|
||||||
'search_html': 'ui/search.html',
|
|
||||||
'home_html': 'ui/index.html',
|
|
||||||
'error_html': 'ui/error.html',
|
|
||||||
|
|
||||||
'static_routes': {'static/default': 'static/'},
|
|
||||||
|
|
||||||
'domain_specific_rules': 'rules.yaml',
|
|
||||||
}
|
|
||||||
|
|
||||||
class DictChain:
|
|
||||||
def __init__(self, *dicts):
|
|
||||||
self.dicts = dicts
|
|
||||||
|
|
||||||
def get(self, key, default_val=None):
|
|
||||||
for d in self.dicts:
|
|
||||||
val = d.get(key)
|
|
||||||
if val is not None:
|
|
||||||
return val
|
|
||||||
return default_val
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
## Reference non-YAML config
|
|
||||||
#=================================================================
|
|
||||||
def pywb_config_manual(passed_config = {}):
|
|
||||||
|
|
||||||
config = DictChain(passed_config, DEFAULTS)
|
|
||||||
|
|
||||||
routes = []
|
|
||||||
|
|
||||||
hostpaths = config.get('hostpaths')
|
|
||||||
|
|
||||||
# collections based on cdx source
|
|
||||||
collections = config.get('collections')
|
|
||||||
|
|
||||||
for name, value in collections.iteritems():
|
|
||||||
if isinstance(value, str):
|
|
||||||
value = {'index_paths': value}
|
|
||||||
|
|
||||||
route_config = DictChain(value, config)
|
|
||||||
|
|
||||||
ds_rules_file = route_config.get('domain_specific_rules', None)
|
|
||||||
cdx_server = IndexReader(route_config, ds_rules_file)
|
|
||||||
|
|
||||||
wb_handler = config_utils.create_wb_handler(
|
|
||||||
cdx_server=cdx_server,
|
|
||||||
config=route_config,
|
|
||||||
ds_rules_file=ds_rules_file,
|
|
||||||
)
|
|
||||||
|
|
||||||
logging.debug('Adding Collection: ' + name)
|
|
||||||
|
|
||||||
route_class = route_config.get('route_class', archivalrouter.Route)
|
|
||||||
|
|
||||||
routes.append(route_class(name, wb_handler, config = route_config))
|
|
||||||
|
|
||||||
# cdx query handler
|
|
||||||
if route_config.get('enable_cdx_api', False):
|
|
||||||
routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_server)))
|
|
||||||
|
|
||||||
|
|
||||||
if config.get('debug_echo_env', False):
|
|
||||||
routes.append(archivalrouter.Route('echo_env', handlers.DebugEchoEnvHandler()))
|
|
||||||
|
|
||||||
if config.get('debug_echo_req', False):
|
|
||||||
routes.append(archivalrouter.Route('echo_req', handlers.DebugEchoHandler()))
|
|
||||||
|
|
||||||
|
|
||||||
static_routes = config.get('static_routes')
|
|
||||||
|
|
||||||
for static_name, static_path in static_routes.iteritems():
|
|
||||||
routes.append(archivalrouter.Route(static_name, handlers.StaticHandler(static_path)))
|
|
||||||
|
|
||||||
# Check for new proxy mode!
|
|
||||||
if config.get('enable_http_proxy', False):
|
|
||||||
router = proxy.ProxyArchivalRouter
|
|
||||||
else:
|
|
||||||
router = archivalrouter.ArchivalRouter
|
|
||||||
|
|
||||||
# Finally, create wb router
|
|
||||||
return router(
|
|
||||||
routes,
|
|
||||||
# Specify hostnames that pywb will be running on
|
|
||||||
# This will help catch occasionally missed rewrites that fall-through to the host
|
|
||||||
# (See archivalrouter.ReferRedirect)
|
|
||||||
hostpaths = hostpaths,
|
|
||||||
|
|
||||||
abs_path = config.get('absolute_paths', True),
|
|
||||||
|
|
||||||
home_view = config_utils.load_template_file(config.get('home_html'), 'Home Page'),
|
|
||||||
error_view = config_utils.load_template_file(config.get('error_html'), 'Error Page')
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# YAML config loader
|
|
||||||
#=================================================================
|
|
||||||
DEFAULT_CONFIG_FILE = 'config.yaml'
|
|
||||||
|
|
||||||
|
|
||||||
def pywb_config(config_file = None):
|
|
||||||
if not config_file:
|
|
||||||
config_file = os.environ.get('PYWB_CONFIG', DEFAULT_CONFIG_FILE)
|
|
||||||
|
|
||||||
with open(config_file) as fh:
|
|
||||||
config = yaml.load(fh)
|
|
||||||
|
|
||||||
return pywb_config_manual(config)
|
|
||||||
|
|
@ -4,6 +4,9 @@
|
|||||||
import surt
|
import surt
|
||||||
import urlparse
|
import urlparse
|
||||||
|
|
||||||
|
from wbexception import WbException
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class UrlCanonicalizer(object):
|
class UrlCanonicalizer(object):
|
||||||
def __init__(self, surt_ordered=True):
|
def __init__(self, surt_ordered=True):
|
||||||
@ -14,7 +17,7 @@ class UrlCanonicalizer(object):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class UrlCanonicalizeException(Exception):
|
class UrlCanonicalizeException(WbException):
|
||||||
def status(self):
|
def status(self):
|
||||||
return '400 Bad Request'
|
return '400 Bad Request'
|
||||||
|
|
||||||
@ -164,7 +167,8 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
|
|||||||
|
|
||||||
elif match_type == 'domain':
|
elif match_type == 'domain':
|
||||||
if not surt_ordered:
|
if not surt_ordered:
|
||||||
raise UrlCanonicalizeException('matchType=domain unsupported for non-surt')
|
msg = 'matchType=domain unsupported for non-surt'
|
||||||
|
raise UrlCanonicalizeException(msg)
|
||||||
|
|
||||||
host = start_key.split(')/')[0]
|
host = start_key.split(')/')[0]
|
||||||
|
|
||||||
|
@ -1,10 +1,9 @@
|
|||||||
import yaml
|
|
||||||
import pkgutil
|
import pkgutil
|
||||||
|
from loaders import load_yaml_config
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
DEFAULT_RULES_FILE = 'pywb/rules.yaml'
|
||||||
DEFAULT_RULES_FILE = 'rules.yaml'
|
|
||||||
DEFAULT_RULES_PKG = 'pywb'
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -23,10 +22,14 @@ class RuleSet(object):
|
|||||||
|
|
||||||
self.rules = []
|
self.rules = []
|
||||||
|
|
||||||
ds_rules_file = kwargs.get('ds_rules_file')
|
|
||||||
default_rule_config = kwargs.get('default_rule_config')
|
default_rule_config = kwargs.get('default_rule_config')
|
||||||
|
|
||||||
config = self.load_default_rules(ds_rules_file)
|
ds_rules_file = kwargs.get('ds_rules_file')
|
||||||
|
|
||||||
|
if not ds_rules_file:
|
||||||
|
ds_rules_file = DEFAULT_RULES_FILE
|
||||||
|
|
||||||
|
config = load_yaml_config(ds_rules_file)
|
||||||
|
|
||||||
rulesmap = config.get('rules') if config else None
|
rulesmap = config.get('rules') if config else None
|
||||||
|
|
||||||
@ -53,22 +56,6 @@ class RuleSet(object):
|
|||||||
if not def_key_found and default_rule_config is not None:
|
if not def_key_found and default_rule_config is not None:
|
||||||
self.rules.append(rule_cls(self.DEFAULT_KEY, default_rule_config))
|
self.rules.append(rule_cls(self.DEFAULT_KEY, default_rule_config))
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def load_default_rules(filename=None, pkg=None):
|
|
||||||
config = None
|
|
||||||
|
|
||||||
if not filename:
|
|
||||||
filename = DEFAULT_RULES_FILE
|
|
||||||
|
|
||||||
if not pkg:
|
|
||||||
pkg = DEFAULT_RULES_PKG
|
|
||||||
|
|
||||||
if filename:
|
|
||||||
yaml_str = pkgutil.get_data(pkg, filename)
|
|
||||||
config = yaml.load(yaml_str)
|
|
||||||
|
|
||||||
return config
|
|
||||||
|
|
||||||
def iter_matching(self, urlkey):
|
def iter_matching(self, urlkey):
|
||||||
"""
|
"""
|
||||||
Iterate over all matching rules for given urlkey
|
Iterate over all matching rules for given urlkey
|
||||||
|
@ -7,11 +7,20 @@ import os
|
|||||||
import hmac
|
import hmac
|
||||||
import urllib2
|
import urllib2
|
||||||
import time
|
import time
|
||||||
|
import pkg_resources
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def is_http(filename):
|
def is_http(filename):
|
||||||
return any(filename.startswith(x) for x in ['http://', 'https://'])
|
return filename.startswith(('http://', 'https://'))
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def load_yaml_config(config_file):
|
||||||
|
import yaml
|
||||||
|
configdata = BlockLoader().load(config_file)
|
||||||
|
config = yaml.load(configdata)
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -24,27 +33,46 @@ class BlockLoader(object):
|
|||||||
def __init__(self, cookie_maker=None):
|
def __init__(self, cookie_maker=None):
|
||||||
self.cookie_maker = cookie_maker
|
self.cookie_maker = cookie_maker
|
||||||
|
|
||||||
def load(self, url, offset, length):
|
def load(self, url, offset=0, length=-1):
|
||||||
"""
|
"""
|
||||||
Determine loading method based on uri
|
Determine loading method based on uri
|
||||||
"""
|
"""
|
||||||
if is_http(url):
|
if is_http(url):
|
||||||
return self.load_http(url, offset, length)
|
return self.load_http(url, offset, length)
|
||||||
else:
|
else:
|
||||||
return self.load_file(url, offset, length)
|
return self.load_file_or_resource(url, offset, length)
|
||||||
|
|
||||||
def load_file(self, url, offset, length):
|
def load_file_or_resource(self, url, offset, length):
|
||||||
"""
|
"""
|
||||||
Load a file-like reader from the local file system
|
Load a file-like reader from the local file system
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
file_only = False
|
||||||
|
|
||||||
if url.startswith('file://'):
|
if url.startswith('file://'):
|
||||||
url = url[len('file://'):]
|
url = url[len('file://'):]
|
||||||
|
file_only = True
|
||||||
|
|
||||||
afile = open(url, 'rb')
|
try:
|
||||||
afile.seek(offset)
|
# first, try as file
|
||||||
|
afile = open(url, 'rb')
|
||||||
|
|
||||||
if length > 0:
|
except IOError:
|
||||||
|
if file_only:
|
||||||
|
raise
|
||||||
|
|
||||||
|
# then, try as package.path/file
|
||||||
|
pkg_split = url.split('/', 1)
|
||||||
|
if len(pkg_split) == 1:
|
||||||
|
raise
|
||||||
|
|
||||||
|
afile = pkg_resources.resource_stream(pkg_split[0],
|
||||||
|
pkg_split[1])
|
||||||
|
|
||||||
|
if offset > 0:
|
||||||
|
afile.seek(offset)
|
||||||
|
|
||||||
|
if length >= 0:
|
||||||
return LimitReader(afile, length)
|
return LimitReader(afile, length)
|
||||||
else:
|
else:
|
||||||
return afile
|
return afile
|
||||||
|
@ -30,9 +30,9 @@
|
|||||||
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
|
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
|
||||||
' CDX N b a m s k r M S V g\\n'
|
' CDX N b a m s k r M S V g\\n'
|
||||||
|
|
||||||
#DecompressingBufferedReader readline() with decompression
|
#DecompressingBufferedReader readline() with decompression (zipnum file, no header)
|
||||||
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
|
>>> DecompressingBufferedReader(open(test_zip_dir + 'zipnum-sample.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
|
||||||
' CDX N b a m s k r M S V g\\n'
|
'com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz\\n'
|
||||||
|
|
||||||
>>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
|
>>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
|
||||||
'Example Domain'
|
'Example Domain'
|
||||||
@ -60,7 +60,7 @@ from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
|||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'
|
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'
|
||||||
test_cdx_dir = get_test_dir() + 'cdx/'
|
test_cdx_dir = get_test_dir() + 'cdx/'
|
||||||
|
test_zip_dir = get_test_dir() + 'zipcdx/'
|
||||||
|
|
||||||
def read_multiple(reader, inc_reads):
|
def read_multiple(reader, inc_reads):
|
||||||
result = None
|
result = None
|
||||||
|
@ -171,7 +171,6 @@ def timestamp_to_datetime(string):
|
|||||||
# pad to 6 digits
|
# pad to 6 digits
|
||||||
string = _pad_timestamp(string, PAD_6)
|
string = _pad_timestamp(string, PAD_6)
|
||||||
|
|
||||||
|
|
||||||
def clamp(val, min_, max_):
|
def clamp(val, min_, max_):
|
||||||
try:
|
try:
|
||||||
val = int(val)
|
val = int(val)
|
||||||
|
3
pywb/utils/wbexception.py
Normal file
3
pywb/utils/wbexception.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
class WbException(Exception):
|
||||||
|
def status(self):
|
||||||
|
return '500 Internal Server Error'
|
@ -9,6 +9,9 @@ from pywb.utils.statusandheaders import StatusAndHeadersParserException
|
|||||||
from pywb.utils.loaders import BlockLoader
|
from pywb.utils.loaders import BlockLoader
|
||||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||||
|
|
||||||
|
from pywb.utils.wbexception import WbException
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
ArcWarcRecord = collections.namedtuple('ArchiveRecord',
|
ArcWarcRecord = collections.namedtuple('ArchiveRecord',
|
||||||
'type, rec_headers, ' +
|
'type, rec_headers, ' +
|
||||||
@ -16,7 +19,7 @@ ArcWarcRecord = collections.namedtuple('ArchiveRecord',
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ArchiveLoadFailed(Exception):
|
class ArchiveLoadFailed(WbException):
|
||||||
def __init__(self, reason, filename=''):
|
def __init__(self, reason, filename=''):
|
||||||
super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason))
|
super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason))
|
||||||
#self.filename = filename
|
#self.filename = filename
|
||||||
@ -62,9 +65,9 @@ class ArcWarcRecordLoader:
|
|||||||
decomp_type = 'gzip'
|
decomp_type = 'gzip'
|
||||||
|
|
||||||
# Create decompressing stream
|
# Create decompressing stream
|
||||||
stream = DecompressingBufferedReader(stream = raw,
|
stream = DecompressingBufferedReader(stream=raw,
|
||||||
decomp_type = decomp_type,
|
decomp_type=decomp_type,
|
||||||
block_size = self.block_size)
|
block_size=self.block_size)
|
||||||
|
|
||||||
(the_format, rec_headers) = self._detect_type_load_headers(stream)
|
(the_format, rec_headers) = self._detect_type_load_headers(stream)
|
||||||
|
|
||||||
|
@ -176,6 +176,6 @@ class ResolvingLoader:
|
|||||||
params = {'url': url,
|
params = {'url': url,
|
||||||
'closest': timestamp,
|
'closest': timestamp,
|
||||||
'filter': 'digest:' + digest,
|
'filter': 'digest:' + digest,
|
||||||
'output': 'raw'}
|
'output': 'cdxobject'}
|
||||||
|
|
||||||
return self.cdx_server.load_cdx(**params)
|
return self.cdx_server.load_cdx(**params)
|
||||||
|
124
pywb/wbapp.py
124
pywb/wbapp.py
@ -1,124 +0,0 @@
|
|||||||
from wbexceptions import WbException, NotFoundException, InternalRedirect
|
|
||||||
from wbrequestresponse import WbResponse, StatusAndHeaders
|
|
||||||
|
|
||||||
from pywb.cdx.cdxserver import CDXException
|
|
||||||
from pywb.utils.canonicalize import UrlCanonicalizeException
|
|
||||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
|
||||||
|
|
||||||
import os
|
|
||||||
import importlib
|
|
||||||
import logging
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters
|
|
||||||
# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
|
|
||||||
# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
|
|
||||||
def rel_request_uri(environ, include_query=1):
|
|
||||||
"""
|
|
||||||
Return the requested path, optionally including the query string
|
|
||||||
|
|
||||||
# Simple test:
|
|
||||||
>>> rel_request_uri({'PATH_INFO': '/web/example.com'})
|
|
||||||
'/web/example.com'
|
|
||||||
|
|
||||||
# Test all unecoded special chars and double-quote
|
|
||||||
# (double-quote must be encoded but not single quote)
|
|
||||||
>>> rel_request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""})
|
|
||||||
"/web/example.com/0~!+$&'()*+,;=:%22"
|
|
||||||
"""
|
|
||||||
from urllib import quote
|
|
||||||
url = quote(environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@')
|
|
||||||
if include_query and environ.get('QUERY_STRING'):
|
|
||||||
url += '?' + environ['QUERY_STRING']
|
|
||||||
|
|
||||||
return url
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def create_wb_app(wb_router):
|
|
||||||
|
|
||||||
# Top-level wsgi application
|
|
||||||
def application(env, start_response):
|
|
||||||
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
|
|
||||||
env['REL_REQUEST_URI'] = rel_request_uri(env)
|
|
||||||
else:
|
|
||||||
env['REL_REQUEST_URI'] = env['REQUEST_URI']
|
|
||||||
|
|
||||||
response = None
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = wb_router(env)
|
|
||||||
|
|
||||||
if not response:
|
|
||||||
raise NotFoundException('No handler for "{0}"'.format(env['REL_REQUEST_URI']))
|
|
||||||
|
|
||||||
except InternalRedirect as ir:
|
|
||||||
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
|
|
||||||
|
|
||||||
except (WbException, CDXException,
|
|
||||||
UrlCanonicalizeException, ArchiveLoadFailed) as e:
|
|
||||||
response = handle_exception(env, wb_router.error_view, e, False)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
response = handle_exception(env, wb_router.error_view, e, True)
|
|
||||||
|
|
||||||
return response(env, start_response)
|
|
||||||
|
|
||||||
|
|
||||||
return application
|
|
||||||
|
|
||||||
|
|
||||||
def handle_exception(env, error_view, exc, print_trace):
|
|
||||||
if hasattr(exc, 'status'):
|
|
||||||
status = exc.status()
|
|
||||||
else:
|
|
||||||
status = '400 Bad Request'
|
|
||||||
|
|
||||||
if print_trace:
|
|
||||||
import traceback
|
|
||||||
err_details = traceback.format_exc(exc)
|
|
||||||
print err_details
|
|
||||||
else:
|
|
||||||
logging.info(str(exc))
|
|
||||||
err_details = None
|
|
||||||
|
|
||||||
if error_view:
|
|
||||||
import traceback
|
|
||||||
return error_view.render_response(err_msg = str(exc), err_details = err_details, status = status)
|
|
||||||
else:
|
|
||||||
return WbResponse.text_response(status + ' Error: ' + str(exc), status = status)
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
DEFAULT_CONFIG_FILE = 'config.yaml'
|
|
||||||
|
|
||||||
def main():
|
|
||||||
try:
|
|
||||||
logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG)
|
|
||||||
|
|
||||||
# see if there's a custom init module
|
|
||||||
config_name = os.environ.get('PYWB_CONFIG_MODULE')
|
|
||||||
|
|
||||||
if not config_name:
|
|
||||||
# use default module
|
|
||||||
config_name = 'pywb.pywb_init'
|
|
||||||
logging.info('Loading from default config module "{0}"'.format(config_name))
|
|
||||||
logging.info('')
|
|
||||||
|
|
||||||
module = importlib.import_module(config_name)
|
|
||||||
|
|
||||||
app = create_wb_app(module.pywb_config())
|
|
||||||
logging.info('')
|
|
||||||
logging.info('*** pywb inited with settings from {0}.pywb_config()!\n'.format(config_name))
|
|
||||||
return app
|
|
||||||
|
|
||||||
except Exception:
|
|
||||||
logging.exception('*** pywb could not init with settings from {0}.pywb_config()!\n'.format(config_name))
|
|
||||||
raise
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
if __name__ == "__main__":
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
application = main()
|
|
4
run.sh
4
run.sh
@ -10,14 +10,14 @@ mypath=$(cd `dirname $0` && pwd)
|
|||||||
# ex: my_pywb.pywb_config()
|
# ex: my_pywb.pywb_config()
|
||||||
#export 'PYWB_CONFIG=my_pywb'
|
#export 'PYWB_CONFIG=my_pywb'
|
||||||
|
|
||||||
app="pywb.wbapp"
|
app="pywb.apps.wayback"
|
||||||
|
|
||||||
params="--http-socket :8080 -b 65536"
|
params="--http-socket :8080 -b 65536"
|
||||||
#params="--static-map /static=$mypath/static --http-socket :8080 -b 65536"
|
#params="--static-map /static=$mypath/static --http-socket :8080 -b 65536"
|
||||||
|
|
||||||
if [ -z "$1" ]; then
|
if [ -z "$1" ]; then
|
||||||
# Standard root config
|
# Standard root config
|
||||||
params="$params --wsgi pywb.wbapp"
|
params="$params --wsgi $app"
|
||||||
else
|
else
|
||||||
# run with --mount
|
# run with --mount
|
||||||
# requires a file not a package, so creating a mount_run.py to load the package
|
# requires a file not a package, so creating a mount_run.py to load the package
|
||||||
|
Binary file not shown.
10
setup.py
10
setup.py
@ -14,7 +14,14 @@ setup(
|
|||||||
license='GPL',
|
license='GPL',
|
||||||
packages=find_packages(),
|
packages=find_packages(),
|
||||||
provides=[
|
provides=[
|
||||||
'pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'
|
'pywb',
|
||||||
|
'pywb.utils',
|
||||||
|
'pywb.cdx',
|
||||||
|
'pywb.warc',
|
||||||
|
'pywb.rewrite',
|
||||||
|
'pywb.framework'
|
||||||
|
'pywb.core',
|
||||||
|
'pywb.apps'
|
||||||
],
|
],
|
||||||
package_data={
|
package_data={
|
||||||
'pywb': ['ui/*', 'static/*', '*.yaml'],
|
'pywb': ['ui/*', 'static/*', '*.yaml'],
|
||||||
@ -34,7 +41,6 @@ setup(
|
|||||||
'pyyaml',
|
'pyyaml',
|
||||||
'WebTest',
|
'WebTest',
|
||||||
'pytest',
|
'pytest',
|
||||||
'werkzeug>=0.9.4',
|
|
||||||
],
|
],
|
||||||
# tests_require=['WebTest', 'pytest'],
|
# tests_require=['WebTest', 'pytest'],
|
||||||
zip_safe=False
|
zip_safe=False
|
||||||
|
@ -90,6 +90,9 @@ enable_http_proxy: true
|
|||||||
# enable cdx server api for querying cdx directly (experimental)
|
# enable cdx server api for querying cdx directly (experimental)
|
||||||
enable_cdx_api: true
|
enable_cdx_api: true
|
||||||
|
|
||||||
|
# test different port
|
||||||
|
port: 9000
|
||||||
|
|
||||||
# optional reporter callback func
|
# optional reporter callback func
|
||||||
# if set, called with request and cdx object
|
# if set, called with request and cdx object
|
||||||
reporter: !!python/object/new:tests.fixture.PrintReporter []
|
reporter: !!python/object/new:tests.fixture.PrintReporter []
|
||||||
|
@ -1,32 +1,26 @@
|
|||||||
import os
|
|
||||||
import re
|
import re
|
||||||
|
import webtest
|
||||||
|
|
||||||
import pytest
|
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
|
|
||||||
from werkzeug.test import Client
|
|
||||||
from werkzeug.wrappers import BaseResponse, Response
|
|
||||||
|
|
||||||
import yaml
|
|
||||||
|
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.cdx.cdxobject import CDXObject
|
||||||
from pywb.cdx.wsgi_cdxserver import create_app
|
from pywb.apps.cdx_server import application
|
||||||
|
|
||||||
from tests.fixture import testconfig
|
import pytest
|
||||||
|
|
||||||
|
#================================================================
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def client(testconfig):
|
def client():
|
||||||
app = create_app(testconfig)
|
return webtest.TestApp(application)
|
||||||
return Client(app, Response)
|
|
||||||
|
|
||||||
# ================================================================
|
|
||||||
|
|
||||||
def query(client, url, **params):
|
#================================================================
|
||||||
|
def query(client, url, is_error=False, **params):
|
||||||
params['url'] = url
|
params['url'] = url
|
||||||
return client.get('/cdx?' + urlencode(params, doseq=1))
|
return client.get('/cdx?' + urlencode(params, doseq=1), expect_errors=is_error)
|
||||||
|
|
||||||
# ================================================================
|
|
||||||
|
|
||||||
|
#================================================================
|
||||||
def test_exact_url(client):
|
def test_exact_url(client):
|
||||||
"""
|
"""
|
||||||
basic exact match, no filters, etc.
|
basic exact match, no filters, etc.
|
||||||
@ -34,48 +28,54 @@ def test_exact_url(client):
|
|||||||
resp = query(client, 'http://www.iana.org/')
|
resp = query(client, 'http://www.iana.org/')
|
||||||
|
|
||||||
assert resp.status_code == 200
|
assert resp.status_code == 200
|
||||||
print resp.data
|
print resp.body
|
||||||
|
|
||||||
|
|
||||||
|
#================================================================
|
||||||
def test_prefix_match(client):
|
def test_prefix_match(client):
|
||||||
"""
|
"""
|
||||||
prefix match test
|
prefix match test
|
||||||
"""
|
"""
|
||||||
resp = query(client, 'http://www.iana.org/', matchType='prefix')
|
resp = query(client, 'http://www.iana.org/', matchType='prefix')
|
||||||
|
|
||||||
print resp.data.splitlines()
|
print resp.body.splitlines()
|
||||||
assert resp.status_code == 200
|
assert resp.status_code == 200
|
||||||
|
|
||||||
suburls = 0
|
suburls = 0
|
||||||
for l in resp.data.splitlines():
|
for l in resp.body.splitlines():
|
||||||
fields = l.split(' ')
|
fields = l.split(' ')
|
||||||
if len(fields[0]) > len('org,iana)/'):
|
if len(fields[0]) > len('org,iana)/'):
|
||||||
suburls += 1
|
suburls += 1
|
||||||
assert suburls > 0
|
assert suburls > 0
|
||||||
|
|
||||||
|
|
||||||
|
#================================================================
|
||||||
def test_filters(client):
|
def test_filters(client):
|
||||||
"""
|
"""
|
||||||
filter cdxes by mimetype and filename field, exact match.
|
filter cdxes by mimetype and filename field, exact match.
|
||||||
"""
|
"""
|
||||||
resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
|
resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
|
||||||
filter=('mimetype:warc/revisit', 'filename:dupes.warc.gz'))
|
filter=('mimetype:warc/revisit', 'filename:dupes.warc.gz'))
|
||||||
|
|
||||||
assert resp.status_code == 200
|
|
||||||
assert resp.mimetype == 'text/plain'
|
|
||||||
|
|
||||||
for l in resp.data.splitlines():
|
assert resp.status_code == 200
|
||||||
|
assert resp.content_type == 'text/plain'
|
||||||
|
|
||||||
|
for l in resp.body.splitlines():
|
||||||
fields = l.split(' ')
|
fields = l.split(' ')
|
||||||
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
|
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
|
||||||
assert fields[3] == 'warc/revisit'
|
assert fields[3] == 'warc/revisit'
|
||||||
assert fields[10] == 'dupes.warc.gz'
|
assert fields[10] == 'dupes.warc.gz'
|
||||||
|
|
||||||
|
|
||||||
|
#================================================================
|
||||||
def test_limit(client):
|
def test_limit(client):
|
||||||
resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
|
resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
|
||||||
limit='1')
|
limit='1')
|
||||||
|
|
||||||
assert resp.status_code == 200
|
assert resp.status_code == 200
|
||||||
assert resp.mimetype == 'text/plain'
|
assert resp.content_type == 'text/plain'
|
||||||
|
|
||||||
cdxes = resp.data.splitlines()
|
cdxes = resp.body.splitlines()
|
||||||
assert len(cdxes) == 1
|
assert len(cdxes) == 1
|
||||||
fields = cdxes[0].split(' ')
|
fields = cdxes[0].split(' ')
|
||||||
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
|
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
|
||||||
@ -86,15 +86,17 @@ def test_limit(client):
|
|||||||
limit='1', reverse='1')
|
limit='1', reverse='1')
|
||||||
|
|
||||||
assert resp.status_code == 200
|
assert resp.status_code == 200
|
||||||
assert resp.mimetype == 'text/plain'
|
assert resp.content_type == 'text/plain'
|
||||||
|
|
||||||
cdxes = resp.data.splitlines()
|
cdxes = resp.body.splitlines()
|
||||||
assert len(cdxes) == 1
|
assert len(cdxes) == 1
|
||||||
fields = cdxes[0].split(' ')
|
fields = cdxes[0].split(' ')
|
||||||
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
|
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
|
||||||
assert fields[1] == '20140127171239'
|
assert fields[1] == '20140127171239'
|
||||||
assert fields[3] == 'warc/revisit'
|
assert fields[3] == 'warc/revisit'
|
||||||
|
|
||||||
|
|
||||||
|
#================================================================
|
||||||
def test_fields(client):
|
def test_fields(client):
|
||||||
"""
|
"""
|
||||||
retrieve subset of fields with ``fields`` parameter.
|
retrieve subset of fields with ``fields`` parameter.
|
||||||
@ -104,7 +106,7 @@ def test_fields(client):
|
|||||||
|
|
||||||
assert resp.status_code == 200
|
assert resp.status_code == 200
|
||||||
|
|
||||||
cdxes = resp.data.splitlines()
|
cdxes = resp.body.splitlines()
|
||||||
|
|
||||||
for cdx in cdxes:
|
for cdx in cdxes:
|
||||||
fields = cdx.split(' ')
|
fields = cdx.split(' ')
|
||||||
@ -113,16 +115,21 @@ def test_fields(client):
|
|||||||
assert re.match(r'\d{14}$', fields[1])
|
assert re.match(r'\d{14}$', fields[1])
|
||||||
assert re.match(r'\d{3}|-', fields[2])
|
assert re.match(r'\d{3}|-', fields[2])
|
||||||
|
|
||||||
|
|
||||||
|
#================================================================
|
||||||
def test_fields_undefined(client):
|
def test_fields_undefined(client):
|
||||||
"""
|
"""
|
||||||
server shall respond with Bad Request (TODO: with proper explanation),
|
server shall respond with Bad Request and name of undefined
|
||||||
when ``fields`` parameter contains undefined name(s).
|
when ``fields`` parameter contains undefined name(s).
|
||||||
"""
|
"""
|
||||||
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
|
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
|
||||||
|
is_error=True,
|
||||||
fields='urlkey,nosuchfield')
|
fields='urlkey,nosuchfield')
|
||||||
|
|
||||||
resp.status_code == 400
|
resp.status_code == 400
|
||||||
|
|
||||||
|
|
||||||
|
#================================================================
|
||||||
def test_resolveRevisits(client):
|
def test_resolveRevisits(client):
|
||||||
"""
|
"""
|
||||||
with ``resolveRevisits=true``, server adds three fields pointing to
|
with ``resolveRevisits=true``, server adds three fields pointing to
|
||||||
@ -132,9 +139,9 @@ def test_resolveRevisits(client):
|
|||||||
resolveRevisits='true'
|
resolveRevisits='true'
|
||||||
)
|
)
|
||||||
assert resp.status_code == 200
|
assert resp.status_code == 200
|
||||||
assert resp.mimetype == 'text/plain'
|
assert resp.content_type == 'text/plain'
|
||||||
|
|
||||||
cdxes = resp.data.splitlines()
|
cdxes = resp.body.splitlines()
|
||||||
originals = {}
|
originals = {}
|
||||||
for cdx in cdxes:
|
for cdx in cdxes:
|
||||||
fields = cdx.split(' ')
|
fields = cdx.split(' ')
|
||||||
@ -151,6 +158,8 @@ def test_resolveRevisits(client):
|
|||||||
orig = originals.get(sha)
|
orig = originals.get(sha)
|
||||||
assert orig == (int(orig_size), int(orig_offset), orig_fn)
|
assert orig == (int(orig_size), int(orig_offset), orig_fn)
|
||||||
|
|
||||||
|
|
||||||
|
#================================================================
|
||||||
def test_resolveRevisits_orig_fields(client):
|
def test_resolveRevisits_orig_fields(client):
|
||||||
"""
|
"""
|
||||||
when resolveRevisits=true, extra three fields are named
|
when resolveRevisits=true, extra three fields are named
|
||||||
@ -162,9 +171,9 @@ def test_resolveRevisits_orig_fields(client):
|
|||||||
fields='urlkey,orig.length,orig.offset,orig.filename'
|
fields='urlkey,orig.length,orig.offset,orig.filename'
|
||||||
)
|
)
|
||||||
assert resp.status_code == 200
|
assert resp.status_code == 200
|
||||||
assert resp.mimetype == 'text/plain'
|
assert resp.content_type == 'text/plain'
|
||||||
|
|
||||||
cdxes = resp.data.splitlines()
|
cdxes = resp.body.splitlines()
|
||||||
for cdx in cdxes:
|
for cdx in cdxes:
|
||||||
fields = cdx.split(' ')
|
fields = cdx.split(' ')
|
||||||
assert len(fields) == 4
|
assert len(fields) == 4
|
||||||
@ -172,6 +181,8 @@ def test_resolveRevisits_orig_fields(client):
|
|||||||
assert (orig_len == '-' and orig_offset == '-' and orig_fn == '-' or
|
assert (orig_len == '-' and orig_offset == '-' and orig_fn == '-' or
|
||||||
(int(orig_len), int(orig_offset), orig_fn))
|
(int(orig_len), int(orig_offset), orig_fn))
|
||||||
|
|
||||||
|
|
||||||
|
#================================================================
|
||||||
def test_collapseTime_resolveRevisits_reverse(client):
|
def test_collapseTime_resolveRevisits_reverse(client):
|
||||||
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
|
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
|
||||||
collapseTime='11',
|
collapseTime='11',
|
||||||
@ -179,11 +190,10 @@ def test_collapseTime_resolveRevisits_reverse(client):
|
|||||||
reverse='true'
|
reverse='true'
|
||||||
)
|
)
|
||||||
|
|
||||||
cdxes = [CDXObject(l) for l in resp.data.splitlines()]
|
cdxes = [CDXObject(l) for l in resp.body.splitlines()]
|
||||||
|
|
||||||
assert len(cdxes) == 3
|
assert len(cdxes) == 3
|
||||||
|
|
||||||
# timestamp is in descending order
|
# timestamp is in descending order
|
||||||
for i in range(len(cdxes) - 1):
|
for i in range(len(cdxes) - 1):
|
||||||
assert cdxes[i]['timestamp'] >= cdxes[i + 1]['timestamp']
|
assert cdxes[i]['timestamp'] >= cdxes[i + 1]['timestamp']
|
||||||
|
|
@ -1,6 +1,6 @@
|
|||||||
import webtest
|
import webtest
|
||||||
from pywb.pywb_init import pywb_config
|
from pywb.core.pywb_init import create_wb_router
|
||||||
from pywb.wbapp import create_wb_app
|
from pywb.framework.wsgi_wrappers import init_app
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.cdx.cdxobject import CDXObject
|
||||||
|
|
||||||
from fixture import TestExclusionPerms
|
from fixture import TestExclusionPerms
|
||||||
@ -11,8 +11,13 @@ class TestWb:
|
|||||||
def setup(self):
|
def setup(self):
|
||||||
#self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config())
|
#self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config())
|
||||||
# save it in self - useful for debugging
|
# save it in self - useful for debugging
|
||||||
self.router = pywb_config(self.TEST_CONFIG)
|
self.app = init_app(create_wb_router,
|
||||||
self.app = create_wb_app(self.router)
|
load_yaml=True,
|
||||||
|
config_file=self.TEST_CONFIG)
|
||||||
|
|
||||||
|
#self.router = pywb_config(self.TEST_CONFIG)
|
||||||
|
#self.app = create_wb_app(self.router)
|
||||||
|
|
||||||
self.testapp = webtest.TestApp(self.app)
|
self.testapp = webtest.TestApp(self.app)
|
||||||
|
|
||||||
def _assert_basic_html(self, resp):
|
def _assert_basic_html(self, resp):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user