1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Merge pull request #28 from ikreymer/pkg-reorg

pywb pkg refactoring: create pywb.framework, pywb.core and pywb.apps
This commit is contained in:
ikreymer 2014-03-03 12:04:12 -08:00
commit 5a28bc6992
45 changed files with 759 additions and 660 deletions

0
pywb/apps/__init__.py Normal file
View File

17
pywb/apps/cdx_server.py Normal file
View File

@ -0,0 +1,17 @@
from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server
from pywb.core.cdx_handler import create_cdx_server_app
#=================================================================
# init cdx server app
#=================================================================
# cdx-server only config
DEFAULT_CONFIG = 'pywb/cdx/config.yaml'
application = init_app(create_cdx_server_app,
load_yaml=True,
config_file=DEFAULT_CONFIG)
if __name__ == "__main__":
start_wsgi_server(application)

10
pywb/apps/wayback.py Normal file
View File

@ -0,0 +1,10 @@
from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server
from pywb.core.pywb_init import create_wb_router
#=================================================================
# init pywb app
#=================================================================
application = init_app(create_wb_router, load_yaml=True)
if __name__ == "__main__":
start_wsgi_server(application)

View File

@ -9,6 +9,7 @@ from pywb.utils.canonicalize import unsurt, UrlCanonicalizer
from query import CDXQuery
#=================================================================
def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
"""

View File

@ -4,9 +4,11 @@ import itertools
from urllib import urlencode
from urlparse import parse_qs
from pywb.utils.wbexception import WbException
#=================================================================
class CDXException(Exception):
class CDXException(WbException):
def status(self):
return '400 Bad Request'
@ -61,7 +63,7 @@ class CDXObject(OrderedDict):
cdxformat = i
if not cdxformat:
raise Exception('unknown {0}-field cdx format'.format(len(fields)))
raise CDXException('unknown {0}-field cdx format'.format(len(fields)))
for header, field in itertools.izip(cdxformat, fields):
self[header] = field
@ -85,8 +87,15 @@ class CDXObject(OrderedDict):
"""
if fields is None:
return str(self) + '\n'
else:
return ' '.join(self[x] for x in fields) + '\n'
try:
result = ' '.join(self[x] for x in fields) + '\n'
except KeyError as ke:
msg = 'Invalid field "{0}" found in fields= argument'
msg = msg.format(ke.message)
raise CDXException(msg)
return result
def __str__(self):
if self.cdxline:
@ -109,7 +118,7 @@ class IDXObject(OrderedDict):
if len(fields) < self.NUM_REQ_FIELDS:
msg = 'invalid idx format: {0} fields found, {1} required'
raise Exception(msg.format(len(fields), self.NUM_REQ_FIELDS))
raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS))
for header, field in itertools.izip(self.FORMAT, fields):
self[header] = field

View File

@ -31,8 +31,18 @@ def cdx_load(sources, query, perms_checker=None, process=True):
if perms_checker:
cdx_iter = restrict_cdx(cdx_iter, query, perms_checker)
if query.output == 'text':
cdx_iter = cdx_to_text(cdx_iter, query.fields)
return cdx_iter
#=================================================================
def cdx_to_text(cdx_iter, fields):
for cdx in cdx_iter:
yield cdx.to_text(fields)
#=================================================================
def restrict_cdx(cdx_iter, query, perms_checker):
"""
@ -56,6 +66,7 @@ def restrict_cdx(cdx_iter, query, perms_checker):
yield cdx
#=================================================================
def process_cdx(cdx_iter, query):
if query.resolve_revisits:
@ -255,7 +266,6 @@ def cdx_resolve_revisits(cdx_iter):
originals = {}
for cdx in cdx_iter:
is_revisit = cdx.is_revisit()
digest = cdx['digest']

View File

@ -126,14 +126,19 @@ class CDXServer(BaseCDXServer):
logging.warn('No CDX Sources configured from paths=%s', paths)
def _add_cdx_source(self, source):
if source is None: return
if source is None:
return
logging.debug('Adding CDX Source: %s', source)
self.sources.append(source)
def add_cdx_source(self, source, config):
if source is None: return
if source is None:
return
if isinstance(source, CDXSource):
self._add_cdx_source(source)
elif isinstance(source, str):
if os.path.isdir(source):
for fn in os.listdir(source):
@ -213,5 +218,3 @@ def create_cdx_server(config, ds_rules_file=None):
surt_ordered=surt_ordered,
ds_rules_file=ds_rules_file,
perms_checker=perms_checker)

View File

@ -8,6 +8,7 @@ import urllib
import urllib2
import itertools
#=================================================================
class CDXSource(object):
"""
@ -92,7 +93,6 @@ class RedisCDXSource(CDXSource):
if config:
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
def load_cdx(self, query):
"""
Load cdx from redis cache, from an ordered list

View File

@ -1,5 +1,6 @@
from urllib import urlencode
from urlparse import parse_qs
from cdxobject import CDXException
#=================================================================
@ -62,6 +63,9 @@ class CDXQuery(object):
@property
def fields(self):
v = self.params.get('fields')
# check old param name
if not v:
v = self.params.get('fl')
return v.split(',') if v else None
@property
@ -105,9 +109,6 @@ class CDXQuery(object):
"""
params = parse_qs(env['QUERY_STRING'])
if not 'output' in params:
params['output'] = 'text'
# parse_qs produces arrays for single values
# cdx processing expects singleton params for all params,
# except filters, so convert here
@ -116,4 +117,8 @@ class CDXQuery(object):
if name != 'filter':
params[name] = val[0]
if not 'output' in params:
params['output'] = 'text'
return params

View File

@ -187,6 +187,7 @@ import pytest
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
kwparams['url'] = url
kwparams['output'] = 'cdxobject'
fields = kwparams.get('fields')
if fields:
fields = fields.split(',')

View File

@ -1,15 +0,0 @@
import webtest
from pywb.cdx.wsgi_cdxserver import create_app
from pywb import get_test_dir
class TestCdx:
def setup(self):
self.app = create_app(get_test_dir() + 'cdx/')
self.testapp = webtest.TestApp(self.app)
def test_cdx(self):
resp = self.testapp.get('/cdx?url=http://www.iana.org/_css/2013.1/screen.css')
assert resp.content_type == 'text/plain'
assert resp.content_length > 0

View File

@ -1,103 +0,0 @@
from werkzeug.wrappers import BaseResponse
from cdxserver import create_cdx_server
from pywb import get_test_dir
from query import CDXQuery
import logging
import os
import yaml
import pkg_resources
#=================================================================
CONFIG_FILE = 'config.yaml'
RULES_FILE = 'rules.yaml'
DEFAULT_PORT = 8080
#=================================================================
class CDXQueryRequest(object):
def __init__(self, environ):
self.query = CDXQuery.from_wsgi_env(environ)
class WSGICDXServer(object):
def __init__(self, config, rules_file):
self.cdxserver = create_cdx_server(config, rules_file)
def __call__(self, environ, start_response):
request = CDXQueryRequest(environ)
try:
logging.debug('request.args=%s', request.query)
result = self.cdxserver.load_cdx_query(request.query)
# TODO: select response type by "output" parameter
response = PlainTextResponse(result, request.query.fields)
return response(environ, start_response)
except Exception as exc:
logging.error('load_cdx failed', exc_info=1)
# TODO: error response should be different for each response
# type
start_response('400 Error', [('Content-Type', 'text/plain')])
return [str(exc)]
def cdx_text_out(cdx, fields):
if not fields:
return str(cdx) + '\n'
else:
logging.info('cdx fields=%s', cdx.keys)
# TODO: this will results in an exception if fields contain
# non-existent field name.
return ' '.join(cdx[x] for x in fields) + '\n'
class PlainTextResponse(BaseResponse):
def __init__(self, cdxitr, fields, status=200, content_type='text/plain'):
super(PlainTextResponse, self).__init__(
response=(
cdx.to_text(fields) for cdx in cdxitr
),
status=status, content_type=content_type)
# class JsonResponse(Response):
# pass
# class MementoResponse(Response):
# pass
def create_app(config=None):
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
level=logging.DEBUG)
if not config:
index_paths = get_test_dir() + 'cdx/'
config = dict(index_paths=index_paths)
return WSGICDXServer(config, RULES_FILE)
if __name__ == "__main__":
from optparse import OptionParser
from werkzeug.serving import run_simple
opt = OptionParser('%prog [OPTIONS]')
opt.add_option('-p', '--port', type='int', default=None)
options, args = opt.parse_args()
configdata = pkg_resources.resource_string(__name__, CONFIG_FILE)
config = yaml.load(configdata)
port = options.port
if port is None:
port = (config and config.get('port')) or DEFAULT_PORT
app = create_app(config)
logging.debug('Starting CDX Server on port %s', port)
try:
run_simple('0.0.0.0', port, app, use_reloader=True, use_debugger=True)
except KeyboardInterrupt as ex:
pass
logging.debug('Stopping CDX Server')
else:
# XXX pass production config
application = create_app()

View File

@ -1,56 +0,0 @@
import views
import handlers
import replay_views
import logging
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader
from pywb.rewrite.rewrite_content import RewriteContent
#=================================================================
# Config Loading
#=================================================================
def load_template_file(file, desc = None, view_class = views.J2TemplateView):
if file:
logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
file = view_class(file)
return file
#=================================================================
def create_wb_handler(cdx_server, config, ds_rules_file=None):
record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker'))
paths = config.get('archive_paths')
resolving_loader = ResolvingLoader(paths=paths,
cdx_server=cdx_server,
record_loader=record_loader)
replayer = replay_views.ReplayView(
content_loader = resolving_loader,
content_rewriter = RewriteContent(ds_rules_file=ds_rules_file),
head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),
buffer_response = config.get('buffer_response', True),
redir_to_exact = config.get('redir_to_exact', True),
reporter = config.get('reporter')
)
wb_handler = handlers.WBHandler(
cdx_server,
replayer,
html_view = load_template_file(config.get('query_html'), 'Captures Page', views.J2HtmlCapturesView),
search_view = load_template_file(config.get('search_html'), 'Search Page'),
)
return wb_handler

0
pywb/core/__init__.py Normal file
View File

43
pywb/core/cdx_handler.py Normal file
View File

@ -0,0 +1,43 @@
from pywb.cdx.query import CDXQuery
from pywb.cdx.cdxserver import create_cdx_server
from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.framework.basehandlers import BaseHandler
from views import TextCapturesView
#=================================================================
class CDXHandler(BaseHandler):
"""
Handler which passes wsgi request to cdx server and
returns a text-based cdx response
"""
def __init__(self, index_reader, view=None):
self.index_reader = index_reader
self.view = view if view else TextCapturesView()
def __call__(self, wbrequest):
params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env)
cdx_lines = self.index_reader.load_cdx(**params)
return self.view.render_response(wbrequest, cdx_lines)
def __str__(self):
return 'CDX Handler: ' + str(self.index_reader)
#=================================================================
DEFAULT_RULES = 'pywb/rules.yaml'
#=================================================================
def create_cdx_server_app(config):
"""
Create a cdx server config to be wrapped in a wsgi app
Currently using single access point '/cdx'
TODO: more complex example with multiple collections?
"""
cdx_server = create_cdx_server(config, DEFAULT_RULES)
port = config.get('port')
routes = [Route('cdx', CDXHandler(cdx_server))]
return ArchivalRouter(routes, port=port)

View File

@ -1,30 +1,13 @@
import urlparse
import pkgutil
import mimetypes
import time
from pywb.rewrite.wburl import WbUrl
from pywb.cdx.query import CDXQuery
from wbrequestresponse import WbResponse
from wbexceptions import WbException, NotFoundException
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.wbexceptions import WbException, NotFoundException
from views import TextCapturesView
#=================================================================
class BaseHandler(object):
def __call__(self, wbrequest):
return wbrequest
def get_wburl_type(self):
return None
#=================================================================
class WbUrlHandler(BaseHandler):
def get_wburl_type(self):
return WbUrl
#=================================================================
# Standard WB Handler
#=================================================================
@ -33,11 +16,15 @@ class WBHandler(WbUrlHandler):
html_view=None, search_view=None):
self.index_reader = index_reader
self.replay = replay
self.text_view = TextCapturesView()
self.text_query_view = TextCapturesView()
self.query_view = html_view
if not self.query_view:
self.query_view = text_query_view
self.html_view = html_view
self.search_view = search_view
def __call__(self, wbrequest):
@ -49,11 +36,10 @@ class WBHandler(WbUrlHandler):
# new special modifier to always show cdx index
if wbrequest.wb_url.mod == 'cdx_':
return self.text_view.render_response(wbrequest, cdx_lines)
return self.text_query_view.render_response(wbrequest, cdx_lines)
if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY):
query_view = self.html_view if self.html_view else self.text_view
return query_view.render_response(wbrequest, cdx_lines)
return self.query_view.render_response(wbrequest, cdx_lines)
with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
return self.replay(wbrequest, cdx_lines)
@ -70,29 +56,11 @@ class WBHandler(WbUrlHandler):
return 'WBHandler: ' + str(self.index_reader) + ', ' + str(self.replay)
#=================================================================
# CDX-Server Handler -- pass all params to cdx server
#=================================================================
class CDXHandler(BaseHandler):
def __init__(self, index_reader, view = None):
self.index_reader = index_reader
self.view = view if view else TextCapturesView()
def __call__(self, wbrequest):
params = CDXQuery.extract_params_from_wsgi_env(wbrequest.env)
cdx_lines = self.index_reader.load_cdx(**params)
return self.view.render_response(wbrequest, cdx_lines)
def __str__(self):
return 'Index Reader: ' + str(self.index_reader)
#=================================================================
# Static Content Handler
#=================================================================
class StaticHandler(BaseHandler):
def __init__(self, static_path, pkg = __package__):
def __init__(self, static_path, pkg = 'pywb'):
mimetypes.init()
self.static_path = static_path

View File

@ -29,6 +29,7 @@ class IndexReader(object):
params.update(wbrequest.custom_params)
params['allowFuzzy'] = True
params['output'] = 'cdxobject'
cdxlines = self.load_cdx(url=wburl.url, **params)

181
pywb/core/pywb_init.py Normal file
View File

@ -0,0 +1,181 @@
from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.framework.proxy import ProxyArchivalRouter
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader
from pywb.rewrite.rewrite_content import RewriteContent
from indexreader import IndexReader
from views import J2TemplateView, J2HtmlCapturesView
from replay_views import ReplayView
from handlers import WBHandler
from handlers import StaticHandler
from cdx_handler import CDXHandler
from handlers import DebugEchoHandler, DebugEchoEnvHandler
import os
import yaml
import logging
#=================================================================
DEFAULTS = {
'hostpaths': ['http://localhost:8080'],
'collections': {'pywb': './sample_archive/cdx/'},
'archive_paths': './sample_archive/warcs/',
'head_insert_html': 'ui/head_insert.html',
'query_html': 'ui/query.html',
'search_html': 'ui/search.html',
'home_html': 'ui/index.html',
'error_html': 'ui/error.html',
'static_routes': {'static/default': 'static/'},
'domain_specific_rules': 'pywb/rules.yaml',
}
#=================================================================
class DictChain:
def __init__(self, *dicts):
self.dicts = dicts
def get(self, key, default_val=None):
for d in self.dicts:
val = d.get(key)
if val is not None:
return val
return default_val
#=================================================================
def load_template_file(file, desc=None, view_class=J2TemplateView):
if file:
logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
file = view_class(file)
return file
#=================================================================
def create_wb_handler(cdx_server, config, ds_rules_file=None):
cookie_maker=config.get('cookie_maker')
record_loader = ArcWarcRecordLoader(cookie_maker=cookie_maker)
paths = config.get('archive_paths')
resolving_loader = ResolvingLoader(paths=paths,
cdx_server=cdx_server,
record_loader=record_loader)
head_insert_view = load_template_file(config.get('head_insert_html'),
'Head Insert')
replayer = ReplayView(
content_loader=resolving_loader,
content_rewriter=RewriteContent(ds_rules_file=ds_rules_file),
head_insert_view=head_insert_view,
buffer_response=config.get('buffer_response', True),
redir_to_exact=config.get('redir_to_exact', True),
reporter=config.get('reporter')
)
html_view = load_template_file(config.get('query_html'),
'Captures Page',
J2HtmlCapturesView)
search_view = load_template_file(config.get('search_html'),
'Search Page')
wb_handler = WBHandler(
cdx_server,
replayer,
html_view=html_view,
search_view=search_view,
)
return wb_handler
#=================================================================
def create_wb_router(passed_config = {}):
config = DictChain(passed_config, DEFAULTS)
routes = []
hostpaths = config.get('hostpaths')
port = config.get('port')
# collections based on cdx source
collections = config.get('collections')
for name, value in collections.iteritems():
if isinstance(value, str):
value = {'index_paths': value}
route_config = DictChain(value, config)
ds_rules_file = route_config.get('domain_specific_rules', None)
cdx_server = IndexReader(route_config, ds_rules_file)
wb_handler = create_wb_handler(
cdx_server=cdx_server,
config=route_config,
ds_rules_file=ds_rules_file,
)
logging.debug('Adding Collection: ' + name)
route_class = route_config.get('route_class', Route)
routes.append(route_class(name, wb_handler, config = route_config))
# cdx query handler
if route_config.get('enable_cdx_api', False):
routes.append(Route(name + '-cdx', CDXHandler(cdx_server)))
if config.get('debug_echo_env', False):
routes.append(Route('echo_env', DebugEchoEnvHandler()))
if config.get('debug_echo_req', False):
routes.append(Route('echo_req', DebugEchoHandler()))
static_routes = config.get('static_routes')
for static_name, static_path in static_routes.iteritems():
routes.append(Route(static_name, StaticHandler(static_path)))
# Check for new proxy mode!
if config.get('enable_http_proxy', False):
router = ProxyArchivalRouter
else:
router = ArchivalRouter
# Finally, create wb router
return router(
routes,
# Specify hostnames that pywb will be running on
# This will help catch occasionally missed rewrites that fall-through to the host
# (See archivalrouter.ReferRedirect)
hostpaths = hostpaths,
port = port,
abs_path = config.get('absolute_paths', True),
home_view = load_template_file(config.get('home_html'), 'Home Page'),
error_view = load_template_file(config.get('error_html'), 'Error Page')
)

View File

@ -2,9 +2,9 @@ import StringIO
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.bufferedreaders import ChunkedDataReader
from wbrequestresponse import WbResponse
from pywb.framework.wbrequestresponse import WbResponse
from wbexceptions import CaptureException, InternalRedirect
from pywb.framework.wbexceptions import CaptureException, InternalRedirect
from pywb.warc.recordloader import ArchiveLoadFailed
from pywb.utils.loaders import LimitReader
@ -51,7 +51,7 @@ class ReplayView:
self._redirect_if_needed(wbrequest, cdx)
# one more check for referrer-based self-redirect
self._reject_referrer_self_redirect(wbrequest, status_headers)
self._reject_referrer_self_redirect(wbrequest)
response = None
@ -177,25 +177,30 @@ class ReplayView:
def _reject_self_redirect(self, wbrequest, cdx, status_headers):
# self-redirect via location
"""
Check if response is a 3xx redirect to the same url
If so, reject this capture to avoid causing redirect loop
"""
if status_headers.statusline.startswith('3'):
request_url = wbrequest.wb_url.url.lower()
location_url = status_headers.get_header('Location').lower()
#TODO: canonicalize before testing?
if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)):
raise CaptureException('Self Redirect: ' + str(cdx))
def _reject_referrer_self_redirect(self, wbrequest, status_headers):
# at correct timestamp now, but must check for referrer redirect
# indirect self-redirect, via meta-refresh, if referrer is same as current url
if status_headers.statusline.startswith('2'):
# build full url even if using relative-rewriting
request_url = wbrequest.host_prefix + wbrequest.rel_prefix + str(wbrequest.wb_url)
referrer_url = wbrequest.referrer
if (referrer_url and UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(referrer_url)):
raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))
def _reject_referrer_self_redirect(self, wbrequest):
"""
Perform final check for referrer based self-redirect.
This method should be called after verifying request timestamp matches capture.
if referrer is same as current url, reject this response and try another capture
"""
if not wbrequest.referrer:
return
# build full url even if using relative-rewriting
request_url = (wbrequest.host_prefix +
wbrequest.rel_prefix + str(wbrequest.wb_url))
if (UrlRewriter.strip_protocol(request_url) ==
UrlRewriter.strip_protocol(wbrequest.referrer)):
raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))

View File

@ -1,6 +1,6 @@
import pywb.utils.timeutils as timeutils
from pywb.utils.timeutils import timestamp_to_datetime
from pywb.framework.wbrequestresponse import WbResponse
import wbrequestresponse
import urlparse
import time
@ -18,7 +18,7 @@ class StaticTextView:
return self.text
def render_response(self, **kwargs):
return wbrequestresponse.WbResponse.text_stream(self.text)
return WbResponse.text_stream(self.text)
#=================================================================
class J2TemplateView:
@ -34,7 +34,7 @@ class J2TemplateView:
if template_dir.startswith('.') or template_dir.startswith('file://'):
loader = FileSystemLoader(template_dir)
else:
loader = PackageLoader(__package__, template_dir)
loader = PackageLoader('pywb', template_dir)
jinja_env = Environment(loader = loader, trim_blocks = True)
jinja_env.filters['format_ts'] = J2TemplateView.format_ts
@ -51,13 +51,13 @@ class J2TemplateView:
def render_response(self, **kwargs):
template_result = self.render_to_string(**kwargs)
status = kwargs.get('status', '200 OK')
return wbrequestresponse.WbResponse.text_response(str(template_result), status = status, content_type = 'text/html; charset=utf-8')
return WbResponse.text_response(str(template_result), status = status, content_type = 'text/html; charset=utf-8')
# Filters
@staticmethod
def format_ts(value, format_='%a, %b %d %Y %H:%M:%S'):
value = timeutils.timestamp_to_datetime(value)
value = timestamp_to_datetime(value)
return value.strftime(format_)
@staticmethod
@ -90,7 +90,7 @@ class TextCapturesView:
cdx += '\n'
return cdx
cdx_lines = imap(to_str, cdx_lines)
return wbrequestresponse.WbResponse.text_stream(cdx_lines)
return WbResponse.text_stream(cdx_lines)

View File

View File

@ -1,17 +1,31 @@
import urlparse
import re
from wbrequestresponse import WbRequest, WbResponse
from pywb.rewrite.url_rewriter import UrlRewriter
from wbrequestresponse import WbRequest, WbResponse
#=================================================================
# ArchivalRouter -- route WB requests in archival mode
#=================================================================
class ArchivalRouter:
def __init__(self, routes, hostpaths=None, abs_path=True, home_view=None, error_view=None):
class ArchivalRouter(object):
def __init__(self, routes,
hostpaths=None,
port=None,
abs_path=True,
home_view=None,
error_view=None):
self.routes = routes
self.fallback = ReferRedirect(hostpaths)
# optional port setting may be ignored by wsgi container
self.port = port
if hostpaths:
self.fallback = ReferRedirect(hostpaths)
else:
self.fallback = None
self.abs_path = abs_path
self.home_view = home_view
@ -29,26 +43,27 @@ class ArchivalRouter:
return self.fallback(env, self.routes) if self.fallback else None
def render_home_page(self):
# render the homepage!
if self.home_view:
return self.home_view.render_response(routes = self.routes)
return self.home_view.render_response(routes=self.routes)
else:
# default home page template
text = '\n'.join(map(str, self.routes))
return WbResponse.text_response(text)
#=================================================================
# Route by matching regex (or fixed prefix)
# of request uri (excluding first '/')
#=================================================================
class Route:
class Route(object):
# match upto next / or ? or end
SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)'
SLASH_QUERY_LOOKAHEAD = '(?=/|$|\?)'
def __init__(self, regex, handler, coll_group=0, config={},
lookahead=SLASH_QUERY_LOOKAHEAD):
def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD):
self.path = regex
if regex:
self.regex = re.compile(regex + lookahead)
@ -59,12 +74,11 @@ class Route:
self.coll_group = coll_group
self._custom_init(config)
def __call__(self, env, use_abs_prefix):
wbrequest = self.parse_request(env, use_abs_prefix)
return self.handler(wbrequest) if wbrequest else None
def parse_request(self, env, use_abs_prefix, request_uri = None):
def parse_request(self, env, use_abs_prefix, request_uri=None):
if not request_uri:
request_uri = env['REL_REQUEST_URI']
@ -75,10 +89,12 @@ class Route:
matched_str = matcher.group(0)
if matched_str:
rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri
# remove the '/' + rel_prefix part of uri
wb_url_str = request_uri[len(matched_str) + 2:]
else:
rel_prefix = env['SCRIPT_NAME'] + '/'
wb_url_str = request_uri[1:] # the request_uri is the wb_url, since no coll
# the request_uri is the wb_url, since no coll
wb_url_str = request_uri[1:]
coll = matcher.group(self.coll_group)
@ -88,20 +104,19 @@ class Route:
rel_prefix=rel_prefix,
coll=coll,
use_abs_prefix=use_abs_prefix,
wburl_class = self.handler.get_wburl_type(),
wburl_class=self.handler.get_wburl_type(),
urlrewriter_class=UrlRewriter)
# Allow for applying of additional filters
self._apply_filters(wbrequest, matcher)
return wbrequest
def _apply_filters(self, wbrequest, matcher):
for filter in self.filters:
last_grp = len(matcher.groups())
wbrequest.query_filter.append(filter.format(matcher.group(last_grp)))
filter_str = filter.format(matcher.group(last_grp))
wbrequest.query_filter.append(filter_str)
def _custom_init(self, config):
self.filters = config.get('filters', [])
@ -112,7 +127,8 @@ class Route:
#=================================================================
# ReferRedirect -- redirect urls that have 'fallen through' based on the referrer settings
# ReferRedirect -- redirect urls that have 'fallen through'
# based on the referrer settings
#=================================================================
class ReferRedirect:
def __init__(self, match_prefixs):
@ -121,7 +137,6 @@ class ReferRedirect:
else:
self.match_prefixs = [match_prefixs]
def __call__(self, env, routes):
referrer = env.get('HTTP_REFERER')
@ -133,7 +148,7 @@ class ReferRedirect:
ref_split = urlparse.urlsplit(referrer)
# ensure referrer starts with one of allowed hosts
if not any (referrer.startswith(i) for i in self.match_prefixs):
if not any(referrer.startswith(i) for i in self.match_prefixs):
if ref_split.netloc != env.get('HTTP_HOST'):
return None
@ -144,13 +159,12 @@ class ReferRedirect:
if app_path:
# must start with current app name, if not root
if not path.startswith(app_path):
return None
return None
path = path[len(app_path):]
for route in routes:
ref_request = route.parse_request(env, False, request_uri = path)
ref_request = route.parse_request(env, False, request_uri=path)
if ref_request:
break
@ -174,6 +188,10 @@ class ReferRedirect:
# 2013/path.html -> /path.html
rel_request_uri = rel_request_uri[len(timestamp_path) - 1:]
final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, rewriter.rewrite(rel_request_uri), '', ''))
final_url = urlparse.urlunsplit((ref_split.scheme,
ref_split.netloc,
rewriter.rewrite(rel_request_uri),
'',
''))
return WbResponse.redir_response(final_url)

View File

@ -0,0 +1,23 @@
from pywb.rewrite.wburl import WbUrl
#=================================================================
class BaseHandler(object):
"""
Represents a base handler class that handles any request
"""
def __call__(self, wbrequest):
return wbrequest
def get_wburl_type(self):
return None
#=================================================================
class WbUrlHandler(BaseHandler):
"""
Represents a handler which assumes the request contains a WbUrl
Ensure that the WbUrl is parsed in the request
"""
def get_wburl_type(self):
return WbUrl

View File

@ -2,23 +2,37 @@ from wbrequestresponse import WbResponse, WbRequest
from archivalrouter import ArchivalRouter
import urlparse
#=================================================================
# An experimental router which combines both archival and proxy modes
# http proxy mode support is very simple: only latest capture is available currently
# http proxy mode support is very simple so far:
# only latest capture is available currently
#=================================================================
class ProxyArchivalRouter(ArchivalRouter):
def __init__(self, routes,
hostpaths=None,
port=None,
abs_path=True,
home_view=None,
error_view=None):
(super(ProxyArchivalRouter, self).
__init__(routes,
hostpaths=hostpaths,
port=port,
abs_path=abs_path,
home_view=home_view,
error_view=error_view))
class ProxyArchivalRouter:
def __init__(self, routes, hostpaths = None, abs_path = True, home_view = None, error_view = None):
self.archival = ArchivalRouter(routes, hostpaths, abs_path, home_view, error_view)
self.proxy = ProxyRouter(routes[0].handler, hostpaths, error_view)
self.error_view = error_view
#self.error_view = error_view
def __call__(self, env):
response = self.archival(env)
response = self.proxy(env)
if response:
return response
response = self.proxy(env)
response = super(ProxyArchivalRouter, self).__call__(env)
if response:
return response
@ -29,7 +43,7 @@ class ProxyArchivalRouter:
# Only supports latest capture replay at the moment
#=================================================================
class ProxyRouter:
def __init__(self, handler, hostpaths = None, error_view = None):
def __init__(self, handler, hostpaths=None, error_view=None):
self.handler = handler
self.hostpaths = hostpaths
@ -56,27 +70,26 @@ class ProxyRouter:
return self.handler(wbrequest)
# Proxy Auto-Config (PAC) script for the proxy
def make_pac_response(self, env):
server_hostport = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
buff = 'function FindProxyForURL (url, host) {\n'
direct_cond =' if (shExpMatch(host, "{0}")) {{ return "DIRECT"; }}\n'
direct = ' if (shExpMatch(host, "{0}")) {{ return "DIRECT"; }}\n'
for hostpath in self.hostpaths:
parts = urlparse.urlsplit(hostpath).netloc.split(':')
buff += direct_cond.format(parts[0])
buff += direct.format(parts[0])
buff += direct_cond.format(env['SERVER_NAME'])
buff += direct.format(env['SERVER_NAME'])
#buff += '\n return "PROXY {0}";\n}}\n'.format(self.hostpaths[0])
buff += '\n return "PROXY {0}";\n}}\n'.format(server_hostport)
return WbResponse.text_response(buff, content_type = 'application/x-ns-proxy-autoconfig')
content_type = 'application/x-ns-proxy-autoconfig'
return WbResponse.text_response(buff, content_type=content_type)
#=================================================================
@ -85,10 +98,11 @@ class ProxyRouter:
class ProxyHttpsUrlRewriter:
HTTP = 'http://'
HTTPS = 'https://'
def __init__(self, wbrequest, prefix):
pass
def rewrite(self, url, mod = None):
def rewrite(self, url, mod=None):
if url.startswith(self.HTTPS):
return self.HTTP + url[len(self.HTTPS):]
else:
@ -97,6 +111,5 @@ class ProxyHttpsUrlRewriter:
def get_timestamp_url(self, timestamp, url):
return url
def get_abs_url(self, url = ''):
def get_abs_url(self, url=''):
return url

View File

@ -84,8 +84,8 @@ False
"""
from pywb.archivalrouter import Route, ReferRedirect
from pywb.handlers import BaseHandler, WbUrlHandler
from pywb.framework.archivalrouter import Route, ReferRedirect
from pywb.framework.basehandlers import BaseHandler, WbUrlHandler
import pprint
def print_req(req):

View File

@ -41,7 +41,7 @@ from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.wbrequestresponse import WbRequest, WbResponse
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
def print_req_from_uri(request_uri, env={}, use_abs_prefix=False):

View File

@ -1,23 +1,22 @@
from pywb.utils.wbexception import WbException
class WbException(Exception):
pass
class NotFoundException(WbException):
def status(self):
return '404 Not Found'
# Exceptions that effect a specific capture and result in a retry
class CaptureException(WbException):
def status(self):
return '500 Internal Server Error'
class InternalRedirect(WbException):
def __init__(self, location, status = '302 Internal Redirect'):
def __init__(self, location, status='302 Internal Redirect'):
WbException.__init__(self, 'Redirecting -> ' + location)
self.status = status
self.httpHeaders = [('Location', location)]
def status(self):
return self.status

View File

@ -26,7 +26,6 @@ class WbRequest:
except KeyError:
return ''
def __init__(self, env,
request_uri=None,
rel_prefix='',
@ -40,7 +39,10 @@ class WbRequest:
self.env = env
self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI')
if request_uri:
self.request_uri = request_uri
else:
self.request_uri = env.get('REL_REQUEST_URI')
self.coll = coll
@ -55,7 +57,6 @@ class WbRequest:
else:
self.wb_prefix = rel_prefix
if not wb_url_str:
wb_url_str = '/'
@ -83,7 +84,6 @@ class WbRequest:
# PERF
env['X_PERF'] = {}
def _is_ajax(self):
value = self.env.get('HTTP_X_REQUESTED_WITH')
if not value:
@ -96,7 +96,6 @@ class WbRequest:
return True
return False
def __repr__(self):
varlist = vars(self)
varstr = pprint.pformat(varlist)
@ -111,32 +110,39 @@ class WbResponse:
Holds a status_headers object and a response iter, to be
returned to wsgi container.
"""
def __init__(self, status_headers, value = []):
def __init__(self, status_headers, value=[]):
self.status_headers = status_headers
self.body = value
@staticmethod
def text_stream(text, status = '200 OK', content_type = 'text/plain'):
return WbResponse(StatusAndHeaders(status, [('Content-Type', content_type)]), value = text)
def text_stream(stream, status='200 OK', content_type='text/plain'):
status_headers = StatusAndHeaders(status,
[('Content-Type', content_type)])
return WbResponse(status_headers, value=stream)
@staticmethod
def text_response(text, status = '200 OK', content_type = 'text/plain'):
return WbResponse(StatusAndHeaders(status, [('Content-Type', content_type)]), value = [text])
def text_response(text, status='200 OK', content_type='text/plain'):
status_headers = StatusAndHeaders(status,
[('Content-Type', content_type)])
return WbResponse(status_headers, value=[text])
@staticmethod
def redir_response(location, status = '302 Redirect'):
return WbResponse(StatusAndHeaders(status, [('Location', location)]))
def redir_response(location, status='302 Redirect'):
return WbResponse(StatusAndHeaders(status,
[('Location', location)]))
def __call__(self, env, start_response):
# PERF
perfstats = env.get('X_PERF')
if perfstats:
self.status_headers.headers.append(('X-Archive-Perf-Stats', str(perfstats)))
self.status_headers.headers.append(('X-Archive-Perf-Stats',
str(perfstats)))
start_response(self.status_headers.statusline, self.status_headers.headers)
start_response(self.status_headers.statusline,
self.status_headers.headers)
if env['REQUEST_METHOD'] == 'HEAD':
if hasattr(self.body, 'close'):
@ -148,6 +154,5 @@ class WbResponse:
else:
return [str(self.body)]
def __repr__(self):
return str(vars(self))

View File

@ -0,0 +1,165 @@
from pywb.utils.wbexception import WbException
from pywb.utils.loaders import load_yaml_config
from wbexceptions import NotFoundException, InternalRedirect
from wbrequestresponse import WbResponse, StatusAndHeaders
import os
import importlib
import logging
DEFAULT_PORT = 8080
#=================================================================
# adapted from wsgiref.request_uri, but doesn't include domain name
# and allows all characters which are allowed in the path segment
# according to: http://tools.ietf.org/html/rfc3986#section-3.3
# explained here:
# http://stackoverflow.com/questions/4669692/
# valid-characters-for-directory-part-of-a-url-for-short-links
def rel_request_uri(environ, include_query=1):
"""
Return the requested path, optionally including the query string
# Simple test:
>>> rel_request_uri({'PATH_INFO': '/web/example.com'})
'/web/example.com'
# Test all unecoded special chars and double-quote
# (double-quote must be encoded but not single quote)
>>> rel_request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""})
"/web/example.com/0~!+$&'()*+,;=:%22"
"""
from urllib import quote
url = quote(environ.get('PATH_INFO', ''), safe='/~!$&\'()*+,;=:@')
if include_query and environ.get('QUERY_STRING'):
url += '?' + environ['QUERY_STRING']
return url
#=================================================================
class WSGIApp(object):
def __init__(self, wb_router):
self.wb_router = wb_router
self.port = DEFAULT_PORT
if hasattr(wb_router, 'port'):
self.port = wb_router.port
# Top-level wsgi application
def __call__(self, env, start_response):
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
env['REL_REQUEST_URI'] = rel_request_uri(env)
else:
env['REL_REQUEST_URI'] = env['REQUEST_URI']
wb_router = self.wb_router
response = None
try:
response = wb_router(env)
if not response:
msg = 'No handler for "{0}"'.format(env['REL_REQUEST_URI'])
raise NotFoundException(msg)
except InternalRedirect as ir:
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
except WbException as e:
response = handle_exception(env, wb_router.error_view, e, False)
except Exception as e:
response = handle_exception(env, wb_router.error_view, e, True)
return response(env, start_response)
#=================================================================
def handle_exception(env, error_view, exc, print_trace):
if hasattr(exc, 'status'):
status = exc.status()
else:
status = '400 Bad Request'
if print_trace:
import traceback
err_details = traceback.format_exc(exc)
print err_details
else:
logging.info(str(exc))
err_details = None
if error_view:
import traceback
return error_view.render_response(err_msg=str(exc),
err_details=err_details,
status=status)
else:
return WbResponse.text_response(status + ' Error: ' + str(exc),
status=status)
#=================================================================
DEFAULT_CONFIG_FILE = 'config.yaml'
#=================================================================
def init_app(init_func, load_yaml=True, config_file=None):
logging.basicConfig(format='%(asctime)s: [%(levelname)s]: %(message)s',
level=logging.DEBUG)
logging.info('')
if load_yaml:
if not config_file:
config_file = os.environ.get('PYWB_CONFIG_FILE')
if not config_file:
config_file = DEFAULT_CONFIG_FILE
config = load_yaml_config(config_file)
try:
if load_yaml:
wb_router = init_func(config)
else:
wb_router = init_func()
except:
msg = '*** pywb app init FAILED config from "%s"!\n'
logging.exception(msg, init_func.__name__)
raise
else:
msg = '*** pywb app inited with config from "%s"!\n'
logging.info(msg, init_func.__name__)
return WSGIApp(wb_router)
#=================================================================
def start_wsgi_server(the_app):
from wsgiref.simple_server import make_server
from optparse import OptionParser
opt = OptionParser('%prog [OPTIONS]')
opt.add_option('-p', '--port', type='int', default=None)
options, args = opt.parse_args()
port = options.port
port = the_app.port
if not port:
port = DEFAULT_PORT
logging.debug('Starting CDX Server on port %s', port)
try:
httpd = make_server('', port, the_app)
httpd.serve_forever()
except KeyboardInterrupt as ex:
pass
logging.debug('Stopping CDX Server')

View File

@ -1,128 +0,0 @@
import handlers
import archivalrouter
import config_utils
import proxy
from indexreader import IndexReader
import os
import yaml
import logging
#=================================================================
DEFAULTS = {
'hostpaths': ['http://localhost:8080'],
'collections': {'pywb': './sample_archive/cdx/'},
'archive_paths': './sample_archive/warcs/',
'head_insert_html': 'ui/head_insert.html',
'query_html': 'ui/query.html',
'search_html': 'ui/search.html',
'home_html': 'ui/index.html',
'error_html': 'ui/error.html',
'static_routes': {'static/default': 'static/'},
'domain_specific_rules': 'rules.yaml',
}
class DictChain:
def __init__(self, *dicts):
self.dicts = dicts
def get(self, key, default_val=None):
for d in self.dicts:
val = d.get(key)
if val is not None:
return val
return default_val
#=================================================================
## Reference non-YAML config
#=================================================================
def pywb_config_manual(passed_config = {}):
config = DictChain(passed_config, DEFAULTS)
routes = []
hostpaths = config.get('hostpaths')
# collections based on cdx source
collections = config.get('collections')
for name, value in collections.iteritems():
if isinstance(value, str):
value = {'index_paths': value}
route_config = DictChain(value, config)
ds_rules_file = route_config.get('domain_specific_rules', None)
cdx_server = IndexReader(route_config, ds_rules_file)
wb_handler = config_utils.create_wb_handler(
cdx_server=cdx_server,
config=route_config,
ds_rules_file=ds_rules_file,
)
logging.debug('Adding Collection: ' + name)
route_class = route_config.get('route_class', archivalrouter.Route)
routes.append(route_class(name, wb_handler, config = route_config))
# cdx query handler
if route_config.get('enable_cdx_api', False):
routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_server)))
if config.get('debug_echo_env', False):
routes.append(archivalrouter.Route('echo_env', handlers.DebugEchoEnvHandler()))
if config.get('debug_echo_req', False):
routes.append(archivalrouter.Route('echo_req', handlers.DebugEchoHandler()))
static_routes = config.get('static_routes')
for static_name, static_path in static_routes.iteritems():
routes.append(archivalrouter.Route(static_name, handlers.StaticHandler(static_path)))
# Check for new proxy mode!
if config.get('enable_http_proxy', False):
router = proxy.ProxyArchivalRouter
else:
router = archivalrouter.ArchivalRouter
# Finally, create wb router
return router(
routes,
# Specify hostnames that pywb will be running on
# This will help catch occasionally missed rewrites that fall-through to the host
# (See archivalrouter.ReferRedirect)
hostpaths = hostpaths,
abs_path = config.get('absolute_paths', True),
home_view = config_utils.load_template_file(config.get('home_html'), 'Home Page'),
error_view = config_utils.load_template_file(config.get('error_html'), 'Error Page')
)
#=================================================================
# YAML config loader
#=================================================================
DEFAULT_CONFIG_FILE = 'config.yaml'
def pywb_config(config_file = None):
if not config_file:
config_file = os.environ.get('PYWB_CONFIG', DEFAULT_CONFIG_FILE)
with open(config_file) as fh:
config = yaml.load(fh)
return pywb_config_manual(config)

View File

@ -4,6 +4,9 @@
import surt
import urlparse
from wbexception import WbException
#=================================================================
class UrlCanonicalizer(object):
def __init__(self, surt_ordered=True):
@ -14,7 +17,7 @@ class UrlCanonicalizer(object):
#=================================================================
class UrlCanonicalizeException(Exception):
class UrlCanonicalizeException(WbException):
def status(self):
return '400 Bad Request'
@ -164,7 +167,8 @@ def calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
elif match_type == 'domain':
if not surt_ordered:
raise UrlCanonicalizeException('matchType=domain unsupported for non-surt')
msg = 'matchType=domain unsupported for non-surt'
raise UrlCanonicalizeException(msg)
host = start_key.split(')/')[0]

View File

@ -1,10 +1,9 @@
import yaml
import pkgutil
from loaders import load_yaml_config
#=================================================================
DEFAULT_RULES_FILE = 'rules.yaml'
DEFAULT_RULES_PKG = 'pywb'
DEFAULT_RULES_FILE = 'pywb/rules.yaml'
#=================================================================
@ -23,10 +22,14 @@ class RuleSet(object):
self.rules = []
ds_rules_file = kwargs.get('ds_rules_file')
default_rule_config = kwargs.get('default_rule_config')
config = self.load_default_rules(ds_rules_file)
ds_rules_file = kwargs.get('ds_rules_file')
if not ds_rules_file:
ds_rules_file = DEFAULT_RULES_FILE
config = load_yaml_config(ds_rules_file)
rulesmap = config.get('rules') if config else None
@ -53,22 +56,6 @@ class RuleSet(object):
if not def_key_found and default_rule_config is not None:
self.rules.append(rule_cls(self.DEFAULT_KEY, default_rule_config))
@staticmethod
def load_default_rules(filename=None, pkg=None):
config = None
if not filename:
filename = DEFAULT_RULES_FILE
if not pkg:
pkg = DEFAULT_RULES_PKG
if filename:
yaml_str = pkgutil.get_data(pkg, filename)
config = yaml.load(yaml_str)
return config
def iter_matching(self, urlkey):
"""
Iterate over all matching rules for given urlkey

View File

@ -7,11 +7,20 @@ import os
import hmac
import urllib2
import time
import pkg_resources
#=================================================================
def is_http(filename):
return any(filename.startswith(x) for x in ['http://', 'https://'])
return filename.startswith(('http://', 'https://'))
#=================================================================
def load_yaml_config(config_file):
import yaml
configdata = BlockLoader().load(config_file)
config = yaml.load(configdata)
return config
#=================================================================
@ -24,27 +33,46 @@ class BlockLoader(object):
def __init__(self, cookie_maker=None):
self.cookie_maker = cookie_maker
def load(self, url, offset, length):
def load(self, url, offset=0, length=-1):
"""
Determine loading method based on uri
"""
if is_http(url):
return self.load_http(url, offset, length)
else:
return self.load_file(url, offset, length)
return self.load_file_or_resource(url, offset, length)
def load_file(self, url, offset, length):
def load_file_or_resource(self, url, offset, length):
"""
Load a file-like reader from the local file system
"""
file_only = False
if url.startswith('file://'):
url = url[len('file://'):]
file_only = True
afile = open(url, 'rb')
afile.seek(offset)
try:
# first, try as file
afile = open(url, 'rb')
if length > 0:
except IOError:
if file_only:
raise
# then, try as package.path/file
pkg_split = url.split('/', 1)
if len(pkg_split) == 1:
raise
afile = pkg_resources.resource_stream(pkg_split[0],
pkg_split[1])
if offset > 0:
afile.seek(offset)
if length >= 0:
return LimitReader(afile, length)
else:
return afile

View File

@ -30,9 +30,9 @@
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx', 'rb')).readline()
' CDX N b a m s k r M S V g\\n'
#DecompressingBufferedReader readline() with decompression
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
' CDX N b a m s k r M S V g\\n'
#DecompressingBufferedReader readline() with decompression (zipnum file, no header)
>>> DecompressingBufferedReader(open(test_zip_dir + 'zipnum-sample.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
'com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz\\n'
>>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
'Example Domain'
@ -60,7 +60,7 @@ from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb import get_test_dir
#test_cdx_dir = os.path.dirname(os.path.realpath(__file__)) + '/../sample-data/'
test_cdx_dir = get_test_dir() + 'cdx/'
test_zip_dir = get_test_dir() + 'zipcdx/'
def read_multiple(reader, inc_reads):
result = None

View File

@ -171,7 +171,6 @@ def timestamp_to_datetime(string):
# pad to 6 digits
string = _pad_timestamp(string, PAD_6)
def clamp(val, min_, max_):
try:
val = int(val)

View File

@ -0,0 +1,3 @@
class WbException(Exception):
def status(self):
return '500 Internal Server Error'

View File

@ -9,6 +9,9 @@ from pywb.utils.statusandheaders import StatusAndHeadersParserException
from pywb.utils.loaders import BlockLoader
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb.utils.wbexception import WbException
#=================================================================
ArcWarcRecord = collections.namedtuple('ArchiveRecord',
'type, rec_headers, ' +
@ -16,7 +19,7 @@ ArcWarcRecord = collections.namedtuple('ArchiveRecord',
#=================================================================
class ArchiveLoadFailed(Exception):
class ArchiveLoadFailed(WbException):
def __init__(self, reason, filename=''):
super(ArchiveLoadFailed, self).__init__(filename + ':' + str(reason))
#self.filename = filename
@ -62,9 +65,9 @@ class ArcWarcRecordLoader:
decomp_type = 'gzip'
# Create decompressing stream
stream = DecompressingBufferedReader(stream = raw,
decomp_type = decomp_type,
block_size = self.block_size)
stream = DecompressingBufferedReader(stream=raw,
decomp_type=decomp_type,
block_size=self.block_size)
(the_format, rec_headers) = self._detect_type_load_headers(stream)

View File

@ -176,6 +176,6 @@ class ResolvingLoader:
params = {'url': url,
'closest': timestamp,
'filter': 'digest:' + digest,
'output': 'raw'}
'output': 'cdxobject'}
return self.cdx_server.load_cdx(**params)

View File

@ -1,124 +0,0 @@
from wbexceptions import WbException, NotFoundException, InternalRedirect
from wbrequestresponse import WbResponse, StatusAndHeaders
from pywb.cdx.cdxserver import CDXException
from pywb.utils.canonicalize import UrlCanonicalizeException
from pywb.warc.recordloader import ArchiveLoadFailed
import os
import importlib
import logging
#=================================================================
# adapted -from wsgiref.request_uri, but doesn't include domain name and allows all characters
# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
# explained here: http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
def rel_request_uri(environ, include_query=1):
"""
Return the requested path, optionally including the query string
# Simple test:
>>> rel_request_uri({'PATH_INFO': '/web/example.com'})
'/web/example.com'
# Test all unecoded special chars and double-quote
# (double-quote must be encoded but not single quote)
>>> rel_request_uri({'PATH_INFO': "/web/example.com/0~!+$&'()*+,;=:\\\""})
"/web/example.com/0~!+$&'()*+,;=:%22"
"""
from urllib import quote
url = quote(environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@')
if include_query and environ.get('QUERY_STRING'):
url += '?' + environ['QUERY_STRING']
return url
#=================================================================
def create_wb_app(wb_router):
# Top-level wsgi application
def application(env, start_response):
if env.get('SCRIPT_NAME') or not env.get('REQUEST_URI'):
env['REL_REQUEST_URI'] = rel_request_uri(env)
else:
env['REL_REQUEST_URI'] = env['REQUEST_URI']
response = None
try:
response = wb_router(env)
if not response:
raise NotFoundException('No handler for "{0}"'.format(env['REL_REQUEST_URI']))
except InternalRedirect as ir:
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
except (WbException, CDXException,
UrlCanonicalizeException, ArchiveLoadFailed) as e:
response = handle_exception(env, wb_router.error_view, e, False)
except Exception as e:
response = handle_exception(env, wb_router.error_view, e, True)
return response(env, start_response)
return application
def handle_exception(env, error_view, exc, print_trace):
if hasattr(exc, 'status'):
status = exc.status()
else:
status = '400 Bad Request'
if print_trace:
import traceback
err_details = traceback.format_exc(exc)
print err_details
else:
logging.info(str(exc))
err_details = None
if error_view:
import traceback
return error_view.render_response(err_msg = str(exc), err_details = err_details, status = status)
else:
return WbResponse.text_response(status + ' Error: ' + str(exc), status = status)
#=================================================================
DEFAULT_CONFIG_FILE = 'config.yaml'
def main():
try:
logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG)
# see if there's a custom init module
config_name = os.environ.get('PYWB_CONFIG_MODULE')
if not config_name:
# use default module
config_name = 'pywb.pywb_init'
logging.info('Loading from default config module "{0}"'.format(config_name))
logging.info('')
module = importlib.import_module(config_name)
app = create_wb_app(module.pywb_config())
logging.info('')
logging.info('*** pywb inited with settings from {0}.pywb_config()!\n'.format(config_name))
return app
except Exception:
logging.exception('*** pywb could not init with settings from {0}.pywb_config()!\n'.format(config_name))
raise
#=================================================================
if __name__ == "__main__":
pass
else:
application = main()

4
run.sh
View File

@ -10,14 +10,14 @@ mypath=$(cd `dirname $0` && pwd)
# ex: my_pywb.pywb_config()
#export 'PYWB_CONFIG=my_pywb'
app="pywb.wbapp"
app="pywb.apps.wayback"
params="--http-socket :8080 -b 65536"
#params="--static-map /static=$mypath/static --http-socket :8080 -b 65536"
if [ -z "$1" ]; then
# Standard root config
params="$params --wsgi pywb.wbapp"
params="$params --wsgi $app"
else
# run with --mount
# requires a file not a package, so creating a mount_run.py to load the package

Binary file not shown.

View File

@ -14,7 +14,14 @@ setup(
license='GPL',
packages=find_packages(),
provides=[
'pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'
'pywb',
'pywb.utils',
'pywb.cdx',
'pywb.warc',
'pywb.rewrite',
'pywb.framework'
'pywb.core',
'pywb.apps'
],
package_data={
'pywb': ['ui/*', 'static/*', '*.yaml'],
@ -34,7 +41,6 @@ setup(
'pyyaml',
'WebTest',
'pytest',
'werkzeug>=0.9.4',
],
# tests_require=['WebTest', 'pytest'],
zip_safe=False

View File

@ -90,6 +90,9 @@ enable_http_proxy: true
# enable cdx server api for querying cdx directly (experimental)
enable_cdx_api: true
# test different port
port: 9000
# optional reporter callback func
# if set, called with request and cdx object
reporter: !!python/object/new:tests.fixture.PrintReporter []

View File

@ -1,32 +1,26 @@
import os
import re
import webtest
import pytest
from urllib import urlencode
from werkzeug.test import Client
from werkzeug.wrappers import BaseResponse, Response
import yaml
from pywb.cdx.cdxobject import CDXObject
from pywb.cdx.wsgi_cdxserver import create_app
from pywb.apps.cdx_server import application
from tests.fixture import testconfig
import pytest
#================================================================
@pytest.fixture
def client(testconfig):
app = create_app(testconfig)
return Client(app, Response)
def client():
return webtest.TestApp(application)
# ================================================================
def query(client, url, **params):
#================================================================
def query(client, url, is_error=False, **params):
params['url'] = url
return client.get('/cdx?' + urlencode(params, doseq=1))
return client.get('/cdx?' + urlencode(params, doseq=1), expect_errors=is_error)
# ================================================================
#================================================================
def test_exact_url(client):
"""
basic exact match, no filters, etc.
@ -34,48 +28,54 @@ def test_exact_url(client):
resp = query(client, 'http://www.iana.org/')
assert resp.status_code == 200
print resp.data
print resp.body
#================================================================
def test_prefix_match(client):
"""
prefix match test
"""
resp = query(client, 'http://www.iana.org/', matchType='prefix')
print resp.data.splitlines()
print resp.body.splitlines()
assert resp.status_code == 200
suburls = 0
for l in resp.data.splitlines():
for l in resp.body.splitlines():
fields = l.split(' ')
if len(fields[0]) > len('org,iana)/'):
suburls += 1
assert suburls > 0
#================================================================
def test_filters(client):
"""
filter cdxes by mimetype and filename field, exact match.
"""
resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
filter=('mimetype:warc/revisit', 'filename:dupes.warc.gz'))
assert resp.status_code == 200
assert resp.mimetype == 'text/plain'
for l in resp.data.splitlines():
assert resp.status_code == 200
assert resp.content_type == 'text/plain'
for l in resp.body.splitlines():
fields = l.split(' ')
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
assert fields[3] == 'warc/revisit'
assert fields[10] == 'dupes.warc.gz'
#================================================================
def test_limit(client):
resp = query(client, 'http://www.iana.org/_css/2013.1/screen.css',
limit='1')
assert resp.status_code == 200
assert resp.mimetype == 'text/plain'
assert resp.content_type == 'text/plain'
cdxes = resp.data.splitlines()
cdxes = resp.body.splitlines()
assert len(cdxes) == 1
fields = cdxes[0].split(' ')
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
@ -86,15 +86,17 @@ def test_limit(client):
limit='1', reverse='1')
assert resp.status_code == 200
assert resp.mimetype == 'text/plain'
assert resp.content_type == 'text/plain'
cdxes = resp.data.splitlines()
cdxes = resp.body.splitlines()
assert len(cdxes) == 1
fields = cdxes[0].split(' ')
assert fields[0] == 'org,iana)/_css/2013.1/screen.css'
assert fields[1] == '20140127171239'
assert fields[3] == 'warc/revisit'
#================================================================
def test_fields(client):
"""
retrieve subset of fields with ``fields`` parameter.
@ -104,7 +106,7 @@ def test_fields(client):
assert resp.status_code == 200
cdxes = resp.data.splitlines()
cdxes = resp.body.splitlines()
for cdx in cdxes:
fields = cdx.split(' ')
@ -113,16 +115,21 @@ def test_fields(client):
assert re.match(r'\d{14}$', fields[1])
assert re.match(r'\d{3}|-', fields[2])
#================================================================
def test_fields_undefined(client):
"""
server shall respond with Bad Request (TODO: with proper explanation),
server shall respond with Bad Request and name of undefined
when ``fields`` parameter contains undefined name(s).
"""
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
is_error=True,
fields='urlkey,nosuchfield')
resp.status_code == 400
#================================================================
def test_resolveRevisits(client):
"""
with ``resolveRevisits=true``, server adds three fields pointing to
@ -132,9 +139,9 @@ def test_resolveRevisits(client):
resolveRevisits='true'
)
assert resp.status_code == 200
assert resp.mimetype == 'text/plain'
assert resp.content_type == 'text/plain'
cdxes = resp.data.splitlines()
cdxes = resp.body.splitlines()
originals = {}
for cdx in cdxes:
fields = cdx.split(' ')
@ -151,6 +158,8 @@ def test_resolveRevisits(client):
orig = originals.get(sha)
assert orig == (int(orig_size), int(orig_offset), orig_fn)
#================================================================
def test_resolveRevisits_orig_fields(client):
"""
when resolveRevisits=true, extra three fields are named
@ -162,9 +171,9 @@ def test_resolveRevisits_orig_fields(client):
fields='urlkey,orig.length,orig.offset,orig.filename'
)
assert resp.status_code == 200
assert resp.mimetype == 'text/plain'
assert resp.content_type == 'text/plain'
cdxes = resp.data.splitlines()
cdxes = resp.body.splitlines()
for cdx in cdxes:
fields = cdx.split(' ')
assert len(fields) == 4
@ -172,6 +181,8 @@ def test_resolveRevisits_orig_fields(client):
assert (orig_len == '-' and orig_offset == '-' and orig_fn == '-' or
(int(orig_len), int(orig_offset), orig_fn))
#================================================================
def test_collapseTime_resolveRevisits_reverse(client):
resp = query(client, 'http://www.iana.org/_css/2013.1/print.css',
collapseTime='11',
@ -179,11 +190,10 @@ def test_collapseTime_resolveRevisits_reverse(client):
reverse='true'
)
cdxes = [CDXObject(l) for l in resp.data.splitlines()]
cdxes = [CDXObject(l) for l in resp.body.splitlines()]
assert len(cdxes) == 3
# timestamp is in descending order
for i in range(len(cdxes) - 1):
assert cdxes[i]['timestamp'] >= cdxes[i + 1]['timestamp']

View File

@ -1,6 +1,6 @@
import webtest
from pywb.pywb_init import pywb_config
from pywb.wbapp import create_wb_app
from pywb.core.pywb_init import create_wb_router
from pywb.framework.wsgi_wrappers import init_app
from pywb.cdx.cdxobject import CDXObject
from fixture import TestExclusionPerms
@ -11,8 +11,13 @@ class TestWb:
def setup(self):
#self.app = pywb.wbapp.create_wb_app(pywb.pywb_init.pywb_config())
# save it in self - useful for debugging
self.router = pywb_config(self.TEST_CONFIG)
self.app = create_wb_app(self.router)
self.app = init_app(create_wb_router,
load_yaml=True,
config_file=self.TEST_CONFIG)
#self.router = pywb_config(self.TEST_CONFIG)
#self.app = create_wb_app(self.router)
self.testapp = webtest.TestApp(self.app)
def _assert_basic_html(self, resp):