1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

package reorg!

split up remaining parts of pywb root pkg
into core, dispatch and bootstrap
This commit is contained in:
Ilya Kreymer 2014-02-24 03:00:01 -08:00
parent 9194e867ea
commit 51d61a8738
19 changed files with 73 additions and 59 deletions

View File

View File

@ -1,16 +1,16 @@
import views
import handlers
import replay_views
import logging import logging
from pywb.warc.recordloader import ArcWarcRecordLoader from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader from pywb.warc.resolvingloader import ResolvingLoader
from pywb.rewrite.rewrite_content import RewriteContent from pywb.rewrite.rewrite_content import RewriteContent
from pywb.core.views import J2TemplateView, J2HtmlCapturesView
from pywb.core.handlers import WBHandler
from pywb.core.replay_views import ReplayView
#================================================================= #=================================================================
# Config Loading # Config Loading
#================================================================= #=================================================================
def load_template_file(file, desc = None, view_class = views.J2TemplateView): def load_template_file(file, desc = None, view_class = J2TemplateView):
if file: if file:
logging.debug('Adding {0}: {1}'.format(desc if desc else name, file)) logging.debug('Adding {0}: {1}'.format(desc if desc else name, file))
file = view_class(file) file = view_class(file)
@ -25,7 +25,7 @@ def create_wb_handler(cdx_server, config):
resolving_loader = ResolvingLoader(paths = paths, cdx_server = cdx_server, record_loader = record_loader) resolving_loader = ResolvingLoader(paths = paths, cdx_server = cdx_server, record_loader = record_loader)
replayer = replay_views.ReplayView( replayer = ReplayView(
content_loader = resolving_loader, content_loader = resolving_loader,
content_rewriter = RewriteContent(), content_rewriter = RewriteContent(),
@ -40,12 +40,12 @@ def create_wb_handler(cdx_server, config):
) )
wb_handler = handlers.WBHandler( wb_handler = WBHandler(
cdx_server, cdx_server,
replayer, replayer,
html_view = load_template_file(config.get('query_html'), 'Captures Page', views.J2HtmlCapturesView), html_view = load_template_file(config.get('query_html'), 'Captures Page', J2HtmlCapturesView),
search_view = load_template_file(config.get('search_html'), 'Search Page'), search_view = load_template_file(config.get('search_html'), 'Search Page'),
) )

View File

@ -1,8 +1,10 @@
import handlers from pywb.core.handlers import CDXHandler, StaticHandler
import archivalrouter from pywb.core.handlers import DebugEchoHandler, DebugEchoEnvHandler
from pywb.dispatch.archivalrouter import ArchivalRouter, Route
from pywb.dispatch.proxy import ProxyArchivalRouter
from pywb.core.indexreader import IndexReader
import config_utils import config_utils
import proxy
from indexreader import IndexReader
import os import os
import yaml import yaml
@ -67,32 +69,32 @@ def pywb_config_manual(passed_config = {}):
logging.debug('Adding Collection: ' + name) logging.debug('Adding Collection: ' + name)
route_class = route_config.get('route_class', archivalrouter.Route) route_class = route_config.get('route_class', Route)
routes.append(route_class(name, wb_handler, config = route_config)) routes.append(route_class(name, wb_handler, config = route_config))
# cdx query handler # cdx query handler
if route_config.get('enable_cdx_api', False): if route_config.get('enable_cdx_api', False):
routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_server))) routes.append(Route(name + '-cdx', CDXHandler(cdx_server)))
if config.get('debug_echo_env', False): if config.get('debug_echo_env', False):
routes.append(archivalrouter.Route('echo_env', handlers.DebugEchoEnvHandler())) routes.append(Route('echo_env', DebugEchoEnvHandler()))
if config.get('debug_echo_req', False): if config.get('debug_echo_req', False):
routes.append(archivalrouter.Route('echo_req', handlers.DebugEchoHandler())) routes.append(Route('echo_req', DebugEchoHandler()))
static_routes = config.get('static_routes') static_routes = config.get('static_routes')
for static_name, static_path in static_routes.iteritems(): for static_name, static_path in static_routes.iteritems():
routes.append(archivalrouter.Route(static_name, handlers.StaticHandler(static_path))) routes.append(Route(static_name, StaticHandler(static_path)))
# Check for new proxy mode! # Check for new proxy mode!
if config.get('enable_http_proxy', False): if config.get('enable_http_proxy', False):
router = proxy.ProxyArchivalRouter router = ProxyArchivalRouter
else: else:
router = archivalrouter.ArchivalRouter router = ArchivalRouter
# Finally, create wb router # Finally, create wb router
return router( return router(

View File

@ -1,5 +1,5 @@
from wbexceptions import WbException, NotFoundException, InternalRedirect from pywb.core.wbexceptions import WbException, NotFoundException, InternalRedirect
from wbrequestresponse import WbResponse, StatusAndHeaders from pywb.core.wbrequestresponse import WbResponse, StatusAndHeaders
from pywb.cdx.cdxserver import CDXException from pywb.cdx.cdxserver import CDXException
from pywb.warc.recordloader import ArchiveLoadFailed from pywb.warc.recordloader import ArchiveLoadFailed
@ -91,6 +91,10 @@ def handle_exception(env, error_view, exc, print_trace):
#================================================================= #=================================================================
DEFAULT_CONFIG_FILE = 'config.yaml' DEFAULT_CONFIG_FILE = 'config.yaml'
DEFAULT_INIT_MODULE = 'pywb.bootstrap.pywb_init'
#=================================================================
def main(): def main():
try: try:
logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG) logging.basicConfig(format = '%(asctime)s: [%(levelname)s]: %(message)s', level = logging.DEBUG)
@ -100,7 +104,7 @@ def main():
if not config_name: if not config_name:
# use default module # use default module
config_name = 'pywb.pywb_init' config_name = DEFAULT_INIT_MODULE
logging.info('Loading from default config module "{0}"'.format(config_name)) logging.info('Loading from default config module "{0}"'.format(config_name))
logging.info('') logging.info('')

0
pywb/core/__init__.py Normal file
View File

View File

@ -33,11 +33,15 @@ class WBHandler(WbUrlHandler):
html_view=None, search_view=None): html_view=None, search_view=None):
self.index_reader = index_reader self.index_reader = index_reader
self.replay = replay self.replay = replay
self.text_view = TextCapturesView() self.text_query_view = TextCapturesView()
self.query_view = html_view
if not self.query_view:
self.query_view = text_query_view
self.html_view = html_view
self.search_view = search_view self.search_view = search_view
def __call__(self, wbrequest): def __call__(self, wbrequest):
@ -49,11 +53,10 @@ class WBHandler(WbUrlHandler):
# new special modifier to always show cdx index # new special modifier to always show cdx index
if wbrequest.wb_url.mod == 'cdx_': if wbrequest.wb_url.mod == 'cdx_':
return self.text_view.render_response(wbrequest, cdx_lines) return self.text_query_view.render_response(wbrequest, cdx_lines)
if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY): if (wbrequest.wb_url.type == wbrequest.wb_url.QUERY) or (wbrequest.wb_url.type == wbrequest.wb_url.URL_QUERY):
query_view = self.html_view if self.html_view else self.text_view return self.query_view.render_response(wbrequest, cdx_lines)
return query_view.render_response(wbrequest, cdx_lines)
with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t: with PerfTimer(wbrequest.env.get('X_PERF'), 'replay') as t:
return self.replay(wbrequest, cdx_lines) return self.replay(wbrequest, cdx_lines)
@ -92,7 +95,7 @@ class CDXHandler(BaseHandler):
# Static Content Handler # Static Content Handler
#================================================================= #=================================================================
class StaticHandler(BaseHandler): class StaticHandler(BaseHandler):
def __init__(self, static_path, pkg = __package__): def __init__(self, static_path, pkg = 'pywb'):
mimetypes.init() mimetypes.init()
self.static_path = static_path self.static_path = static_path

View File

@ -49,7 +49,7 @@ class ReplayView:
self._redirect_if_needed(wbrequest, cdx) self._redirect_if_needed(wbrequest, cdx)
# one more check for referrer-based self-redirect # one more check for referrer-based self-redirect
self._reject_referrer_self_redirect(wbrequest, status_headers) self._reject_referrer_self_redirect(wbrequest)
response = None response = None
@ -150,25 +150,30 @@ class ReplayView:
def _reject_self_redirect(self, wbrequest, cdx, status_headers): def _reject_self_redirect(self, wbrequest, cdx, status_headers):
# self-redirect via location """
Check if response is a 3xx redirect to the same url
If so, reject this capture to avoid causing redirect loop
"""
if status_headers.statusline.startswith('3'): if status_headers.statusline.startswith('3'):
request_url = wbrequest.wb_url.url.lower() request_url = wbrequest.wb_url.url.lower()
location_url = status_headers.get_header('Location').lower() location_url = status_headers.get_header('Location').lower()
#TODO: canonicalize before testing?
if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)): if (UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(location_url)):
raise CaptureException('Self Redirect: ' + str(cdx)) raise CaptureException('Self Redirect: ' + str(cdx))
def _reject_referrer_self_redirect(self, wbrequest, status_headers): def _reject_referrer_self_redirect(self, wbrequest):
# at correct timestamp now, but must check for referrer redirect """
# indirect self-redirect, via meta-refresh, if referrer is same as current url Perform final check for referrer based self-redirect.
if status_headers.statusline.startswith('2'): This method should be called after verifying request timestamp matches capture.
# build full url even if using relative-rewriting if referrer is same as current url, reject this response and try another capture
request_url = wbrequest.host_prefix + wbrequest.rel_prefix + str(wbrequest.wb_url) """
referrer_url = wbrequest.referrer if not wbrequest.referrer:
if (referrer_url and UrlRewriter.strip_protocol(request_url) == UrlRewriter.strip_protocol(referrer_url)): return
raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))
# build full url even if using relative-rewriting
request_url = (wbrequest.host_prefix +
wbrequest.rel_prefix + str(wbrequest.wb_url))
if (UrlRewriter.strip_protocol(request_url) ==
UrlRewriter.strip_protocol(wbrequest.referrer)):
raise CaptureException('Self Redirect via Referrer: ' + str(wbrequest.wb_url))

View File

@ -41,7 +41,7 @@ from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.wbrequestresponse import WbRequest, WbResponse from pywb.core.wbrequestresponse import WbRequest, WbResponse
def print_req_from_uri(request_uri, env={}, use_abs_prefix=False): def print_req_from_uri(request_uri, env={}, use_abs_prefix=False):

View File

@ -1,6 +1,6 @@
import pywb.utils.timeutils as timeutils from pywb.utils.timeutils import timestamp_to_datetime
from wbrequestresponse import WbResponse
import wbrequestresponse
import urlparse import urlparse
import time import time
@ -18,7 +18,7 @@ class StaticTextView:
return self.text return self.text
def render_response(self, **kwargs): def render_response(self, **kwargs):
return wbrequestresponse.WbResponse.text_stream(self.text) return WbResponse.text_stream(self.text)
#================================================================= #=================================================================
class J2TemplateView: class J2TemplateView:
@ -34,7 +34,7 @@ class J2TemplateView:
if template_dir.startswith('.') or template_dir.startswith('file://'): if template_dir.startswith('.') or template_dir.startswith('file://'):
loader = FileSystemLoader(template_dir) loader = FileSystemLoader(template_dir)
else: else:
loader = PackageLoader(__package__, template_dir) loader = PackageLoader('pywb', template_dir)
jinja_env = Environment(loader = loader, trim_blocks = True) jinja_env = Environment(loader = loader, trim_blocks = True)
jinja_env.filters['format_ts'] = J2TemplateView.format_ts jinja_env.filters['format_ts'] = J2TemplateView.format_ts
@ -51,13 +51,13 @@ class J2TemplateView:
def render_response(self, **kwargs): def render_response(self, **kwargs):
template_result = self.render_to_string(**kwargs) template_result = self.render_to_string(**kwargs)
status = kwargs.get('status', '200 OK') status = kwargs.get('status', '200 OK')
return wbrequestresponse.WbResponse.text_response(str(template_result), status = status, content_type = 'text/html; charset=utf-8') return WbResponse.text_response(str(template_result), status = status, content_type = 'text/html; charset=utf-8')
# Filters # Filters
@staticmethod @staticmethod
def format_ts(value, format='%a, %b %d %Y %H:%M:%S'): def format_ts(value, format='%a, %b %d %Y %H:%M:%S'):
value = timeutils.timestamp_to_datetime(value) value = timestamp_to_datetime(value)
return time.strftime(format, value) return time.strftime(format, value)
@staticmethod @staticmethod
@ -90,7 +90,7 @@ class TextCapturesView:
cdx += '\n' cdx += '\n'
return cdx return cdx
cdx_lines = imap(to_str, cdx_lines) cdx_lines = imap(to_str, cdx_lines)
return wbrequestresponse.WbResponse.text_stream(cdx_lines) return WbResponse.text_stream(cdx_lines)

View File

View File

@ -1,7 +1,7 @@
import urlparse import urlparse
import re import re
from wbrequestresponse import WbRequest, WbResponse from pywb.core.wbrequestresponse import WbRequest, WbResponse
from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.url_rewriter import UrlRewriter

View File

@ -1,4 +1,4 @@
from wbrequestresponse import WbResponse, WbRequest from pywb.core.wbrequestresponse import WbResponse, WbRequest
from archivalrouter import ArchivalRouter from archivalrouter import ArchivalRouter
import urlparse import urlparse

View File

@ -70,8 +70,8 @@ False
""" """
from pywb.archivalrouter import Route, ReferRedirect from pywb.dispatch.archivalrouter import Route, ReferRedirect
from pywb.handlers import BaseHandler, WbUrlHandler from pywb.core.handlers import BaseHandler, WbUrlHandler
import pprint import pprint
def print_req(req): def print_req(req):

4
run.sh
View File

@ -10,14 +10,14 @@ mypath=$(cd `dirname $0` && pwd)
# ex: my_pywb.pywb_config() # ex: my_pywb.pywb_config()
#export 'PYWB_CONFIG=my_pywb' #export 'PYWB_CONFIG=my_pywb'
app="pywb.wbapp" app="pywb.bootstrap.wbapp"
params="--http-socket :8080 -b 65536" params="--http-socket :8080 -b 65536"
#params="--static-map /static=$mypath/static --http-socket :8080 -b 65536" #params="--static-map /static=$mypath/static --http-socket :8080 -b 65536"
if [ -z "$1" ]; then if [ -z "$1" ]; then
# Standard root config # Standard root config
params="$params --wsgi pywb.wbapp" params="$params --wsgi $app"
else else
# run with --mount # run with --mount
# requires a file not a package, so creating a mount_run.py to load the package # requires a file not a package, so creating a mount_run.py to load the package

View File

@ -11,8 +11,8 @@ setuptools.setup(name='pywb',
author_email='ilya@archive.org', author_email='ilya@archive.org',
long_description=open('README.md').read(), long_description=open('README.md').read(),
license='GPL', license='GPL',
packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite','pywb.core','pywb.dispatch','pywb.bootstrap'],
provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite','pywb.core','pywb.dispatch','pywb.bootstrap'],
package_data={'pywb': ['ui/*', 'static/*'], 'pywb.cdx': ['*.yaml']}, package_data={'pywb': ['ui/*', 'static/*'], 'pywb.cdx': ['*.yaml']},
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))], ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*'))],

View File

@ -1,6 +1,6 @@
import webtest import webtest
from pywb.pywb_init import pywb_config from pywb.bootstrap.pywb_init import pywb_config
from pywb.wbapp import create_wb_app from pywb.bootstrap.wbapp import create_wb_app
from pywb.cdx.cdxobject import CDXObject from pywb.cdx.cdxobject import CDXObject
class TestWb: class TestWb: