1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

more refactoring!

create 'framework' subpackage for general purpose components!
contains routing, request/response, exceptions and wsgi wrappers
update framework package for pep8
dsrules: using load_config_yaml() (pushed to utils)
to init default config
This commit is contained in:
Ilya Kreymer 2014-03-02 21:42:05 -08:00
parent f1acad53fc
commit f0a0976038
17 changed files with 138 additions and 121 deletions

View File

@ -1,5 +1,5 @@
from pywb.bootstrap.wsgi_wrappers import init_app, start_wsgi_server
from pywb.bootstrap.pywb_init import create_wb_router
from pywb.framework.wsgi_wrappers import init_app, start_wsgi_server
from pywb.core.pywb_init import create_wb_router
#=================================================================
# init pywb app

View File

@ -5,7 +5,7 @@ import time
from pywb.rewrite.wburl import WbUrl
from pywb.cdx.query import CDXQuery
from wbrequestresponse import WbResponse
from pywb.framework.wbrequestresponse import WbResponse
from wbexceptions import WbException, NotFoundException
from views import TextCapturesView

View File

@ -1,25 +1,25 @@
from pywb.dispatch.archivalrouter import ArchivalRouter, Route
from pywb.dispatch.proxy import ProxyArchivalRouter
from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.framework.proxy import ProxyArchivalRouter
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader
from pywb.rewrite.rewrite_content import RewriteContent
from pywb.core.indexreader import IndexReader
from pywb.core.views import J2TemplateView, J2HtmlCapturesView
from pywb.core.handlers import WBHandler
from pywb.core.replay_views import ReplayView
from indexreader import IndexReader
from views import J2TemplateView, J2HtmlCapturesView
from replay_views import ReplayView
from pywb.core.handlers import CDXHandler, StaticHandler
from pywb.core.handlers import DebugEchoHandler, DebugEchoEnvHandler
from handlers import WBHandler
from handlers import CDXHandler, StaticHandler
from handlers import DebugEchoHandler, DebugEchoEnvHandler
from pywb.utils.loaders import BlockLoader
import os
import yaml
import logging
#=================================================================
DEFAULTS = {
'hostpaths': ['http://localhost:8080'],
@ -34,7 +34,7 @@ DEFAULTS = {
'static_routes': {'static/default': 'static/'},
'domain_specific_rules': 'rules.yaml',
'domain_specific_rules': 'pywb/rules.yaml',
}
#=================================================================

View File

@ -2,9 +2,9 @@ import StringIO
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.bufferedreaders import ChunkedDataReader
from wbrequestresponse import WbResponse
from pywb.framework.wbrequestresponse import WbResponse
from wbexceptions import CaptureException, InternalRedirect
from pywb.framework.wbexceptions import CaptureException, InternalRedirect
from pywb.warc.recordloader import ArchiveLoadFailed
from pywb.utils.loaders import LimitReader

View File

@ -1,5 +1,5 @@
from pywb.utils.timeutils import timestamp_to_datetime
from wbrequestresponse import WbResponse
from pywb.framework.wbrequestresponse import WbResponse
import urlparse
import time

View File

@ -1,15 +1,17 @@
import urlparse
import re
from pywb.core.wbrequestresponse import WbRequest, WbResponse
from pywb.rewrite.url_rewriter import UrlRewriter
from wbrequestresponse import WbRequest, WbResponse
#=================================================================
# ArchivalRouter -- route WB requests in archival mode
#=================================================================
class ArchivalRouter:
def __init__(self, routes, hostpaths=None, abs_path=True, home_view=None, error_view=None):
class ArchivalRouter(object):
def __init__(self, routes, hostpaths=None, abs_path=True,
home_view=None, error_view=None):
self.routes = routes
self.fallback = ReferRedirect(hostpaths)
self.abs_path = abs_path
@ -29,26 +31,27 @@ class ArchivalRouter:
return self.fallback(env, self.routes) if self.fallback else None
def render_home_page(self):
# render the homepage!
if self.home_view:
return self.home_view.render_response(routes = self.routes)
return self.home_view.render_response(routes=self.routes)
else:
# default home page template
text = '\n'.join(map(str, self.routes))
return WbResponse.text_response(text)
#=================================================================
# Route by matching regex (or fixed prefix)
# of request uri (excluding first '/')
#=================================================================
class Route:
class Route(object):
# match upto next / or ? or end
SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)'
SLASH_QUERY_LOOKAHEAD = '(?=/|$|\?)'
def __init__(self, regex, handler, coll_group=0, config={},
lookahead=SLASH_QUERY_LOOKAHEAD):
def __init__(self, regex, handler, coll_group = 0, config = {}, lookahead = SLASH_QUERY_LOOKAHEAD):
self.path = regex
if regex:
self.regex = re.compile(regex + lookahead)
@ -59,12 +62,11 @@ class Route:
self.coll_group = coll_group
self._custom_init(config)
def __call__(self, env, use_abs_prefix):
wbrequest = self.parse_request(env, use_abs_prefix)
return self.handler(wbrequest) if wbrequest else None
def parse_request(self, env, use_abs_prefix, request_uri = None):
def parse_request(self, env, use_abs_prefix, request_uri=None):
if not request_uri:
request_uri = env['REL_REQUEST_URI']
@ -75,10 +77,12 @@ class Route:
matched_str = matcher.group(0)
if matched_str:
rel_prefix = env['SCRIPT_NAME'] + '/' + matched_str + '/'
wb_url_str = request_uri[len(matched_str) + 2:] # remove the '/' + rel_prefix part of uri
# remove the '/' + rel_prefix part of uri
wb_url_str = request_uri[len(matched_str) + 2:]
else:
rel_prefix = env['SCRIPT_NAME'] + '/'
wb_url_str = request_uri[1:] # the request_uri is the wb_url, since no coll
# the request_uri is the wb_url, since no coll
wb_url_str = request_uri[1:]
coll = matcher.group(self.coll_group)
@ -88,20 +92,19 @@ class Route:
rel_prefix=rel_prefix,
coll=coll,
use_abs_prefix=use_abs_prefix,
wburl_class = self.handler.get_wburl_type(),
wburl_class=self.handler.get_wburl_type(),
urlrewriter_class=UrlRewriter)
# Allow for applying of additional filters
self._apply_filters(wbrequest, matcher)
return wbrequest
def _apply_filters(self, wbrequest, matcher):
for filter in self.filters:
last_grp = len(matcher.groups())
wbrequest.query_filter.append(filter.format(matcher.group(last_grp)))
filter_str = filter.format(matcher.group(last_grp))
wbrequest.query_filter.append(filter_str)
def _custom_init(self, config):
self.filters = config.get('filters', [])
@ -112,7 +115,8 @@ class Route:
#=================================================================
# ReferRedirect -- redirect urls that have 'fallen through' based on the referrer settings
# ReferRedirect -- redirect urls that have 'fallen through'
# based on the referrer settings
#=================================================================
class ReferRedirect:
def __init__(self, match_prefixs):
@ -121,7 +125,6 @@ class ReferRedirect:
else:
self.match_prefixs = [match_prefixs]
def __call__(self, env, routes):
referrer = env.get('HTTP_REFERER')
@ -133,7 +136,7 @@ class ReferRedirect:
ref_split = urlparse.urlsplit(referrer)
# ensure referrer starts with one of allowed hosts
if not any (referrer.startswith(i) for i in self.match_prefixs):
if not any(referrer.startswith(i) for i in self.match_prefixs):
if ref_split.netloc != env.get('HTTP_HOST'):
return None
@ -144,13 +147,12 @@ class ReferRedirect:
if app_path:
# must start with current app name, if not root
if not path.startswith(app_path):
return None
return None
path = path[len(app_path):]
for route in routes:
ref_request = route.parse_request(env, False, request_uri = path)
ref_request = route.parse_request(env, False, request_uri=path)
if ref_request:
break
@ -174,6 +176,10 @@ class ReferRedirect:
# 2013/path.html -> /path.html
rel_request_uri = rel_request_uri[len(timestamp_path) - 1:]
final_url = urlparse.urlunsplit((ref_split.scheme, ref_split.netloc, rewriter.rewrite(rel_request_uri), '', ''))
final_url = urlparse.urlunsplit((ref_split.scheme,
ref_split.netloc,
rewriter.rewrite(rel_request_uri),
'',
''))
return WbResponse.redir_response(final_url)

View File

@ -1,15 +1,19 @@
from pywb.core.wbrequestresponse import WbResponse, WbRequest
from wbrequestresponse import WbResponse, WbRequest
from archivalrouter import ArchivalRouter
import urlparse
#=================================================================
# An experimental router which combines both archival and proxy modes
# http proxy mode support is very simple: only latest capture is available currently
# http proxy mode support is very simple so far:
# only latest capture is available currently
#=================================================================
class ProxyArchivalRouter:
def __init__(self, routes, hostpaths = None, abs_path = True, home_view = None, error_view = None):
self.archival = ArchivalRouter(routes, hostpaths, abs_path, home_view, error_view)
def __init__(self, routes, hostpaths=None, abs_path=True,
home_view=None, error_view=None):
self.archival = ArchivalRouter(routes, hostpaths, abs_path,
home_view, error_view)
self.proxy = ProxyRouter(routes[0].handler, hostpaths, error_view)
self.error_view = error_view
@ -29,7 +33,7 @@ class ProxyArchivalRouter:
# Only supports latest capture replay at the moment
#=================================================================
class ProxyRouter:
def __init__(self, handler, hostpaths = None, error_view = None):
def __init__(self, handler, hostpaths=None, error_view=None):
self.handler = handler
self.hostpaths = hostpaths
@ -56,27 +60,26 @@ class ProxyRouter:
return self.handler(wbrequest)
# Proxy Auto-Config (PAC) script for the proxy
def make_pac_response(self, env):
server_hostport = env['SERVER_NAME'] + ':' + env['SERVER_PORT']
buff = 'function FindProxyForURL (url, host) {\n'
direct_cond =' if (shExpMatch(host, "{0}")) {{ return "DIRECT"; }}\n'
direct = ' if (shExpMatch(host, "{0}")) {{ return "DIRECT"; }}\n'
for hostpath in self.hostpaths:
parts = urlparse.urlsplit(hostpath).netloc.split(':')
buff += direct_cond.format(parts[0])
buff += direct.format(parts[0])
buff += direct_cond.format(env['SERVER_NAME'])
buff += direct.format(env['SERVER_NAME'])
#buff += '\n return "PROXY {0}";\n}}\n'.format(self.hostpaths[0])
buff += '\n return "PROXY {0}";\n}}\n'.format(server_hostport)
return WbResponse.text_response(buff, content_type = 'application/x-ns-proxy-autoconfig')
content_type = 'application/x-ns-proxy-autoconfig'
return WbResponse.text_response(buff, content_type=content_type)
#=================================================================
@ -85,10 +88,11 @@ class ProxyRouter:
class ProxyHttpsUrlRewriter:
HTTP = 'http://'
HTTPS = 'https://'
def __init__(self, wbrequest, prefix):
pass
def rewrite(self, url, mod = None):
def rewrite(self, url, mod=None):
if url.startswith(self.HTTPS):
return self.HTTP + url[len(self.HTTPS):]
else:
@ -97,6 +101,5 @@ class ProxyHttpsUrlRewriter:
def get_timestamp_url(self, timestamp, url):
return url
def get_abs_url(self, url = ''):
def get_abs_url(self, url=''):
return url

View File

@ -84,7 +84,7 @@ False
"""
from pywb.dispatch.archivalrouter import Route, ReferRedirect
from pywb.framework.archivalrouter import Route, ReferRedirect
from pywb.core.handlers import BaseHandler, WbUrlHandler
import pprint

View File

@ -41,7 +41,7 @@ from pywb.rewrite.wburl import WbUrl
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.core.wbrequestresponse import WbRequest, WbResponse
from pywb.framework.wbrequestresponse import WbRequest, WbResponse
def print_req_from_uri(request_uri, env={}, use_abs_prefix=False):

View File

@ -26,7 +26,6 @@ class WbRequest:
except KeyError:
return ''
def __init__(self, env,
request_uri=None,
rel_prefix='',
@ -40,7 +39,10 @@ class WbRequest:
self.env = env
self.request_uri = request_uri if request_uri else env.get('REL_REQUEST_URI')
if request_uri:
self.request_uri = request_uri
else:
self.request_uri = env.get('REL_REQUEST_URI')
self.coll = coll
@ -55,7 +57,6 @@ class WbRequest:
else:
self.wb_prefix = rel_prefix
if not wb_url_str:
wb_url_str = '/'
@ -83,7 +84,6 @@ class WbRequest:
# PERF
env['X_PERF'] = {}
def _is_ajax(self):
value = self.env.get('HTTP_X_REQUESTED_WITH')
if not value:
@ -96,7 +96,6 @@ class WbRequest:
return True
return False
def __repr__(self):
varlist = vars(self)
varstr = pprint.pformat(varlist)
@ -111,32 +110,39 @@ class WbResponse:
Holds a status_headers object and a response iter, to be
returned to wsgi container.
"""
def __init__(self, status_headers, value = []):
def __init__(self, status_headers, value=[]):
self.status_headers = status_headers
self.body = value
@staticmethod
def text_stream(text, status = '200 OK', content_type = 'text/plain'):
return WbResponse(StatusAndHeaders(status, [('Content-Type', content_type)]), value = text)
def text_stream(stream, status='200 OK', content_type='text/plain'):
status_headers = StatusAndHeaders(status,
[('Content-Type', content_type)])
return WbResponse(status_headers, value=stream)
@staticmethod
def text_response(text, status = '200 OK', content_type = 'text/plain'):
return WbResponse(StatusAndHeaders(status, [('Content-Type', content_type)]), value = [text])
def text_response(text, status='200 OK', content_type='text/plain'):
status_headers = StatusAndHeaders(status,
[('Content-Type', content_type)])
return WbResponse(status_headers, value=[text])
@staticmethod
def redir_response(location, status = '302 Redirect'):
return WbResponse(StatusAndHeaders(status, [('Location', location)]))
def redir_response(location, status='302 Redirect'):
return WbResponse(StatusAndHeaders(status,
[('Location', location)]))
def __call__(self, env, start_response):
# PERF
perfstats = env.get('X_PERF')
if perfstats:
self.status_headers.headers.append(('X-Archive-Perf-Stats', str(perfstats)))
self.status_headers.headers.append(('X-Archive-Perf-Stats',
str(perfstats)))
start_response(self.status_headers.statusline, self.status_headers.headers)
start_response(self.status_headers.statusline,
self.status_headers.headers)
if env['REQUEST_METHOD'] == 'HEAD':
if hasattr(self.body, 'close'):
@ -148,6 +154,5 @@ class WbResponse:
else:
return [str(self.body)]
def __repr__(self):
return str(vars(self))

View File

@ -1,8 +1,9 @@
from pywb.utils.wbexception import WbException
from pywb.core.wbexceptions import NotFoundException, InternalRedirect
from pywb.core.wbrequestresponse import WbResponse, StatusAndHeaders
from pywb.utils.loaders import load_yaml_config
from wbexceptions import NotFoundException, InternalRedirect
from wbrequestresponse import WbResponse, StatusAndHeaders
from pywb.utils.loaders import BlockLoader
import os
import importlib
@ -10,10 +11,13 @@ import logging
#=================================================================
# adapted from wsgiref.request_uri, but doesn't include domain name and allows all characters
# allowed in the path segment according to: http://tools.ietf.org/html/rfc3986#section-3.3
# adapted from wsgiref.request_uri, but doesn't include domain name
# and allows all characters which are allowed in the path segment
# according to: http://tools.ietf.org/html/rfc3986#section-3.3
# explained here:
# http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links
# http://stackoverflow.com/questions/4669692/
# valid-characters-for-directory-part-of-a-url-for-short-links
def rel_request_uri(environ, include_query=1):
"""
Return the requested path, optionally including the query string
@ -28,7 +32,7 @@ def rel_request_uri(environ, include_query=1):
"/web/example.com/0~!+$&'()*+,;=:%22"
"""
from urllib import quote
url = quote(environ.get('PATH_INFO',''), safe='/~!$&\'()*+,;=:@')
url = quote(environ.get('PATH_INFO', ''), safe='/~!$&\'()*+,;=:@')
if include_query and environ.get('QUERY_STRING'):
url += '?' + environ['QUERY_STRING']
@ -50,7 +54,8 @@ def create_wb_app(wb_router):
response = wb_router(env)
if not response:
raise NotFoundException('No handler for "{0}"'.format(env['REL_REQUEST_URI']))
msg = 'No handler for "{0}"'.format(env['REL_REQUEST_URI'])
raise NotFoundException(msg)
except InternalRedirect as ir:
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
@ -63,7 +68,6 @@ def create_wb_app(wb_router):
return response(env, start_response)
return application
@ -94,16 +98,6 @@ def handle_exception(env, error_view, exc, print_trace):
#=================================================================
DEFAULT_CONFIG_FILE = 'config.yaml'
def load_yaml_config(config_file=None):
import yaml
if not config_file:
config_file = DEFAULT_CONFIG_FILE
configdata = BlockLoader().load(config_file)
config = yaml.load(configdata)
return config
#=================================================================
def init_app(init_func, load_yaml=True, config_file=None):
@ -114,6 +108,9 @@ def init_app(init_func, load_yaml=True, config_file=None):
if load_yaml:
if not config_file:
config_file = os.environ.get('PYWB_CONFIG_FILE')
if not config_file:
config_file = DEFAULT_CONFIG_FILE
config = load_yaml_config(config_file)
try:
@ -135,6 +132,7 @@ def init_app(init_func, load_yaml=True, config_file=None):
#=================================================================
DEFAULT_PORT = 8080
def start_wsgi_server(the_app):
from wsgiref.simple_server import make_server
from optparse import OptionParser
@ -153,7 +151,6 @@ def start_wsgi_server(the_app):
except:
port = DEFAULT_PORT
logging.debug('Starting CDX Server on port %s', port)
try:

View File

@ -1,11 +1,10 @@
import yaml
import pkgutil
from loaders import load_yaml_config
#=================================================================
DEFAULT_RULES_FILE = 'rules.yaml'
DEFAULT_RULES_PKG = 'pywb'
DEFAULT_RULES_FILE = 'pywb/rules.yaml'
#=================================================================
class RuleSet(object):
@ -23,10 +22,14 @@ class RuleSet(object):
self.rules = []
ds_rules_file = kwargs.get('ds_rules_file')
default_rule_config = kwargs.get('default_rule_config')
config = self.load_default_rules(ds_rules_file)
ds_rules_file = kwargs.get('ds_rules_file')
if not ds_rules_file:
ds_rules_file = DEFAULT_RULES_FILE
config = load_yaml_config(ds_rules_file)
rulesmap = config.get('rules') if config else None
@ -53,22 +56,6 @@ class RuleSet(object):
if not def_key_found and default_rule_config is not None:
self.rules.append(rule_cls(self.DEFAULT_KEY, default_rule_config))
@staticmethod
def load_default_rules(filename=None, pkg=None):
config = None
if not filename:
filename = DEFAULT_RULES_FILE
if not pkg:
pkg = DEFAULT_RULES_PKG
if filename:
yaml_str = pkgutil.get_data(pkg, filename)
config = yaml.load(yaml_str)
return config
def iter_matching(self, urlkey):
"""
Iterate over all matching rules for given urlkey

View File

@ -7,12 +7,20 @@ import os
import hmac
import urllib2
import time
from pkg_resources import resource_stream
import pkg_resources
#=================================================================
def is_http(filename):
return any(filename.startswith(x) for x in ['http://', 'https://'])
return filename.startswith(('http://', 'https://'))
#=================================================================
def load_yaml_config(config_file):
import yaml
configdata = BlockLoader().load(config_file)
config = yaml.load(configdata)
return config
#=================================================================
@ -39,16 +47,27 @@ class BlockLoader(object):
Load a file-like reader from the local file system
"""
file_only = False
if url.startswith('file://'):
url = url[len('file://'):]
file_only = True
try:
# first, try as file
afile = open(url, 'rb')
except IOError as file_err:
except IOError:
#if file_only:
# raise
# then, try as package.path/file
pkg_split = url.split('/', 1)
afile = resource_stream(pkg_split[0], pkg_split[1])
#if len(pkg_split) == 1:
# raise
afile = pkg_resources.resource_stream(pkg_split[0],
pkg_split[1])
if offset > 0:
afile.seek(offset)

View File

@ -1,6 +1,6 @@
import webtest
from pywb.bootstrap.pywb_init import create_wb_router
from pywb.bootstrap.wsgi_wrappers import init_app
from pywb.core.pywb_init import create_wb_router
from pywb.framework.wsgi_wrappers import init_app
from pywb.cdx.cdxobject import CDXObject
from fixture import TestExclusionPerms