1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

refactor domain specific rules:

- head insert callback passed in with rule, up to template
to handle additional inserts based on rule properties
- ability to pass in custom rules config to both cdx server
and content rewriter
- move canonicalize to utils pkg
- add wombat, modify wb.js to remove wombat-related settings
This commit is contained in:
Ilya Kreymer 2014-02-26 22:04:37 -08:00
parent 5a41f59f39
commit 453ab678ed
16 changed files with 482 additions and 55 deletions

View File

@ -5,11 +5,11 @@ import pkgutil
from pywb.utils.dsrules import BaseRule, RuleSet from pywb.utils.dsrules import BaseRule, RuleSet
from canonicalize import unsurt, UrlCanonicalizer from pywb.utils.canonicalize import unsurt, UrlCanonicalizer
#================================================================= #=================================================================
def load_domain_specific_cdx_rules(filename, surt_ordered): def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
#fh = pkgutil.get_data(__package__, filename) #fh = pkgutil.get_data(__package__, filename)
#config = yaml.load(fh) #config = yaml.load(fh)
@ -17,7 +17,8 @@ def load_domain_specific_cdx_rules(filename, surt_ordered):
fuzzy = None fuzzy = None
# Load Canonicalizer Rules # Load Canonicalizer Rules
rules = RuleSet(CDXDomainSpecificRule, 'canonicalize') rules = RuleSet(CDXDomainSpecificRule, 'canonicalize',
ds_rules_file=ds_rules_file)
if not surt_ordered: if not surt_ordered:
for rule in rules: for rule in rules:
@ -27,7 +28,8 @@ def load_domain_specific_cdx_rules(filename, surt_ordered):
canon = CustomUrlCanonicalizer(rules, surt_ordered) canon = CustomUrlCanonicalizer(rules, surt_ordered)
# Load Fuzzy Lookup Rules # Load Fuzzy Lookup Rules
rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup') rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup',
ds_rules_file=ds_rules_file)
if not surt_ordered: if not surt_ordered:
for rule in rules: for rule in rules:

View File

@ -1,4 +1,4 @@
from canonicalize import UrlCanonicalizer, calc_search_range from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range
from cdxops import cdx_load from cdxops import cdx_load
from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
@ -17,13 +17,13 @@ import urlparse
#================================================================= #=================================================================
class BaseCDXServer(object): class BaseCDXServer(object):
def __init__(self, **kwargs): def __init__(self, **kwargs):
ds_rules = kwargs.get('ds_rules') ds_rules_file = kwargs.get('ds_rules_file')
surt_ordered = kwargs.get('surt_ordered', True) surt_ordered = kwargs.get('surt_ordered', True)
# load from domain-specific rules # load from domain-specific rules
if ds_rules: if ds_rules_file:
self.url_canon, self.fuzzy_query = ( self.url_canon, self.fuzzy_query = (
load_domain_specific_cdx_rules(ds_rules, surt_ordered)) load_domain_specific_cdx_rules(ds_rules_file, surt_ordered))
# or custom passed in canonicalizer # or custom passed in canonicalizer
else: else:
self.url_canon = kwargs.get('url_canon') self.url_canon = kwargs.get('url_canon')
@ -166,7 +166,7 @@ def create_cdx_server(config, ds_rules_file=None):
return server_cls(paths, return server_cls(paths,
config=pass_config, config=pass_config,
surt_ordered=surt_ordered, surt_ordered=surt_ordered,
ds_rules=ds_rules_file, ds_rules_file=ds_rules_file,
perms_checker=perms_checker) perms_checker=perms_checker)

View File

@ -18,17 +18,19 @@ def load_template_file(file, desc = None, view_class = views.J2TemplateView):
return file return file
#================================================================= #=================================================================
def create_wb_handler(cdx_server, config): def create_wb_handler(cdx_server, config, ds_rules_file=None):
record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker')) record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker'))
paths = config.get('archive_paths') paths = config.get('archive_paths')
resolving_loader = ResolvingLoader(paths = paths, cdx_server = cdx_server, record_loader = record_loader) resolving_loader = ResolvingLoader(paths=paths,
cdx_server=cdx_server,
record_loader=record_loader)
replayer = replay_views.ReplayView( replayer = replay_views.ReplayView(
content_loader = resolving_loader, content_loader = resolving_loader,
content_rewriter = RewriteContent(), content_rewriter = RewriteContent(ds_rules_file=ds_rules_file),
head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'), head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),

View File

@ -57,12 +57,13 @@ def pywb_config_manual(passed_config = {}):
route_config = DictChain(value, config) route_config = DictChain(value, config)
ds_rules = route_config.get('domain_specific_rules', None) ds_rules_file = route_config.get('domain_specific_rules', None)
cdx_server = IndexReader(route_config, ds_rules) cdx_server = IndexReader(route_config, ds_rules_file)
wb_handler = config_utils.create_wb_handler( wb_handler = config_utils.create_wb_handler(
cdx_server = cdx_server, cdx_server=cdx_server,
config = route_config, config=route_config,
ds_rules_file=ds_rules_file,
) )
logging.debug('Adding Collection: ' + name) logging.debug('Adding Collection: ' + name)

View File

@ -7,6 +7,7 @@ from wbrequestresponse import WbResponse
from wbexceptions import CaptureException, InternalRedirect from wbexceptions import CaptureException, InternalRedirect
from pywb.warc.recordloader import ArchiveLoadFailed from pywb.warc.recordloader import ArchiveLoadFailed
#================================================================= #=================================================================
class ReplayView: class ReplayView:
def __init__(self, content_loader, content_rewriter, head_insert_view = None, def __init__(self, content_loader, content_rewriter, head_insert_view = None,
@ -99,20 +100,34 @@ class ReplayView:
def rewrite_content(self, wbrequest, cdx, status_headers, stream): def rewrite_content(self, wbrequest, cdx, status_headers, stream):
urlrewriter = wbrequest.urlrewriter urlrewriter = wbrequest.urlrewriter
(rewritten_headers, stream) = self.content_rewriter.rewrite_headers(urlrewriter, status_headers, stream) result = self.content_rewriter.rewrite_headers(urlrewriter,
status_headers,
stream,
cdx['urlkey'])
(rewritten_headers, stream) = result
# no rewriting needed! # no rewriting needed!
if rewritten_headers.text_type is None: if rewritten_headers.text_type is None:
response_iter = self.stream_to_iter(stream) response_iter = self.stream_to_iter(stream)
return WbResponse(rewritten_headers.status_headers, response_iter) return WbResponse(rewritten_headers.status_headers, response_iter)
# do head insert def make_head_insert(rule):
return (self.head_insert_view.render_to_string(wbrequest=wbrequest,
cdx=cdx,
rule=rule))
# do head insert
if self.head_insert_view: if self.head_insert_view:
head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx) head_insert_func = make_head_insert
else: else:
head_insert_str = None head_insert_func = None
(status_headers, response_gen) = self.content_rewriter.rewrite_content(urlrewriter, rewritten_headers, stream, head_insert_str) result = self.content_rewriter.rewrite_content(urlrewriter,
rewritten_headers,
stream,
head_insert_func,
cdx['urlkey'])
(status_headers, response_gen) = result
if self.buffer_response: if self.buffer_response:
if wbrequest.wb_url.mod == 'id_': if wbrequest.wb_url.mod == 'id_':

View File

@ -11,9 +11,12 @@ from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader
#=================================================================
class RewriteContent: class RewriteContent:
def __init__(self, config=None): def __init__(self, ds_rules_file=None):
self.ruleset = RuleSet(RewriteRules, 'rewrite', config, {}) self.ruleset = RuleSet(RewriteRules, 'rewrite',
default_rule_config={},
ds_rules_file=ds_rules_file)
def rewrite_headers(self, urlrewriter, status_headers, stream, urlkey=''): def rewrite_headers(self, urlrewriter, status_headers, stream, urlkey=''):
header_rewriter_class = self.ruleset.get_first_match(urlkey).rewriters['header'] header_rewriter_class = self.ruleset.get_first_match(urlkey).rewriters['header']
@ -31,7 +34,7 @@ class RewriteContent:
return (rewritten_headers, stream) return (rewritten_headers, stream)
def rewrite_content(self, urlrewriter, headers, stream, head_insert_str=None, urlkey=''): def rewrite_content(self, urlrewriter, headers, stream, head_insert_func=None, urlkey=''):
# see if we've already rewritten headers # see if we've already rewritten headers
if isinstance(headers, RewrittenStatusAndHeaders): if isinstance(headers, RewrittenStatusAndHeaders):
@ -65,7 +68,6 @@ class RewriteContent:
text_type = rewritten_headers.text_type text_type = rewritten_headers.text_type
#rewriter_class = self.rewriters.get(text_type)
rule = self.ruleset.get_first_match(urlkey) rule = self.ruleset.get_first_match(urlkey)
try: try:
@ -74,10 +76,13 @@ class RewriteContent:
raise Exception('Unknown Text Type for Rewrite: ' + text_type) raise Exception('Unknown Text Type for Rewrite: ' + text_type)
#import sys #import sys
#sys.stderr.write(str(vars(self.ruleset.get_first_match(urlkey)))) #sys.stderr.write(str(vars(rule)))
if text_type == 'html': if text_type == 'html':
head_insert_str = rule.create_head_inserts() + head_insert_str head_insert_str = ''
if head_insert_func:
head_insert_str = head_insert_func(rule)
rewriter = rewriter_class(urlrewriter, rewriter = rewriter_class(urlrewriter,
outstream=None, outstream=None,

View File

@ -7,11 +7,11 @@ import mimetypes
from pywb.utils.loaders import is_http from pywb.utils.loaders import is_http
from pywb.utils.timeutils import datetime_to_timestamp from pywb.utils.timeutils import datetime_to_timestamp
from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeaders
from pywb.utils.canonicalize import canonicalize
from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.rewrite_content import RewriteContent from pywb.rewrite.rewrite_content import RewriteContent
from pywb.cdx.canonicalize import canonicalize
""" """
Fetch a url from live web and apply rewriting rules Fetch a url from live web and apply rewriting rules
@ -43,7 +43,7 @@ def get_local_file(uri):
return (status_headers, stream) return (status_headers, stream)
#================================================================= #=================================================================
def get_rewritten(url, urlrewriter, urlkey=None): def get_rewritten(url, urlrewriter, urlkey=None, head_insert_func=None):
if is_http(url): if is_http(url):
(status_headers, stream) = get_status_and_stream(url) (status_headers, stream) = get_status_and_stream(url)
else: else:
@ -53,11 +53,15 @@ def get_rewritten(url, urlrewriter, urlkey=None):
if not urlkey: if not urlkey:
urlkey = canonicalize(url) urlkey = canonicalize(url)
status_headers, gen = RewriteContent().rewrite_content(urlrewriter, rewriter = RewriteContent()
status_headers,
stream, result = rewriter.rewrite_content(urlrewriter,
head_insert_str='', status_headers,
urlkey=urlkey) stream,
head_insert_func=head_insert_func,
urlkey=urlkey)
status_headers, gen = result
buff = '' buff = ''
for x in gen: for x in gen:

View File

@ -0,0 +1,53 @@
from pywb.utils.dsrules import BaseRule
from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
from html_rewriter import HTMLRewriter
from header_rewriter import HeaderRewriter
import itertools
class RewriteRules(BaseRule):
def __init__(self, url_prefix, config={}):
super(RewriteRules, self).__init__(url_prefix, config)
self.rewriters = {}
#self._script_head_inserts = config.get('script_head_inserts', {})
self.rewriters['header'] = config.get('header_class', HeaderRewriter)
self.rewriters['css'] = config.get('css_class', CSSRewriter)
self.rewriters['xml'] = config.get('xml_class', XMLRewriter)
self.rewriters['html'] = config.get('html_class', HTMLRewriter)
# Custom handling for js rewriting, often the most complex
self.js_rewrite_location = config.get('js_rewrite_location', True)
self.js_rewrite_location = bool(self.js_rewrite_location)
# ability to toggle rewriting
if self.js_rewrite_location:
js_default_class = JSLinkAndLocationRewriter
else:
js_default_class = JSLinkOnlyRewriter
# set js class, using either default or override from config
self.rewriters['js'] = config.get('js_class', js_default_class)
# add any regexs for js rewriter
self._add_custom_regexs('js', config)
def _add_custom_regexs(self, field, config):
regexs = config.get(field + '_regexs')
if not regexs:
return
rewriter_cls = self.rewriters[field]
rule_def_tuples = RegexRewriter.parse_rules_from_config(regexs)
def extend_rewriter_with_regex(urlrewriter):
#import sys
#sys.stderr.write('\n\nEXTEND: ' + str(rule_def_tuples))
return rewriter_cls(urlrewriter, rule_def_tuples)
self.rewriters[field] = extend_rewriter_with_regex

View File

@ -8,9 +8,18 @@ from pywb import get_test_dir
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
def head_insert_func(rule):
if rule.js_rewrite_location == True:
return '<script src="/static/default/wombat.js"> </script>'
else:
return ''
def test_local_1(): def test_local_1():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, 'com,example,test)/') status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
urlrewriter,
'com,example,test)/',
head_insert_func)
# wombat insert added # wombat insert added
assert '<head><script src="/static/default/wombat.js"> </script>' in buff assert '<head><script src="/static/default/wombat.js"> </script>' in buff
@ -23,7 +32,10 @@ def test_local_1():
def test_local_2_no_js_location_rewrite(): def test_local_2_no_js_location_rewrite():
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, 'example,example,test)/nolocation_rewrite') status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
urlrewriter,
'example,example,test)/nolocation_rewrite',
head_insert_func)
# no wombat insert # no wombat insert
assert '<head><script src="/static/default/wombat.js"> </script>' not in buff assert '<head><script src="/static/default/wombat.js"> </script>' not in buff
@ -55,6 +67,6 @@ def test_example_domain_specific_3():
status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter) status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter)
# comment out bootloader # comment out bootloader
assert '/* Bootloader.configurePage' in buff, buff assert '/* Bootloader.configurePage' in buff

View File

@ -1,18 +1,21 @@
/*
Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License.
This file is part of pywb.
// Rewritten location and domain obj setup pywb is free software: you can redistribute it and/or modify
window.WB_wombat_location = window.location it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
if (window.top != window) { pywb is distributed in the hope that it will be useful,
window.top.WB_wombat_location = window.top.location but WITHOUT ANY WARRANTY; without even the implied warranty of
} MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
if (window.opener) {
window.opener.WB_wombat_location = window.opener.location
}
document.WB_wombat_domain = document.domain
You should have received a copy of the GNU General Public License
along with pywb. If not, see <http://www.gnu.org/licenses/>.
*/
function initBanner() function initBanner()
{ {

219
pywb/static/wombat.js Normal file
View File

@ -0,0 +1,219 @@
/*
Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License.
This file is part of pywb.
pywb is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
pywb is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with pywb. If not, see <http://www.gnu.org/licenses/>.
*/
//============================================
// Wombat JS-Rewriting Library
//============================================
var WB_wombat_replayPrefix;
var WB_wombat_replayDatePrefix;
var WB_wombat_captureDatePart;
var WB_wombat_origHost;
function WB_StripPort(str)
{
var hostWithPort = str.match(/^http:\/\/[\w\d@.-]+:\d+/);
if (hostWithPort) {
var hostName = hostWithPort[0].substr(0, hostWithPort[0].lastIndexOf(':'));
return hostName + str.substr(hostWithPort[0].length);
}
return str;
}
function WB_IsHostUrl(str)
{
// Good guess that's its a hostname
if (str.indexOf("www.") == 0) {
return true;
}
// hostname:port (port required)
var matches = str.match(/^[\w-]+(\.[\w-_]+)+(:\d+)(\/|$)/);
if (matches && (matches[0].length < 64)) {
return true;
}
// ip:port
matches = str.match(/^\d+\.\d+\.\d+\.\d+(:\d+)?(\/|$)/);
if (matches && (matches[0].length < 64)) {
return true;
}
return false;
}
function WB_RewriteUrl(url)
{
var httpPrefix = "http://";
// If not dealing with a string, just return it
if (!url || (typeof url) != "string") {
return url;
}
// If starts with prefix, no rewriting needed
// Only check replay prefix (no date) as date may be different for each capture
if (url.indexOf(WB_wombat_replayPrefix) == 0) {
return url;
}
// If server relative url, add prefix and original host
if (url.charAt(0) == "/") {
// Already a relative url, don't make any changes!
if (url.indexOf(WB_wombat_captureDatePart) >= 0) {
return url;
}
return WB_wombat_replayDatePrefix + WB_wombat_origHost + url;
}
// If full url starting with http://, add prefix
if (url.indexOf(httpPrefix) == 0) {
return WB_wombat_replayDatePrefix + url;
}
// May or may not be a hostname, call function to determine
// If it is, add the prefix and make sure port is removed
if (WB_IsHostUrl(url)) {
return WB_wombat_replayDatePrefix + httpPrefix + url;
}
return url;
}
function WB_CopyObjectFields(obj)
{
var newObj = {};
for (prop in obj) {
if ((typeof obj[prop]) != "function") {
newObj[prop] = obj[prop];
}
}
return newObj;
}
function WB_ExtractOrig(href)
{
if (!href) {
return "";
}
href = href.toString();
var index = href.indexOf("/http", 1);
if (index > 0) {
return href.substr(index + 1);
} else {
return href;
}
}
function WB_CopyLocationObj(loc)
{
var newLoc = WB_CopyObjectFields(loc);
newLoc._origLoc = loc;
newLoc._origHref = loc.href;
// Rewrite replace and assign functions
newLoc.replace = function(url) { this._origLoc.replace(WB_RewriteUrl(url)); }
newLoc.assign = function(url) { this._origLoc.assign(WB_RewriteUrl(url)); }
newLoc.reload = loc.reload;
newLoc.href = WB_ExtractOrig(newLoc._origHref);
newLoc.toString = function() { return this.href; }
return newLoc;
}
function WB_wombat_updateLoc(reqHref, origHref, location)
{
if (reqHref && (WB_ExtractOrig(origHref) != WB_ExtractOrig(reqHref))) {
var finalHref = WB_RewriteUrl(reqHref);
location.href = finalHref;
}
}
function WB_wombat_checkLocationChange(wbLoc, isTop)
{
var locType = (typeof wbLoc);
var location = (isTop ? window.top.location : window.location);
// String has been assigned to location, so assign it
if (locType == "string") {
WB_wombat_updateLoc(wbLoc, location.href, location)
} else if (locType == "object") {
WB_wombat_updateLoc(wbLoc.href, wbLoc._origHref, location);
}
}
var wombat_updating = false;
function WB_wombat_checkLocations()
{
if (wombat_updating) {
return false;
}
wombat_updating = true;
WB_wombat_checkLocationChange(window.WB_wombat_location, false);
if (window.self.location != window.top.location) {
WB_wombat_checkLocationChange(window.top.WB_wombat_location, true);
}
wombat_updating = false;
}
function WB_wombat_Init(replayPrefix, captureDate, origHost)
{
WB_wombat_replayPrefix = replayPrefix;
WB_wombat_replayDatePrefix = replayPrefix + captureDate + "/";
WB_wombat_captureDatePart = "/" + captureDate + "/";
WB_wombat_origHost = "http://" + origHost;
window.WB_wombat_location = WB_CopyLocationObj(window.self.location);
if (window.self.location != window.top.location) {
window.top.WB_wombat_location = WB_CopyLocationObj(window.top.location);
}
if (window.opener) {
window.opener.WB_wombat_location = (window.opener ? WB_CopyLocationObj(window.opener.location) : null);
}
document.WB_wombat_domain = origHost;
}
// Check quickly after page load
setTimeout(WB_wombat_checkLocations, 100);
// Check periodically every few seconds
setInterval(WB_wombat_checkLocations, 500);

View File

@ -1,7 +1,14 @@
<!-- WB Insert --> <!-- WB Insert -->
{% if rule.js_rewrite_location %}
<script src='{{ wbrequest.host_prefix }}/static/default/wombat.js'> </script>
<script> <script>
wbinfo = {} WB_wombat_Init("{{wbrequest.wb_prefix}}", "{{cdx['timestamp']}}", "{{cdx['original'] | host}}");
wbinfo.capture_str = "{{ cdx['timestamp'] | format_ts }}"; </script>
{% endif %}
<script>
wbinfo = {}
wbinfo.capture_str = "{{ cdx['timestamp'] | format_ts }}";
</script> </script>
<script src='{{ wbrequest.host_prefix }}/static/default/wb.js'> </script> <script src='{{ wbrequest.host_prefix }}/static/default/wb.js'> </script>
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/static/default/wb.css'/> <link rel='stylesheet' href='{{ wbrequest.host_prefix }}/static/default/wb.css'/>

View File

@ -3,8 +3,6 @@
import surt import surt
import urlparse import urlparse
from cdxobject import CDXException
#================================================================= #=================================================================
class UrlCanonicalizer(object): class UrlCanonicalizer(object):
@ -15,6 +13,12 @@ class UrlCanonicalizer(object):
return canonicalize(url, self.surt_ordered) return canonicalize(url, self.surt_ordered)
#=================================================================
class UrlCanonicalizeException(Exception):
def status(self):
return '400 Bad Request'
#================================================================= #=================================================================
def canonicalize(url, surt_ordered=True): def canonicalize(url, surt_ordered=True):
""" """
@ -31,7 +35,7 @@ def canonicalize(url, surt_ordered=True):
try: try:
key = surt.surt(url) key = surt.surt(url)
except Exception as e: except Exception as e:
raise CDXException('Invalid Url: ' + url) raise UrlCanonicalizeException('Invalid Url: ' + url)
# if not surt, unsurt the surt to get canonicalized non-surt url # if not surt, unsurt the surt to get canonicalized non-surt url
if not surt_ordered: if not surt_ordered:

98
pywb/utils/dsrules.py Normal file
View File

@ -0,0 +1,98 @@
import yaml
import pkgutil
#=================================================================
DEFAULT_RULES_FILE = 'rules.yaml'
DEFAULT_RULES_PKG = 'pywb'
#=================================================================
class RuleSet(object):
DEFAULT_KEY = ''
def __init__(self, rule_cls, fieldname, **kwargs):
"""
A domain specific rules block, inited via config map.
If config map not specified, it is loaded from default location.
The rules are represented as a map by domain.
Each rules configuration will load is own field type
from the list and given a specified rule_cls.
"""
self.rules = []
ds_rules_file = kwargs.get('ds_rules_file')
default_rule_config = kwargs.get('default_rule_config')
config = self.load_default_rules(ds_rules_file)
rulesmap = config.get('rules') if config else None
# if default_rule_config provided, always init a default ruleset
if not rulesmap and default_rule_config is not None:
self.rules = [rule_cls(self.DEFAULT_KEY, default_rule_config)]
return
def_key_found = False
# iterate over master rules file
for value in rulesmap:
url_prefix = value.get('url_prefix')
rules_def = value.get(fieldname)
if not rules_def:
continue
if url_prefix == self.DEFAULT_KEY:
def_key_found = True
self.rules.append(rule_cls(url_prefix, rules_def))
# if default_rule_config provided, always init a default ruleset
if not def_key_found and default_rule_config is not None:
self.rules.append(rule_cls(self.DEFAULT_KEY, default_rule_config))
@staticmethod
def load_default_rules(filename=None, pkg=None):
config = None
if not filename:
filename = DEFAULT_RULES_FILE
if not pkg:
pkg = DEFAULT_RULES_PKG
if filename:
yaml_str = pkgutil.get_data(pkg, filename)
config = yaml.load(yaml_str)
return config
def iter_matching(self, urlkey):
"""
Iterate over all matching rules for given urlkey
"""
for rule in self.rules:
if rule.applies(urlkey):
yield rule
def get_first_match(self, urlkey):
for rule in self.rules:
if rule.applies(urlkey):
return rule
#=================================================================
class BaseRule(object):
"""
Base rule class -- subclassed to handle specific
rules for given url_prefix key
"""
def __init__(self, url_prefix, rules):
self.url_prefix = url_prefix
if not isinstance(self.url_prefix, list):
self.url_prefix = [self.url_prefix]
def applies(self, urlkey):
return any(urlkey.startswith(x) for x in self.url_prefix)

View File

@ -2,6 +2,7 @@ from wbexceptions import WbException, NotFoundException, InternalRedirect
from wbrequestresponse import WbResponse, StatusAndHeaders from wbrequestresponse import WbResponse, StatusAndHeaders
from pywb.cdx.cdxserver import CDXException from pywb.cdx.cdxserver import CDXException
from pywb.utils.canonicalize import UrlCanonicalizeException
from pywb.warc.recordloader import ArchiveLoadFailed from pywb.warc.recordloader import ArchiveLoadFailed
import os import os
@ -55,7 +56,8 @@ def create_wb_app(wb_router):
except InternalRedirect as ir: except InternalRedirect as ir:
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders)) response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
except (WbException, CDXException, ArchiveLoadFailed) as e: except (WbException, CDXException,
UrlCanonicalizeException, ArchiveLoadFailed) as e:
response = handle_exception(env, wb_router.error_view, e, False) response = handle_exception(env, wb_router.error_view, e, False)
except Exception as e: except Exception as e:

View File

@ -13,7 +13,7 @@ setuptools.setup(name='pywb',
license='GPL', license='GPL',
packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
package_data={'pywb': ['ui/*', 'static/*'], 'pywb.cdx': ['*.yaml']}, package_data={'pywb': ['ui/*', 'static/*'], 'pywb': ['*.yaml']},
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')), ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))], ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],