diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index a9e06778..006dd88d 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -5,11 +5,11 @@ import pkgutil from pywb.utils.dsrules import BaseRule, RuleSet -from canonicalize import unsurt, UrlCanonicalizer +from pywb.utils.canonicalize import unsurt, UrlCanonicalizer #================================================================= -def load_domain_specific_cdx_rules(filename, surt_ordered): +def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): #fh = pkgutil.get_data(__package__, filename) #config = yaml.load(fh) @@ -17,7 +17,8 @@ def load_domain_specific_cdx_rules(filename, surt_ordered): fuzzy = None # Load Canonicalizer Rules - rules = RuleSet(CDXDomainSpecificRule, 'canonicalize') + rules = RuleSet(CDXDomainSpecificRule, 'canonicalize', + ds_rules_file=ds_rules_file) if not surt_ordered: for rule in rules: @@ -27,7 +28,8 @@ def load_domain_specific_cdx_rules(filename, surt_ordered): canon = CustomUrlCanonicalizer(rules, surt_ordered) # Load Fuzzy Lookup Rules - rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup') + rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup', + ds_rules_file=ds_rules_file) if not surt_ordered: for rule in rules: diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 1a68f7e4..7f548ec4 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -1,4 +1,4 @@ -from canonicalize import UrlCanonicalizer, calc_search_range +from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range from cdxops import cdx_load from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource @@ -17,13 +17,13 @@ import urlparse #================================================================= class BaseCDXServer(object): def __init__(self, **kwargs): - ds_rules = kwargs.get('ds_rules') + ds_rules_file = kwargs.get('ds_rules_file') surt_ordered = kwargs.get('surt_ordered', True) # load from domain-specific rules - if ds_rules: + if ds_rules_file: self.url_canon, self.fuzzy_query = ( - load_domain_specific_cdx_rules(ds_rules, surt_ordered)) + load_domain_specific_cdx_rules(ds_rules_file, surt_ordered)) # or custom passed in canonicalizer else: self.url_canon = kwargs.get('url_canon') @@ -166,7 +166,7 @@ def create_cdx_server(config, ds_rules_file=None): return server_cls(paths, config=pass_config, surt_ordered=surt_ordered, - ds_rules=ds_rules_file, + ds_rules_file=ds_rules_file, perms_checker=perms_checker) diff --git a/pywb/config_utils.py b/pywb/config_utils.py index 672e8735..05844a2e 100644 --- a/pywb/config_utils.py +++ b/pywb/config_utils.py @@ -18,17 +18,19 @@ def load_template_file(file, desc = None, view_class = views.J2TemplateView): return file #================================================================= -def create_wb_handler(cdx_server, config): +def create_wb_handler(cdx_server, config, ds_rules_file=None): record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker')) paths = config.get('archive_paths') - resolving_loader = ResolvingLoader(paths = paths, cdx_server = cdx_server, record_loader = record_loader) + resolving_loader = ResolvingLoader(paths=paths, + cdx_server=cdx_server, + record_loader=record_loader) replayer = replay_views.ReplayView( content_loader = resolving_loader, - content_rewriter = RewriteContent(), + content_rewriter = RewriteContent(ds_rules_file=ds_rules_file), head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'), diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py index be4bdded..bd63bfd5 100644 --- a/pywb/pywb_init.py +++ b/pywb/pywb_init.py @@ -57,12 +57,13 @@ def pywb_config_manual(passed_config = {}): route_config = DictChain(value, config) - ds_rules = route_config.get('domain_specific_rules', None) - cdx_server = IndexReader(route_config, ds_rules) + ds_rules_file = route_config.get('domain_specific_rules', None) + cdx_server = IndexReader(route_config, ds_rules_file) wb_handler = config_utils.create_wb_handler( - cdx_server = cdx_server, - config = route_config, + cdx_server=cdx_server, + config=route_config, + ds_rules_file=ds_rules_file, ) logging.debug('Adding Collection: ' + name) diff --git a/pywb/replay_views.py b/pywb/replay_views.py index 4c6907eb..9113ad5f 100644 --- a/pywb/replay_views.py +++ b/pywb/replay_views.py @@ -7,6 +7,7 @@ from wbrequestresponse import WbResponse from wbexceptions import CaptureException, InternalRedirect from pywb.warc.recordloader import ArchiveLoadFailed + #================================================================= class ReplayView: def __init__(self, content_loader, content_rewriter, head_insert_view = None, @@ -99,20 +100,34 @@ class ReplayView: def rewrite_content(self, wbrequest, cdx, status_headers, stream): urlrewriter = wbrequest.urlrewriter - (rewritten_headers, stream) = self.content_rewriter.rewrite_headers(urlrewriter, status_headers, stream) + result = self.content_rewriter.rewrite_headers(urlrewriter, + status_headers, + stream, + cdx['urlkey']) + (rewritten_headers, stream) = result # no rewriting needed! if rewritten_headers.text_type is None: response_iter = self.stream_to_iter(stream) return WbResponse(rewritten_headers.status_headers, response_iter) - # do head insert + def make_head_insert(rule): + return (self.head_insert_view.render_to_string(wbrequest=wbrequest, + cdx=cdx, + rule=rule)) + # do head insert if self.head_insert_view: - head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx) + head_insert_func = make_head_insert else: - head_insert_str = None + head_insert_func = None - (status_headers, response_gen) = self.content_rewriter.rewrite_content(urlrewriter, rewritten_headers, stream, head_insert_str) + result = self.content_rewriter.rewrite_content(urlrewriter, + rewritten_headers, + stream, + head_insert_func, + cdx['urlkey']) + + (status_headers, response_gen) = result if self.buffer_response: if wbrequest.wb_url.mod == 'id_': diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 80daf7e3..1ba3d321 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -11,9 +11,12 @@ from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader +#================================================================= class RewriteContent: - def __init__(self, config=None): - self.ruleset = RuleSet(RewriteRules, 'rewrite', config, {}) + def __init__(self, ds_rules_file=None): + self.ruleset = RuleSet(RewriteRules, 'rewrite', + default_rule_config={}, + ds_rules_file=ds_rules_file) def rewrite_headers(self, urlrewriter, status_headers, stream, urlkey=''): header_rewriter_class = self.ruleset.get_first_match(urlkey).rewriters['header'] @@ -31,7 +34,7 @@ class RewriteContent: return (rewritten_headers, stream) - def rewrite_content(self, urlrewriter, headers, stream, head_insert_str=None, urlkey=''): + def rewrite_content(self, urlrewriter, headers, stream, head_insert_func=None, urlkey=''): # see if we've already rewritten headers if isinstance(headers, RewrittenStatusAndHeaders): @@ -65,7 +68,6 @@ class RewriteContent: text_type = rewritten_headers.text_type - #rewriter_class = self.rewriters.get(text_type) rule = self.ruleset.get_first_match(urlkey) try: @@ -74,10 +76,13 @@ class RewriteContent: raise Exception('Unknown Text Type for Rewrite: ' + text_type) #import sys - #sys.stderr.write(str(vars(self.ruleset.get_first_match(urlkey)))) + #sys.stderr.write(str(vars(rule))) if text_type == 'html': - head_insert_str = rule.create_head_inserts() + head_insert_str + head_insert_str = '' + + if head_insert_func: + head_insert_str = head_insert_func(rule) rewriter = rewriter_class(urlrewriter, outstream=None, diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index 9d752d10..63783234 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -7,11 +7,11 @@ import mimetypes from pywb.utils.loaders import is_http from pywb.utils.timeutils import datetime_to_timestamp from pywb.utils.statusandheaders import StatusAndHeaders +from pywb.utils.canonicalize import canonicalize from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.rewrite_content import RewriteContent -from pywb.cdx.canonicalize import canonicalize """ Fetch a url from live web and apply rewriting rules @@ -43,7 +43,7 @@ def get_local_file(uri): return (status_headers, stream) #================================================================= -def get_rewritten(url, urlrewriter, urlkey=None): +def get_rewritten(url, urlrewriter, urlkey=None, head_insert_func=None): if is_http(url): (status_headers, stream) = get_status_and_stream(url) else: @@ -53,11 +53,15 @@ def get_rewritten(url, urlrewriter, urlkey=None): if not urlkey: urlkey = canonicalize(url) - status_headers, gen = RewriteContent().rewrite_content(urlrewriter, - status_headers, - stream, - head_insert_str='', - urlkey=urlkey) + rewriter = RewriteContent() + + result = rewriter.rewrite_content(urlrewriter, + status_headers, + stream, + head_insert_func=head_insert_func, + urlkey=urlkey) + + status_headers, gen = result buff = '' for x in gen: diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py new file mode 100644 index 00000000..e1584162 --- /dev/null +++ b/pywb/rewrite/rewriterules.py @@ -0,0 +1,53 @@ +from pywb.utils.dsrules import BaseRule + +from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter +from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter +from html_rewriter import HTMLRewriter +from header_rewriter import HeaderRewriter + +import itertools + +class RewriteRules(BaseRule): + def __init__(self, url_prefix, config={}): + super(RewriteRules, self).__init__(url_prefix, config) + + self.rewriters = {} + + #self._script_head_inserts = config.get('script_head_inserts', {}) + + self.rewriters['header'] = config.get('header_class', HeaderRewriter) + self.rewriters['css'] = config.get('css_class', CSSRewriter) + self.rewriters['xml'] = config.get('xml_class', XMLRewriter) + self.rewriters['html'] = config.get('html_class', HTMLRewriter) + + # Custom handling for js rewriting, often the most complex + self.js_rewrite_location = config.get('js_rewrite_location', True) + self.js_rewrite_location = bool(self.js_rewrite_location) + + # ability to toggle rewriting + if self.js_rewrite_location: + js_default_class = JSLinkAndLocationRewriter + else: + js_default_class = JSLinkOnlyRewriter + + # set js class, using either default or override from config + self.rewriters['js'] = config.get('js_class', js_default_class) + + # add any regexs for js rewriter + self._add_custom_regexs('js', config) + + def _add_custom_regexs(self, field, config): + regexs = config.get(field + '_regexs') + if not regexs: + return + + rewriter_cls = self.rewriters[field] + + rule_def_tuples = RegexRewriter.parse_rules_from_config(regexs) + + def extend_rewriter_with_regex(urlrewriter): + #import sys + #sys.stderr.write('\n\nEXTEND: ' + str(rule_def_tuples)) + return rewriter_cls(urlrewriter, rule_def_tuples) + + self.rewriters[field] = extend_rewriter_with_regex diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index f3a7667a..36e74848 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -8,9 +8,18 @@ from pywb import get_test_dir urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') +def head_insert_func(rule): + if rule.js_rewrite_location == True: + return '' + else: + return '' + def test_local_1(): - status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, 'com,example,test)/') + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', + urlrewriter, + 'com,example,test)/', + head_insert_func) # wombat insert added assert '' in buff @@ -23,7 +32,10 @@ def test_local_1(): def test_local_2_no_js_location_rewrite(): - status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, 'example,example,test)/nolocation_rewrite') + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', + urlrewriter, + 'example,example,test)/nolocation_rewrite', + head_insert_func) # no wombat insert assert '' not in buff @@ -55,6 +67,6 @@ def test_example_domain_specific_3(): status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter) # comment out bootloader - assert '/* Bootloader.configurePage' in buff, buff + assert '/* Bootloader.configurePage' in buff diff --git a/pywb/static/wb.js b/pywb/static/wb.js index a7b39370..c4798da8 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -1,18 +1,21 @@ +/* +Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License. +This file is part of pywb. -// Rewritten location and domain obj setup -window.WB_wombat_location = window.location + pywb is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. -if (window.top != window) { - window.top.WB_wombat_location = window.top.location -} - -if (window.opener) { - window.opener.WB_wombat_location = window.opener.location -} - -document.WB_wombat_domain = document.domain + pywb is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License + along with pywb. If not, see . +*/ function initBanner() { diff --git a/pywb/static/wombat.js b/pywb/static/wombat.js new file mode 100644 index 00000000..d2b7d12c --- /dev/null +++ b/pywb/static/wombat.js @@ -0,0 +1,219 @@ +/* +Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License. + +This file is part of pywb. + + pywb is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + pywb is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with pywb. If not, see . +*/ + +//============================================ +// Wombat JS-Rewriting Library +//============================================ + +var WB_wombat_replayPrefix; +var WB_wombat_replayDatePrefix; +var WB_wombat_captureDatePart; +var WB_wombat_origHost; + + +function WB_StripPort(str) +{ + var hostWithPort = str.match(/^http:\/\/[\w\d@.-]+:\d+/); + if (hostWithPort) { + var hostName = hostWithPort[0].substr(0, hostWithPort[0].lastIndexOf(':')); + return hostName + str.substr(hostWithPort[0].length); + } + + return str; +} + +function WB_IsHostUrl(str) +{ + // Good guess that's its a hostname + if (str.indexOf("www.") == 0) { + return true; + } + + // hostname:port (port required) + var matches = str.match(/^[\w-]+(\.[\w-_]+)+(:\d+)(\/|$)/); + if (matches && (matches[0].length < 64)) { + return true; + } + + // ip:port + matches = str.match(/^\d+\.\d+\.\d+\.\d+(:\d+)?(\/|$)/); + if (matches && (matches[0].length < 64)) { + return true; + } + + return false; +} + +function WB_RewriteUrl(url) +{ + var httpPrefix = "http://"; + + // If not dealing with a string, just return it + if (!url || (typeof url) != "string") { + return url; + } + + // If starts with prefix, no rewriting needed + // Only check replay prefix (no date) as date may be different for each capture + if (url.indexOf(WB_wombat_replayPrefix) == 0) { + return url; + } + + // If server relative url, add prefix and original host + if (url.charAt(0) == "/") { + + // Already a relative url, don't make any changes! + if (url.indexOf(WB_wombat_captureDatePart) >= 0) { + return url; + } + + return WB_wombat_replayDatePrefix + WB_wombat_origHost + url; + } + + // If full url starting with http://, add prefix + if (url.indexOf(httpPrefix) == 0) { + return WB_wombat_replayDatePrefix + url; + } + + // May or may not be a hostname, call function to determine + // If it is, add the prefix and make sure port is removed + if (WB_IsHostUrl(url)) { + return WB_wombat_replayDatePrefix + httpPrefix + url; + } + + return url; +} + +function WB_CopyObjectFields(obj) +{ + var newObj = {}; + + for (prop in obj) { + if ((typeof obj[prop]) != "function") { + newObj[prop] = obj[prop]; + } + } + + return newObj; +} + +function WB_ExtractOrig(href) +{ + if (!href) { + return ""; + } + href = href.toString(); + var index = href.indexOf("/http", 1); + if (index > 0) { + return href.substr(index + 1); + } else { + return href; + } +} + +function WB_CopyLocationObj(loc) +{ + var newLoc = WB_CopyObjectFields(loc); + + newLoc._origLoc = loc; + newLoc._origHref = loc.href; + + // Rewrite replace and assign functions + newLoc.replace = function(url) { this._origLoc.replace(WB_RewriteUrl(url)); } + newLoc.assign = function(url) { this._origLoc.assign(WB_RewriteUrl(url)); } + newLoc.reload = loc.reload; + newLoc.href = WB_ExtractOrig(newLoc._origHref); + newLoc.toString = function() { return this.href; } + + return newLoc; +} + +function WB_wombat_updateLoc(reqHref, origHref, location) +{ + if (reqHref && (WB_ExtractOrig(origHref) != WB_ExtractOrig(reqHref))) { + var finalHref = WB_RewriteUrl(reqHref); + + location.href = finalHref; + } +} + +function WB_wombat_checkLocationChange(wbLoc, isTop) +{ + var locType = (typeof wbLoc); + + var location = (isTop ? window.top.location : window.location); + + // String has been assigned to location, so assign it + if (locType == "string") { + WB_wombat_updateLoc(wbLoc, location.href, location) + + } else if (locType == "object") { + WB_wombat_updateLoc(wbLoc.href, wbLoc._origHref, location); + } +} + +var wombat_updating = false; + +function WB_wombat_checkLocations() +{ + if (wombat_updating) { + return false; + } + + wombat_updating = true; + + WB_wombat_checkLocationChange(window.WB_wombat_location, false); + + if (window.self.location != window.top.location) { + WB_wombat_checkLocationChange(window.top.WB_wombat_location, true); + } + + wombat_updating = false; +} + +function WB_wombat_Init(replayPrefix, captureDate, origHost) +{ + WB_wombat_replayPrefix = replayPrefix; + WB_wombat_replayDatePrefix = replayPrefix + captureDate + "/"; + WB_wombat_captureDatePart = "/" + captureDate + "/"; + + WB_wombat_origHost = "http://" + origHost; + + window.WB_wombat_location = WB_CopyLocationObj(window.self.location); + + + if (window.self.location != window.top.location) { + window.top.WB_wombat_location = WB_CopyLocationObj(window.top.location); + } + + if (window.opener) { + window.opener.WB_wombat_location = (window.opener ? WB_CopyLocationObj(window.opener.location) : null); + } + + + document.WB_wombat_domain = origHost; + +} + +// Check quickly after page load +setTimeout(WB_wombat_checkLocations, 100); + + +// Check periodically every few seconds +setInterval(WB_wombat_checkLocations, 500); diff --git a/pywb/ui/head_insert.html b/pywb/ui/head_insert.html index b30cd015..aa910442 100644 --- a/pywb/ui/head_insert.html +++ b/pywb/ui/head_insert.html @@ -1,7 +1,14 @@ +{% if rule.js_rewrite_location %} + +{% endif %} + + diff --git a/pywb/cdx/canonicalize.py b/pywb/utils/canonicalize.py similarity index 95% rename from pywb/cdx/canonicalize.py rename to pywb/utils/canonicalize.py index e2f818b9..bd21e4ca 100644 --- a/pywb/cdx/canonicalize.py +++ b/pywb/utils/canonicalize.py @@ -3,8 +3,6 @@ import surt import urlparse -from cdxobject import CDXException - #================================================================= class UrlCanonicalizer(object): @@ -15,6 +13,12 @@ class UrlCanonicalizer(object): return canonicalize(url, self.surt_ordered) +#================================================================= +class UrlCanonicalizeException(Exception): + def status(self): + return '400 Bad Request' + + #================================================================= def canonicalize(url, surt_ordered=True): """ @@ -31,7 +35,7 @@ def canonicalize(url, surt_ordered=True): try: key = surt.surt(url) except Exception as e: - raise CDXException('Invalid Url: ' + url) + raise UrlCanonicalizeException('Invalid Url: ' + url) # if not surt, unsurt the surt to get canonicalized non-surt url if not surt_ordered: diff --git a/pywb/utils/dsrules.py b/pywb/utils/dsrules.py new file mode 100644 index 00000000..2e6f9626 --- /dev/null +++ b/pywb/utils/dsrules.py @@ -0,0 +1,98 @@ +import yaml +import pkgutil + +#================================================================= + +DEFAULT_RULES_FILE = 'rules.yaml' +DEFAULT_RULES_PKG = 'pywb' + + +#================================================================= +class RuleSet(object): + DEFAULT_KEY = '' + + def __init__(self, rule_cls, fieldname, **kwargs): + """ + A domain specific rules block, inited via config map. + If config map not specified, it is loaded from default location. + + The rules are represented as a map by domain. + Each rules configuration will load is own field type + from the list and given a specified rule_cls. + """ + + self.rules = [] + + ds_rules_file = kwargs.get('ds_rules_file') + default_rule_config = kwargs.get('default_rule_config') + + config = self.load_default_rules(ds_rules_file) + + rulesmap = config.get('rules') if config else None + + # if default_rule_config provided, always init a default ruleset + if not rulesmap and default_rule_config is not None: + self.rules = [rule_cls(self.DEFAULT_KEY, default_rule_config)] + return + + def_key_found = False + + # iterate over master rules file + for value in rulesmap: + url_prefix = value.get('url_prefix') + rules_def = value.get(fieldname) + if not rules_def: + continue + + if url_prefix == self.DEFAULT_KEY: + def_key_found = True + + self.rules.append(rule_cls(url_prefix, rules_def)) + + # if default_rule_config provided, always init a default ruleset + if not def_key_found and default_rule_config is not None: + self.rules.append(rule_cls(self.DEFAULT_KEY, default_rule_config)) + + @staticmethod + def load_default_rules(filename=None, pkg=None): + config = None + + if not filename: + filename = DEFAULT_RULES_FILE + + if not pkg: + pkg = DEFAULT_RULES_PKG + + if filename: + yaml_str = pkgutil.get_data(pkg, filename) + config = yaml.load(yaml_str) + + return config + + def iter_matching(self, urlkey): + """ + Iterate over all matching rules for given urlkey + """ + for rule in self.rules: + if rule.applies(urlkey): + yield rule + + def get_first_match(self, urlkey): + for rule in self.rules: + if rule.applies(urlkey): + return rule + + +#================================================================= +class BaseRule(object): + """ + Base rule class -- subclassed to handle specific + rules for given url_prefix key + """ + def __init__(self, url_prefix, rules): + self.url_prefix = url_prefix + if not isinstance(self.url_prefix, list): + self.url_prefix = [self.url_prefix] + + def applies(self, urlkey): + return any(urlkey.startswith(x) for x in self.url_prefix) diff --git a/pywb/wbapp.py b/pywb/wbapp.py index 0befa172..ac51ba9d 100644 --- a/pywb/wbapp.py +++ b/pywb/wbapp.py @@ -2,6 +2,7 @@ from wbexceptions import WbException, NotFoundException, InternalRedirect from wbrequestresponse import WbResponse, StatusAndHeaders from pywb.cdx.cdxserver import CDXException +from pywb.utils.canonicalize import UrlCanonicalizeException from pywb.warc.recordloader import ArchiveLoadFailed import os @@ -55,7 +56,8 @@ def create_wb_app(wb_router): except InternalRedirect as ir: response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders)) - except (WbException, CDXException, ArchiveLoadFailed) as e: + except (WbException, CDXException, + UrlCanonicalizeException, ArchiveLoadFailed) as e: response = handle_exception(env, wb_router.error_view, e, False) except Exception as e: diff --git a/setup.py b/setup.py index dac8a907..0750fe55 100755 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setuptools.setup(name='pywb', license='GPL', packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'], - package_data={'pywb': ['ui/*', 'static/*'], 'pywb.cdx': ['*.yaml']}, + package_data={'pywb': ['ui/*', 'static/*'], 'pywb': ['*.yaml']}, data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')), ('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')), ('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],