mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
refactor domain specific rules:
- head insert callback passed in with rule, up to template to handle additional inserts based on rule properties - ability to pass in custom rules config to both cdx server and content rewriter - move canonicalize to utils pkg - add wombat, modify wb.js to remove wombat-related settings
This commit is contained in:
parent
5a41f59f39
commit
453ab678ed
@ -5,11 +5,11 @@ import pkgutil
|
|||||||
|
|
||||||
from pywb.utils.dsrules import BaseRule, RuleSet
|
from pywb.utils.dsrules import BaseRule, RuleSet
|
||||||
|
|
||||||
from canonicalize import unsurt, UrlCanonicalizer
|
from pywb.utils.canonicalize import unsurt, UrlCanonicalizer
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def load_domain_specific_cdx_rules(filename, surt_ordered):
|
def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
|
||||||
#fh = pkgutil.get_data(__package__, filename)
|
#fh = pkgutil.get_data(__package__, filename)
|
||||||
#config = yaml.load(fh)
|
#config = yaml.load(fh)
|
||||||
|
|
||||||
@ -17,7 +17,8 @@ def load_domain_specific_cdx_rules(filename, surt_ordered):
|
|||||||
fuzzy = None
|
fuzzy = None
|
||||||
|
|
||||||
# Load Canonicalizer Rules
|
# Load Canonicalizer Rules
|
||||||
rules = RuleSet(CDXDomainSpecificRule, 'canonicalize')
|
rules = RuleSet(CDXDomainSpecificRule, 'canonicalize',
|
||||||
|
ds_rules_file=ds_rules_file)
|
||||||
|
|
||||||
if not surt_ordered:
|
if not surt_ordered:
|
||||||
for rule in rules:
|
for rule in rules:
|
||||||
@ -27,7 +28,8 @@ def load_domain_specific_cdx_rules(filename, surt_ordered):
|
|||||||
canon = CustomUrlCanonicalizer(rules, surt_ordered)
|
canon = CustomUrlCanonicalizer(rules, surt_ordered)
|
||||||
|
|
||||||
# Load Fuzzy Lookup Rules
|
# Load Fuzzy Lookup Rules
|
||||||
rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup')
|
rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup',
|
||||||
|
ds_rules_file=ds_rules_file)
|
||||||
|
|
||||||
if not surt_ordered:
|
if not surt_ordered:
|
||||||
for rule in rules:
|
for rule in rules:
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from canonicalize import UrlCanonicalizer, calc_search_range
|
from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range
|
||||||
|
|
||||||
from cdxops import cdx_load
|
from cdxops import cdx_load
|
||||||
from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
|
from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
|
||||||
@ -17,13 +17,13 @@ import urlparse
|
|||||||
#=================================================================
|
#=================================================================
|
||||||
class BaseCDXServer(object):
|
class BaseCDXServer(object):
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
ds_rules = kwargs.get('ds_rules')
|
ds_rules_file = kwargs.get('ds_rules_file')
|
||||||
surt_ordered = kwargs.get('surt_ordered', True)
|
surt_ordered = kwargs.get('surt_ordered', True)
|
||||||
|
|
||||||
# load from domain-specific rules
|
# load from domain-specific rules
|
||||||
if ds_rules:
|
if ds_rules_file:
|
||||||
self.url_canon, self.fuzzy_query = (
|
self.url_canon, self.fuzzy_query = (
|
||||||
load_domain_specific_cdx_rules(ds_rules, surt_ordered))
|
load_domain_specific_cdx_rules(ds_rules_file, surt_ordered))
|
||||||
# or custom passed in canonicalizer
|
# or custom passed in canonicalizer
|
||||||
else:
|
else:
|
||||||
self.url_canon = kwargs.get('url_canon')
|
self.url_canon = kwargs.get('url_canon')
|
||||||
@ -166,7 +166,7 @@ def create_cdx_server(config, ds_rules_file=None):
|
|||||||
return server_cls(paths,
|
return server_cls(paths,
|
||||||
config=pass_config,
|
config=pass_config,
|
||||||
surt_ordered=surt_ordered,
|
surt_ordered=surt_ordered,
|
||||||
ds_rules=ds_rules_file,
|
ds_rules_file=ds_rules_file,
|
||||||
perms_checker=perms_checker)
|
perms_checker=perms_checker)
|
||||||
|
|
||||||
|
|
||||||
|
@ -18,17 +18,19 @@ def load_template_file(file, desc = None, view_class = views.J2TemplateView):
|
|||||||
return file
|
return file
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def create_wb_handler(cdx_server, config):
|
def create_wb_handler(cdx_server, config, ds_rules_file=None):
|
||||||
|
|
||||||
record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker'))
|
record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker'))
|
||||||
paths = config.get('archive_paths')
|
paths = config.get('archive_paths')
|
||||||
|
|
||||||
resolving_loader = ResolvingLoader(paths = paths, cdx_server = cdx_server, record_loader = record_loader)
|
resolving_loader = ResolvingLoader(paths=paths,
|
||||||
|
cdx_server=cdx_server,
|
||||||
|
record_loader=record_loader)
|
||||||
|
|
||||||
replayer = replay_views.ReplayView(
|
replayer = replay_views.ReplayView(
|
||||||
content_loader = resolving_loader,
|
content_loader = resolving_loader,
|
||||||
|
|
||||||
content_rewriter = RewriteContent(),
|
content_rewriter = RewriteContent(ds_rules_file=ds_rules_file),
|
||||||
|
|
||||||
head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),
|
head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'),
|
||||||
|
|
||||||
|
@ -57,12 +57,13 @@ def pywb_config_manual(passed_config = {}):
|
|||||||
|
|
||||||
route_config = DictChain(value, config)
|
route_config = DictChain(value, config)
|
||||||
|
|
||||||
ds_rules = route_config.get('domain_specific_rules', None)
|
ds_rules_file = route_config.get('domain_specific_rules', None)
|
||||||
cdx_server = IndexReader(route_config, ds_rules)
|
cdx_server = IndexReader(route_config, ds_rules_file)
|
||||||
|
|
||||||
wb_handler = config_utils.create_wb_handler(
|
wb_handler = config_utils.create_wb_handler(
|
||||||
cdx_server = cdx_server,
|
cdx_server=cdx_server,
|
||||||
config = route_config,
|
config=route_config,
|
||||||
|
ds_rules_file=ds_rules_file,
|
||||||
)
|
)
|
||||||
|
|
||||||
logging.debug('Adding Collection: ' + name)
|
logging.debug('Adding Collection: ' + name)
|
||||||
|
@ -7,6 +7,7 @@ from wbrequestresponse import WbResponse
|
|||||||
from wbexceptions import CaptureException, InternalRedirect
|
from wbexceptions import CaptureException, InternalRedirect
|
||||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ReplayView:
|
class ReplayView:
|
||||||
def __init__(self, content_loader, content_rewriter, head_insert_view = None,
|
def __init__(self, content_loader, content_rewriter, head_insert_view = None,
|
||||||
@ -99,20 +100,34 @@ class ReplayView:
|
|||||||
def rewrite_content(self, wbrequest, cdx, status_headers, stream):
|
def rewrite_content(self, wbrequest, cdx, status_headers, stream):
|
||||||
urlrewriter = wbrequest.urlrewriter
|
urlrewriter = wbrequest.urlrewriter
|
||||||
|
|
||||||
(rewritten_headers, stream) = self.content_rewriter.rewrite_headers(urlrewriter, status_headers, stream)
|
result = self.content_rewriter.rewrite_headers(urlrewriter,
|
||||||
|
status_headers,
|
||||||
|
stream,
|
||||||
|
cdx['urlkey'])
|
||||||
|
(rewritten_headers, stream) = result
|
||||||
|
|
||||||
# no rewriting needed!
|
# no rewriting needed!
|
||||||
if rewritten_headers.text_type is None:
|
if rewritten_headers.text_type is None:
|
||||||
response_iter = self.stream_to_iter(stream)
|
response_iter = self.stream_to_iter(stream)
|
||||||
return WbResponse(rewritten_headers.status_headers, response_iter)
|
return WbResponse(rewritten_headers.status_headers, response_iter)
|
||||||
|
|
||||||
# do head insert
|
def make_head_insert(rule):
|
||||||
|
return (self.head_insert_view.render_to_string(wbrequest=wbrequest,
|
||||||
|
cdx=cdx,
|
||||||
|
rule=rule))
|
||||||
|
# do head insert
|
||||||
if self.head_insert_view:
|
if self.head_insert_view:
|
||||||
head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx)
|
head_insert_func = make_head_insert
|
||||||
else:
|
else:
|
||||||
head_insert_str = None
|
head_insert_func = None
|
||||||
|
|
||||||
(status_headers, response_gen) = self.content_rewriter.rewrite_content(urlrewriter, rewritten_headers, stream, head_insert_str)
|
result = self.content_rewriter.rewrite_content(urlrewriter,
|
||||||
|
rewritten_headers,
|
||||||
|
stream,
|
||||||
|
head_insert_func,
|
||||||
|
cdx['urlkey'])
|
||||||
|
|
||||||
|
(status_headers, response_gen) = result
|
||||||
|
|
||||||
if self.buffer_response:
|
if self.buffer_response:
|
||||||
if wbrequest.wb_url.mod == 'id_':
|
if wbrequest.wb_url.mod == 'id_':
|
||||||
|
@ -11,9 +11,12 @@ from pywb.utils.statusandheaders import StatusAndHeaders
|
|||||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader
|
from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
class RewriteContent:
|
class RewriteContent:
|
||||||
def __init__(self, config=None):
|
def __init__(self, ds_rules_file=None):
|
||||||
self.ruleset = RuleSet(RewriteRules, 'rewrite', config, {})
|
self.ruleset = RuleSet(RewriteRules, 'rewrite',
|
||||||
|
default_rule_config={},
|
||||||
|
ds_rules_file=ds_rules_file)
|
||||||
|
|
||||||
def rewrite_headers(self, urlrewriter, status_headers, stream, urlkey=''):
|
def rewrite_headers(self, urlrewriter, status_headers, stream, urlkey=''):
|
||||||
header_rewriter_class = self.ruleset.get_first_match(urlkey).rewriters['header']
|
header_rewriter_class = self.ruleset.get_first_match(urlkey).rewriters['header']
|
||||||
@ -31,7 +34,7 @@ class RewriteContent:
|
|||||||
|
|
||||||
return (rewritten_headers, stream)
|
return (rewritten_headers, stream)
|
||||||
|
|
||||||
def rewrite_content(self, urlrewriter, headers, stream, head_insert_str=None, urlkey=''):
|
def rewrite_content(self, urlrewriter, headers, stream, head_insert_func=None, urlkey=''):
|
||||||
|
|
||||||
# see if we've already rewritten headers
|
# see if we've already rewritten headers
|
||||||
if isinstance(headers, RewrittenStatusAndHeaders):
|
if isinstance(headers, RewrittenStatusAndHeaders):
|
||||||
@ -65,7 +68,6 @@ class RewriteContent:
|
|||||||
|
|
||||||
text_type = rewritten_headers.text_type
|
text_type = rewritten_headers.text_type
|
||||||
|
|
||||||
#rewriter_class = self.rewriters.get(text_type)
|
|
||||||
rule = self.ruleset.get_first_match(urlkey)
|
rule = self.ruleset.get_first_match(urlkey)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -74,10 +76,13 @@ class RewriteContent:
|
|||||||
raise Exception('Unknown Text Type for Rewrite: ' + text_type)
|
raise Exception('Unknown Text Type for Rewrite: ' + text_type)
|
||||||
|
|
||||||
#import sys
|
#import sys
|
||||||
#sys.stderr.write(str(vars(self.ruleset.get_first_match(urlkey))))
|
#sys.stderr.write(str(vars(rule)))
|
||||||
|
|
||||||
if text_type == 'html':
|
if text_type == 'html':
|
||||||
head_insert_str = rule.create_head_inserts() + head_insert_str
|
head_insert_str = ''
|
||||||
|
|
||||||
|
if head_insert_func:
|
||||||
|
head_insert_str = head_insert_func(rule)
|
||||||
|
|
||||||
rewriter = rewriter_class(urlrewriter,
|
rewriter = rewriter_class(urlrewriter,
|
||||||
outstream=None,
|
outstream=None,
|
||||||
|
@ -7,11 +7,11 @@ import mimetypes
|
|||||||
from pywb.utils.loaders import is_http
|
from pywb.utils.loaders import is_http
|
||||||
from pywb.utils.timeutils import datetime_to_timestamp
|
from pywb.utils.timeutils import datetime_to_timestamp
|
||||||
from pywb.utils.statusandheaders import StatusAndHeaders
|
from pywb.utils.statusandheaders import StatusAndHeaders
|
||||||
|
from pywb.utils.canonicalize import canonicalize
|
||||||
|
|
||||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||||
from pywb.rewrite.rewrite_content import RewriteContent
|
from pywb.rewrite.rewrite_content import RewriteContent
|
||||||
|
|
||||||
from pywb.cdx.canonicalize import canonicalize
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Fetch a url from live web and apply rewriting rules
|
Fetch a url from live web and apply rewriting rules
|
||||||
@ -43,7 +43,7 @@ def get_local_file(uri):
|
|||||||
return (status_headers, stream)
|
return (status_headers, stream)
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def get_rewritten(url, urlrewriter, urlkey=None):
|
def get_rewritten(url, urlrewriter, urlkey=None, head_insert_func=None):
|
||||||
if is_http(url):
|
if is_http(url):
|
||||||
(status_headers, stream) = get_status_and_stream(url)
|
(status_headers, stream) = get_status_and_stream(url)
|
||||||
else:
|
else:
|
||||||
@ -53,11 +53,15 @@ def get_rewritten(url, urlrewriter, urlkey=None):
|
|||||||
if not urlkey:
|
if not urlkey:
|
||||||
urlkey = canonicalize(url)
|
urlkey = canonicalize(url)
|
||||||
|
|
||||||
status_headers, gen = RewriteContent().rewrite_content(urlrewriter,
|
rewriter = RewriteContent()
|
||||||
status_headers,
|
|
||||||
stream,
|
result = rewriter.rewrite_content(urlrewriter,
|
||||||
head_insert_str='',
|
status_headers,
|
||||||
urlkey=urlkey)
|
stream,
|
||||||
|
head_insert_func=head_insert_func,
|
||||||
|
urlkey=urlkey)
|
||||||
|
|
||||||
|
status_headers, gen = result
|
||||||
|
|
||||||
buff = ''
|
buff = ''
|
||||||
for x in gen:
|
for x in gen:
|
||||||
|
53
pywb/rewrite/rewriterules.py
Normal file
53
pywb/rewrite/rewriterules.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
from pywb.utils.dsrules import BaseRule
|
||||||
|
|
||||||
|
from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
|
||||||
|
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
|
||||||
|
from html_rewriter import HTMLRewriter
|
||||||
|
from header_rewriter import HeaderRewriter
|
||||||
|
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
class RewriteRules(BaseRule):
|
||||||
|
def __init__(self, url_prefix, config={}):
|
||||||
|
super(RewriteRules, self).__init__(url_prefix, config)
|
||||||
|
|
||||||
|
self.rewriters = {}
|
||||||
|
|
||||||
|
#self._script_head_inserts = config.get('script_head_inserts', {})
|
||||||
|
|
||||||
|
self.rewriters['header'] = config.get('header_class', HeaderRewriter)
|
||||||
|
self.rewriters['css'] = config.get('css_class', CSSRewriter)
|
||||||
|
self.rewriters['xml'] = config.get('xml_class', XMLRewriter)
|
||||||
|
self.rewriters['html'] = config.get('html_class', HTMLRewriter)
|
||||||
|
|
||||||
|
# Custom handling for js rewriting, often the most complex
|
||||||
|
self.js_rewrite_location = config.get('js_rewrite_location', True)
|
||||||
|
self.js_rewrite_location = bool(self.js_rewrite_location)
|
||||||
|
|
||||||
|
# ability to toggle rewriting
|
||||||
|
if self.js_rewrite_location:
|
||||||
|
js_default_class = JSLinkAndLocationRewriter
|
||||||
|
else:
|
||||||
|
js_default_class = JSLinkOnlyRewriter
|
||||||
|
|
||||||
|
# set js class, using either default or override from config
|
||||||
|
self.rewriters['js'] = config.get('js_class', js_default_class)
|
||||||
|
|
||||||
|
# add any regexs for js rewriter
|
||||||
|
self._add_custom_regexs('js', config)
|
||||||
|
|
||||||
|
def _add_custom_regexs(self, field, config):
|
||||||
|
regexs = config.get(field + '_regexs')
|
||||||
|
if not regexs:
|
||||||
|
return
|
||||||
|
|
||||||
|
rewriter_cls = self.rewriters[field]
|
||||||
|
|
||||||
|
rule_def_tuples = RegexRewriter.parse_rules_from_config(regexs)
|
||||||
|
|
||||||
|
def extend_rewriter_with_regex(urlrewriter):
|
||||||
|
#import sys
|
||||||
|
#sys.stderr.write('\n\nEXTEND: ' + str(rule_def_tuples))
|
||||||
|
return rewriter_cls(urlrewriter, rule_def_tuples)
|
||||||
|
|
||||||
|
self.rewriters[field] = extend_rewriter_with_regex
|
@ -8,9 +8,18 @@ from pywb import get_test_dir
|
|||||||
|
|
||||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
|
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
|
||||||
|
|
||||||
|
def head_insert_func(rule):
|
||||||
|
if rule.js_rewrite_location == True:
|
||||||
|
return '<script src="/static/default/wombat.js"> </script>'
|
||||||
|
else:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
def test_local_1():
|
def test_local_1():
|
||||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, 'com,example,test)/')
|
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
||||||
|
urlrewriter,
|
||||||
|
'com,example,test)/',
|
||||||
|
head_insert_func)
|
||||||
|
|
||||||
# wombat insert added
|
# wombat insert added
|
||||||
assert '<head><script src="/static/default/wombat.js"> </script>' in buff
|
assert '<head><script src="/static/default/wombat.js"> </script>' in buff
|
||||||
@ -23,7 +32,10 @@ def test_local_1():
|
|||||||
|
|
||||||
|
|
||||||
def test_local_2_no_js_location_rewrite():
|
def test_local_2_no_js_location_rewrite():
|
||||||
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, 'example,example,test)/nolocation_rewrite')
|
status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html',
|
||||||
|
urlrewriter,
|
||||||
|
'example,example,test)/nolocation_rewrite',
|
||||||
|
head_insert_func)
|
||||||
|
|
||||||
# no wombat insert
|
# no wombat insert
|
||||||
assert '<head><script src="/static/default/wombat.js"> </script>' not in buff
|
assert '<head><script src="/static/default/wombat.js"> </script>' not in buff
|
||||||
@ -55,6 +67,6 @@ def test_example_domain_specific_3():
|
|||||||
status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter)
|
status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter)
|
||||||
|
|
||||||
# comment out bootloader
|
# comment out bootloader
|
||||||
assert '/* Bootloader.configurePage' in buff, buff
|
assert '/* Bootloader.configurePage' in buff
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,18 +1,21 @@
|
|||||||
|
/*
|
||||||
|
Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License.
|
||||||
|
|
||||||
|
This file is part of pywb.
|
||||||
|
|
||||||
// Rewritten location and domain obj setup
|
pywb is free software: you can redistribute it and/or modify
|
||||||
window.WB_wombat_location = window.location
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
if (window.top != window) {
|
pywb is distributed in the hope that it will be useful,
|
||||||
window.top.WB_wombat_location = window.top.location
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
}
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
if (window.opener) {
|
|
||||||
window.opener.WB_wombat_location = window.opener.location
|
|
||||||
}
|
|
||||||
|
|
||||||
document.WB_wombat_domain = document.domain
|
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with pywb. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
function initBanner()
|
function initBanner()
|
||||||
{
|
{
|
||||||
|
219
pywb/static/wombat.js
Normal file
219
pywb/static/wombat.js
Normal file
@ -0,0 +1,219 @@
|
|||||||
|
/*
|
||||||
|
Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License.
|
||||||
|
|
||||||
|
This file is part of pywb.
|
||||||
|
|
||||||
|
pywb is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
pywb is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with pywb. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
//============================================
|
||||||
|
// Wombat JS-Rewriting Library
|
||||||
|
//============================================
|
||||||
|
|
||||||
|
var WB_wombat_replayPrefix;
|
||||||
|
var WB_wombat_replayDatePrefix;
|
||||||
|
var WB_wombat_captureDatePart;
|
||||||
|
var WB_wombat_origHost;
|
||||||
|
|
||||||
|
|
||||||
|
function WB_StripPort(str)
|
||||||
|
{
|
||||||
|
var hostWithPort = str.match(/^http:\/\/[\w\d@.-]+:\d+/);
|
||||||
|
if (hostWithPort) {
|
||||||
|
var hostName = hostWithPort[0].substr(0, hostWithPort[0].lastIndexOf(':'));
|
||||||
|
return hostName + str.substr(hostWithPort[0].length);
|
||||||
|
}
|
||||||
|
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
function WB_IsHostUrl(str)
|
||||||
|
{
|
||||||
|
// Good guess that's its a hostname
|
||||||
|
if (str.indexOf("www.") == 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// hostname:port (port required)
|
||||||
|
var matches = str.match(/^[\w-]+(\.[\w-_]+)+(:\d+)(\/|$)/);
|
||||||
|
if (matches && (matches[0].length < 64)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ip:port
|
||||||
|
matches = str.match(/^\d+\.\d+\.\d+\.\d+(:\d+)?(\/|$)/);
|
||||||
|
if (matches && (matches[0].length < 64)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
function WB_RewriteUrl(url)
|
||||||
|
{
|
||||||
|
var httpPrefix = "http://";
|
||||||
|
|
||||||
|
// If not dealing with a string, just return it
|
||||||
|
if (!url || (typeof url) != "string") {
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If starts with prefix, no rewriting needed
|
||||||
|
// Only check replay prefix (no date) as date may be different for each capture
|
||||||
|
if (url.indexOf(WB_wombat_replayPrefix) == 0) {
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If server relative url, add prefix and original host
|
||||||
|
if (url.charAt(0) == "/") {
|
||||||
|
|
||||||
|
// Already a relative url, don't make any changes!
|
||||||
|
if (url.indexOf(WB_wombat_captureDatePart) >= 0) {
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
return WB_wombat_replayDatePrefix + WB_wombat_origHost + url;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If full url starting with http://, add prefix
|
||||||
|
if (url.indexOf(httpPrefix) == 0) {
|
||||||
|
return WB_wombat_replayDatePrefix + url;
|
||||||
|
}
|
||||||
|
|
||||||
|
// May or may not be a hostname, call function to determine
|
||||||
|
// If it is, add the prefix and make sure port is removed
|
||||||
|
if (WB_IsHostUrl(url)) {
|
||||||
|
return WB_wombat_replayDatePrefix + httpPrefix + url;
|
||||||
|
}
|
||||||
|
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
function WB_CopyObjectFields(obj)
|
||||||
|
{
|
||||||
|
var newObj = {};
|
||||||
|
|
||||||
|
for (prop in obj) {
|
||||||
|
if ((typeof obj[prop]) != "function") {
|
||||||
|
newObj[prop] = obj[prop];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return newObj;
|
||||||
|
}
|
||||||
|
|
||||||
|
function WB_ExtractOrig(href)
|
||||||
|
{
|
||||||
|
if (!href) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
href = href.toString();
|
||||||
|
var index = href.indexOf("/http", 1);
|
||||||
|
if (index > 0) {
|
||||||
|
return href.substr(index + 1);
|
||||||
|
} else {
|
||||||
|
return href;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function WB_CopyLocationObj(loc)
|
||||||
|
{
|
||||||
|
var newLoc = WB_CopyObjectFields(loc);
|
||||||
|
|
||||||
|
newLoc._origLoc = loc;
|
||||||
|
newLoc._origHref = loc.href;
|
||||||
|
|
||||||
|
// Rewrite replace and assign functions
|
||||||
|
newLoc.replace = function(url) { this._origLoc.replace(WB_RewriteUrl(url)); }
|
||||||
|
newLoc.assign = function(url) { this._origLoc.assign(WB_RewriteUrl(url)); }
|
||||||
|
newLoc.reload = loc.reload;
|
||||||
|
newLoc.href = WB_ExtractOrig(newLoc._origHref);
|
||||||
|
newLoc.toString = function() { return this.href; }
|
||||||
|
|
||||||
|
return newLoc;
|
||||||
|
}
|
||||||
|
|
||||||
|
function WB_wombat_updateLoc(reqHref, origHref, location)
|
||||||
|
{
|
||||||
|
if (reqHref && (WB_ExtractOrig(origHref) != WB_ExtractOrig(reqHref))) {
|
||||||
|
var finalHref = WB_RewriteUrl(reqHref);
|
||||||
|
|
||||||
|
location.href = finalHref;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function WB_wombat_checkLocationChange(wbLoc, isTop)
|
||||||
|
{
|
||||||
|
var locType = (typeof wbLoc);
|
||||||
|
|
||||||
|
var location = (isTop ? window.top.location : window.location);
|
||||||
|
|
||||||
|
// String has been assigned to location, so assign it
|
||||||
|
if (locType == "string") {
|
||||||
|
WB_wombat_updateLoc(wbLoc, location.href, location)
|
||||||
|
|
||||||
|
} else if (locType == "object") {
|
||||||
|
WB_wombat_updateLoc(wbLoc.href, wbLoc._origHref, location);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var wombat_updating = false;
|
||||||
|
|
||||||
|
function WB_wombat_checkLocations()
|
||||||
|
{
|
||||||
|
if (wombat_updating) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
wombat_updating = true;
|
||||||
|
|
||||||
|
WB_wombat_checkLocationChange(window.WB_wombat_location, false);
|
||||||
|
|
||||||
|
if (window.self.location != window.top.location) {
|
||||||
|
WB_wombat_checkLocationChange(window.top.WB_wombat_location, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
wombat_updating = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
function WB_wombat_Init(replayPrefix, captureDate, origHost)
|
||||||
|
{
|
||||||
|
WB_wombat_replayPrefix = replayPrefix;
|
||||||
|
WB_wombat_replayDatePrefix = replayPrefix + captureDate + "/";
|
||||||
|
WB_wombat_captureDatePart = "/" + captureDate + "/";
|
||||||
|
|
||||||
|
WB_wombat_origHost = "http://" + origHost;
|
||||||
|
|
||||||
|
window.WB_wombat_location = WB_CopyLocationObj(window.self.location);
|
||||||
|
|
||||||
|
|
||||||
|
if (window.self.location != window.top.location) {
|
||||||
|
window.top.WB_wombat_location = WB_CopyLocationObj(window.top.location);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (window.opener) {
|
||||||
|
window.opener.WB_wombat_location = (window.opener ? WB_CopyLocationObj(window.opener.location) : null);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
document.WB_wombat_domain = origHost;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check quickly after page load
|
||||||
|
setTimeout(WB_wombat_checkLocations, 100);
|
||||||
|
|
||||||
|
|
||||||
|
// Check periodically every few seconds
|
||||||
|
setInterval(WB_wombat_checkLocations, 500);
|
@ -1,7 +1,14 @@
|
|||||||
<!-- WB Insert -->
|
<!-- WB Insert -->
|
||||||
|
{% if rule.js_rewrite_location %}
|
||||||
|
<script src='{{ wbrequest.host_prefix }}/static/default/wombat.js'> </script>
|
||||||
<script>
|
<script>
|
||||||
wbinfo = {}
|
WB_wombat_Init("{{wbrequest.wb_prefix}}", "{{cdx['timestamp']}}", "{{cdx['original'] | host}}");
|
||||||
wbinfo.capture_str = "{{ cdx['timestamp'] | format_ts }}";
|
</script>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
<script>
|
||||||
|
wbinfo = {}
|
||||||
|
wbinfo.capture_str = "{{ cdx['timestamp'] | format_ts }}";
|
||||||
</script>
|
</script>
|
||||||
<script src='{{ wbrequest.host_prefix }}/static/default/wb.js'> </script>
|
<script src='{{ wbrequest.host_prefix }}/static/default/wb.js'> </script>
|
||||||
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/static/default/wb.css'/>
|
<link rel='stylesheet' href='{{ wbrequest.host_prefix }}/static/default/wb.css'/>
|
||||||
|
@ -3,8 +3,6 @@
|
|||||||
|
|
||||||
import surt
|
import surt
|
||||||
import urlparse
|
import urlparse
|
||||||
from cdxobject import CDXException
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class UrlCanonicalizer(object):
|
class UrlCanonicalizer(object):
|
||||||
@ -15,6 +13,12 @@ class UrlCanonicalizer(object):
|
|||||||
return canonicalize(url, self.surt_ordered)
|
return canonicalize(url, self.surt_ordered)
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class UrlCanonicalizeException(Exception):
|
||||||
|
def status(self):
|
||||||
|
return '400 Bad Request'
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def canonicalize(url, surt_ordered=True):
|
def canonicalize(url, surt_ordered=True):
|
||||||
"""
|
"""
|
||||||
@ -31,7 +35,7 @@ def canonicalize(url, surt_ordered=True):
|
|||||||
try:
|
try:
|
||||||
key = surt.surt(url)
|
key = surt.surt(url)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise CDXException('Invalid Url: ' + url)
|
raise UrlCanonicalizeException('Invalid Url: ' + url)
|
||||||
|
|
||||||
# if not surt, unsurt the surt to get canonicalized non-surt url
|
# if not surt, unsurt the surt to get canonicalized non-surt url
|
||||||
if not surt_ordered:
|
if not surt_ordered:
|
98
pywb/utils/dsrules.py
Normal file
98
pywb/utils/dsrules.py
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
import yaml
|
||||||
|
import pkgutil
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
|
||||||
|
DEFAULT_RULES_FILE = 'rules.yaml'
|
||||||
|
DEFAULT_RULES_PKG = 'pywb'
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class RuleSet(object):
|
||||||
|
DEFAULT_KEY = ''
|
||||||
|
|
||||||
|
def __init__(self, rule_cls, fieldname, **kwargs):
|
||||||
|
"""
|
||||||
|
A domain specific rules block, inited via config map.
|
||||||
|
If config map not specified, it is loaded from default location.
|
||||||
|
|
||||||
|
The rules are represented as a map by domain.
|
||||||
|
Each rules configuration will load is own field type
|
||||||
|
from the list and given a specified rule_cls.
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.rules = []
|
||||||
|
|
||||||
|
ds_rules_file = kwargs.get('ds_rules_file')
|
||||||
|
default_rule_config = kwargs.get('default_rule_config')
|
||||||
|
|
||||||
|
config = self.load_default_rules(ds_rules_file)
|
||||||
|
|
||||||
|
rulesmap = config.get('rules') if config else None
|
||||||
|
|
||||||
|
# if default_rule_config provided, always init a default ruleset
|
||||||
|
if not rulesmap and default_rule_config is not None:
|
||||||
|
self.rules = [rule_cls(self.DEFAULT_KEY, default_rule_config)]
|
||||||
|
return
|
||||||
|
|
||||||
|
def_key_found = False
|
||||||
|
|
||||||
|
# iterate over master rules file
|
||||||
|
for value in rulesmap:
|
||||||
|
url_prefix = value.get('url_prefix')
|
||||||
|
rules_def = value.get(fieldname)
|
||||||
|
if not rules_def:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if url_prefix == self.DEFAULT_KEY:
|
||||||
|
def_key_found = True
|
||||||
|
|
||||||
|
self.rules.append(rule_cls(url_prefix, rules_def))
|
||||||
|
|
||||||
|
# if default_rule_config provided, always init a default ruleset
|
||||||
|
if not def_key_found and default_rule_config is not None:
|
||||||
|
self.rules.append(rule_cls(self.DEFAULT_KEY, default_rule_config))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load_default_rules(filename=None, pkg=None):
|
||||||
|
config = None
|
||||||
|
|
||||||
|
if not filename:
|
||||||
|
filename = DEFAULT_RULES_FILE
|
||||||
|
|
||||||
|
if not pkg:
|
||||||
|
pkg = DEFAULT_RULES_PKG
|
||||||
|
|
||||||
|
if filename:
|
||||||
|
yaml_str = pkgutil.get_data(pkg, filename)
|
||||||
|
config = yaml.load(yaml_str)
|
||||||
|
|
||||||
|
return config
|
||||||
|
|
||||||
|
def iter_matching(self, urlkey):
|
||||||
|
"""
|
||||||
|
Iterate over all matching rules for given urlkey
|
||||||
|
"""
|
||||||
|
for rule in self.rules:
|
||||||
|
if rule.applies(urlkey):
|
||||||
|
yield rule
|
||||||
|
|
||||||
|
def get_first_match(self, urlkey):
|
||||||
|
for rule in self.rules:
|
||||||
|
if rule.applies(urlkey):
|
||||||
|
return rule
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class BaseRule(object):
|
||||||
|
"""
|
||||||
|
Base rule class -- subclassed to handle specific
|
||||||
|
rules for given url_prefix key
|
||||||
|
"""
|
||||||
|
def __init__(self, url_prefix, rules):
|
||||||
|
self.url_prefix = url_prefix
|
||||||
|
if not isinstance(self.url_prefix, list):
|
||||||
|
self.url_prefix = [self.url_prefix]
|
||||||
|
|
||||||
|
def applies(self, urlkey):
|
||||||
|
return any(urlkey.startswith(x) for x in self.url_prefix)
|
@ -2,6 +2,7 @@ from wbexceptions import WbException, NotFoundException, InternalRedirect
|
|||||||
from wbrequestresponse import WbResponse, StatusAndHeaders
|
from wbrequestresponse import WbResponse, StatusAndHeaders
|
||||||
|
|
||||||
from pywb.cdx.cdxserver import CDXException
|
from pywb.cdx.cdxserver import CDXException
|
||||||
|
from pywb.utils.canonicalize import UrlCanonicalizeException
|
||||||
from pywb.warc.recordloader import ArchiveLoadFailed
|
from pywb.warc.recordloader import ArchiveLoadFailed
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@ -55,7 +56,8 @@ def create_wb_app(wb_router):
|
|||||||
except InternalRedirect as ir:
|
except InternalRedirect as ir:
|
||||||
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
|
response = WbResponse(StatusAndHeaders(ir.status, ir.httpHeaders))
|
||||||
|
|
||||||
except (WbException, CDXException, ArchiveLoadFailed) as e:
|
except (WbException, CDXException,
|
||||||
|
UrlCanonicalizeException, ArchiveLoadFailed) as e:
|
||||||
response = handle_exception(env, wb_router.error_view, e, False)
|
response = handle_exception(env, wb_router.error_view, e, False)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
2
setup.py
2
setup.py
@ -13,7 +13,7 @@ setuptools.setup(name='pywb',
|
|||||||
license='GPL',
|
license='GPL',
|
||||||
packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
|
packages=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
|
||||||
provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
|
provides=['pywb','pywb.utils','pywb.cdx','pywb.warc','pywb.rewrite'],
|
||||||
package_data={'pywb': ['ui/*', 'static/*'], 'pywb.cdx': ['*.yaml']},
|
package_data={'pywb': ['ui/*', 'static/*'], 'pywb': ['*.yaml']},
|
||||||
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
data_files = [('sample_archive/cdx/', glob.glob('sample_archive/cdx/*')),
|
||||||
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
|
('sample_archive/warcs/', glob.glob('sample_archive/warcs/*')),
|
||||||
('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],
|
('sample_archive/text_content/', glob.glob('sample_archive/text_content/*'))],
|
||||||
|
Loading…
x
Reference in New Issue
Block a user