diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index a9e06778..006dd88d 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -5,11 +5,11 @@ import pkgutil from pywb.utils.dsrules import BaseRule, RuleSet -from canonicalize import unsurt, UrlCanonicalizer +from pywb.utils.canonicalize import unsurt, UrlCanonicalizer #================================================================= -def load_domain_specific_cdx_rules(filename, surt_ordered): +def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): #fh = pkgutil.get_data(__package__, filename) #config = yaml.load(fh) @@ -17,7 +17,8 @@ def load_domain_specific_cdx_rules(filename, surt_ordered): fuzzy = None # Load Canonicalizer Rules - rules = RuleSet(CDXDomainSpecificRule, 'canonicalize') + rules = RuleSet(CDXDomainSpecificRule, 'canonicalize', + ds_rules_file=ds_rules_file) if not surt_ordered: for rule in rules: @@ -27,7 +28,8 @@ def load_domain_specific_cdx_rules(filename, surt_ordered): canon = CustomUrlCanonicalizer(rules, surt_ordered) # Load Fuzzy Lookup Rules - rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup') + rules = RuleSet(CDXDomainSpecificRule, 'fuzzy_lookup', + ds_rules_file=ds_rules_file) if not surt_ordered: for rule in rules: diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 1a68f7e4..7f548ec4 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -1,4 +1,4 @@ -from canonicalize import UrlCanonicalizer, calc_search_range +from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range from cdxops import cdx_load from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource @@ -17,13 +17,13 @@ import urlparse #================================================================= class BaseCDXServer(object): def __init__(self, **kwargs): - ds_rules = kwargs.get('ds_rules') + ds_rules_file = kwargs.get('ds_rules_file') surt_ordered = kwargs.get('surt_ordered', True) # load from domain-specific rules - if ds_rules: + if ds_rules_file: self.url_canon, self.fuzzy_query = ( - load_domain_specific_cdx_rules(ds_rules, surt_ordered)) + load_domain_specific_cdx_rules(ds_rules_file, surt_ordered)) # or custom passed in canonicalizer else: self.url_canon = kwargs.get('url_canon') @@ -166,7 +166,7 @@ def create_cdx_server(config, ds_rules_file=None): return server_cls(paths, config=pass_config, surt_ordered=surt_ordered, - ds_rules=ds_rules_file, + ds_rules_file=ds_rules_file, perms_checker=perms_checker) diff --git a/pywb/config_utils.py b/pywb/config_utils.py index 672e8735..05844a2e 100644 --- a/pywb/config_utils.py +++ b/pywb/config_utils.py @@ -18,17 +18,19 @@ def load_template_file(file, desc = None, view_class = views.J2TemplateView): return file #================================================================= -def create_wb_handler(cdx_server, config): +def create_wb_handler(cdx_server, config, ds_rules_file=None): record_loader = ArcWarcRecordLoader(cookie_maker = config.get('cookie_maker')) paths = config.get('archive_paths') - resolving_loader = ResolvingLoader(paths = paths, cdx_server = cdx_server, record_loader = record_loader) + resolving_loader = ResolvingLoader(paths=paths, + cdx_server=cdx_server, + record_loader=record_loader) replayer = replay_views.ReplayView( content_loader = resolving_loader, - content_rewriter = RewriteContent(), + content_rewriter = RewriteContent(ds_rules_file=ds_rules_file), head_insert_view = load_template_file(config.get('head_insert_html'), 'Head Insert'), diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py index be4bdded..bd63bfd5 100644 --- a/pywb/pywb_init.py +++ b/pywb/pywb_init.py @@ -57,12 +57,13 @@ def pywb_config_manual(passed_config = {}): route_config = DictChain(value, config) - ds_rules = route_config.get('domain_specific_rules', None) - cdx_server = IndexReader(route_config, ds_rules) + ds_rules_file = route_config.get('domain_specific_rules', None) + cdx_server = IndexReader(route_config, ds_rules_file) wb_handler = config_utils.create_wb_handler( - cdx_server = cdx_server, - config = route_config, + cdx_server=cdx_server, + config=route_config, + ds_rules_file=ds_rules_file, ) logging.debug('Adding Collection: ' + name) diff --git a/pywb/replay_views.py b/pywb/replay_views.py index 4c6907eb..9113ad5f 100644 --- a/pywb/replay_views.py +++ b/pywb/replay_views.py @@ -7,6 +7,7 @@ from wbrequestresponse import WbResponse from wbexceptions import CaptureException, InternalRedirect from pywb.warc.recordloader import ArchiveLoadFailed + #================================================================= class ReplayView: def __init__(self, content_loader, content_rewriter, head_insert_view = None, @@ -99,20 +100,34 @@ class ReplayView: def rewrite_content(self, wbrequest, cdx, status_headers, stream): urlrewriter = wbrequest.urlrewriter - (rewritten_headers, stream) = self.content_rewriter.rewrite_headers(urlrewriter, status_headers, stream) + result = self.content_rewriter.rewrite_headers(urlrewriter, + status_headers, + stream, + cdx['urlkey']) + (rewritten_headers, stream) = result # no rewriting needed! if rewritten_headers.text_type is None: response_iter = self.stream_to_iter(stream) return WbResponse(rewritten_headers.status_headers, response_iter) - # do head insert + def make_head_insert(rule): + return (self.head_insert_view.render_to_string(wbrequest=wbrequest, + cdx=cdx, + rule=rule)) + # do head insert if self.head_insert_view: - head_insert_str = self.head_insert_view.render_to_string(wbrequest = wbrequest, cdx = cdx) + head_insert_func = make_head_insert else: - head_insert_str = None + head_insert_func = None - (status_headers, response_gen) = self.content_rewriter.rewrite_content(urlrewriter, rewritten_headers, stream, head_insert_str) + result = self.content_rewriter.rewrite_content(urlrewriter, + rewritten_headers, + stream, + head_insert_func, + cdx['urlkey']) + + (status_headers, response_gen) = result if self.buffer_response: if wbrequest.wb_url.mod == 'id_': diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 80daf7e3..1ba3d321 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -11,9 +11,12 @@ from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.bufferedreaders import DecompressingBufferedReader, ChunkedDataReader +#================================================================= class RewriteContent: - def __init__(self, config=None): - self.ruleset = RuleSet(RewriteRules, 'rewrite', config, {}) + def __init__(self, ds_rules_file=None): + self.ruleset = RuleSet(RewriteRules, 'rewrite', + default_rule_config={}, + ds_rules_file=ds_rules_file) def rewrite_headers(self, urlrewriter, status_headers, stream, urlkey=''): header_rewriter_class = self.ruleset.get_first_match(urlkey).rewriters['header'] @@ -31,7 +34,7 @@ class RewriteContent: return (rewritten_headers, stream) - def rewrite_content(self, urlrewriter, headers, stream, head_insert_str=None, urlkey=''): + def rewrite_content(self, urlrewriter, headers, stream, head_insert_func=None, urlkey=''): # see if we've already rewritten headers if isinstance(headers, RewrittenStatusAndHeaders): @@ -65,7 +68,6 @@ class RewriteContent: text_type = rewritten_headers.text_type - #rewriter_class = self.rewriters.get(text_type) rule = self.ruleset.get_first_match(urlkey) try: @@ -74,10 +76,13 @@ class RewriteContent: raise Exception('Unknown Text Type for Rewrite: ' + text_type) #import sys - #sys.stderr.write(str(vars(self.ruleset.get_first_match(urlkey)))) + #sys.stderr.write(str(vars(rule))) if text_type == 'html': - head_insert_str = rule.create_head_inserts() + head_insert_str + head_insert_str = '' + + if head_insert_func: + head_insert_str = head_insert_func(rule) rewriter = rewriter_class(urlrewriter, outstream=None, diff --git a/pywb/rewrite/rewrite_live.py b/pywb/rewrite/rewrite_live.py index 9d752d10..63783234 100644 --- a/pywb/rewrite/rewrite_live.py +++ b/pywb/rewrite/rewrite_live.py @@ -7,11 +7,11 @@ import mimetypes from pywb.utils.loaders import is_http from pywb.utils.timeutils import datetime_to_timestamp from pywb.utils.statusandheaders import StatusAndHeaders +from pywb.utils.canonicalize import canonicalize from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.rewrite_content import RewriteContent -from pywb.cdx.canonicalize import canonicalize """ Fetch a url from live web and apply rewriting rules @@ -43,7 +43,7 @@ def get_local_file(uri): return (status_headers, stream) #================================================================= -def get_rewritten(url, urlrewriter, urlkey=None): +def get_rewritten(url, urlrewriter, urlkey=None, head_insert_func=None): if is_http(url): (status_headers, stream) = get_status_and_stream(url) else: @@ -53,11 +53,15 @@ def get_rewritten(url, urlrewriter, urlkey=None): if not urlkey: urlkey = canonicalize(url) - status_headers, gen = RewriteContent().rewrite_content(urlrewriter, - status_headers, - stream, - head_insert_str='', - urlkey=urlkey) + rewriter = RewriteContent() + + result = rewriter.rewrite_content(urlrewriter, + status_headers, + stream, + head_insert_func=head_insert_func, + urlkey=urlkey) + + status_headers, gen = result buff = '' for x in gen: diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py new file mode 100644 index 00000000..e1584162 --- /dev/null +++ b/pywb/rewrite/rewriterules.py @@ -0,0 +1,53 @@ +from pywb.utils.dsrules import BaseRule + +from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter +from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter +from html_rewriter import HTMLRewriter +from header_rewriter import HeaderRewriter + +import itertools + +class RewriteRules(BaseRule): + def __init__(self, url_prefix, config={}): + super(RewriteRules, self).__init__(url_prefix, config) + + self.rewriters = {} + + #self._script_head_inserts = config.get('script_head_inserts', {}) + + self.rewriters['header'] = config.get('header_class', HeaderRewriter) + self.rewriters['css'] = config.get('css_class', CSSRewriter) + self.rewriters['xml'] = config.get('xml_class', XMLRewriter) + self.rewriters['html'] = config.get('html_class', HTMLRewriter) + + # Custom handling for js rewriting, often the most complex + self.js_rewrite_location = config.get('js_rewrite_location', True) + self.js_rewrite_location = bool(self.js_rewrite_location) + + # ability to toggle rewriting + if self.js_rewrite_location: + js_default_class = JSLinkAndLocationRewriter + else: + js_default_class = JSLinkOnlyRewriter + + # set js class, using either default or override from config + self.rewriters['js'] = config.get('js_class', js_default_class) + + # add any regexs for js rewriter + self._add_custom_regexs('js', config) + + def _add_custom_regexs(self, field, config): + regexs = config.get(field + '_regexs') + if not regexs: + return + + rewriter_cls = self.rewriters[field] + + rule_def_tuples = RegexRewriter.parse_rules_from_config(regexs) + + def extend_rewriter_with_regex(urlrewriter): + #import sys + #sys.stderr.write('\n\nEXTEND: ' + str(rule_def_tuples)) + return rewriter_cls(urlrewriter, rule_def_tuples) + + self.rewriters[field] = extend_rewriter_with_regex diff --git a/pywb/rewrite/test/test_rewrite_live.py b/pywb/rewrite/test/test_rewrite_live.py index f3a7667a..36e74848 100644 --- a/pywb/rewrite/test/test_rewrite_live.py +++ b/pywb/rewrite/test/test_rewrite_live.py @@ -8,9 +8,18 @@ from pywb import get_test_dir urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/') +def head_insert_func(rule): + if rule.js_rewrite_location == True: + return '' + else: + return '' + def test_local_1(): - status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, 'com,example,test)/') + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', + urlrewriter, + 'com,example,test)/', + head_insert_func) # wombat insert added assert '
' in buff @@ -23,7 +32,10 @@ def test_local_1(): def test_local_2_no_js_location_rewrite(): - status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', urlrewriter, 'example,example,test)/nolocation_rewrite') + status_headers, buff = get_rewritten(get_test_dir() + 'text_content/sample.html', + urlrewriter, + 'example,example,test)/nolocation_rewrite', + head_insert_func) # no wombat insert assert '' not in buff @@ -55,6 +67,6 @@ def test_example_domain_specific_3(): status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter) # comment out bootloader - assert '/* Bootloader.configurePage' in buff, buff + assert '/* Bootloader.configurePage' in buff diff --git a/pywb/static/wb.js b/pywb/static/wb.js index a7b39370..c4798da8 100644 --- a/pywb/static/wb.js +++ b/pywb/static/wb.js @@ -1,18 +1,21 @@ +/* +Copyright(c) 2013-2014 Ilya Kreymer. Released under the GNU General Public License. +This file is part of pywb. -// Rewritten location and domain obj setup -window.WB_wombat_location = window.location + pywb is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. -if (window.top != window) { - window.top.WB_wombat_location = window.top.location -} - -if (window.opener) { - window.opener.WB_wombat_location = window.opener.location -} - -document.WB_wombat_domain = document.domain + pywb is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License + along with pywb. If not, see