From 0495423e86204f461bcb0336e4f6cde0a7cda61b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 6 Dec 2014 17:16:35 -0800 Subject: [PATCH] rewrite: add per-collection rewrite options, settable in 'rewrite_opts' block in each collection. Added rewrite_base to disable rewriting tag and rewrite_rel_canon to disable rewriting link rel=canon. Disabling tag rewrite fixex #51 and new system addresses #50 as well. --- CHANGES.rst | 11 +++++++++++ pywb/framework/archivalrouter.py | 4 +++- pywb/framework/wbrequestresponse.py | 6 ++++-- pywb/rewrite/html_rewriter.py | 23 ++++++++++++++++------- pywb/rewrite/regex_rewriters.py | 2 +- pywb/rewrite/test/test_html_rewriter.py | 24 +++++++++++++++++++++--- pywb/rewrite/url_rewriter.py | 3 ++- 7 files changed, 58 insertions(+), 15 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 0fe9ae07..0be413f1 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,3 +1,14 @@ +pywb 0.6.6 changelist +~~~~~~~~~~~~~~~~~~~~~ + +* Beginning of new rewrite options, settable per collections and stored in UrlRewriter. Available options: + + - `rewrite_base` - set to False to disable rewriting `` tag + - `rewrite_rel_canon` - set to false to disable rewriting `` + +* JS rewrite: Don't rewrite location if starting with '$' + + pywb 0.6.5 changelist ~~~~~~~~~~~~~~~~~~~~~ diff --git a/pywb/framework/archivalrouter.py b/pywb/framework/archivalrouter.py index 33230027..3b0b5a6d 100644 --- a/pywb/framework/archivalrouter.py +++ b/pywb/framework/archivalrouter.py @@ -62,7 +62,8 @@ class ArchivalRouter(object): use_abs_prefix=use_abs_prefix, wburl_class=route.handler.get_wburl_type(), urlrewriter_class=UrlRewriter, - cookie_scope=route.cookie_scope) + cookie_scope=route.cookie_scope, + rewrite_opts=route.rewrite_opts) # Allow for applying of additional filters route.apply_filters(wbrequest, matcher) @@ -101,6 +102,7 @@ class Route(object): # collection id from regex group (default 0) self.coll_group = coll_group self.cookie_scope = config.get('cookie_scope') + self.rewrite_opts = config.get('rewrite_opts', {}) self._custom_init(config) def is_handling(self, request_uri): diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 06970316..7c48dbb3 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -38,7 +38,8 @@ class WbRequest(object): wburl_class=None, urlrewriter_class=None, is_proxy=False, - cookie_scope=None): + cookie_scope=None, + rewrite_opts={}): self.env = env @@ -77,7 +78,8 @@ class WbRequest(object): host_prefix + rel_prefix, rel_prefix, env.get('SCRIPT_NAME', '/'), - cookie_scope) + cookie_scope, + rewrite_opts) self.urlrewriter.deprefix_url() else: diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 618c5191..cae65a89 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -92,6 +92,9 @@ class HTMLRewriterMixin(object): self.rewrite_tags = self._init_rewrite_tags(defmod) + # get opts from urlrewriter + self.opts = url_rewriter.rewrite_opts + # =========================== META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE) @@ -174,9 +177,11 @@ class HTMLRewriterMixin(object): elif attr_name == 'crossorigin': attr_name = '_crossorigin' - # special case: link don't rewrite canonical + # special case: if rewrite_canon not set, + # don't rewrite rel=canonical elif tag == 'link' and attr_name == 'href': - if not self.has_attr(tag_attrs, ('rel', 'canonical')): + if (self.opts.get('rewrite_rel_canon', True) or + not self.has_attr(tag_attrs, ('rel', 'canonical'))): rw_mod = handler.get(attr_name) attr_value = self._rewrite_url(attr_value, rw_mod) @@ -191,17 +196,21 @@ class HTMLRewriterMixin(object): rw_mod = 'oe_' attr_value = self._rewrite_url(attr_value, rw_mod) + # special case: base tag + elif (tag == 'base') and (attr_name == 'href') and attr_value: + rw_mod = handler.get(attr_name) + base_value = self._rewrite_url(attr_value, rw_mod) + if self.opts.get('rewrite_base', True): + attr_value = base_value + self.url_rewriter = (self.url_rewriter. + rebase_rewriter(base_value)) + else: # rewrite url using tag handler rw_mod = handler.get(attr_name) if rw_mod is not None: attr_value = self._rewrite_url(attr_value, rw_mod) - # special case: base tag - if (tag == 'base') and (attr_name == 'href') and attr_value: - self.url_rewriter = (self.url_rewriter. - rebase_rewriter(attr_value)) - # write the attr! self._write_attr(attr_name, attr_value) diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index 179e06fd..375bca08 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -130,7 +130,7 @@ class JSLinkAndLocationRewriter(JSLinkOnlyRewriter): def __init__(self, rewriter, rules=[], prefix='WB_wombat_'): rules = rules + [ - (r'(?>> parse('') # -# Base Tests +# Base Tests -- w/ rewrite (default) >>> parse('') >>> parse('') +# Base Tests -- no rewrite +>>> parse('', urlrewriter=no_base_canon_rewriter) + + +>>> parse('', urlrewriter=no_base_canon_rewriter) + + + + # HTML Entities >>> parse('›   > ?') ›   > ? @@ -102,8 +111,12 @@ ur""" >>> parse('
SomeTest
', head_insert = '')
SomeTest
-# don't rewrite rel=canonical +# rel=canonical: rewrite (default) >>> parse('') + + +# rel=canonical: no_rewrite +>>> parse('', urlrewriter=no_base_canon_rewriter) # doctype @@ -143,7 +156,12 @@ import pprint urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/') -def parse(data, head_insert = None): +no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', + '/web/', + rewrite_opts=dict(rewrite_rel_canon=False, + rewrite_base=False)) + +def parse(data, head_insert=None, urlrewriter=urlrewriter): parser = HTMLRewriter(urlrewriter, head_insert = head_insert) #data = data.decode('utf-8') result = parser.rewrite(data) + parser.close() diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py index aa87260c..a5cc7952 100644 --- a/pywb/rewrite/url_rewriter.py +++ b/pywb/rewrite/url_rewriter.py @@ -20,13 +20,14 @@ class UrlRewriter(object): REL_SCHEME = ('//', r'\/\/', r'\\/\\/') def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None, - root_path=None, cookie_scope=None): + root_path=None, cookie_scope=None, rewrite_opts={}): self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl) self.prefix = prefix self.full_prefix = full_prefix self.rel_prefix = rel_prefix if rel_prefix else prefix self.root_path = root_path if root_path else '/' self.cookie_scope = cookie_scope + self.rewrite_opts = rewrite_opts def rewrite(self, url, mod=None): # if special protocol, no rewriting at all