diff --git a/CHANGES.rst b/CHANGES.rst
index 0fe9ae07..0be413f1 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -1,3 +1,14 @@
+pywb 0.6.6 changelist
+~~~~~~~~~~~~~~~~~~~~~
+
+* Beginning of new rewrite options, settable per collections and stored in UrlRewriter. Available options:
+
+ - `rewrite_base` - set to False to disable rewriting `` tag
+ - `rewrite_rel_canon` - set to false to disable rewriting ``
+
+* JS rewrite: Don't rewrite location if starting with '$'
+
+
pywb 0.6.5 changelist
~~~~~~~~~~~~~~~~~~~~~
diff --git a/pywb/framework/archivalrouter.py b/pywb/framework/archivalrouter.py
index 33230027..3b0b5a6d 100644
--- a/pywb/framework/archivalrouter.py
+++ b/pywb/framework/archivalrouter.py
@@ -62,7 +62,8 @@ class ArchivalRouter(object):
use_abs_prefix=use_abs_prefix,
wburl_class=route.handler.get_wburl_type(),
urlrewriter_class=UrlRewriter,
- cookie_scope=route.cookie_scope)
+ cookie_scope=route.cookie_scope,
+ rewrite_opts=route.rewrite_opts)
# Allow for applying of additional filters
route.apply_filters(wbrequest, matcher)
@@ -101,6 +102,7 @@ class Route(object):
# collection id from regex group (default 0)
self.coll_group = coll_group
self.cookie_scope = config.get('cookie_scope')
+ self.rewrite_opts = config.get('rewrite_opts', {})
self._custom_init(config)
def is_handling(self, request_uri):
diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py
index 06970316..7c48dbb3 100644
--- a/pywb/framework/wbrequestresponse.py
+++ b/pywb/framework/wbrequestresponse.py
@@ -38,7 +38,8 @@ class WbRequest(object):
wburl_class=None,
urlrewriter_class=None,
is_proxy=False,
- cookie_scope=None):
+ cookie_scope=None,
+ rewrite_opts={}):
self.env = env
@@ -77,7 +78,8 @@ class WbRequest(object):
host_prefix + rel_prefix,
rel_prefix,
env.get('SCRIPT_NAME', '/'),
- cookie_scope)
+ cookie_scope,
+ rewrite_opts)
self.urlrewriter.deprefix_url()
else:
diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py
index 618c5191..cae65a89 100644
--- a/pywb/rewrite/html_rewriter.py
+++ b/pywb/rewrite/html_rewriter.py
@@ -92,6 +92,9 @@ class HTMLRewriterMixin(object):
self.rewrite_tags = self._init_rewrite_tags(defmod)
+ # get opts from urlrewriter
+ self.opts = url_rewriter.rewrite_opts
+
# ===========================
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$',
re.IGNORECASE | re.MULTILINE)
@@ -174,9 +177,11 @@ class HTMLRewriterMixin(object):
elif attr_name == 'crossorigin':
attr_name = '_crossorigin'
- # special case: link don't rewrite canonical
+ # special case: if rewrite_canon not set,
+ # don't rewrite rel=canonical
elif tag == 'link' and attr_name == 'href':
- if not self.has_attr(tag_attrs, ('rel', 'canonical')):
+ if (self.opts.get('rewrite_rel_canon', True) or
+ not self.has_attr(tag_attrs, ('rel', 'canonical'))):
rw_mod = handler.get(attr_name)
attr_value = self._rewrite_url(attr_value, rw_mod)
@@ -191,17 +196,21 @@ class HTMLRewriterMixin(object):
rw_mod = 'oe_'
attr_value = self._rewrite_url(attr_value, rw_mod)
+ # special case: base tag
+ elif (tag == 'base') and (attr_name == 'href') and attr_value:
+ rw_mod = handler.get(attr_name)
+ base_value = self._rewrite_url(attr_value, rw_mod)
+ if self.opts.get('rewrite_base', True):
+ attr_value = base_value
+ self.url_rewriter = (self.url_rewriter.
+ rebase_rewriter(base_value))
+
else:
# rewrite url using tag handler
rw_mod = handler.get(attr_name)
if rw_mod is not None:
attr_value = self._rewrite_url(attr_value, rw_mod)
- # special case: base tag
- if (tag == 'base') and (attr_name == 'href') and attr_value:
- self.url_rewriter = (self.url_rewriter.
- rebase_rewriter(attr_value))
-
# write the attr!
self._write_attr(attr_name, attr_value)
diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py
index 179e06fd..375bca08 100644
--- a/pywb/rewrite/regex_rewriters.py
+++ b/pywb/rewrite/regex_rewriters.py
@@ -130,7 +130,7 @@ class JSLinkAndLocationRewriter(JSLinkOnlyRewriter):
def __init__(self, rewriter, rules=[], prefix='WB_wombat_'):
rules = rules + [
- (r'(?>> parse('
')
#
-# Base Tests
+# Base Tests -- w/ rewrite (default)
>>> parse('
')
>>> parse('
')
+# Base Tests -- no rewrite
+>>> parse('', urlrewriter=no_base_canon_rewriter)
+
+
+>>> parse('
', urlrewriter=no_base_canon_rewriter)
+
+
+
+
# HTML Entities
>>> parse('› > ?')
› > ?
@@ -102,8 +111,12 @@ ur"""
>>> parse('SomeTest
', head_insert = '')
SomeTest
-# don't rewrite rel=canonical
+# rel=canonical: rewrite (default)
>>> parse('')
+
+
+# rel=canonical: no_rewrite
+>>> parse('', urlrewriter=no_base_canon_rewriter)
# doctype
@@ -143,7 +156,12 @@ import pprint
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
-def parse(data, head_insert = None):
+no_base_canon_rewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html',
+ '/web/',
+ rewrite_opts=dict(rewrite_rel_canon=False,
+ rewrite_base=False))
+
+def parse(data, head_insert=None, urlrewriter=urlrewriter):
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
#data = data.decode('utf-8')
result = parser.rewrite(data) + parser.close()
diff --git a/pywb/rewrite/url_rewriter.py b/pywb/rewrite/url_rewriter.py
index aa87260c..a5cc7952 100644
--- a/pywb/rewrite/url_rewriter.py
+++ b/pywb/rewrite/url_rewriter.py
@@ -20,13 +20,14 @@ class UrlRewriter(object):
REL_SCHEME = ('//', r'\/\/', r'\\/\\/')
def __init__(self, wburl, prefix, full_prefix=None, rel_prefix=None,
- root_path=None, cookie_scope=None):
+ root_path=None, cookie_scope=None, rewrite_opts={}):
self.wburl = wburl if isinstance(wburl, WbUrl) else WbUrl(wburl)
self.prefix = prefix
self.full_prefix = full_prefix
self.rel_prefix = rel_prefix if rel_prefix else prefix
self.root_path = root_path if root_path else '/'
self.cookie_scope = cookie_scope
+ self.rewrite_opts = rewrite_opts
def rewrite(self, url, mod=None):
# if special protocol, no rewriting at all