diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 5a316016..f7575fa5 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -78,7 +78,8 @@ class HTMLRewriterMixin(object): head_insert=None, js_rewriter_class=JSRewriter, css_rewriter_class=CSSRewriter, - defmod=''): + defmod='', + parse_comments=False): self.url_rewriter = url_rewriter self._wb_parse_context = None @@ -87,6 +88,8 @@ class HTMLRewriterMixin(object): self.css_rewriter = css_rewriter_class(url_rewriter) self.head_insert = head_insert + self.parse_comments = parse_comments + self.rewrite_tags = self._init_rewrite_tags(defmod) # =========================== @@ -316,7 +319,11 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser): def handle_comment(self, data): self.out.write('') def handle_decl(self, data): diff --git a/pywb/rewrite/regex_rewriters.py b/pywb/rewrite/regex_rewriters.py index 3f440eee..df7a128e 100644 --- a/pywb/rewrite/regex_rewriters.py +++ b/pywb/rewrite/regex_rewriters.py @@ -25,7 +25,7 @@ class RegexRewriter(object): @staticmethod def archival_rewrite(rewriter): - return lambda string: rewriter.rewrite(string, 'em_') + return lambda string: rewriter.rewrite(string, 'mp_') #@staticmethod #def replacer(other): @@ -105,7 +105,7 @@ class JSLinkOnlyRewriter(RegexRewriter): JS Rewriter which rewrites absolute http://, https:// and // urls at the beginning of a string """ - JS_HTTPX = r'(?<="|\')(?:https?:)?\\{0,2}/\\{0,2}/[A-Za-z0-9:_@.-]+' + JS_HTTPX = r'(?<="|\'|;)(?:https?:)?\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+' def __init__(self, rewriter, rules=[]): rules = rules + [ diff --git a/pywb/rewrite/rewrite_content.py b/pywb/rewrite/rewrite_content.py index 207d879e..c852633b 100644 --- a/pywb/rewrite/rewrite_content.py +++ b/pywb/rewrite/rewrite_content.py @@ -127,7 +127,8 @@ class RewriteContent: js_rewriter_class=rule.rewriters['js'], css_rewriter_class=rule.rewriters['css'], head_insert=head_insert_str, - defmod=self.defmod) + defmod=self.defmod, + parse_comments=rule.parse_comments) else: if wb_url.is_banner_only: diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py index d70d2d08..226942c5 100644 --- a/pywb/rewrite/rewriterules.py +++ b/pywb/rewrite/rewriterules.py @@ -8,8 +8,6 @@ from html_rewriter import HTMLRewriter import itertools -HTML = HTMLRewriter - #================================================================= class RewriteRules(BaseRule): @@ -23,8 +21,9 @@ class RewriteRules(BaseRule): self.rewriters['header'] = config.get('header_class', HeaderRewriter) self.rewriters['css'] = config.get('css_class', CSSRewriter) self.rewriters['xml'] = config.get('xml_class', XMLRewriter) - self.rewriters['html'] = config.get('html_class', HTML) - #self.rewriters['html'] = config.get('html_class', HTMLRewriter) + self.rewriters['html'] = config.get('html_class', HTMLRewriter) + + self.parse_comments = config.get('parse_comments', False) # Custom handling for js rewriting, often the most complex self.js_rewrite_location = config.get('js_rewrite_location', True) diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py index 9ea8edc0..f3a5d38d 100644 --- a/pywb/rewrite/test/test_html_rewriter.py +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -62,7 +62,7 @@ ur""" # Script tag >>> parse('') - + # Script tag + crossorigin >>> parse('') @@ -70,21 +70,21 @@ ur""" # Unterminated script tag, handle and auto-terminate >>> parse(' + >>> parse('') - + >>> parse('
') - + # Style >>> parse('') - + # Unterminated style tag, handle and auto-terminate >>> parse(' + # Head Insertion >>> parse('Test', head_insert = '') diff --git a/pywb/rewrite/test/test_regex_rewriters.py b/pywb/rewrite/test/test_regex_rewriters.py index 3f3b4638..69a367a9 100644 --- a/pywb/rewrite/test/test_regex_rewriters.py +++ b/pywb/rewrite/test/test_regex_rewriters.py @@ -12,16 +12,16 @@ r""" #================================================================= >>> _test_js('location = "http://example.com/abc.html"') -'WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"' +'WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html"' >>> _test_js(r'location = "http:\/\/example.com/abc.html"') -'WB_wombat_location = "/web/20131010em_/http:\\/\\/example.com/abc.html"' +'WB_wombat_location = "/web/20131010mp_/http:\\/\\/example.com/abc.html"' >>> _test_js(r'location = "http:\\/\\/example.com/abc.html"') -'WB_wombat_location = "/web/20131010em_/http:\\\\/\\\\/example.com/abc.html"' +'WB_wombat_location = "/web/20131010mp_/http:\\\\/\\\\/example.com/abc.html"' >>> _test_js(r"location = 'http://example.com/abc.html/'") -"WB_wombat_location = '/web/20131010em_/http://example.com/abc.html/'" +"WB_wombat_location = '/web/20131010mp_/http://example.com/abc.html/'" >>> _test_js(r'location = http://example.com/abc.html/') 'WB_wombat_location = http://example.com/abc.html/' @@ -37,94 +37,98 @@ r""" '"/location" == some_location_val; locations = WB_wombat_location;' >>> _test_js('cool_Location = "http://example.com/abc.html"') -'cool_Location = "/web/20131010em_/http://example.com/abc.html"' +'cool_Location = "/web/20131010mp_/http://example.com/abc.html"' >>> _test_js('window.location = "http://example.com/abc.html" document.domain = "anotherdomain.com"') -'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"' +'window.WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html" document.WB_wombat_domain = "anotherdomain.com"' >>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"') 'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"' # custom rules added >>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)]) -'window.WB_wombat_location = "/web/20131010em_/http://example.com/abc.html"; /*some_func(); */' +'window.WB_wombat_location = "/web/20131010mp_/http://example.com/abc.html"; /*some_func(); */' # scheme-agnostic >>> _test_js('cool_Location = "//example.com/abc.html" //comment') -'cool_Location = "/web/20131010em_/http://example.com/abc.html" //comment' +'cool_Location = "/web/20131010mp_/http://example.com/abc.html" //comment' # document.cookie test >>> _test_js('document.cookie = "a=b; Path=/"') 'document.WB_wombat_cookie = "a=b; Path=/"' +# js-escaped +>>> _test_js('"http:\\/\\/www.example.com\\/some\\/path\\/?query=1"') +'"/web/20131010mp_/http:\\/\\/www.example.com\\/some\\/path\\/?query=1"' + #================================================================= # XML Rewriting #================================================================= >>> _test_xml('