From e4bcef1c8b3048b63414b0e472792e4d9c96d813 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 4 Nov 2014 12:12:32 -0800 Subject: [PATCH] rewrite: default HTMLParser entityref and charref are treated as plain data for HTMLRewriter, since they are never rewritten, and to avoid semicolon ambiguity, since no way to determine if there is a ; or not at end. Addresses #43 --- pywb/rewrite/html_rewriter.py | 21 ++++++++++++++++----- pywb/rewrite/test/test_html_rewriter.py | 7 +++++-- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index f7575fa5..f0c904c2 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -263,10 +263,20 @@ class HTMLRewriterMixin(object): #================================================================= class HTMLRewriter(HTMLRewriterMixin, HTMLParser): + PARSETAG = re.compile('[<]') + def __init__(self, *args, **kwargs): HTMLParser.__init__(self) super(HTMLRewriter, self).__init__(*args, **kwargs) + def reset(self): + HTMLParser.reset(self) + self.interesting = self.PARSETAG + + def clear_cdata_mode(self): + HTMLParser.clear_cdata_mode(self) + self.interesting = self.PARSETAG + def feed(self, string): try: HTMLParser.feed(self, string) @@ -311,11 +321,12 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser): def handle_data(self, data): self.parse_data(data) - def handle_entityref(self, data): - self.out.write('&' + data + ';') - - def handle_charref(self, data): - self.out.write('&#' + data + ';') + # overriding regex so that these are no longer called + #def handle_entityref(self, data): + # self.out.write('&' + data + ';') + # + #def handle_charref(self, data): + # self.out.write('&#' + data + ';') def handle_comment(self, data): self.out.write('