diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index f7575fa5..f0c904c2 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -263,10 +263,20 @@ class HTMLRewriterMixin(object): #================================================================= class HTMLRewriter(HTMLRewriterMixin, HTMLParser): + PARSETAG = re.compile('[<]') + def __init__(self, *args, **kwargs): HTMLParser.__init__(self) super(HTMLRewriter, self).__init__(*args, **kwargs) + def reset(self): + HTMLParser.reset(self) + self.interesting = self.PARSETAG + + def clear_cdata_mode(self): + HTMLParser.clear_cdata_mode(self) + self.interesting = self.PARSETAG + def feed(self, string): try: HTMLParser.feed(self, string) @@ -311,11 +321,12 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser): def handle_data(self, data): self.parse_data(data) - def handle_entityref(self, data): - self.out.write('&' + data + ';') - - def handle_charref(self, data): - self.out.write('&#' + data + ';') + # overriding regex so that these are no longer called + #def handle_entityref(self, data): + # self.out.write('&' + data + ';') + # + #def handle_charref(self, data): + # self.out.write('&#' + data + ';') def handle_comment(self, data): self.out.write('