diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index daeedc34..ac0eaf74 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -28,8 +28,17 @@ class CDXFile(CDXSource): self.filename = filename def load_cdx(self, query): - source = open(self.filename) - return iter_range(source, query.key, query.end_key) + def do_open(): + try: + source = open(self.filename) + gen = iter_range(source, query.key, query.end_key) + for line in gen: + yield line + finally: + source.close() + + return do_open() + #return iter_range(do_open(), query.key, query.end_key) def __str__(self): return 'CDX File - ' + self.filename diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index 4866fc5a..b3668521 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -274,10 +274,20 @@ class HTMLRewriterMixin(object): #================================================================= class HTMLRewriter(HTMLRewriterMixin, HTMLParser): + PARSETAG = re.compile('[<]') + def __init__(self, *args, **kwargs): HTMLParser.__init__(self) super(HTMLRewriter, self).__init__(*args, **kwargs) + def reset(self): + HTMLParser.reset(self) + self.interesting = self.PARSETAG + + def clear_cdata_mode(self): + HTMLParser.clear_cdata_mode(self) + self.interesting = self.PARSETAG + def feed(self, string): try: HTMLParser.feed(self, string) @@ -322,11 +332,12 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser): def handle_data(self, data): self.parse_data(data) - def handle_entityref(self, data): - self.out.write('&' + data + ';') - - def handle_charref(self, data): - self.out.write('&#' + data + ';') + # overriding regex so that these are no longer called + #def handle_entityref(self, data): + # self.out.write('&' + data + ';') + # + #def handle_charref(self, data): + # self.out.write('&#' + data + ';') def handle_comment(self, data): self.out.write('