From 5e4b830fa7cddabcc413e18d8b09a3baf7d1bd93 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 4 Nov 2014 09:42:53 -0800 Subject: [PATCH 1/2] cdx: ensure cdx file is closed when iterator is done, since cdx files are opened per-lookup, related to #45 --- pywb/cdx/cdxsource.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index daeedc34..ac0eaf74 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -28,8 +28,17 @@ class CDXFile(CDXSource): self.filename = filename def load_cdx(self, query): - source = open(self.filename) - return iter_range(source, query.key, query.end_key) + def do_open(): + try: + source = open(self.filename) + gen = iter_range(source, query.key, query.end_key) + for line in gen: + yield line + finally: + source.close() + + return do_open() + #return iter_range(do_open(), query.key, query.end_key) def __str__(self): return 'CDX File - ' + self.filename From e4bcef1c8b3048b63414b0e472792e4d9c96d813 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 4 Nov 2014 12:12:32 -0800 Subject: [PATCH 2/2] rewrite: default HTMLParser entityref and charref are treated as plain data for HTMLRewriter, since they are never rewritten, and to avoid semicolon ambiguity, since no way to determine if there is a ; or not at end. Addresses #43 --- pywb/rewrite/html_rewriter.py | 21 ++++++++++++++++----- pywb/rewrite/test/test_html_rewriter.py | 7 +++++-- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index f7575fa5..f0c904c2 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -263,10 +263,20 @@ class HTMLRewriterMixin(object): #================================================================= class HTMLRewriter(HTMLRewriterMixin, HTMLParser): + PARSETAG = re.compile('[<]') + def __init__(self, *args, **kwargs): HTMLParser.__init__(self) super(HTMLRewriter, self).__init__(*args, **kwargs) + def reset(self): + HTMLParser.reset(self) + self.interesting = self.PARSETAG + + def clear_cdata_mode(self): + HTMLParser.clear_cdata_mode(self) + self.interesting = self.PARSETAG + def feed(self, string): try: HTMLParser.feed(self, string) @@ -311,11 +321,12 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser): def handle_data(self, data): self.parse_data(data) - def handle_entityref(self, data): - self.out.write('&' + data + ';') - - def handle_charref(self, data): - self.out.write('&#' + data + ';') + # overriding regex so that these are no longer called + #def handle_entityref(self, data): + # self.out.write('&' + data + ';') + # + #def handle_charref(self, data): + # self.out.write('&#' + data + ';') def handle_comment(self, data): self.out.write('