diff --git a/CHANGES.rst b/CHANGES.rst index 25585716..1ddaeea2 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,3 +1,19 @@ +pywb 0.6.4 changelist +~~~~~~~~~~~~~~~~~~~~~ + +* Ignore bad multiline headers in warc. + +* Rewrite fix: Don't parse html entities in HTML rewriter. + +* Ensure cdx iterator closed when reeading. + +* Rewrite fix: remove pywb prefix from any query params. + +* Rewrite fix: better JS rewriting, avoid // comments when matching protocol-relative urls. + +* WARC metadata and resource records include in cdx from cdx-indexer by default + + pywb 0.6.3 changelist ~~~~~~~~~~~~~~~~~~~~~ diff --git a/README.rst b/README.rst index d427b09c..ffe28258 100644 --- a/README.rst +++ b/README.rst @@ -1,10 +1,10 @@ -PyWb 0.6.3 +PyWb 0.6.4 ========== .. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master :target: https://travis-ci.org/ikreymer/pywb -.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=master - :target: https://coveralls.io/r/ikreymer/pywb?branch=master +.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=develop + :target: https://coveralls.io/r/ikreymer/pywb?branch=develop .. image:: https://img.shields.io/gratipay/ikreymer.svg :target: https://www.gratipay.com/ikreymer/ diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index daeedc34..ac0eaf74 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -28,8 +28,17 @@ class CDXFile(CDXSource): self.filename = filename def load_cdx(self, query): - source = open(self.filename) - return iter_range(source, query.key, query.end_key) + def do_open(): + try: + source = open(self.filename) + gen = iter_range(source, query.key, query.end_key) + for line in gen: + yield line + finally: + source.close() + + return do_open() + #return iter_range(do_open(), query.key, query.end_key) def __str__(self): return 'CDX File - ' + self.filename diff --git a/pywb/framework/wbrequestresponse.py b/pywb/framework/wbrequestresponse.py index 8cbabc49..808563ea 100644 --- a/pywb/framework/wbrequestresponse.py +++ b/pywb/framework/wbrequestresponse.py @@ -78,6 +78,8 @@ class WbRequest(object): rel_prefix, env.get('SCRIPT_NAME', '/'), cookie_scope) + + self.urlrewriter.deprefix_url() else: # no wb_url, just store blank wb_url self.wb_url = None diff --git a/pywb/framework/wsgi_wrappers.py b/pywb/framework/wsgi_wrappers.py index 6ff68a25..cbd3825d 100644 --- a/pywb/framework/wsgi_wrappers.py +++ b/pywb/framework/wsgi_wrappers.py @@ -136,9 +136,9 @@ class WSGIApp(object): err_details = None if error_view: - if err_url: + if err_url and isinstance(err_url, str): err_url = err_url.decode('utf-8', 'ignore') - if err_msg: + if err_msg and isinstance(err_msg, str): err_msg = err_msg.decode('utf-8', 'ignore') return error_view.render_response(exc_type=type(exc).__name__, diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index f7575fa5..f0c904c2 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -263,10 +263,20 @@ class HTMLRewriterMixin(object): #================================================================= class HTMLRewriter(HTMLRewriterMixin, HTMLParser): + PARSETAG = re.compile('[<]') + def __init__(self, *args, **kwargs): HTMLParser.__init__(self) super(HTMLRewriter, self).__init__(*args, **kwargs) + def reset(self): + HTMLParser.reset(self) + self.interesting = self.PARSETAG + + def clear_cdata_mode(self): + HTMLParser.clear_cdata_mode(self) + self.interesting = self.PARSETAG + def feed(self, string): try: HTMLParser.feed(self, string) @@ -311,11 +321,12 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser): def handle_data(self, data): self.parse_data(data) - def handle_entityref(self, data): - self.out.write('&' + data + ';') - - def handle_charref(self, data): - self.out.write('&#' + data + ';') + # overriding regex so that these are no longer called + #def handle_entityref(self, data): + # self.out.write('&' + data + ';') + # + #def handle_charref(self, data): + # self.out.write('&#' + data + ';') def handle_comment(self, data): self.out.write('