From 787dfc136e2fc4eafa2c881cbf9a9a1b3bb4e9c6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 24 Dec 2013 22:51:33 -0800 Subject: [PATCH] wbhtml: add script and style doctests override close() to handle open ') + >>> parse('') - >>> WBHtml(rewriter).feed('') + # Unterminated script tag auto-terminate + >>> parse(' + + >>> parse('') - >>> WBHtml(rewriter).feed('
') + >>> parse('
')
- >>> WBHtml(rewriter).feed('') + >>> parse('') - """ + + # Unterminated style tag auto-terminate + >>> parse(' + """ REWRITE_TAGS = { 'a': {'href': ''}, @@ -77,6 +85,14 @@ class WBHtml(HTMLParser): self.cssRewriter = CSSRewriter(rewriter) + def close(self): + if (self._wbParseContext): + self.feed('') + self._wbParseContext = None + + HTMLParser.close(self) + + # =========================== META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE) @@ -217,4 +233,9 @@ if __name__ == "__main__": rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/') + def parse(data): + parser = WBHtml(rewriter) + parser.feed(data) + parser.close() + doctest.testmod() diff --git a/pywb/wburlrewriter.py b/pywb/wburlrewriter.py index 41370ef4..751d3b4c 100644 --- a/pywb/wburlrewriter.py +++ b/pywb/wburlrewriter.py @@ -41,7 +41,7 @@ class ArchivalUrlRewriter: NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:'] - PROTOCOLS = ['http://', 'https://', '//', 'mms://', 'rtsp://', 'wais://'] + PROTOCOLS = ['http://', 'https://', '//', 'ftp://', 'mms://', 'rtsp://', 'wais://'] def __init__(self, wburl_str, prefix): self.wburl = ArchivalUrl(wburl_str)