diff --git a/pywb/indexreader.py b/pywb/indexreader.py index 95453c9f..66bde98a 100644 --- a/pywb/indexreader.py +++ b/pywb/indexreader.py @@ -19,7 +19,8 @@ class RemoteCDXServer: 'statuscode': '200', 'timestamp': '20020120142510', 'urlkey': 'com,example)/'} - """ + + """ def __init__(self, serverUrl): self.serverUrl = serverUrl diff --git a/pywb/regexmatch.py b/pywb/regexmatch.py index 53102f21..9f3d4242 100644 --- a/pywb/regexmatch.py +++ b/pywb/regexmatch.py @@ -148,6 +148,9 @@ class CSSRewriter(RegexRewriter): """ + CSS_URL_REGEX = "url\\s*\\(\\s*[\\\\\"']*([^'\"]+)[\\\\\"']*\\s*\\)" + CSS_IMPORT_NO_URL_REGEX = "@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)" + def __init__(self, rewriter): rules = self._createRules(rewriter) @@ -156,8 +159,8 @@ class CSSRewriter(RegexRewriter): def _createRules(self, rewriter): return [ - ("url\\s*\\(\\s*[\\\\\"']*([^'\"]+)[\\\\\"']*\\s*\\)", RegexRewriter.archivalRewrite(rewriter), 1), - ("@import\\s+(?!url)\\(?\\s*['\"]?(?!url[\\s\\(])([\w.:/\\\\-]+)", RegexRewriter.archivalRewrite(rewriter), 1), + (CSSRewriter.CSS_URL_REGEX, RegexRewriter.archivalRewrite(rewriter), 1), + (CSSRewriter.CSS_IMPORT_NO_URL_REGEX, RegexRewriter.archivalRewrite(rewriter), 1), ] diff --git a/pywb/wbhtml.py b/pywb/wbhtml.py index 325b4892..946715da 100644 --- a/pywb/wbhtml.py +++ b/pywb/wbhtml.py @@ -10,30 +10,38 @@ from regexmatch import JSRewriter, CSSRewriter #================================================================= class WBHtml(HTMLParser): r""" - >>> WBHtml(rewriter).feed('Text') + >>> parse('Text') Text - >>> WBHtml(rewriter).feed('
') + >>> parse('
')
- >>> WBHtml(rewriter).feed('
') + >>> parse('
')
- >>> WBHtml(rewriter).feed('') + >>> parse('') - >>> WBHtml(rewriter).feed('') + >>> parse('') - >>> WBHtml(rewriter).feed('') + # Unterminated script tag auto-terminate + >>> parse(' + + >>> parse('') - >>> WBHtml(rewriter).feed('
') + >>> parse('
')
- >>> WBHtml(rewriter).feed('') + >>> parse('') - """ + + # Unterminated style tag auto-terminate + >>> parse(' + """ REWRITE_TAGS = { 'a': {'href': ''}, @@ -77,6 +85,14 @@ class WBHtml(HTMLParser): self.cssRewriter = CSSRewriter(rewriter) + def close(self): + if (self._wbParseContext): + self.feed('') + self._wbParseContext = None + + HTMLParser.close(self) + + # =========================== META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE) @@ -217,4 +233,9 @@ if __name__ == "__main__": rewriter = ArchivalUrlRewriter('/20131226101010/http://example.com/some/path/index.html', '/web/') + def parse(data): + parser = WBHtml(rewriter) + parser.feed(data) + parser.close() + doctest.testmod() diff --git a/pywb/wburlrewriter.py b/pywb/wburlrewriter.py index 41370ef4..751d3b4c 100644 --- a/pywb/wburlrewriter.py +++ b/pywb/wburlrewriter.py @@ -41,7 +41,7 @@ class ArchivalUrlRewriter: NO_REWRITE_URI_PREFIX = ['javascript:', 'data:', 'mailto:', 'about:'] - PROTOCOLS = ['http://', 'https://', '//', 'mms://', 'rtsp://', 'wais://'] + PROTOCOLS = ['http://', 'https://', '//', 'ftp://', 'mms://', 'rtsp://', 'wais://'] def __init__(self, wburl_str, prefix): self.wburl = ArchivalUrl(wburl_str)