diff --git a/pywb/wbhtml.py b/pywb/wbhtml.py
index d6b39550..93354754 100644
--- a/pywb/wbhtml.py
+++ b/pywb/wbhtml.py
@@ -1,3 +1,6 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
import sys
import re
@@ -24,7 +27,11 @@ class WBHtml(HTMLParser):
>>> parse('
')
-
+
+ # Unicode
+ >>> parse('испытание')
+ испытание
+
# Meta tag
>>> parse('')
@@ -32,6 +39,10 @@ class WBHtml(HTMLParser):
>>> parse('')
+ >>> parse('')
+
+
+ # Script tag
>>> parse('')
@@ -124,6 +135,9 @@ class WBHtml(HTMLParser):
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE)
def _rewriteMetaRefresh(self, metaRefresh):
+ if not metaRefresh:
+ return None
+
m = WBHtml.META_REFRESH_REGEX.match(metaRefresh)
if not m:
return metaRefresh
@@ -154,7 +168,6 @@ class WBHtml(HTMLParser):
return False
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
-
# special case: script or style parse context
if (tag in WBHtml.STATE_TAGS) and (self._wbParseContext == None):
self._wbParseContext = tag
@@ -178,7 +191,7 @@ class WBHtml(HTMLParser):
attrName, attrValue = attr
# special case: inline JS/event handler
- if (attrValue and attrValue.startswith('javascript:')) or attrName.startswith("on"):
+ if (attrValue and attrValue.startswith('javascript:')) or attrName.startswith('on'):
attrValue = self._rewriteScript(attrValue)
# special case: inline CSS/style attribute
@@ -199,8 +212,8 @@ class WBHtml(HTMLParser):
if rwMod is not None:
attrValue = self._rewriteURL(attrValue, rwMod)
- #self.out.write(' {0}="{1}"'.format(attrName, attrValue))
if attrValue:
+ #self.out.write(' {0}="{1}"'.format(attrName, attrValue))
self.out.write(' ' + attrName + '="' + attrValue + '"')
else:
self.out.write(' ' + attrName)
@@ -208,7 +221,7 @@ class WBHtml(HTMLParser):
self.out.write('/>' if isStartEnd else '>')
# special case: head tag
- if (self.headInsert) and (self._wbParseContext == None) and (tag == "head"):
+ if (self.headInsert) and (self._wbParseContext == None) and (tag == 'head'):
self.out.write(self.headInsert)
self.headInsert = None