From b8c4a453c9f23d99b13c4339930ed48ddc9e6e9d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 29 Dec 2013 22:42:29 -0800 Subject: [PATCH] wbhtml: add utf-8 tests --- pywb/wbhtml.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/pywb/wbhtml.py b/pywb/wbhtml.py index d6b39550..93354754 100644 --- a/pywb/wbhtml.py +++ b/pywb/wbhtml.py @@ -1,3 +1,6 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + import sys import re @@ -24,7 +27,11 @@ class WBHtml(HTMLParser): >>> parse('') - + + # Unicode + >>> parse('испытание') + испытание + # Meta tag >>> parse('') @@ -32,6 +39,10 @@ class WBHtml(HTMLParser): >>> parse('') + >>> parse('') + + + # Script tag >>> parse('') @@ -124,6 +135,9 @@ class WBHtml(HTMLParser): META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE) def _rewriteMetaRefresh(self, metaRefresh): + if not metaRefresh: + return None + m = WBHtml.META_REFRESH_REGEX.match(metaRefresh) if not m: return metaRefresh @@ -154,7 +168,6 @@ class WBHtml(HTMLParser): return False def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd): - # special case: script or style parse context if (tag in WBHtml.STATE_TAGS) and (self._wbParseContext == None): self._wbParseContext = tag @@ -178,7 +191,7 @@ class WBHtml(HTMLParser): attrName, attrValue = attr # special case: inline JS/event handler - if (attrValue and attrValue.startswith('javascript:')) or attrName.startswith("on"): + if (attrValue and attrValue.startswith('javascript:')) or attrName.startswith('on'): attrValue = self._rewriteScript(attrValue) # special case: inline CSS/style attribute @@ -199,8 +212,8 @@ class WBHtml(HTMLParser): if rwMod is not None: attrValue = self._rewriteURL(attrValue, rwMod) - #self.out.write(' {0}="{1}"'.format(attrName, attrValue)) if attrValue: + #self.out.write(' {0}="{1}"'.format(attrName, attrValue)) self.out.write(' ' + attrName + '="' + attrValue + '"') else: self.out.write(' ' + attrName) @@ -208,7 +221,7 @@ class WBHtml(HTMLParser): self.out.write('/>' if isStartEnd else '>') # special case: head tag - if (self.headInsert) and (self._wbParseContext == None) and (tag == "head"): + if (self.headInsert) and (self._wbParseContext == None) and (tag == 'head'): self.out.write(self.headInsert) self.headInsert = None