mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
wbhtml: add utf-8 tests
This commit is contained in:
parent
997dc5df0f
commit
b8c4a453c9
@ -1,3 +1,6 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import re
|
import re
|
||||||
|
|
||||||
@ -24,7 +27,11 @@ class WBHtml(HTMLParser):
|
|||||||
|
|
||||||
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
|
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
|
||||||
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
|
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
|
||||||
|
|
||||||
|
# Unicode
|
||||||
|
>>> parse('<a href="http://испытание.испытание/">испытание</a>')
|
||||||
|
<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
|
||||||
|
|
||||||
# Meta tag
|
# Meta tag
|
||||||
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
||||||
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
|
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
|
||||||
@ -32,6 +39,10 @@ class WBHtml(HTMLParser):
|
|||||||
>>> parse('<meta http-equiv="Content-type" content="text/html; charset=utf-8" />')
|
>>> parse('<meta http-equiv="Content-type" content="text/html; charset=utf-8" />')
|
||||||
<meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
|
<meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
|
||||||
|
|
||||||
|
>>> parse('<META http-equiv="refresh" content>')
|
||||||
|
<meta http-equiv="refresh" content>
|
||||||
|
|
||||||
|
# Script tag
|
||||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
||||||
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
|
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
|
||||||
|
|
||||||
@ -124,6 +135,9 @@ class WBHtml(HTMLParser):
|
|||||||
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE)
|
META_REFRESH_REGEX = re.compile('^[\\d.]+\\s*;\\s*url\\s*=\\s*(.+?)\\s*$', re.IGNORECASE | re.MULTILINE)
|
||||||
|
|
||||||
def _rewriteMetaRefresh(self, metaRefresh):
|
def _rewriteMetaRefresh(self, metaRefresh):
|
||||||
|
if not metaRefresh:
|
||||||
|
return None
|
||||||
|
|
||||||
m = WBHtml.META_REFRESH_REGEX.match(metaRefresh)
|
m = WBHtml.META_REFRESH_REGEX.match(metaRefresh)
|
||||||
if not m:
|
if not m:
|
||||||
return metaRefresh
|
return metaRefresh
|
||||||
@ -154,7 +168,6 @@ class WBHtml(HTMLParser):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
|
def rewriteTagAttrs(self, tag, tagAttrs, isStartEnd):
|
||||||
|
|
||||||
# special case: script or style parse context
|
# special case: script or style parse context
|
||||||
if (tag in WBHtml.STATE_TAGS) and (self._wbParseContext == None):
|
if (tag in WBHtml.STATE_TAGS) and (self._wbParseContext == None):
|
||||||
self._wbParseContext = tag
|
self._wbParseContext = tag
|
||||||
@ -178,7 +191,7 @@ class WBHtml(HTMLParser):
|
|||||||
attrName, attrValue = attr
|
attrName, attrValue = attr
|
||||||
|
|
||||||
# special case: inline JS/event handler
|
# special case: inline JS/event handler
|
||||||
if (attrValue and attrValue.startswith('javascript:')) or attrName.startswith("on"):
|
if (attrValue and attrValue.startswith('javascript:')) or attrName.startswith('on'):
|
||||||
attrValue = self._rewriteScript(attrValue)
|
attrValue = self._rewriteScript(attrValue)
|
||||||
|
|
||||||
# special case: inline CSS/style attribute
|
# special case: inline CSS/style attribute
|
||||||
@ -199,8 +212,8 @@ class WBHtml(HTMLParser):
|
|||||||
if rwMod is not None:
|
if rwMod is not None:
|
||||||
attrValue = self._rewriteURL(attrValue, rwMod)
|
attrValue = self._rewriteURL(attrValue, rwMod)
|
||||||
|
|
||||||
#self.out.write(' {0}="{1}"'.format(attrName, attrValue))
|
|
||||||
if attrValue:
|
if attrValue:
|
||||||
|
#self.out.write(' {0}="{1}"'.format(attrName, attrValue))
|
||||||
self.out.write(' ' + attrName + '="' + attrValue + '"')
|
self.out.write(' ' + attrName + '="' + attrValue + '"')
|
||||||
else:
|
else:
|
||||||
self.out.write(' ' + attrName)
|
self.out.write(' ' + attrName)
|
||||||
@ -208,7 +221,7 @@ class WBHtml(HTMLParser):
|
|||||||
self.out.write('/>' if isStartEnd else '>')
|
self.out.write('/>' if isStartEnd else '>')
|
||||||
|
|
||||||
# special case: head tag
|
# special case: head tag
|
||||||
if (self.headInsert) and (self._wbParseContext == None) and (tag == "head"):
|
if (self.headInsert) and (self._wbParseContext == None) and (tag == 'head'):
|
||||||
self.out.write(self.headInsert)
|
self.out.write(self.headInsert)
|
||||||
self.headInsert = None
|
self.headInsert = None
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user