mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
rewrite: default HTMLParser entityref and charref are treated as plain
data for HTMLRewriter, since they are never rewritten, and to avoid semicolon ambiguity, since no way to determine if there is a ; or not at end. Addresses #43
This commit is contained in:
parent
5e4b830fa7
commit
e4bcef1c8b
@ -263,10 +263,20 @@ class HTMLRewriterMixin(object):
|
||||
|
||||
#=================================================================
|
||||
class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
||||
PARSETAG = re.compile('[<]')
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
HTMLParser.__init__(self)
|
||||
super(HTMLRewriter, self).__init__(*args, **kwargs)
|
||||
|
||||
def reset(self):
|
||||
HTMLParser.reset(self)
|
||||
self.interesting = self.PARSETAG
|
||||
|
||||
def clear_cdata_mode(self):
|
||||
HTMLParser.clear_cdata_mode(self)
|
||||
self.interesting = self.PARSETAG
|
||||
|
||||
def feed(self, string):
|
||||
try:
|
||||
HTMLParser.feed(self, string)
|
||||
@ -311,11 +321,12 @@ class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
||||
def handle_data(self, data):
|
||||
self.parse_data(data)
|
||||
|
||||
def handle_entityref(self, data):
|
||||
self.out.write('&' + data + ';')
|
||||
|
||||
def handle_charref(self, data):
|
||||
self.out.write('&#' + data + ';')
|
||||
# overriding regex so that these are no longer called
|
||||
#def handle_entityref(self, data):
|
||||
# self.out.write('&' + data + ';')
|
||||
#
|
||||
#def handle_charref(self, data):
|
||||
# self.out.write('&#' + data + ';')
|
||||
|
||||
def handle_comment(self, data):
|
||||
self.out.write('<!--')
|
||||
|
@ -28,8 +28,11 @@ ur"""
|
||||
<base href="/web/20131226101010/http://example.com/some/path/static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
|
||||
|
||||
# HTML Entities
|
||||
>>> parse('<a href="">› ></div>')
|
||||
<a href="">› ></div>
|
||||
>>> parse('<a href="">› > ?</div>')
|
||||
<a href="">› > ?</div>
|
||||
|
||||
>>> parse('<div>X&Y</div> </div>X&Y;</div>')
|
||||
<div>X&Y</div> </div>X&Y;</div>
|
||||
|
||||
# Don't rewrite anchors
|
||||
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
|
||||
|
Loading…
x
Reference in New Issue
Block a user