mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
rewrite: add doctype rewriting, more tests on various markup edge cases
This commit is contained in:
parent
742df6238e
commit
9654c22bed
@ -77,8 +77,18 @@ class RewriterTarget(object):
|
||||
self.rewriter.parse_data(data)
|
||||
self.rewriter.out.write(u'-->')
|
||||
|
||||
def pi(self, data):
|
||||
self.rewriter.out.write(u'<?' + data + u'>')
|
||||
def doctype(self, root_tag, public_id, system_id):
|
||||
self.rewriter.out.write(u'<!doctype')
|
||||
if root_tag:
|
||||
self.rewriter.out.write(' ' + root_tag)
|
||||
if public_id:
|
||||
self.rewriter.out.write(' PUBLIC ' + public_id)
|
||||
if system_id:
|
||||
self.rewriter.out.write(' SYSTEM ' + system_id)
|
||||
self.rewriter.out.write(u'>')
|
||||
|
||||
def pi(self, target, data):
|
||||
self.rewriter.out.write(u'<?' + target + ' ' + data + u'>')
|
||||
|
||||
def close(self):
|
||||
return ''
|
||||
|
@ -46,6 +46,9 @@ ur"""
|
||||
>>> parse('<meta http-equiv="Content-type" content="text/html; charset=utf-8" />')
|
||||
<meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
|
||||
|
||||
>>> parse('<meta http-equiv="refresh" content="text/html; charset=utf-8" />')
|
||||
<meta http-equiv="refresh" content="text/html; charset=utf-8"/>
|
||||
|
||||
>>> parse('<META http-equiv="refresh" content>')
|
||||
<meta http-equiv="refresh" content="">
|
||||
|
||||
@ -63,6 +66,7 @@ ur"""
|
||||
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
||||
<div style="background: url('/web/20131226101010em_/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
|
||||
|
||||
# Style
|
||||
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
||||
<style>@import "/web/20131226101010em_/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010em_/http://example.com/some/path/myfont.ttf') }</style>
|
||||
|
||||
@ -77,11 +81,29 @@ ur"""
|
||||
>>> parse('<html><head/><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><head><script src="cool.js"></script></head><body>Test</body></html>
|
||||
|
||||
>>> parse('<body><div>SomeTest</div>', head_insert = '/* Insert */')
|
||||
/* Insert */<body><div>SomeTest</div>
|
||||
>>> parse('<body><div style="">SomeTest</div>', head_insert = '/* Insert */')
|
||||
/* Insert */<body><div style="">SomeTest</div>
|
||||
|
||||
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
|
||||
<link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
|
||||
|
||||
>>> parse('<!doctype html PUBLIC "public">')
|
||||
<!doctype html PUBLIC "public">
|
||||
|
||||
# uncommon markup
|
||||
>>> parse('<?test content?>')
|
||||
<?test content?>
|
||||
|
||||
# no special cdata treatment, preserved in <script>
|
||||
>>> parse('<script><![CDATA[ <a href="path.html"></a> ]]></script>')
|
||||
<script><![CDATA[ <a href="path.html"></a> ]]></script>
|
||||
|
||||
# CDATA outside of <script> parsed and *not* rewritten
|
||||
>>> parse('<?test content><![CDATA[ <a href="http://example.com"></a> ]]>')
|
||||
<?test content><![CDATA[ <a href="http://example.com"></a> ]>
|
||||
|
||||
>>> parse('<!-- <a href="http://example.com"></a> -->')
|
||||
<!-- <a href="http://example.com"></a> -->
|
||||
"""
|
||||
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
|
@ -86,13 +86,30 @@ ur"""
|
||||
>>> parse('<body>abc</body></html><input type="hidden" value="def"/>')
|
||||
<html><body>abc</body><input type="hidden" value="def"></input></html>
|
||||
|
||||
# doctype
|
||||
>>> parse('<!doctype html><div>abcdef</div>')
|
||||
<html><body><div>abcdef</div></body></html>
|
||||
|
||||
# no attr value
|
||||
>>> parse('<checkbox selected></checkbox')
|
||||
<html><body><checkbox selected=""></checkbox></body></html>
|
||||
|
||||
# doctype
|
||||
>>> parse('<!doctype html><div>abcdef</div>')
|
||||
<!doctype html><html><body><div>abcdef</div></body></html>
|
||||
|
||||
>>> parse('<!doctype html PUBLIC "public"><div>abcdef</div>')
|
||||
<!doctype html PUBLIC public><html><body><div>abcdef</div></body></html>
|
||||
|
||||
>>> parse('<!doctype html SYSTEM "system"><div>abcdef</div>')
|
||||
<!doctype html SYSTEM system><html><body><div>abcdef</div></body></html>
|
||||
|
||||
# uncommon markup
|
||||
>>> parse('<?test content?>')
|
||||
<?test content?>
|
||||
|
||||
# no special cdata treatment, preserved in <script>
|
||||
>>> parse('<script><![CDATA[ <a href="path.html"></a> ]]></script>')
|
||||
<html><head><script><![CDATA[ <a href="path.html"></a> ]]></script></head></html>
|
||||
|
||||
>>> parse('<!-- <a href="http://example.com"></a> -->')
|
||||
<!-- <a href="http://example.com"></a> -->
|
||||
"""
|
||||
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
|
Loading…
x
Reference in New Issue
Block a user