1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rewrite: add doctype rewriting, more tests on various markup edge cases

This commit is contained in:
Ilya Kreymer 2014-03-23 23:46:49 -07:00
parent 742df6238e
commit 9654c22bed
3 changed files with 57 additions and 8 deletions

View File

@ -77,8 +77,18 @@ class RewriterTarget(object):
self.rewriter.parse_data(data)
self.rewriter.out.write(u'-->')
def pi(self, data):
self.rewriter.out.write(u'<?' + data + u'>')
def doctype(self, root_tag, public_id, system_id):
self.rewriter.out.write(u'<!doctype')
if root_tag:
self.rewriter.out.write(' ' + root_tag)
if public_id:
self.rewriter.out.write(' PUBLIC ' + public_id)
if system_id:
self.rewriter.out.write(' SYSTEM ' + system_id)
self.rewriter.out.write(u'>')
def pi(self, target, data):
self.rewriter.out.write(u'<?' + target + ' ' + data + u'>')
def close(self):
return ''

View File

@ -46,6 +46,9 @@ ur"""
>>> parse('<meta http-equiv="Content-type" content="text/html; charset=utf-8" />')
<meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
>>> parse('<meta http-equiv="refresh" content="text/html; charset=utf-8" />')
<meta http-equiv="refresh" content="text/html; charset=utf-8"/>
>>> parse('<META http-equiv="refresh" content>')
<meta http-equiv="refresh" content="">
@ -63,6 +66,7 @@ ur"""
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
<div style="background: url('/web/20131226101010em_/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
# Style
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
<style>@import "/web/20131226101010em_/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010em_/http://example.com/some/path/myfont.ttf') }</style>
@ -77,11 +81,29 @@ ur"""
>>> parse('<html><head/><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
<html><head><script src="cool.js"></script></head><body>Test</body></html>
>>> parse('<body><div>SomeTest</div>', head_insert = '/* Insert */')
/* Insert */<body><div>SomeTest</div>
>>> parse('<body><div style="">SomeTest</div>', head_insert = '/* Insert */')
/* Insert */<body><div style="">SomeTest</div>
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
<link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
>>> parse('<!doctype html PUBLIC "public">')
<!doctype html PUBLIC "public">
# uncommon markup
>>> parse('<?test content?>')
<?test content?>
# no special cdata treatment, preserved in <script>
>>> parse('<script><![CDATA[ <a href="path.html"></a> ]]></script>')
<script><![CDATA[ <a href="path.html"></a> ]]></script>
# CDATA outside of <script> parsed and *not* rewritten
>>> parse('<?test content><![CDATA[ <a href="http://example.com"></a> ]]>')
<?test content><![CDATA[ <a href="http://example.com"></a> ]>
>>> parse('<!-- <a href="http://example.com"></a> -->')
<!-- <a href="http://example.com"></a> -->
"""
from pywb.rewrite.url_rewriter import UrlRewriter

View File

@ -86,13 +86,30 @@ ur"""
>>> parse('<body>abc</body></html><input type="hidden" value="def"/>')
<html><body>abc</body><input type="hidden" value="def"></input></html>
# doctype
>>> parse('<!doctype html><div>abcdef</div>')
<html><body><div>abcdef</div></body></html>
# no attr value
>>> parse('<checkbox selected></checkbox')
<html><body><checkbox selected=""></checkbox></body></html>
# doctype
>>> parse('<!doctype html><div>abcdef</div>')
<!doctype html><html><body><div>abcdef</div></body></html>
>>> parse('<!doctype html PUBLIC "public"><div>abcdef</div>')
<!doctype html PUBLIC public><html><body><div>abcdef</div></body></html>
>>> parse('<!doctype html SYSTEM "system"><div>abcdef</div>')
<!doctype html SYSTEM system><html><body><div>abcdef</div></body></html>
# uncommon markup
>>> parse('<?test content?>')
<?test content?>
# no special cdata treatment, preserved in <script>
>>> parse('<script><![CDATA[ <a href="path.html"></a> ]]></script>')
<html><head><script><![CDATA[ <a href="path.html"></a> ]]></script></head></html>
>>> parse('<!-- <a href="http://example.com"></a> -->')
<!-- <a href="http://example.com"></a> -->
"""
from pywb.rewrite.url_rewriter import UrlRewriter