mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
cleanup: move lxml tests to seperate test dir, seperate html, lxml html and regex
tests into seperate files fix lxml toggle in rewriterrules
This commit is contained in:
parent
f35e82a4d5
commit
2e7b17ed56
94
pywb/rewrite/lxml_html_rewriter.py
Normal file
94
pywb/rewrite/lxml_html_rewriter.py
Normal file
@ -0,0 +1,94 @@
|
||||
import lxml.etree
|
||||
import cgi
|
||||
import re
|
||||
|
||||
from regex_rewriters import JSRewriter, CSSRewriter
|
||||
from url_rewriter import UrlRewriter
|
||||
from html_rewriter import HTMLRewriterMixin
|
||||
|
||||
|
||||
#=================================================================
|
||||
class LXMLHTMLRewriter(HTMLRewriterMixin):
|
||||
END_HTML = re.compile(r'</\s*html\s*>', re.IGNORECASE)
|
||||
|
||||
def __init__(self, url_rewriter,
|
||||
head_insert=None,
|
||||
js_rewriter_class=JSRewriter,
|
||||
css_rewriter_class=CSSRewriter):
|
||||
|
||||
super(LXMLHTMLRewriter, self).__init__(url_rewriter,
|
||||
head_insert,
|
||||
js_rewriter_class,
|
||||
css_rewriter_class)
|
||||
|
||||
self.target = RewriterTarget(self)
|
||||
self.parser = lxml.etree.HTMLParser(remove_pis=False,
|
||||
remove_blank_text=False,
|
||||
remove_comments=False,
|
||||
strip_cdata=False,
|
||||
compact=True,
|
||||
target=self.target,
|
||||
recover=True,
|
||||
)
|
||||
|
||||
def feed(self, string):
|
||||
string = self.END_HTML.sub(u'', string)
|
||||
#string = string.replace(u'</html>', u'')
|
||||
self.parser.feed(string)
|
||||
|
||||
def close(self):
|
||||
if not self.out:
|
||||
self.out = self.AccumBuff()
|
||||
|
||||
self.is_closing = True
|
||||
self.parser.close()
|
||||
|
||||
result = self.out.getvalue()
|
||||
# Clear buffer to create new one for next rewrite()
|
||||
self.out = None
|
||||
|
||||
return result
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RewriterTarget(object):
|
||||
def __init__(self, rewriter):
|
||||
self.rewriter = rewriter
|
||||
|
||||
def start(self, tag, attrs):
|
||||
attrs = attrs.items()
|
||||
|
||||
if not self.rewriter._rewrite_tag_attrs(tag, attrs, escape=True):
|
||||
self.rewriter.out.write(u'<' + tag)
|
||||
|
||||
for name, value in attrs:
|
||||
self.rewriter._write_attr(name, value, escape=True)
|
||||
else:
|
||||
if tag == u'head':
|
||||
if (self.rewriter._rewrite_head(False)):
|
||||
return
|
||||
|
||||
self.rewriter.out.write(u'>')
|
||||
|
||||
def end(self, tag):
|
||||
if (tag == self.rewriter._wb_parse_context):
|
||||
self.rewriter._wb_parse_context = None
|
||||
|
||||
self.rewriter.out.write(u'</' + tag + u'>')
|
||||
|
||||
def data(self, data):
|
||||
if not self.rewriter._wb_parse_context:
|
||||
data = cgi.escape(data, quote=True)
|
||||
|
||||
self.rewriter.parse_data(data)
|
||||
|
||||
def comment(self, data):
|
||||
self.rewriter.out.write(u'<!--')
|
||||
self.rewriter.parse_data(data)
|
||||
self.rewriter.out.write(u'-->')
|
||||
|
||||
def pi(self, data):
|
||||
self.rewriter.out.write(u'<?' + data + u'>')
|
||||
|
||||
def close(self):
|
||||
return ''
|
@ -1,202 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import lxml.etree
|
||||
import cgi
|
||||
import re
|
||||
|
||||
from regex_rewriters import JSRewriter, CSSRewriter
|
||||
from url_rewriter import UrlRewriter
|
||||
from html_rewriter import HTMLRewriterMixin
|
||||
|
||||
|
||||
#=================================================================
|
||||
class LXMLHTMLRewriter(HTMLRewriterMixin):
|
||||
ur"""
|
||||
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||
<html><body><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></body></html>
|
||||
|
||||
>>> parse('<body x="y"><img src="../img.gif"/><br/></body>')
|
||||
<html><body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"></img><br></br></body></html>
|
||||
|
||||
>>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
|
||||
<html><body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"></img><br></br></body></html>
|
||||
|
||||
# malformed html -- "selected" attrib dropped
|
||||
>>> parse('<input "selected"><img src></div>')
|
||||
<html><body><input></input><img src=""></img></body></html>
|
||||
|
||||
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
|
||||
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"></base></head></html>
|
||||
|
||||
# Don't rewrite anchors
|
||||
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
|
||||
<html><body><a href="#abc">Text</a></body></html>
|
||||
|
||||
# Ensure attr values are not unescaped
|
||||
>>> parse('<p data-value=""X"">data</p>')
|
||||
<html><body><p data-value=""X"">data</p></body></html>
|
||||
|
||||
# text moved out of input
|
||||
>>> parse('<input value="val">data</input>')
|
||||
<html><body><input value="val"></input>data</body></html>
|
||||
|
||||
>>> parse('<script src="abc.js"></script>')
|
||||
<html><head><script src="/web/20131226101010js_/http://example.com/some/path/abc.js"></script></head></html>
|
||||
|
||||
# Unicode
|
||||
>>> parse('<a href="http://испытание.испытание/">испытание</a>')
|
||||
<html><body><a href="/web/20131226101010/http://испытание.испытание/">испытание</a></body></html>
|
||||
|
||||
# Meta tag
|
||||
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
||||
<html><head><meta content="10; URL=/web/20131226101010/http://example.com/abc/def.html" http-equiv="refresh"></meta></head></html>
|
||||
|
||||
>>> parse('<meta http-equiv="Content-type" content="text/html; charset=utf-8" />')
|
||||
<html><head><meta content="text/html; charset=utf-8" http-equiv="Content-type"></meta></head></html>
|
||||
|
||||
>>> parse('<META http-equiv="refresh" content>')
|
||||
<html><head><meta content="" http-equiv="refresh"></meta></head></html>
|
||||
|
||||
# Script tag
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
||||
<html><head><script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script></head></html>
|
||||
|
||||
# Unterminated script tag, will auto-terminate
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
|
||||
<html><head><script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script></head></html>
|
||||
|
||||
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
|
||||
<html><head><script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script></head></html>
|
||||
|
||||
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
||||
<html><body><div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div></body></html>
|
||||
|
||||
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
||||
<html><head><style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style></head></html>
|
||||
|
||||
# Unterminated style tag, handle but don't auto-terminate
|
||||
>>> parse('<style>@import url(styles.css)')
|
||||
<html><head><style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style></head></html>
|
||||
|
||||
# Head Insertion
|
||||
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
|
||||
|
||||
>>> parse('<html><head/><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><head><script src="cool.js"></script></head><body>Test</body></html>
|
||||
|
||||
>>> parse('<body><div>SomeTest</div>', head_insert = '/* Insert */')
|
||||
<html>/* Insert */<body><div>SomeTest</div></body></html>
|
||||
|
||||
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
|
||||
<html><head><script>load_stuff();</script><link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"></link></head><body><div>SomeTest</div></body></html>
|
||||
|
||||
|
||||
# content after </html>
|
||||
>>> parse('<body>abc</body></html><input type="hidden" value="def"/>')
|
||||
<html><body>abc</body><input type="hidden" value="def"></input></html>
|
||||
|
||||
# doctype
|
||||
>>> parse('<!doctype html><div>abcdef</div>')
|
||||
<html><body><div>abcdef</div></body></html>
|
||||
|
||||
# no attr value
|
||||
>>> parse('<checkbox selected></checkbox')
|
||||
<html><body><checkbox selected=""></checkbox></body></html>
|
||||
"""
|
||||
|
||||
END_HTML = re.compile(r'</\s*html\s*>', re.IGNORECASE)
|
||||
|
||||
def __init__(self, url_rewriter,
|
||||
head_insert=None,
|
||||
js_rewriter_class=JSRewriter,
|
||||
css_rewriter_class=CSSRewriter):
|
||||
|
||||
super(LXMLHTMLRewriter, self).__init__(url_rewriter,
|
||||
head_insert,
|
||||
js_rewriter_class,
|
||||
css_rewriter_class)
|
||||
|
||||
self.target = RewriterTarget(self)
|
||||
self.parser = lxml.etree.HTMLParser(remove_pis=False,
|
||||
remove_blank_text=False,
|
||||
remove_comments=False,
|
||||
strip_cdata=False,
|
||||
compact=True,
|
||||
target=self.target,
|
||||
recover=True,
|
||||
)
|
||||
|
||||
def feed(self, string):
|
||||
string = self.END_HTML.sub(u'', string)
|
||||
#string = string.replace(u'</html>', u'')
|
||||
self.parser.feed(string)
|
||||
|
||||
def close(self):
|
||||
if not self.out:
|
||||
self.out = self.AccumBuff()
|
||||
|
||||
self.is_closing = True
|
||||
self.parser.close()
|
||||
|
||||
result = self.out.getvalue()
|
||||
# Clear buffer to create new one for next rewrite()
|
||||
self.out = None
|
||||
|
||||
return result
|
||||
|
||||
|
||||
#=================================================================
|
||||
class RewriterTarget(object):
|
||||
def __init__(self, rewriter):
|
||||
self.rewriter = rewriter
|
||||
|
||||
def start(self, tag, attrs):
|
||||
attrs = attrs.items()
|
||||
|
||||
if not self.rewriter._rewrite_tag_attrs(tag, attrs, escape=True):
|
||||
self.rewriter.out.write(u'<' + tag)
|
||||
|
||||
for name, value in attrs:
|
||||
self.rewriter._write_attr(name, value, escape=True)
|
||||
else:
|
||||
if tag == u'head':
|
||||
if (self.rewriter._rewrite_head(False)):
|
||||
return
|
||||
|
||||
self.rewriter.out.write(u'>')
|
||||
|
||||
def end(self, tag):
|
||||
if (tag == self.rewriter._wb_parse_context):
|
||||
self.rewriter._wb_parse_context = None
|
||||
|
||||
self.rewriter.out.write(u'</' + tag + u'>')
|
||||
|
||||
def data(self, data):
|
||||
if not self.rewriter._wb_parse_context:
|
||||
data = cgi.escape(data, quote=True)
|
||||
|
||||
self.rewriter.parse_data(data)
|
||||
|
||||
def comment(self, data):
|
||||
self.rewriter.out.write(u'<!--')
|
||||
self.rewriter.parse_data(data)
|
||||
self.rewriter.out.write(u'-->')
|
||||
|
||||
def pi(self, data):
|
||||
self.rewriter.out.write(u'<?' + data + u'>')
|
||||
|
||||
def close(self):
|
||||
return ''
|
||||
|
||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
|
||||
|
||||
def parse(data, head_insert=None):
|
||||
parser = LXMLHTMLRewriter(urlrewriter, head_insert=head_insert)
|
||||
data = data.decode('utf-8')
|
||||
print parser.rewrite(data) + parser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
@ -15,7 +15,8 @@ HTML = HTMLRewriter
|
||||
def use_lxml_parser():
|
||||
try:
|
||||
import logging
|
||||
from lxml_parser import LXMLHTMLRewriter
|
||||
from lxml_html_rewriter import LXMLHTMLRewriter
|
||||
global HTML
|
||||
HTML = LXMLHTMLRewriter
|
||||
logging.debug('Using LXML Parser')
|
||||
except ImportError:
|
||||
|
101
pywb/rewrite/test/test_html_rewriter.py
Normal file
101
pywb/rewrite/test/test_html_rewriter.py
Normal file
@ -0,0 +1,101 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
ur"""
|
||||
|
||||
#=================================================================
|
||||
# HTML Rewriting (using native HTMLParser)
|
||||
#=================================================================
|
||||
|
||||
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||
<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
|
||||
|
||||
>>> parse('<body x="y"><img src="../img.gif"/><br/></body>')
|
||||
<body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body>
|
||||
|
||||
>>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
|
||||
<body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
|
||||
|
||||
# malformed html -- (2.6 parser raises exception)
|
||||
#>>> parse('<input "selected"><img src></div>')
|
||||
#<input "selected"=""><img src=""></div>
|
||||
|
||||
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
|
||||
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
|
||||
|
||||
# HTML Entities
|
||||
>>> parse('<a href="">› ></div>')
|
||||
<a href="">› ></div>
|
||||
|
||||
# Don't rewrite anchors
|
||||
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
|
||||
<HTML><a href="#abc">Text</a></html>
|
||||
|
||||
# Ensure attr values are not unescaped
|
||||
>>> parse('<input value=""X"">X</input>')
|
||||
<input value=""X"">X</input>
|
||||
|
||||
# Unicode
|
||||
>>> parse('<a href="http://испытание.испытание/">испытание</a>')
|
||||
<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
|
||||
|
||||
# Meta tag
|
||||
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
||||
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
|
||||
|
||||
>>> parse('<meta http-equiv="Content-type" content="text/html; charset=utf-8" />')
|
||||
<meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
|
||||
|
||||
>>> parse('<META http-equiv="refresh" content>')
|
||||
<meta http-equiv="refresh" content="">
|
||||
|
||||
# Script tag
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
||||
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
|
||||
|
||||
# Unterminated script tag, handle but don't auto-terminate
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
|
||||
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc>
|
||||
|
||||
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
|
||||
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
||||
|
||||
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
||||
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
|
||||
|
||||
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
||||
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
|
||||
|
||||
# Unterminated style tag, handle but don't auto-terminate
|
||||
>>> parse('<style>@import url(styles.css)')
|
||||
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)
|
||||
|
||||
# Head Insertion
|
||||
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
|
||||
|
||||
>>> parse('<html><head/><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><head><script src="cool.js"></script></head><body>Test</body></html>
|
||||
|
||||
>>> parse('<body><div>SomeTest</div>', head_insert = '/* Insert */')
|
||||
/* Insert */<body><div>SomeTest</div>
|
||||
|
||||
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
|
||||
<link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
|
||||
"""
|
||||
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.rewrite.html_rewriter import HTMLRewriter
|
||||
|
||||
import pprint
|
||||
|
||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
|
||||
|
||||
def parse(data, head_insert = None):
|
||||
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
|
||||
data = data.decode('utf-8')
|
||||
print parser.rewrite(data) + parser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
114
pywb/rewrite/test/test_lxml_html_rewriter.py
Normal file
114
pywb/rewrite/test/test_lxml_html_rewriter.py
Normal file
@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
ur"""
|
||||
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||
<html><body><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></body></html>
|
||||
|
||||
>>> parse('<body x="y"><img src="../img.gif"/><br/></body>')
|
||||
<html><body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"></img><br></br></body></html>
|
||||
|
||||
>>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
|
||||
<html><body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"></img><br></br></body></html>
|
||||
|
||||
# malformed html -- "selected" attrib dropped
|
||||
>>> parse('<input "selected"><img src></div>')
|
||||
<html><body><input></input><img src=""></img></body></html>
|
||||
|
||||
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
|
||||
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"></base></head></html>
|
||||
|
||||
# Don't rewrite anchors
|
||||
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
|
||||
<html><body><a href="#abc">Text</a></body></html>
|
||||
|
||||
# Ensure attr values are not unescaped
|
||||
>>> parse('<p data-value=""X"">data</p>')
|
||||
<html><body><p data-value=""X"">data</p></body></html>
|
||||
|
||||
# text moved out of input
|
||||
>>> parse('<input value="val">data</input>')
|
||||
<html><body><input value="val"></input>data</body></html>
|
||||
|
||||
>>> parse('<script src="abc.js"></script>')
|
||||
<html><head><script src="/web/20131226101010js_/http://example.com/some/path/abc.js"></script></head></html>
|
||||
|
||||
# Unicode
|
||||
>>> parse('<a href="http://испытание.испытание/">испытание</a>')
|
||||
<html><body><a href="/web/20131226101010/http://испытание.испытание/">испытание</a></body></html>
|
||||
|
||||
# Meta tag
|
||||
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
||||
<html><head><meta content="10; URL=/web/20131226101010/http://example.com/abc/def.html" http-equiv="refresh"></meta></head></html>
|
||||
|
||||
>>> parse('<meta http-equiv="Content-type" content="text/html; charset=utf-8" />')
|
||||
<html><head><meta content="text/html; charset=utf-8" http-equiv="Content-type"></meta></head></html>
|
||||
|
||||
>>> parse('<META http-equiv="refresh" content>')
|
||||
<html><head><meta content="" http-equiv="refresh"></meta></head></html>
|
||||
|
||||
# Script tag
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
||||
<html><head><script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script></head></html>
|
||||
|
||||
# Unterminated script tag, will auto-terminate
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
|
||||
<html><head><script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script></head></html>
|
||||
|
||||
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
|
||||
<html><head><script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script></head></html>
|
||||
|
||||
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
||||
<html><body><div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div></body></html>
|
||||
|
||||
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
||||
<html><head><style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style></head></html>
|
||||
|
||||
# Unterminated style tag, handle but don't auto-terminate
|
||||
>>> parse('<style>@import url(styles.css)')
|
||||
<html><head><style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style></head></html>
|
||||
|
||||
# Head Insertion
|
||||
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
|
||||
|
||||
>>> parse('<html><head/><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><head><script src="cool.js"></script></head><body>Test</body></html>
|
||||
|
||||
>>> parse('<body><div>SomeTest</div>', head_insert = '/* Insert */')
|
||||
<html>/* Insert */<body><div>SomeTest</div></body></html>
|
||||
|
||||
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
|
||||
<html><head><script>load_stuff();</script><link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"></link></head><body><div>SomeTest</div></body></html>
|
||||
|
||||
|
||||
# content after </html>
|
||||
>>> parse('<body>abc</body></html><input type="hidden" value="def"/>')
|
||||
<html><body>abc</body><input type="hidden" value="def"></input></html>
|
||||
|
||||
# doctype
|
||||
>>> parse('<!doctype html><div>abcdef</div>')
|
||||
<html><body><div>abcdef</div></body></html>
|
||||
|
||||
# no attr value
|
||||
>>> parse('<checkbox selected></checkbox')
|
||||
<html><body><checkbox selected=""></checkbox></body></html>
|
||||
"""
|
||||
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
|
||||
try:
|
||||
from pywb.rewrite.lxml_html_rewriter import LXMLHTMLRewriter
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
|
||||
|
||||
def parse(data, head_insert=None):
|
||||
parser = LXMLHTMLRewriter(urlrewriter, head_insert=head_insert)
|
||||
data = data.decode('utf-8')
|
||||
print parser.rewrite(data) + parser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
@ -1,90 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
ur"""
|
||||
|
||||
#=================================================================
|
||||
# HTML Rewriting
|
||||
#=================================================================
|
||||
|
||||
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||
<HTML><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></html>
|
||||
|
||||
>>> parse('<body x="y"><img src="../img.gif"/><br/></body>')
|
||||
<body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body>
|
||||
|
||||
>>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
|
||||
<body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body>
|
||||
|
||||
# malformed html -- (2.6 parser raises exception)
|
||||
#>>> parse('<input "selected"><img src></div>')
|
||||
#<input "selected"=""><img src=""></div>
|
||||
|
||||
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
|
||||
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/>
|
||||
|
||||
# HTML Entities
|
||||
>>> parse('<a href="">› ></div>')
|
||||
<a href="">› ></div>
|
||||
|
||||
# Don't rewrite anchors
|
||||
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
|
||||
<HTML><a href="#abc">Text</a></html>
|
||||
|
||||
# Ensure attr values are not unescaped
|
||||
>>> parse('<input value=""X"">X</input>')
|
||||
<input value=""X"">X</input>
|
||||
|
||||
# Unicode
|
||||
>>> parse('<a href="http://испытание.испытание/">испытание</a>')
|
||||
<a href="/web/20131226101010/http://испытание.испытание/">испытание</a>
|
||||
|
||||
# Meta tag
|
||||
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
||||
<meta http-equiv="refresh" content="10; URL=/web/20131226101010/http://example.com/abc/def.html">
|
||||
|
||||
>>> parse('<meta http-equiv="Content-type" content="text/html; charset=utf-8" />')
|
||||
<meta http-equiv="Content-type" content="text/html; charset=utf-8"/>
|
||||
|
||||
>>> parse('<META http-equiv="refresh" content>')
|
||||
<meta http-equiv="refresh" content="">
|
||||
|
||||
# Script tag
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
||||
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
|
||||
|
||||
# Unterminated script tag, handle but don't auto-terminate
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
|
||||
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc>
|
||||
|
||||
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
|
||||
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
|
||||
|
||||
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
||||
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
|
||||
|
||||
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
||||
<style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style>
|
||||
|
||||
# Unterminated style tag, handle but don't auto-terminate
|
||||
>>> parse('<style>@import url(styles.css)')
|
||||
<style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)
|
||||
|
||||
# Head Insertion
|
||||
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
|
||||
|
||||
>>> parse('<html><head/><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><head><script src="cool.js"></script></head><body>Test</body></html>
|
||||
|
||||
>>> parse('<body><div>SomeTest</div>', head_insert = '/* Insert */')
|
||||
/* Insert */<body><div>SomeTest</div>
|
||||
|
||||
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
|
||||
<link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
|
||||
|
||||
r"""
|
||||
#=================================================================
|
||||
# Custom Regex
|
||||
#=================================================================
|
||||
# Test https->http converter (other tests below in subclasses)
|
||||
>>> RegexRewriter([(RegexRewriter.HTTPX_MATCH_STR, RegexRewriter.remove_https, 0)]).rewrite('a = https://example.com; b = http://example.com; c = https://some-url/path/https://embedded.example.com')
|
||||
'a = http://example.com; b = http://example.com; c = http://some-url/path/http://embedded.example.com'
|
||||
@ -217,7 +134,7 @@ HTTP Headers Rewriting
|
||||
{'charset': None,
|
||||
'removed_header_dict': {},
|
||||
'status_headers': StatusAndHeaders(protocol = '', statusline = '302 Redirect', headers = [ ('X-Archive-Orig-Connection', 'close'),
|
||||
('Location', '/web/20131226101010/http://example.com/other.html')]),
|
||||
('Location', '/web/20131010im_/http://example.com/other.html')]),
|
||||
'text_type': None}
|
||||
|
||||
# gzip
|
||||
@ -248,7 +165,6 @@ Removing Transfer-Encoding always, Was:
|
||||
|
||||
#=================================================================
|
||||
from pywb.rewrite.url_rewriter import UrlRewriter
|
||||
from pywb.rewrite.html_rewriter import HTMLRewriter
|
||||
from pywb.rewrite.regex_rewriters import RegexRewriter, JSRewriter, CSSRewriter, XMLRewriter
|
||||
from pywb.rewrite.header_rewriter import HeaderRewriter
|
||||
|
||||
@ -256,24 +172,17 @@ from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
|
||||
import pprint
|
||||
|
||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
|
||||
|
||||
def parse(data, head_insert = None):
|
||||
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
|
||||
data = data.decode('utf-8')
|
||||
print parser.rewrite(data) + parser.close()
|
||||
|
||||
arcrw = UrlRewriter('20131010im_/http://example.com/', '/web/')
|
||||
urlrewriter = UrlRewriter('20131010im_/http://example.com/', '/web/')
|
||||
|
||||
|
||||
def _test_js(string, extra = []):
|
||||
return JSRewriter(arcrw, extra).rewrite(string)
|
||||
return JSRewriter(urlrewriter, extra).rewrite(string)
|
||||
|
||||
def _test_xml(string):
|
||||
return XMLRewriter(arcrw).rewrite(string)
|
||||
return XMLRewriter(urlrewriter).rewrite(string)
|
||||
|
||||
def _test_css(string):
|
||||
return CSSRewriter(arcrw).rewrite(string)
|
||||
return CSSRewriter(urlrewriter).rewrite(string)
|
||||
|
||||
headerrewriter = HeaderRewriter()
|
||||
|
Loading…
x
Reference in New Issue
Block a user