mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
first pass -- lxml parser!
This commit is contained in:
parent
b0a7cafe6d
commit
bd10c6c2d2
@ -9,9 +9,10 @@ from HTMLParser import HTMLParser, HTMLParseError
|
||||
from url_rewriter import UrlRewriter
|
||||
from regex_rewriters import JSRewriter, CSSRewriter
|
||||
|
||||
import cgi
|
||||
|
||||
#=================================================================
|
||||
class HTMLRewriter(HTMLParser):
|
||||
class HTMLRewriterMixin(object):
|
||||
"""
|
||||
HTML-Parsing Rewriter for custom rewriting, also delegates
|
||||
to rewriters for script and css
|
||||
@ -56,10 +57,13 @@ class HTMLRewriter(HTMLParser):
|
||||
# ===========================
|
||||
class AccumBuff:
|
||||
def __init__(self):
|
||||
self.buff = ''
|
||||
self.ls = []
|
||||
|
||||
def write(self, string):
|
||||
self.buff += string
|
||||
self.ls.append(string)
|
||||
|
||||
def getvalue(self):
|
||||
return ''.join(self.ls)
|
||||
|
||||
# ===========================
|
||||
def __init__(self, url_rewriter,
|
||||
@ -67,8 +71,6 @@ class HTMLRewriter(HTMLParser):
|
||||
js_rewriter_class=JSRewriter,
|
||||
css_rewriter_class=CSSRewriter):
|
||||
|
||||
HTMLParser.__init__(self)
|
||||
|
||||
self.url_rewriter = url_rewriter
|
||||
self._wb_parse_context = None
|
||||
#self.out = outstream if outstream else self.AccumBuff()
|
||||
@ -126,7 +128,7 @@ class HTMLRewriter(HTMLParser):
|
||||
return value.lower() == attr_value.lower()
|
||||
return False
|
||||
|
||||
def rewrite_tag_attrs(self, tag, tag_attrs, is_start_end):
|
||||
def _rewrite_tag_attrs(self, tag, tag_attrs, escape=False):
|
||||
# special case: script or style parse context
|
||||
if ((tag in self.STATE_TAGS) and not self._wb_parse_context):
|
||||
self._wb_parse_context = tag
|
||||
@ -148,8 +150,7 @@ class HTMLRewriter(HTMLParser):
|
||||
|
||||
self.out.write('<' + tag)
|
||||
|
||||
for attr in tag_attrs:
|
||||
attr_name, attr_value = attr
|
||||
for attr_name, attr_value in tag_attrs:
|
||||
|
||||
# special case: inline JS/event handler
|
||||
if ((attr_value and attr_value.startswith('javascript:'))
|
||||
@ -174,24 +175,38 @@ class HTMLRewriter(HTMLParser):
|
||||
if rw_mod is not None:
|
||||
attr_value = self._rewrite_url(attr_value, rw_mod)
|
||||
|
||||
# parser doesn't differentiate between 'attr=""' and just 'attr'
|
||||
# 'attr=""' is more common, so use that form
|
||||
if attr_value:
|
||||
self.out.write(' ' + attr_name + '="' + attr_value + '"')
|
||||
else:
|
||||
self.out.write(' ' + attr_name + '=""')
|
||||
|
||||
self.out.write('/>' if is_start_end else '>')
|
||||
|
||||
# special case: head tag
|
||||
if (self.head_insert and
|
||||
not self._wb_parse_context and
|
||||
(tag == 'head')):
|
||||
self.out.write(self.head_insert)
|
||||
self.head_insert = None
|
||||
# write the attr!
|
||||
self._write_attr(attr_name, attr_value, escape=escape)
|
||||
|
||||
return True
|
||||
|
||||
def _rewrite_head(self, start_end):
|
||||
# special case: head tag
|
||||
|
||||
# if no insert or in context, no rewrite
|
||||
if not self.head_insert or self._wb_parse_context:
|
||||
return False
|
||||
|
||||
self.out.write('>')
|
||||
self.out.write(self.head_insert)
|
||||
self.head_insert = None
|
||||
|
||||
if start_end:
|
||||
self.out.write('</head>')
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _write_attr(self, name, value, escape=False):
|
||||
# parser doesn't differentiate between 'attr=""' and just 'attr'
|
||||
# 'attr=""' is more common, so use that form
|
||||
if value:
|
||||
if escape:
|
||||
value = cgi.escape(value, quote=True)
|
||||
self.out.write(' ' + name + '="' + value + '"')
|
||||
else:
|
||||
self.out.write(' ' + name + '=""')
|
||||
|
||||
def parse_data(self, data):
|
||||
if self._wb_parse_context == 'script':
|
||||
data = self._rewrite_script(data)
|
||||
@ -204,18 +219,35 @@ class HTMLRewriter(HTMLParser):
|
||||
if not self.out:
|
||||
self.out = self.AccumBuff()
|
||||
|
||||
try:
|
||||
self.feed(string)
|
||||
except HTMLParseError:
|
||||
self.out.write(string)
|
||||
self.feed(string)
|
||||
|
||||
result = self.out.buff
|
||||
result = self.out.getvalue()
|
||||
# Clear buffer to create new one for next rewrite()
|
||||
self.out = None
|
||||
|
||||
return result
|
||||
|
||||
|
||||
#=================================================================
|
||||
class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
|
||||
def __init__(self, url_rewriter,
|
||||
head_insert=None,
|
||||
js_rewriter_class=JSRewriter,
|
||||
css_rewriter_class=CSSRewriter):
|
||||
|
||||
HTMLParser.__init__(self)
|
||||
super(HTMLRewriter, self).__init__(url_rewriter,
|
||||
head_insert,
|
||||
js_rewriter_class,
|
||||
css_rewriter_class)
|
||||
|
||||
# HTMLParser overrides below
|
||||
def feed(self, string):
|
||||
try:
|
||||
HTMLParser.feed(self, string)
|
||||
except HTMLParseError:
|
||||
self.out.write(string)
|
||||
|
||||
def close(self):
|
||||
if (self._wb_parse_context):
|
||||
end_tag = '</' + self._wb_parse_context + '>'
|
||||
@ -238,12 +270,17 @@ class HTMLRewriter(HTMLParser):
|
||||
return s
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if not self.rewrite_tag_attrs(tag, attrs, False):
|
||||
if not self._rewrite_tag_attrs(tag, attrs):
|
||||
self.out.write(self.get_starttag_text())
|
||||
elif tag != 'head' or not self._rewrite_head(False):
|
||||
self.out.write('>')
|
||||
|
||||
def handle_startendtag(self, tag, attrs):
|
||||
if not self.rewrite_tag_attrs(tag, attrs, True):
|
||||
if not self._rewrite_tag_attrs(tag, attrs):
|
||||
self.out.write(self.get_starttag_text())
|
||||
elif tag != 'head' or not self._rewrite_head(True):
|
||||
self.out.write('/>')
|
||||
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if (tag == self._wb_parse_context):
|
||||
|
214
pywb/rewrite/lxml_parser.py
Normal file
214
pywb/rewrite/lxml_parser.py
Normal file
@ -0,0 +1,214 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import lxml.html
|
||||
import lxml.etree
|
||||
import cgi
|
||||
|
||||
from regex_rewriters import JSRewriter, CSSRewriter
|
||||
from url_rewriter import UrlRewriter
|
||||
from html_rewriter import HTMLRewriterMixin
|
||||
from StringIO import StringIO
|
||||
|
||||
|
||||
class LXMLHTMLRewriter(HTMLRewriterMixin):
|
||||
r"""
|
||||
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||
<html><body><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></body></html>
|
||||
|
||||
>>> parse('<body x="y"><img src="../img.gif"/><br/></body>')
|
||||
<html><body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"/><br/></body></html>
|
||||
|
||||
>>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
|
||||
<html><body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"/><br/></body></html>
|
||||
|
||||
# malformed html -- "selected" attrib dropped
|
||||
>>> parse('<input "selected"><img src></div>')
|
||||
<html><body><input/><img src=""/></body></html>
|
||||
|
||||
>>> parse('<html><head><base href="http://example.com/some/path/index.html"/>')
|
||||
<html><head><base href="/web/20131226101010/http://example.com/some/path/index.html"/></head></html>
|
||||
|
||||
# Don't rewrite anchors
|
||||
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
|
||||
<html><body><a href="#abc">Text</a></body></html>
|
||||
|
||||
# Ensure attr values are not unescaped
|
||||
>>> parse('<p data-value=""X"">data</p>')
|
||||
<html><body><p data-value=""X"">data</p></body></html>
|
||||
|
||||
# text moved out of input
|
||||
>>> parse('<input value="val">data</input>')
|
||||
<html><body><input value="val"/>data</body></html>
|
||||
|
||||
>>> parse('<script src="abc.js"></script>')
|
||||
<html><head><script src="/web/20131226101010js_/http://example.com/some/path/abc.js"></script></head></html>
|
||||
|
||||
# Unicode
|
||||
>>> parse('<a href="http://испытание.испытание/">испытание</a>')
|
||||
<html><body><a href="/web/20131226101010/http://испытание.испытание/">испытание</a></body></html>
|
||||
|
||||
# Meta tag
|
||||
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
|
||||
<html><head><meta content="10; URL=/web/20131226101010/http://example.com/abc/def.html" http-equiv="refresh"/></head></html>
|
||||
|
||||
>>> parse('<meta http-equiv="Content-type" content="text/html; charset=utf-8" />')
|
||||
<html><head><meta content="text/html; charset=utf-8" http-equiv="Content-type"/></head></html>
|
||||
|
||||
>>> parse('<META http-equiv="refresh" content>')
|
||||
<html><head><meta content="" http-equiv="refresh"/></head></html>
|
||||
|
||||
# Script tag
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
|
||||
<html><head><script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script></head></html>
|
||||
|
||||
# Unterminated script tag, will auto-terminate
|
||||
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
|
||||
<html><head><script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</sc></script></head></html>
|
||||
|
||||
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
|
||||
<html><head><script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script></head></html>
|
||||
|
||||
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
|
||||
<html><body><div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"/></body></html>
|
||||
|
||||
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
|
||||
<html><head><style>@import "/web/20131226101010/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010/http://example.com/some/path/myfont.ttf') }</style></head></html>
|
||||
|
||||
# Unterminated style tag, handle but don't auto-terminate
|
||||
>>> parse('<style>@import url(styles.css)')
|
||||
<html><head><style>@import url(/web/20131226101010/http://example.com/some/path/styles.css)</style></head></html>
|
||||
|
||||
# Head Insertion
|
||||
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
|
||||
|
||||
>>> parse('<html><head/><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><head><script src="cool.js"></script></head><body>Test</body></html>
|
||||
|
||||
>>> parse('<body><div>SomeTest</div>', head_insert = '/* Insert */')
|
||||
<html>/* Insert */<body><div>SomeTest</div></body></html>
|
||||
|
||||
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
|
||||
<html><head><script>load_stuff();</script><link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"/></head><body><div>SomeTest</div></body></html>
|
||||
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, url_rewriter,
|
||||
head_insert=None,
|
||||
js_rewriter_class=JSRewriter,
|
||||
css_rewriter_class=CSSRewriter):
|
||||
|
||||
super(LXMLHTMLRewriter, self).__init__(url_rewriter,
|
||||
head_insert,
|
||||
js_rewriter_class,
|
||||
css_rewriter_class)
|
||||
|
||||
|
||||
self.target = RewriterTarget(self)
|
||||
self.parser = lxml.etree.HTMLParser(remove_pis=False,
|
||||
remove_blank_text=False,
|
||||
remove_comments=False,
|
||||
strip_cdata=False,
|
||||
compact=True,
|
||||
target=self.target,
|
||||
#encoding='utf-8'
|
||||
)
|
||||
|
||||
|
||||
def feed(self, string):
|
||||
self.parser.feed(string)
|
||||
|
||||
def close(self):
|
||||
if not self.out:
|
||||
self.out = self.AccumBuff()
|
||||
|
||||
self.parser.close()
|
||||
|
||||
result = self.out.getvalue()
|
||||
# Clear buffer to create new one for next rewrite()
|
||||
self.out = None
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class RewriterTarget(object):
|
||||
def __init__(self, rewriter):
|
||||
self.rewriter = rewriter
|
||||
self.curr_tag = None
|
||||
|
||||
def _close_tag(self):
|
||||
if self.curr_tag:
|
||||
self.rewriter.out.write('>')
|
||||
self.curr_tag = None
|
||||
|
||||
def start(self, tag, attrs):
|
||||
self._close_tag()
|
||||
attrs = attrs.items()
|
||||
|
||||
self.curr_tag = tag
|
||||
|
||||
if self.rewriter._rewrite_tag_attrs(tag, attrs, escape=True):
|
||||
if tag == 'head' and self.rewriter._rewrite_head(False):
|
||||
self.curr_tag = None
|
||||
return
|
||||
|
||||
self.rewriter.out.write('<' + tag)
|
||||
|
||||
for name, value in attrs:
|
||||
self.rewriter._write_attr(name, value, escape=True)
|
||||
|
||||
|
||||
def end(self, tag):
|
||||
if (tag == self.rewriter._wb_parse_context):
|
||||
self.rewriter._wb_parse_context = None
|
||||
|
||||
if (self.curr_tag == tag) and (tag != 'script'):
|
||||
self.rewriter.out.write('/>')
|
||||
self.curr_tag = None
|
||||
else:
|
||||
self._close_tag()
|
||||
self.rewriter.out.write('</' + tag + '>')
|
||||
|
||||
def data(self, data):
|
||||
self._close_tag()
|
||||
|
||||
if not self.rewriter._wb_parse_context:
|
||||
data = cgi.escape(data, quote=True)
|
||||
|
||||
self.rewriter.parse_data(data)
|
||||
|
||||
def comment(self, text):
|
||||
self._close_tag()
|
||||
|
||||
self.rewriter.out.write('<!--')
|
||||
self.rewriter.parse_data(text)
|
||||
self.rewriter.out.write('-->')
|
||||
|
||||
def close(self):
|
||||
self._close_tag()
|
||||
return ''
|
||||
|
||||
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
|
||||
|
||||
def parse(data, head_insert=None):
|
||||
parser = LXMLHTMLRewriter(urlrewriter, head_insert=head_insert)
|
||||
print parser.rewrite(data) + parser.close()
|
||||
#return parser.rewrite(data) + parser.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
import sys
|
||||
if len(sys.argv) == 1:
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
else:
|
||||
parser = LXMLHTMLRewriter(urlrewriter)
|
||||
x = open(sys.argv[1])
|
||||
b = x.read()
|
||||
while b:
|
||||
print parser.rewrite(b)
|
||||
b = x.read()
|
||||
print parser.close()
|
@ -3,6 +3,7 @@ from pywb.utils.dsrules import BaseRule
|
||||
from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter
|
||||
from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter
|
||||
from html_rewriter import HTMLRewriter
|
||||
from lxml_parser import LXMLHTMLRewriter
|
||||
from header_rewriter import HeaderRewriter
|
||||
|
||||
import itertools
|
||||
@ -20,7 +21,8 @@ class RewriteRules(BaseRule):
|
||||
self.rewriters['header'] = config.get('header_class', HeaderRewriter)
|
||||
self.rewriters['css'] = config.get('css_class', CSSRewriter)
|
||||
self.rewriters['xml'] = config.get('xml_class', XMLRewriter)
|
||||
self.rewriters['html'] = config.get('html_class', HTMLRewriter)
|
||||
self.rewriters['html'] = config.get('html_class', LXMLHTMLRewriter)
|
||||
#self.rewriters['html'] = config.get('html_class', HTMLRewriter)
|
||||
|
||||
# Custom handling for js rewriting, often the most complex
|
||||
self.js_rewrite_location = config.get('js_rewrite_location', True)
|
||||
|
@ -74,6 +74,9 @@ r"""
|
||||
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
|
||||
|
||||
>>> parse('<html><head/><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
|
||||
<html><head><script src="cool.js"></script></head><body>Test</body></html>
|
||||
|
||||
>>> parse('<body><div>SomeTest</div>', head_insert = '/* Insert */')
|
||||
/* Insert */<body><div>SomeTest</div>
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user