diff --git a/pywb/rewrite/lxml_html_rewriter.py b/pywb/rewrite/lxml_html_rewriter.py new file mode 100644 index 00000000..b245d055 --- /dev/null +++ b/pywb/rewrite/lxml_html_rewriter.py @@ -0,0 +1,94 @@ +import lxml.etree +import cgi +import re + +from regex_rewriters import JSRewriter, CSSRewriter +from url_rewriter import UrlRewriter +from html_rewriter import HTMLRewriterMixin + + +#================================================================= +class LXMLHTMLRewriter(HTMLRewriterMixin): + END_HTML = re.compile(r'', re.IGNORECASE) + + def __init__(self, url_rewriter, + head_insert=None, + js_rewriter_class=JSRewriter, + css_rewriter_class=CSSRewriter): + + super(LXMLHTMLRewriter, self).__init__(url_rewriter, + head_insert, + js_rewriter_class, + css_rewriter_class) + + self.target = RewriterTarget(self) + self.parser = lxml.etree.HTMLParser(remove_pis=False, + remove_blank_text=False, + remove_comments=False, + strip_cdata=False, + compact=True, + target=self.target, + recover=True, + ) + + def feed(self, string): + string = self.END_HTML.sub(u'', string) + #string = string.replace(u'', u'') + self.parser.feed(string) + + def close(self): + if not self.out: + self.out = self.AccumBuff() + + self.is_closing = True + self.parser.close() + + result = self.out.getvalue() + # Clear buffer to create new one for next rewrite() + self.out = None + + return result + + +#================================================================= +class RewriterTarget(object): + def __init__(self, rewriter): + self.rewriter = rewriter + + def start(self, tag, attrs): + attrs = attrs.items() + + if not self.rewriter._rewrite_tag_attrs(tag, attrs, escape=True): + self.rewriter.out.write(u'<' + tag) + + for name, value in attrs: + self.rewriter._write_attr(name, value, escape=True) + else: + if tag == u'head': + if (self.rewriter._rewrite_head(False)): + return + + self.rewriter.out.write(u'>') + + def end(self, tag): + if (tag == self.rewriter._wb_parse_context): + self.rewriter._wb_parse_context = None + + self.rewriter.out.write(u'') + + def data(self, data): + if not self.rewriter._wb_parse_context: + data = cgi.escape(data, quote=True) + + self.rewriter.parse_data(data) + + def comment(self, data): + self.rewriter.out.write(u'') + + def pi(self, data): + self.rewriter.out.write(u'') + + def close(self): + return '' diff --git a/pywb/rewrite/lxml_parser.py b/pywb/rewrite/lxml_parser.py deleted file mode 100644 index 5137d74e..00000000 --- a/pywb/rewrite/lxml_parser.py +++ /dev/null @@ -1,202 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import lxml.etree -import cgi -import re - -from regex_rewriters import JSRewriter, CSSRewriter -from url_rewriter import UrlRewriter -from html_rewriter import HTMLRewriterMixin - - -#================================================================= -class LXMLHTMLRewriter(HTMLRewriterMixin): - ur""" - >>> parse('Text') - Text - - >>> parse('
') -

- - >>> parse('
') -

- - # malformed html -- "selected" attrib dropped - >>> parse('') - - - >>> parse('') - - - # Don't rewrite anchors - >>> parse('Text') - Text - - # Ensure attr values are not unescaped - >>> parse('

data

') -

data

- - # text moved out of input - >>> parse('data') - data - - >>> parse('') - - - # Unicode - >>> parse('испытание') - испытание - - # Meta tag - >>> parse('') - - - >>> parse('') - - - >>> parse('') - - - # Script tag - >>> parse('') - - - # Unterminated script tag, will auto-terminate - >>> parse(' - - >>> parse('') - - - >>> parse('
') -
- - >>> parse('') - - - # Unterminated style tag, handle but don't auto-terminate - >>> parse(' - - # Head Insertion - >>> parse('Test', head_insert = '') - Test - - >>> parse('Test', head_insert = '') - Test - - >>> parse('
SomeTest
', head_insert = '/* Insert */') - /* Insert */
SomeTest
- - >>> parse('
SomeTest
', head_insert = '') -
SomeTest
- - - # content after - >>> parse('abc') - abc - - # doctype - >>> parse('
abcdef
') -
abcdef
- - # no attr value - >>> parse(' - """ - - END_HTML = re.compile(r'', re.IGNORECASE) - - def __init__(self, url_rewriter, - head_insert=None, - js_rewriter_class=JSRewriter, - css_rewriter_class=CSSRewriter): - - super(LXMLHTMLRewriter, self).__init__(url_rewriter, - head_insert, - js_rewriter_class, - css_rewriter_class) - - self.target = RewriterTarget(self) - self.parser = lxml.etree.HTMLParser(remove_pis=False, - remove_blank_text=False, - remove_comments=False, - strip_cdata=False, - compact=True, - target=self.target, - recover=True, - ) - - def feed(self, string): - string = self.END_HTML.sub(u'', string) - #string = string.replace(u'', u'') - self.parser.feed(string) - - def close(self): - if not self.out: - self.out = self.AccumBuff() - - self.is_closing = True - self.parser.close() - - result = self.out.getvalue() - # Clear buffer to create new one for next rewrite() - self.out = None - - return result - - -#================================================================= -class RewriterTarget(object): - def __init__(self, rewriter): - self.rewriter = rewriter - - def start(self, tag, attrs): - attrs = attrs.items() - - if not self.rewriter._rewrite_tag_attrs(tag, attrs, escape=True): - self.rewriter.out.write(u'<' + tag) - - for name, value in attrs: - self.rewriter._write_attr(name, value, escape=True) - else: - if tag == u'head': - if (self.rewriter._rewrite_head(False)): - return - - self.rewriter.out.write(u'>') - - def end(self, tag): - if (tag == self.rewriter._wb_parse_context): - self.rewriter._wb_parse_context = None - - self.rewriter.out.write(u'') - - def data(self, data): - if not self.rewriter._wb_parse_context: - data = cgi.escape(data, quote=True) - - self.rewriter.parse_data(data) - - def comment(self, data): - self.rewriter.out.write(u'') - - def pi(self, data): - self.rewriter.out.write(u'') - - def close(self): - return '' - -urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/') - -def parse(data, head_insert=None): - parser = LXMLHTMLRewriter(urlrewriter, head_insert=head_insert) - data = data.decode('utf-8') - print parser.rewrite(data) + parser.close() - -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py index 8d98313b..007e925a 100644 --- a/pywb/rewrite/rewriterules.py +++ b/pywb/rewrite/rewriterules.py @@ -15,7 +15,8 @@ HTML = HTMLRewriter def use_lxml_parser(): try: import logging - from lxml_parser import LXMLHTMLRewriter + from lxml_html_rewriter import LXMLHTMLRewriter + global HTML HTML = LXMLHTMLRewriter logging.debug('Using LXML Parser') except ImportError: diff --git a/pywb/rewrite/test/test_html_rewriter.py b/pywb/rewrite/test/test_html_rewriter.py new file mode 100644 index 00000000..ed117e9e --- /dev/null +++ b/pywb/rewrite/test/test_html_rewriter.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +ur""" + +#================================================================= +# HTML Rewriting (using native HTMLParser) +#================================================================= + +>>> parse('Text') +Text + +>>> parse('
') +
+ +>>> parse('
') +
+ +# malformed html -- (2.6 parser raises exception) +#>>> parse('') +# + +>>> parse('') + + +# HTML Entities +>>> parse('›   >') +›   > + +# Don't rewrite anchors +>>> parse('Text') +Text + +# Ensure attr values are not unescaped +>>> parse('X') +X + +# Unicode +>>> parse('испытание') +испытание + +# Meta tag +>>> parse('') + + +>>> parse('') + + +>>> parse('') + + +# Script tag +>>> parse('') + + +# Unterminated script tag, handle but don't auto-terminate +>>> parse('') + + +>>> parse('
') +
+ +>>> parse('') + + +# Unterminated style tag, handle but don't auto-terminate +>>> parse('') + + +# Unterminated style tag, handle but don't auto-terminate +>>> parse(' + +# Head Insertion +>>> parse('Test', head_insert = '') +Test + +>>> parse('Test', head_insert = '') +Test + +>>> parse('
SomeTest
', head_insert = '/* Insert */') +/* Insert */
SomeTest
+ +>>> parse('
SomeTest
', head_insert = '') +
SomeTest
+ + +# content after +>>> parse('abc') +abc + +# doctype +>>> parse('
abcdef
') +
abcdef
+ +# no attr value +>>> parse(' +""" + +from pywb.rewrite.url_rewriter import UrlRewriter + +try: + from pywb.rewrite.lxml_html_rewriter import LXMLHTMLRewriter +except ImportError: + pass + +urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/') + +def parse(data, head_insert=None): + parser = LXMLHTMLRewriter(urlrewriter, head_insert=head_insert) + data = data.decode('utf-8') + print parser.rewrite(data) + parser.close() + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/pywb/rewrite/test/test_rewrite.py b/pywb/rewrite/test/test_regex_rewriters.py similarity index 63% rename from pywb/rewrite/test/test_rewrite.py rename to pywb/rewrite/test/test_regex_rewriters.py index 9e3d0337..e4a104f2 100644 --- a/pywb/rewrite/test/test_rewrite.py +++ b/pywb/rewrite/test/test_regex_rewriters.py @@ -1,90 +1,7 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -ur""" - -#================================================================= -# HTML Rewriting -#================================================================= - ->>> parse('Text') -Text - ->>> parse('
') -
- ->>> parse('
') -
- -# malformed html -- (2.6 parser raises exception) -#>>> parse('') -# - ->>> parse('') - - -# HTML Entities ->>> parse('›   >') -›   > - -# Don't rewrite anchors ->>> parse('Text') -Text - -# Ensure attr values are not unescaped ->>> parse('X') -X - -# Unicode ->>> parse('испытание') -испытание - -# Meta tag ->>> parse('') - - ->>> parse('') - - ->>> parse('') - - -# Script tag ->>> parse('') - - -# Unterminated script tag, handle but don't auto-terminate ->>> parse('') - - ->>> parse('
') -
- ->>> parse('') - - -# Unterminated style tag, handle but don't auto-terminate ->>> parse('