From bd10c6c2d2cbb9c87818648193f90ad71a1b4b3f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 16 Mar 2014 23:12:04 -0700 Subject: [PATCH] first pass -- lxml parser! --- pywb/rewrite/html_rewriter.py | 97 +++++++++----- pywb/rewrite/lxml_parser.py | 214 ++++++++++++++++++++++++++++++ pywb/rewrite/rewriterules.py | 4 +- pywb/rewrite/test/test_rewrite.py | 3 + 4 files changed, 287 insertions(+), 31 deletions(-) create mode 100644 pywb/rewrite/lxml_parser.py diff --git a/pywb/rewrite/html_rewriter.py b/pywb/rewrite/html_rewriter.py index f94e2b48..4ec3e530 100644 --- a/pywb/rewrite/html_rewriter.py +++ b/pywb/rewrite/html_rewriter.py @@ -9,9 +9,10 @@ from HTMLParser import HTMLParser, HTMLParseError from url_rewriter import UrlRewriter from regex_rewriters import JSRewriter, CSSRewriter +import cgi #================================================================= -class HTMLRewriter(HTMLParser): +class HTMLRewriterMixin(object): """ HTML-Parsing Rewriter for custom rewriting, also delegates to rewriters for script and css @@ -56,10 +57,13 @@ class HTMLRewriter(HTMLParser): # =========================== class AccumBuff: def __init__(self): - self.buff = '' + self.ls = [] def write(self, string): - self.buff += string + self.ls.append(string) + + def getvalue(self): + return ''.join(self.ls) # =========================== def __init__(self, url_rewriter, @@ -67,8 +71,6 @@ class HTMLRewriter(HTMLParser): js_rewriter_class=JSRewriter, css_rewriter_class=CSSRewriter): - HTMLParser.__init__(self) - self.url_rewriter = url_rewriter self._wb_parse_context = None #self.out = outstream if outstream else self.AccumBuff() @@ -126,7 +128,7 @@ class HTMLRewriter(HTMLParser): return value.lower() == attr_value.lower() return False - def rewrite_tag_attrs(self, tag, tag_attrs, is_start_end): + def _rewrite_tag_attrs(self, tag, tag_attrs, escape=False): # special case: script or style parse context if ((tag in self.STATE_TAGS) and not self._wb_parse_context): self._wb_parse_context = tag @@ -148,8 +150,7 @@ class HTMLRewriter(HTMLParser): self.out.write('<' + tag) - for attr in tag_attrs: - attr_name, attr_value = attr + for attr_name, attr_value in tag_attrs: # special case: inline JS/event handler if ((attr_value and attr_value.startswith('javascript:')) @@ -174,24 +175,38 @@ class HTMLRewriter(HTMLParser): if rw_mod is not None: attr_value = self._rewrite_url(attr_value, rw_mod) - # parser doesn't differentiate between 'attr=""' and just 'attr' - # 'attr=""' is more common, so use that form - if attr_value: - self.out.write(' ' + attr_name + '="' + attr_value + '"') - else: - self.out.write(' ' + attr_name + '=""') - - self.out.write('/>' if is_start_end else '>') - - # special case: head tag - if (self.head_insert and - not self._wb_parse_context and - (tag == 'head')): - self.out.write(self.head_insert) - self.head_insert = None + # write the attr! + self._write_attr(attr_name, attr_value, escape=escape) return True + def _rewrite_head(self, start_end): + # special case: head tag + + # if no insert or in context, no rewrite + if not self.head_insert or self._wb_parse_context: + return False + + self.out.write('>') + self.out.write(self.head_insert) + self.head_insert = None + + if start_end: + self.out.write('') + + return True + + + def _write_attr(self, name, value, escape=False): + # parser doesn't differentiate between 'attr=""' and just 'attr' + # 'attr=""' is more common, so use that form + if value: + if escape: + value = cgi.escape(value, quote=True) + self.out.write(' ' + name + '="' + value + '"') + else: + self.out.write(' ' + name + '=""') + def parse_data(self, data): if self._wb_parse_context == 'script': data = self._rewrite_script(data) @@ -204,18 +219,35 @@ class HTMLRewriter(HTMLParser): if not self.out: self.out = self.AccumBuff() - try: - self.feed(string) - except HTMLParseError: - self.out.write(string) + self.feed(string) - result = self.out.buff + result = self.out.getvalue() # Clear buffer to create new one for next rewrite() self.out = None return result + +#================================================================= +class HTMLRewriter(HTMLRewriterMixin, HTMLParser): + def __init__(self, url_rewriter, + head_insert=None, + js_rewriter_class=JSRewriter, + css_rewriter_class=CSSRewriter): + + HTMLParser.__init__(self) + super(HTMLRewriter, self).__init__(url_rewriter, + head_insert, + js_rewriter_class, + css_rewriter_class) + # HTMLParser overrides below + def feed(self, string): + try: + HTMLParser.feed(self, string) + except HTMLParseError: + self.out.write(string) + def close(self): if (self._wb_parse_context): end_tag = '' @@ -238,12 +270,17 @@ class HTMLRewriter(HTMLParser): return s def handle_starttag(self, tag, attrs): - if not self.rewrite_tag_attrs(tag, attrs, False): + if not self._rewrite_tag_attrs(tag, attrs): self.out.write(self.get_starttag_text()) + elif tag != 'head' or not self._rewrite_head(False): + self.out.write('>') def handle_startendtag(self, tag, attrs): - if not self.rewrite_tag_attrs(tag, attrs, True): + if not self._rewrite_tag_attrs(tag, attrs): self.out.write(self.get_starttag_text()) + elif tag != 'head' or not self._rewrite_head(True): + self.out.write('/>') + def handle_endtag(self, tag): if (tag == self._wb_parse_context): diff --git a/pywb/rewrite/lxml_parser.py b/pywb/rewrite/lxml_parser.py new file mode 100644 index 00000000..3fb3c1fd --- /dev/null +++ b/pywb/rewrite/lxml_parser.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import lxml.html +import lxml.etree +import cgi + +from regex_rewriters import JSRewriter, CSSRewriter +from url_rewriter import UrlRewriter +from html_rewriter import HTMLRewriterMixin +from StringIO import StringIO + + +class LXMLHTMLRewriter(HTMLRewriterMixin): + r""" + >>> parse('Text') + Text + + >>> parse('
') +
+ + >>> parse('
') +
+ + # malformed html -- "selected" attrib dropped + >>> parse('') + + + >>> parse('') + + + # Don't rewrite anchors + >>> parse('Text') + Text + + # Ensure attr values are not unescaped + >>> parse('

data

') +

data

+ + # text moved out of input + >>> parse('data') + data + + >>> parse('') + + + # Unicode + >>> parse('испытание') + испытание + + # Meta tag + >>> parse('') + + + >>> parse('') + + + >>> parse('') + + + # Script tag + >>> parse('') + + + # Unterminated script tag, will auto-terminate + >>> parse(' + + >>> parse('') + + + >>> parse('
') +
+ + >>> parse('') + + + # Unterminated style tag, handle but don't auto-terminate + >>> parse(' + + # Head Insertion + >>> parse('Test', head_insert = '') + Test + + >>> parse('Test', head_insert = '') + Test + + >>> parse('
SomeTest
', head_insert = '/* Insert */') + /* Insert */
SomeTest
+ + >>> parse('
SomeTest
', head_insert = '') +
SomeTest
+ + + """ + + def __init__(self, url_rewriter, + head_insert=None, + js_rewriter_class=JSRewriter, + css_rewriter_class=CSSRewriter): + + super(LXMLHTMLRewriter, self).__init__(url_rewriter, + head_insert, + js_rewriter_class, + css_rewriter_class) + + + self.target = RewriterTarget(self) + self.parser = lxml.etree.HTMLParser(remove_pis=False, + remove_blank_text=False, + remove_comments=False, + strip_cdata=False, + compact=True, + target=self.target, + #encoding='utf-8' + ) + + + def feed(self, string): + self.parser.feed(string) + + def close(self): + if not self.out: + self.out = self.AccumBuff() + + self.parser.close() + + result = self.out.getvalue() + # Clear buffer to create new one for next rewrite() + self.out = None + + return result + + +class RewriterTarget(object): + def __init__(self, rewriter): + self.rewriter = rewriter + self.curr_tag = None + + def _close_tag(self): + if self.curr_tag: + self.rewriter.out.write('>') + self.curr_tag = None + + def start(self, tag, attrs): + self._close_tag() + attrs = attrs.items() + + self.curr_tag = tag + + if self.rewriter._rewrite_tag_attrs(tag, attrs, escape=True): + if tag == 'head' and self.rewriter._rewrite_head(False): + self.curr_tag = None + return + + self.rewriter.out.write('<' + tag) + + for name, value in attrs: + self.rewriter._write_attr(name, value, escape=True) + + + def end(self, tag): + if (tag == self.rewriter._wb_parse_context): + self.rewriter._wb_parse_context = None + + if (self.curr_tag == tag) and (tag != 'script'): + self.rewriter.out.write('/>') + self.curr_tag = None + else: + self._close_tag() + self.rewriter.out.write('') + + def data(self, data): + self._close_tag() + + if not self.rewriter._wb_parse_context: + data = cgi.escape(data, quote=True) + + self.rewriter.parse_data(data) + + def comment(self, text): + self._close_tag() + + self.rewriter.out.write('') + + def close(self): + self._close_tag() + return '' + +urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/') + +def parse(data, head_insert=None): + parser = LXMLHTMLRewriter(urlrewriter, head_insert=head_insert) + print parser.rewrite(data) + parser.close() + #return parser.rewrite(data) + parser.close() + + +if __name__ == "__main__": + + import sys + if len(sys.argv) == 1: + import doctest + doctest.testmod() + else: + parser = LXMLHTMLRewriter(urlrewriter) + x = open(sys.argv[1]) + b = x.read() + while b: + print parser.rewrite(b) + b = x.read() + print parser.close() diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py index e9d124c5..ac52773d 100644 --- a/pywb/rewrite/rewriterules.py +++ b/pywb/rewrite/rewriterules.py @@ -3,6 +3,7 @@ from pywb.utils.dsrules import BaseRule from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter from html_rewriter import HTMLRewriter +from lxml_parser import LXMLHTMLRewriter from header_rewriter import HeaderRewriter import itertools @@ -20,7 +21,8 @@ class RewriteRules(BaseRule): self.rewriters['header'] = config.get('header_class', HeaderRewriter) self.rewriters['css'] = config.get('css_class', CSSRewriter) self.rewriters['xml'] = config.get('xml_class', XMLRewriter) - self.rewriters['html'] = config.get('html_class', HTMLRewriter) + self.rewriters['html'] = config.get('html_class', LXMLHTMLRewriter) + #self.rewriters['html'] = config.get('html_class', HTMLRewriter) # Custom handling for js rewriting, often the most complex self.js_rewrite_location = config.get('js_rewrite_location', True) diff --git a/pywb/rewrite/test/test_rewrite.py b/pywb/rewrite/test/test_rewrite.py index 7164b81f..423490b1 100644 --- a/pywb/rewrite/test/test_rewrite.py +++ b/pywb/rewrite/test/test_rewrite.py @@ -74,6 +74,9 @@ r""" >>> parse('Test', head_insert = '') Test +>>> parse('Test', head_insert = '') +Test + >>> parse('
SomeTest
', head_insert = '/* Insert */') /* Insert */
SomeTest