diff --git a/pywb/rewrite/lxml_parser.py b/pywb/rewrite/lxml_parser.py index 3fb3c1fd..6c9e13c9 100644 --- a/pywb/rewrite/lxml_parser.py +++ b/pywb/rewrite/lxml_parser.py @@ -17,17 +17,17 @@ class LXMLHTMLRewriter(HTMLRewriterMixin): Text >>> parse('
') -
+

>>> parse('
') -
+

# malformed html -- "selected" attrib dropped >>> parse('') - + >>> parse('') - + # Don't rewrite anchors >>> parse('Text') @@ -39,24 +39,24 @@ class LXMLHTMLRewriter(HTMLRewriterMixin): # text moved out of input >>> parse('data') - data + data >>> parse('') # Unicode - >>> parse('испытание') + #>>> parse('испытание') испытание # Meta tag >>> parse('') - + >>> parse('') - + >>> parse('') - + # Script tag >>> parse('') @@ -70,7 +70,7 @@ class LXMLHTMLRewriter(HTMLRewriterMixin): >>> parse('
') -
+
>>> parse('') @@ -90,9 +90,20 @@ class LXMLHTMLRewriter(HTMLRewriterMixin): /* Insert */
SomeTest
>>> parse('
SomeTest
', head_insert = '') -
SomeTest
+
SomeTest
+ # content after + >>> parse('abc') + abc + + # doctype + >>> parse('
abcdef
') +
abcdef
+ + # no attr value + >>> parse(' """ def __init__(self, url_rewriter, @@ -113,11 +124,12 @@ class LXMLHTMLRewriter(HTMLRewriterMixin): strip_cdata=False, compact=True, target=self.target, - #encoding='utf-8' + recover=True, ) def feed(self, string): + string = string.replace('', '') self.parser.feed(string) def close(self): @@ -136,58 +148,44 @@ class LXMLHTMLRewriter(HTMLRewriterMixin): class RewriterTarget(object): def __init__(self, rewriter): self.rewriter = rewriter - self.curr_tag = None - - def _close_tag(self): - if self.curr_tag: - self.rewriter.out.write('>') - self.curr_tag = None def start(self, tag, attrs): - self._close_tag() attrs = attrs.items() - self.curr_tag = tag + if not self.rewriter._rewrite_tag_attrs(tag, attrs, escape=True): + self.rewriter.out.write('<' + tag) - if self.rewriter._rewrite_tag_attrs(tag, attrs, escape=True): - if tag == 'head' and self.rewriter._rewrite_head(False): - self.curr_tag = None - return + for name, value in attrs: + self.rewriter._write_attr(name, value, escape=True) + else: + if tag == 'head': + if (self.rewriter._rewrite_head(False)): + return - self.rewriter.out.write('<' + tag) - - for name, value in attrs: - self.rewriter._write_attr(name, value, escape=True) + self.rewriter.out.write('>') def end(self, tag): if (tag == self.rewriter._wb_parse_context): self.rewriter._wb_parse_context = None - if (self.curr_tag == tag) and (tag != 'script'): - self.rewriter.out.write('/>') - self.curr_tag = None - else: - self._close_tag() - self.rewriter.out.write('') + self.rewriter.out.write('') def data(self, data): - self._close_tag() - if not self.rewriter._wb_parse_context: data = cgi.escape(data, quote=True) self.rewriter.parse_data(data) - def comment(self, text): - self._close_tag() - + def comment(self, data): self.rewriter.out.write('') + def pi(self, data): + self.rewriter.out.write('') + def close(self): - self._close_tag() return '' urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/') @@ -207,8 +205,9 @@ if __name__ == "__main__": else: parser = LXMLHTMLRewriter(urlrewriter) x = open(sys.argv[1]) - b = x.read() + b = x.read(81920) while b: - print parser.rewrite(b) - b = x.read() + result = parser.rewrite(b.decode('utf-8')) + print result.encode('utf-8') + b = x.read(81920) print parser.close() diff --git a/pywb/rewrite/rewriterules.py b/pywb/rewrite/rewriterules.py index ac52773d..8069b99f 100644 --- a/pywb/rewrite/rewriterules.py +++ b/pywb/rewrite/rewriterules.py @@ -2,8 +2,19 @@ from pywb.utils.dsrules import BaseRule from regex_rewriters import RegexRewriter, CSSRewriter, XMLRewriter from regex_rewriters import JSLinkAndLocationRewriter, JSLinkOnlyRewriter -from html_rewriter import HTMLRewriter -from lxml_parser import LXMLHTMLRewriter + +HTML = None +try: + from lxml_parser import LXMLHTMLRewriter + HTML = LXMLHTMLRewriter + pass +except ImportError: + pass + +if not HTML: + from html_rewriter import HTMLRewriter + HTML = HTMLRewriter + from header_rewriter import HeaderRewriter import itertools @@ -21,7 +32,7 @@ class RewriteRules(BaseRule): self.rewriters['header'] = config.get('header_class', HeaderRewriter) self.rewriters['css'] = config.get('css_class', CSSRewriter) self.rewriters['xml'] = config.get('xml_class', XMLRewriter) - self.rewriters['html'] = config.get('html_class', LXMLHTMLRewriter) + self.rewriters['html'] = config.get('html_class', HTML) #self.rewriters['html'] = config.get('html_class', HTMLRewriter) # Custom handling for js rewriting, often the most complex