#!/usr/bin/env python # -*- coding: utf-8 -*- ur""" >>> parse('Text') Text >>> parse('
')

>>> parse('
')

# malformed html -- "selected" attrib dropped >>> parse('') >>> parse('') # Don't rewrite anchors >>> parse('Text') Text # Ensure attr values are not unescaped >>> parse('

data

')

data

# text moved out of input >>> parse('data') data >>> parse('') # Unicode >>> parse('испытание') испытание # Meta tag >>> parse('') >>> parse('') >>> parse('') # Script tag >>> parse('') # Script tag + crossorigin >>> parse('') # Unterminated script tag, will auto-terminate >>> parse(' >>> parse('') >>> parse('
')
>>> parse('') # Unterminated style tag, handle but don't auto-terminate >>> parse(' # Head Insertion >>> parse('Test', head_insert = '') Test >>> parse('Test', head_insert = '') Test >>> parse('
SomeTest
', head_insert = '/* Insert */') /* Insert */
SomeTest
>>> parse('
SomeTest
', head_insert = '')
SomeTest
# content after >>> parse('abc') abc # no attr value >>> parse(' # doctype >>> parse('
abcdef
')
abcdef
>>> parse('
abcdef
')
abcdef
>>> parse('
abcdef
')
abcdef
# uncommon markup >>> parse('') # no special cdata treatment, preserved in ') >>> parse('') # Test blank >>> parse('') # Test no parsing at all >>> p = LXMLHTMLRewriter(urlrewriter) >>> p.close() '' # test   >>> parse(' ')

 

# test multiple rewrites:   extra >, split comment >>> p = LXMLHTMLRewriter(urlrewriter) >>> p.rewrite('
    >
') + p.close() u'
    >
' """ from pywb.rewrite.url_rewriter import UrlRewriter from pywb.rewrite.lxml_html_rewriter import LXMLHTMLRewriter, LXML_SUPPORTED urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/') def parse(data, head_insert=None): parser = LXMLHTMLRewriter(urlrewriter, head_insert=head_insert) data = data.decode('utf-8') print parser.rewrite(data) + parser.close() if __name__ == "__main__": if LXML_SUPPORTED: import doctest doctest.testmod() else: # skip if not supported and lxml not available if not LXML_SUPPORTED: import pytest lxml = pytest.importorskip('lxml.etree')