mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
fixes for unicode (doctests)
remove explicit </html> since lxml does not parse past the </html> tag and adds one anyway (not ideal but only workaround for html after closing tag)
This commit is contained in:
parent
23d60b0bb8
commit
1404177c6f
@ -169,7 +169,9 @@ class HTMLRewriterMixin(object):
|
|||||||
else:
|
else:
|
||||||
# special case: base tag
|
# special case: base tag
|
||||||
if (tag == 'base') and (attr_name == 'href') and attr_value:
|
if (tag == 'base') and (attr_name == 'href') and attr_value:
|
||||||
self.url_rewriter.set_base_url(attr_value)
|
#self.url_rewriter.set_base_url(attr_value)
|
||||||
|
self.url_rewriter = (self.url_rewriter.
|
||||||
|
rebase_rewriter(attr_value))
|
||||||
|
|
||||||
rw_mod = handler.get(attr_name)
|
rw_mod = handler.get(attr_name)
|
||||||
if rw_mod is not None:
|
if rw_mod is not None:
|
||||||
|
@ -1,18 +1,18 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import lxml.html
|
#import lxml.html
|
||||||
import lxml.etree
|
import lxml.etree
|
||||||
import cgi
|
import cgi
|
||||||
|
import re
|
||||||
|
|
||||||
from regex_rewriters import JSRewriter, CSSRewriter
|
from regex_rewriters import JSRewriter, CSSRewriter
|
||||||
from url_rewriter import UrlRewriter
|
from url_rewriter import UrlRewriter
|
||||||
from html_rewriter import HTMLRewriterMixin
|
from html_rewriter import HTMLRewriterMixin
|
||||||
from StringIO import StringIO
|
|
||||||
|
|
||||||
|
|
||||||
class LXMLHTMLRewriter(HTMLRewriterMixin):
|
class LXMLHTMLRewriter(HTMLRewriterMixin):
|
||||||
r"""
|
ur"""
|
||||||
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
|
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
|
||||||
<html><body><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></body></html>
|
<html><body><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></body></html>
|
||||||
|
|
||||||
@ -45,7 +45,7 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
|
|||||||
<html><head><script src="/web/20131226101010js_/http://example.com/some/path/abc.js"></script></head></html>
|
<html><head><script src="/web/20131226101010js_/http://example.com/some/path/abc.js"></script></head></html>
|
||||||
|
|
||||||
# Unicode
|
# Unicode
|
||||||
#>>> parse('<a href="http://испытание.испытание/">испытание</a>')
|
>>> parse('<a href="http://испытание.испытание/">испытание</a>')
|
||||||
<html><body><a href="/web/20131226101010/http://испытание.испытание/">испытание</a></body></html>
|
<html><body><a href="/web/20131226101010/http://испытание.испытание/">испытание</a></body></html>
|
||||||
|
|
||||||
# Meta tag
|
# Meta tag
|
||||||
@ -106,6 +106,8 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
|
|||||||
<html><body><checkbox selected=""></checkbox></body></html>
|
<html><body><checkbox selected=""></checkbox></body></html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
END_HTML = re.compile(r'</\s*html\s*>', re.IGNORECASE)
|
||||||
|
|
||||||
def __init__(self, url_rewriter,
|
def __init__(self, url_rewriter,
|
||||||
head_insert=None,
|
head_insert=None,
|
||||||
js_rewriter_class=JSRewriter,
|
js_rewriter_class=JSRewriter,
|
||||||
@ -116,7 +118,6 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
|
|||||||
js_rewriter_class,
|
js_rewriter_class,
|
||||||
css_rewriter_class)
|
css_rewriter_class)
|
||||||
|
|
||||||
|
|
||||||
self.target = RewriterTarget(self)
|
self.target = RewriterTarget(self)
|
||||||
self.parser = lxml.etree.HTMLParser(remove_pis=False,
|
self.parser = lxml.etree.HTMLParser(remove_pis=False,
|
||||||
remove_blank_text=False,
|
remove_blank_text=False,
|
||||||
@ -127,15 +128,19 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
|
|||||||
recover=True,
|
recover=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.is_closing = False
|
||||||
|
|
||||||
|
|
||||||
def feed(self, string):
|
def feed(self, string):
|
||||||
string = string.replace('</html>', '')
|
string = self.END_HTML.sub(u'', string)
|
||||||
|
#string = string.replace(u'</html>', u'')
|
||||||
self.parser.feed(string)
|
self.parser.feed(string)
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
if not self.out:
|
if not self.out:
|
||||||
self.out = self.AccumBuff()
|
self.out = self.AccumBuff()
|
||||||
|
|
||||||
|
self.is_closing = True
|
||||||
self.parser.close()
|
self.parser.close()
|
||||||
|
|
||||||
result = self.out.getvalue()
|
result = self.out.getvalue()
|
||||||
@ -153,23 +158,26 @@ class RewriterTarget(object):
|
|||||||
attrs = attrs.items()
|
attrs = attrs.items()
|
||||||
|
|
||||||
if not self.rewriter._rewrite_tag_attrs(tag, attrs, escape=True):
|
if not self.rewriter._rewrite_tag_attrs(tag, attrs, escape=True):
|
||||||
self.rewriter.out.write('<' + tag)
|
self.rewriter.out.write(u'<' + tag)
|
||||||
|
|
||||||
for name, value in attrs:
|
for name, value in attrs:
|
||||||
self.rewriter._write_attr(name, value, escape=True)
|
self.rewriter._write_attr(name, value, escape=True)
|
||||||
else:
|
else:
|
||||||
if tag == 'head':
|
if tag == u'head':
|
||||||
if (self.rewriter._rewrite_head(False)):
|
if (self.rewriter._rewrite_head(False)):
|
||||||
return
|
return
|
||||||
|
|
||||||
self.rewriter.out.write('>')
|
self.rewriter.out.write(u'>')
|
||||||
|
|
||||||
|
|
||||||
def end(self, tag):
|
def end(self, tag):
|
||||||
|
#if tag == 'html' and not self.rewriter.is_closing:
|
||||||
|
# raise lxml.etree.LxmlError('test')
|
||||||
|
|
||||||
if (tag == self.rewriter._wb_parse_context):
|
if (tag == self.rewriter._wb_parse_context):
|
||||||
self.rewriter._wb_parse_context = None
|
self.rewriter._wb_parse_context = None
|
||||||
|
|
||||||
self.rewriter.out.write('</' + tag + '>')
|
self.rewriter.out.write(u'</' + tag + u'>')
|
||||||
|
|
||||||
def data(self, data):
|
def data(self, data):
|
||||||
if not self.rewriter._wb_parse_context:
|
if not self.rewriter._wb_parse_context:
|
||||||
@ -178,12 +186,12 @@ class RewriterTarget(object):
|
|||||||
self.rewriter.parse_data(data)
|
self.rewriter.parse_data(data)
|
||||||
|
|
||||||
def comment(self, data):
|
def comment(self, data):
|
||||||
self.rewriter.out.write('<!--')
|
self.rewriter.out.write(u'<!--')
|
||||||
self.rewriter.parse_data(data)
|
self.rewriter.parse_data(data)
|
||||||
self.rewriter.out.write('-->')
|
self.rewriter.out.write(u'-->')
|
||||||
|
|
||||||
def pi(self, data):
|
def pi(self, data):
|
||||||
self.rewriter.out.write('<?' + data + '>')
|
self.rewriter.out.write(u'<?' + data + u'>')
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
return ''
|
return ''
|
||||||
@ -192,6 +200,7 @@ urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.htm
|
|||||||
|
|
||||||
def parse(data, head_insert=None):
|
def parse(data, head_insert=None):
|
||||||
parser = LXMLHTMLRewriter(urlrewriter, head_insert=head_insert)
|
parser = LXMLHTMLRewriter(urlrewriter, head_insert=head_insert)
|
||||||
|
data = data.decode('utf-8')
|
||||||
print parser.rewrite(data) + parser.close()
|
print parser.rewrite(data) + parser.close()
|
||||||
#return parser.rewrite(data) + parser.close()
|
#return parser.rewrite(data) + parser.close()
|
||||||
|
|
||||||
|
@ -66,11 +66,9 @@ def get_rewritten(url, urlrewriter, urlkey=None, head_insert_func=None):
|
|||||||
|
|
||||||
status_headers, gen = result
|
status_headers, gen = result
|
||||||
|
|
||||||
buff = ''
|
#buff = u''.join(gen)
|
||||||
for x in gen:
|
|
||||||
buff += x
|
|
||||||
|
|
||||||
return (status_headers, buff)
|
return (status_headers, gen)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
r"""
|
ur"""
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# HTML Rewriting
|
# HTML Rewriting
|
||||||
@ -260,6 +260,7 @@ urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.htm
|
|||||||
|
|
||||||
def parse(data, head_insert = None):
|
def parse(data, head_insert = None):
|
||||||
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
|
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
|
||||||
|
data = data.decode('utf-8')
|
||||||
print parser.rewrite(data) + parser.close()
|
print parser.rewrite(data) + parser.close()
|
||||||
|
|
||||||
arcrw = UrlRewriter('20131010im_/http://example.com/', '/web/')
|
arcrw = UrlRewriter('20131010im_/http://example.com/', '/web/')
|
||||||
|
@ -64,7 +64,8 @@ def test_example_2():
|
|||||||
|
|
||||||
|
|
||||||
def test_example_domain_specific_3():
|
def test_example_domain_specific_3():
|
||||||
status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter)
|
urlrewriter2 = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
|
||||||
|
status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter2)
|
||||||
|
|
||||||
# comment out bootloader
|
# comment out bootloader
|
||||||
assert '/* Bootloader.configurePage' in buff
|
assert '/* Bootloader.configurePage' in buff
|
||||||
|
@ -61,8 +61,11 @@ class UrlRewriter(object):
|
|||||||
|
|
||||||
return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url)
|
return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url)
|
||||||
|
|
||||||
def set_base_url(self, newUrl):
|
def rebase_rewriter(self, new_url):
|
||||||
self.wburl.url = newUrl
|
#self.wburl.url = newUrl
|
||||||
|
new_wburl = copy.copy(self.wburl)
|
||||||
|
new_wburl.url = new_url
|
||||||
|
return UrlRewriter(new_wburl, self.prefix)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
|
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
|
||||||
@ -94,5 +97,5 @@ class HttpsUrlRewriter(object):
|
|||||||
def get_abs_url(self, url=''):
|
def get_abs_url(self, url=''):
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def set_base_url(self, newUrl):
|
def rebase_rewriter(self, new_url):
|
||||||
pass
|
return self
|
||||||
|
Loading…
x
Reference in New Issue
Block a user