1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

fixes for unicode (doctests)

remove explicit </html> since lxml does not parse past the </html>
tag and adds one anyway (not ideal but only workaround for html after closing tag)
This commit is contained in:
Ilya Kreymer 2014-03-17 11:55:45 -07:00
parent 23d60b0bb8
commit 1404177c6f
6 changed files with 38 additions and 24 deletions

View File

@ -169,7 +169,9 @@ class HTMLRewriterMixin(object):
else:
# special case: base tag
if (tag == 'base') and (attr_name == 'href') and attr_value:
self.url_rewriter.set_base_url(attr_value)
#self.url_rewriter.set_base_url(attr_value)
self.url_rewriter = (self.url_rewriter.
rebase_rewriter(attr_value))
rw_mod = handler.get(attr_name)
if rw_mod is not None:

View File

@ -1,18 +1,18 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import lxml.html
#import lxml.html
import lxml.etree
import cgi
import re
from regex_rewriters import JSRewriter, CSSRewriter
from url_rewriter import UrlRewriter
from html_rewriter import HTMLRewriterMixin
from StringIO import StringIO
class LXMLHTMLRewriter(HTMLRewriterMixin):
r"""
ur"""
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
<html><body><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></body></html>
@ -45,7 +45,7 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
<html><head><script src="/web/20131226101010js_/http://example.com/some/path/abc.js"></script></head></html>
# Unicode
#>>> parse('<a href="http://испытание.испытание/">испытание</a>')
>>> parse('<a href="http://испытание.испытание/">испытание</a>')
<html><body><a href="/web/20131226101010/http://испытание.испытание/">испытание</a></body></html>
# Meta tag
@ -106,6 +106,8 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
<html><body><checkbox selected=""></checkbox></body></html>
"""
END_HTML = re.compile(r'</\s*html\s*>', re.IGNORECASE)
def __init__(self, url_rewriter,
head_insert=None,
js_rewriter_class=JSRewriter,
@ -116,7 +118,6 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
js_rewriter_class,
css_rewriter_class)
self.target = RewriterTarget(self)
self.parser = lxml.etree.HTMLParser(remove_pis=False,
remove_blank_text=False,
@ -127,15 +128,19 @@ class LXMLHTMLRewriter(HTMLRewriterMixin):
recover=True,
)
self.is_closing = False
def feed(self, string):
string = string.replace('</html>', '')
string = self.END_HTML.sub(u'', string)
#string = string.replace(u'</html>', u'')
self.parser.feed(string)
def close(self):
if not self.out:
self.out = self.AccumBuff()
self.is_closing = True
self.parser.close()
result = self.out.getvalue()
@ -153,23 +158,26 @@ class RewriterTarget(object):
attrs = attrs.items()
if not self.rewriter._rewrite_tag_attrs(tag, attrs, escape=True):
self.rewriter.out.write('<' + tag)
self.rewriter.out.write(u'<' + tag)
for name, value in attrs:
self.rewriter._write_attr(name, value, escape=True)
else:
if tag == 'head':
if tag == u'head':
if (self.rewriter._rewrite_head(False)):
return
self.rewriter.out.write('>')
self.rewriter.out.write(u'>')
def end(self, tag):
#if tag == 'html' and not self.rewriter.is_closing:
# raise lxml.etree.LxmlError('test')
if (tag == self.rewriter._wb_parse_context):
self.rewriter._wb_parse_context = None
self.rewriter.out.write('</' + tag + '>')
self.rewriter.out.write(u'</' + tag + u'>')
def data(self, data):
if not self.rewriter._wb_parse_context:
@ -178,12 +186,12 @@ class RewriterTarget(object):
self.rewriter.parse_data(data)
def comment(self, data):
self.rewriter.out.write('<!--')
self.rewriter.out.write(u'<!--')
self.rewriter.parse_data(data)
self.rewriter.out.write('-->')
self.rewriter.out.write(u'-->')
def pi(self, data):
self.rewriter.out.write('<?' + data + '>')
self.rewriter.out.write(u'<?' + data + u'>')
def close(self):
return ''
@ -192,6 +200,7 @@ urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.htm
def parse(data, head_insert=None):
parser = LXMLHTMLRewriter(urlrewriter, head_insert=head_insert)
data = data.decode('utf-8')
print parser.rewrite(data) + parser.close()
#return parser.rewrite(data) + parser.close()

View File

@ -66,11 +66,9 @@ def get_rewritten(url, urlrewriter, urlkey=None, head_insert_func=None):
status_headers, gen = result
buff = ''
for x in gen:
buff += x
#buff = u''.join(gen)
return (status_headers, buff)
return (status_headers, gen)
#=================================================================

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
r"""
ur"""
#=================================================================
# HTML Rewriting
@ -260,6 +260,7 @@ urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.htm
def parse(data, head_insert = None):
parser = HTMLRewriter(urlrewriter, head_insert = head_insert)
data = data.decode('utf-8')
print parser.rewrite(data) + parser.close()
arcrw = UrlRewriter('20131010im_/http://example.com/', '/web/')

View File

@ -64,7 +64,8 @@ def test_example_2():
def test_example_domain_specific_3():
status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter)
urlrewriter2 = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
status_headers, buff = get_rewritten('http://facebook.com/digitalpreservation', urlrewriter2)
# comment out bootloader
assert '/* Bootloader.configurePage' in buff

View File

@ -61,8 +61,11 @@ class UrlRewriter(object):
return self.prefix + self.wburl.to_str(timestamp=timestamp, url=url)
def set_base_url(self, newUrl):
self.wburl.url = newUrl
def rebase_rewriter(self, new_url):
#self.wburl.url = newUrl
new_wburl = copy.copy(self.wburl)
new_wburl.url = new_url
return UrlRewriter(new_wburl, self.prefix)
def __repr__(self):
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
@ -94,5 +97,5 @@ class HttpsUrlRewriter(object):
def get_abs_url(self, url=''):
return url
def set_base_url(self, newUrl):
pass
def rebase_rewriter(self, new_url):
return self