1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

pass raw bytes to htmlparser, assuming ascii-compatibility

(todo: add tests for non-ascii compatible encodings)
improved rendering of certain pages, needs more testing

lxml: remove lxml and complexity associated with having the parser,
as its too unpredictable for older html, does its own decoding.
This commit is contained in:
Ilya Kreymer 2014-06-27 19:03:06 -07:00
parent dd9f138bab
commit 70b7e29b36
10 changed files with 33 additions and 358 deletions

View File

@ -101,8 +101,5 @@ enable_cdx_api: true
# Memento support, enable
enable_memento: true
# Use lxml parser, if available
use_lxml_parser: false
# Replay content in an iframe
framed_replay: true

View File

@ -1,111 +0,0 @@
try:
import lxml.etree
LXML_SUPPORTED = True
except ImportError:
LXML_SUPPORTED = False
pass
import cgi
import re
from regex_rewriters import JSRewriter, CSSRewriter
from url_rewriter import UrlRewriter
from html_rewriter import HTMLRewriterMixin
#=================================================================
class LXMLHTMLRewriter(HTMLRewriterMixin):
END_HTML = re.compile(r'</\s*html\s*>', re.IGNORECASE)
def __init__(self, *args, **kwargs):
super(LXMLHTMLRewriter, self).__init__(*args, **kwargs)
self.target = RewriterTarget(self)
self.parser = lxml.etree.HTMLParser(remove_pis=False,
remove_blank_text=False,
remove_comments=False,
strip_cdata=False,
compact=True,
target=self.target,
recover=True,
)
self.started = False
def feed(self, string):
self.started = True
string = self.END_HTML.sub(b'', string)
#string = string.replace(b'</html>', b'')
self.parser.feed(string)
def parse(self, stream):
self.out = self.AccumBuff()
lxml.etree.parse(stream, self.parser)
result = self.out.getvalue()
# Clear buffer to create new one for next rewrite()
self.out = None
return result
def _internal_close(self):
if self.started:
self.parser.close()
#=================================================================
class RewriterTarget(object):
def __init__(self, rewriter):
self.rewriter = rewriter
def start(self, tag, attrs):
attrs = attrs.items()
if not self.rewriter._rewrite_tag_attrs(tag, attrs, escape=True):
self.rewriter.out.write(b'<' + tag)
for name, value in attrs:
self.rewriter._write_attr(name, value, escape=True)
else:
if tag == b'head':
if (self.rewriter._rewrite_head(False)):
return
self.rewriter.out.write(b'>')
def end(self, tag):
if (tag == self.rewriter._wb_parse_context):
self.rewriter._wb_parse_context = None
self.rewriter.out.write(b'</' + tag + b'>')
def data(self, data):
if not self.rewriter._wb_parse_context:
data = cgi.escape(data, quote=True)
if isinstance(data, unicode):
data = data.replace(u'\xa0', '&nbsp;')
data = data.encode('utf-8')
self.rewriter.parse_data(data)
def comment(self, data):
self.rewriter.out.write(b'<!--')
self.rewriter.parse_data(data)
self.rewriter.out.write(b'-->')
def doctype(self, root_tag, public_id, system_id):
self.rewriter.out.write(b'<!doctype')
if root_tag:
self.rewriter.out.write(' ' + root_tag)
if public_id:
self.rewriter.out.write(' PUBLIC ' + public_id)
if system_id:
self.rewriter.out.write(' SYSTEM ' + system_id)
self.rewriter.out.write(b'>')
def pi(self, target, data):
self.rewriter.out.write(b'<?' + target + ' ' + data + b'>')
def close(self):
return ''

View File

@ -6,7 +6,7 @@ from io import BytesIO
from header_rewriter import RewrittenStatusAndHeaders
from rewriterules import RewriteRules, is_lxml
from rewriterules import RewriteRules
from pywb.utils.dsrules import RuleSet
from pywb.utils.statusandheaders import StatusAndHeaders
@ -21,7 +21,6 @@ class RewriteContent:
default_rule_config={},
ds_rules_file=ds_rules_file)
self.defmod = defmod
self.decode_stream = False
def sanitize_content(self, status_headers, stream):
# remove transfer encoding chunked and wrap in a dechunking stream
@ -97,17 +96,15 @@ class RewriteContent:
else:
stream = DecompressingBufferedReader(stream)
if self.decode_stream:
if rewritten_headers.charset:
encoding = rewritten_headers.charset
elif is_lxml() and text_type == 'html':
stream_raw = True
else:
(encoding, first_buff) = self._detect_charset(stream)
#if self.decode_stream:
# if rewritten_headers.charset:
# encoding = rewritten_headers.charset
# else:
# (encoding, first_buff) = self._detect_charset(stream)
# if encoding not set or chardet thinks its ascii, use utf-8
if not encoding or encoding == 'ascii':
encoding = 'utf-8'
# if not encoding or encoding == 'ascii':
# encoding = 'utf-8'
rule = self.ruleset.get_first_match(urlkey)
@ -132,37 +129,23 @@ class RewriteContent:
rewriter = rewriter_class(urlrewriter)
# Create rewriting generator
gen = self._rewriting_stream_gen(rewriter, encoding, stream_raw,
gen = self._rewriting_stream_gen(rewriter, encoding,
stream, first_buff)
return (status_headers, gen, True)
def _parse_full_gen(self, rewriter, encoding, stream):
buff = rewriter.parse(stream)
buff = buff.encode(encoding)
yield buff
# Create rewrite stream, may even be chunked by front-end
def _rewriting_stream_gen(self, rewriter, encoding, stream_raw,
def _rewriting_stream_gen(self, rewriter, encoding,
stream, first_buff=None):
if stream_raw:
return self._parse_full_gen(rewriter, encoding, stream)
def do_enc_rewrite(buff):
buff = self._decode_buff(buff, stream, encoding)
buff = rewriter.rewrite(buff)
buff = buff.encode(encoding)
return buff
def do_rewrite(buff):
if encoding:
buff = self._decode_buff(buff, stream, encoding)
buff = rewriter.rewrite(buff)
return buff
if encoding:
buff = buff.encode(encoding)
if encoding:
rewrite_func = do_enc_rewrite
else:
rewrite_func = do_rewrite
return buff
def do_finish():
result = rewriter.close()
@ -172,12 +155,12 @@ class RewriteContent:
return result
return self.stream_to_gen(stream,
rewrite_func=rewrite_func,
rewrite_func=do_rewrite,
final_read_func=do_finish,
first_buff=first_buff)
@staticmethod
def _decode_buff(buff, stream, encoding):
def _decode_buff(buff, stream, encoding): # pragma: no coverage
try:
buff = buff.decode(encoding)
except UnicodeDecodeError, e:
@ -194,7 +177,7 @@ class RewriteContent:
return buff
def _detect_charset(self, stream):
def _detect_charset(self, stream): # pragma: no coverage
full_buff = stream.read(8192)
io_buff = BytesIO(full_buff)
@ -224,17 +207,17 @@ class RewriteContent:
buff = first_buff
else:
buff = stream.read()
# if buff and (not hasattr(stream, 'closed') or
# not stream.closed):
# buff += stream.readline()
if buff and (not hasattr(stream, 'closed') or
not stream.closed):
buff += stream.readline()
while buff:
if rewrite_func:
buff = rewrite_func(buff)
yield buff
buff = stream.read()
# if buff:
# buff += stream.readline()
if buff:
buff += stream.readline()
# For adding a tail/handling final buffer
if final_read_func:

View File

@ -9,30 +9,6 @@ from html_rewriter import HTMLRewriter
import itertools
HTML = HTMLRewriter
_is_lxml = False
#=================================================================
def use_lxml_parser():
import logging
from lxml_html_rewriter import LXMLHTMLRewriter, LXML_SUPPORTED
if LXML_SUPPORTED:
global HTML
global _is_lxml
HTML = LXMLHTMLRewriter
logging.debug('Using LXML Parser')
_is_lxml = True
else: # pragma: no cover
logging.debug('LXML Parser not available')
_is_lxml = False
return _is_lxml
#=================================================================
def is_lxml():
return _is_lxml
#=================================================================

View File

@ -1,166 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
ur"""
>>> parse('<HTML><A Href="page.html">Text</a></hTmL>')
<html><body><a href="/web/20131226101010/http://example.com/some/path/page.html">Text</a></body></html>
>>> parse('<body x="y"><img src="../img.gif"/><br/></body>')
<html><body x="y"><img src="/web/20131226101010im_/http://example.com/some/img.gif"></img><br></br></body></html>
>>> parse('<body x="y"><img src="/img.gif"/><br/></body>')
<html><body x="y"><img src="/web/20131226101010im_/http://example.com/img.gif"></img><br></br></body></html>
# malformed html -- "selected" attrib dropped
>>> parse('<input "selected"><img src></div>')
<html><body><input></input><img src=""></img></body></html>
# Base Tests
>>> parse('<html><head><base href="http://example.com/diff/path/file.html"/>')
<html><head><base href="/web/20131226101010/http://example.com/diff/path/file.html"></base></head></html>
>>> parse('<base href="static/"/><img src="image.gif"/>')
<html><head><base href="/web/20131226101010/http://example.com/some/path/static/"></base><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"></img></head></html>
# Don't rewrite anchors
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')
<html><body><a href="#abc">Text</a></body></html>
# Ensure attr values are not unescaped
>>> parse('<p data-value="&quot;X&quot;">data</p>')
<html><body><p data-value="&quot;X&quot;">data</p></body></html>
# text moved out of input
>>> parse('<input value="val">data</input>')
<html><body><input value="val"></input>data</body></html>
>>> parse('<script src="abc.js"></script>')
<html><head><script src="/web/20131226101010js_/http://example.com/some/path/abc.js"></script></head></html>
# Unicode
#>>> parse('<a href="http://испытание.испытание/">испытание</a>')
<html><body><a href="/web/20131226101010/http://испытание.испытание/">испытание</a></body></html>
# Meta tag
>>> parse('<META http-equiv="refresh" content="10; URL=/abc/def.html">')
<html><head><meta content="10; URL=/web/20131226101010/http://example.com/abc/def.html" http-equiv="refresh"></meta></head></html>
>>> parse('<meta http-equiv="Content-type" content="text/html; charset=utf-8" />')
<html><head><meta content="text/html; charset=utf-8" http-equiv="Content-type"></meta></head></html>
>>> parse('<META http-equiv="refresh" content>')
<html><head><meta content="" http-equiv="refresh"></meta></head></html>
# Custom -data attribs
>>> parse('<div data-url="http://example.com/a/b/c.html" data-some-other-value="http://example.com/img.gif">')
<html><body><div data-url="/web/20131226101010oe_/http://example.com/a/b/c.html" data-some-other-value="/web/20131226101010oe_/http://example.com/img.gif"></div></body></html>
# Script tag
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
<html><head><script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</script></head></html>
# Script tag + crossorigin
>>> parse('<script src="/js/scripts.js" crossorigin="anonymous"></script>')
<html><head><script src="/web/20131226101010js_/http://example.com/js/scripts.js" _crossorigin="anonymous"></script></head></html>
# Unterminated script tag, will auto-terminate
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</sc>')
<html><head><script>window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html"</sc></script></head></html>
>>> parse('<script>/*<![CDATA[*/window.location = "http://example.com/a/b/c.html;/*]]>*/"</script>')
<html><head><script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010em_/http://example.com/a/b/c.html;/*]]>*/"</script></head></html>
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
<html><body><div style="background: url('/web/20131226101010em_/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div></body></html>
>>> parse('<style>@import "styles.css" .a { font-face: url(\'myfont.ttf\') }</style>')
<html><head><style>@import "/web/20131226101010em_/http://example.com/some/path/styles.css" .a { font-face: url('/web/20131226101010em_/http://example.com/some/path/myfont.ttf') }</style></head></html>
# Unterminated style tag, handle but don't auto-terminate
>>> parse('<style>@import url(styles.css)')
<html><head><style>@import url(/web/20131226101010em_/http://example.com/some/path/styles.css)</style></head></html>
# Head Insertion
>>> parse('<html><head><script src="other.js"></script></head><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
<html><head><script src="cool.js"></script><script src="/web/20131226101010js_/http://example.com/some/path/other.js"></script></head><body>Test</body></html>
>>> parse('<html><head/><body>Test</body></html>', head_insert = '<script src="cool.js"></script>')
<html><head><script src="cool.js"></script></head><body>Test</body></html>
>>> parse('<body><div>SomeTest</div>', head_insert = '/* Insert */')
<html>/* Insert */<body><div>SomeTest</div></body></html>
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
<html><head><script>load_stuff();</script><link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"></link></head><body><div>SomeTest</div></body></html>
# content after </html>
>>> parse('<body>abc</body></html><input type="hidden" value="def"/>')
<html><body>abc</body><input type="hidden" value="def"></input></html>
# no attr value
>>> parse('<checkbox selected></checkbox')
<html><body><checkbox selected=""></checkbox></body></html>
# doctype
>>> parse('<!doctype html><div>abcdef</div>')
<!doctype html><html><body><div>abcdef</div></body></html>
>>> parse('<!doctype html PUBLIC "public"><div>abcdef</div>')
<!doctype html PUBLIC public><html><body><div>abcdef</div></body></html>
>>> parse('<!doctype html SYSTEM "system"><div>abcdef</div>')
<!doctype html SYSTEM system><html><body><div>abcdef</div></body></html>
# uncommon markup
>>> parse('<?test content?>')
<?test content?>
# no special cdata treatment, preserved in <script>
>>> parse('<script><![CDATA[ <a href="path.html"></a> ]]></script>')
<html><head><script><![CDATA[ <a href="path.html"></a> ]]></script></head></html>
>>> parse('<!-- <a href="http://example.com"></a> -->')
<!-- <a href="http://example.com"></a> -->
# Test blank
>>> parse('')
<BLANKLINE>
# Test no parsing at all
>>> p = LXMLHTMLRewriter(urlrewriter)
>>> p.close()
''
# test &nbsp;
>>> parse('&nbsp;')
<html><body><p>&nbsp;</p></body></html>
# test multiple rewrites: &nbsp; extra >, split comment
>>> p = LXMLHTMLRewriter(urlrewriter)
>>> p.rewrite('<div>&nbsp; &nbsp; > <!-- a') + p.rewrite('b --></div>') + p.close()
'<html><body><div>&nbsp; &nbsp; &gt; <!-- ab --></div></body></html>'
"""
from pywb.rewrite.url_rewriter import UrlRewriter
from pywb.rewrite.lxml_html_rewriter import LXMLHTMLRewriter, LXML_SUPPORTED
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/web/')
def parse(data, head_insert=None):
parser = LXMLHTMLRewriter(urlrewriter, head_insert=head_insert)
data = data.decode('utf-8')
print parser.rewrite(data) + parser.close()
if __name__ == "__main__":
if LXML_SUPPORTED:
import doctest
doctest.testmod()
else:
# skip if not supported and lxml not available
if not LXML_SUPPORTED:
import pytest
lxml = pytest.importorskip('lxml.etree')

View File

@ -1,7 +1,6 @@
from pywb.framework.basehandlers import WbUrlHandler
from pywb.framework.wbrequestresponse import WbResponse
from pywb.framework.archivalrouter import ArchivalRouter, Route
from pywb.rewrite.rewriterules import use_lxml_parser
from handlers import StaticHandler
@ -23,5 +22,4 @@ def create_live_rewriter_app():
Route('static/default', StaticHandler('pywb/static/'))
]
# use_lxml_parser()
return ArchivalRouter(routes, hostpaths=['http://localhost:8080'])

View File

@ -9,8 +9,6 @@ from pywb.framework.basehandlers import BaseHandler
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.warc.resolvingloader import ResolvingLoader
from pywb.rewrite.rewriterules import use_lxml_parser
from views import J2TemplateView, add_env_globals
from views import J2HtmlCapturesView, HeadInsertView
@ -45,9 +43,7 @@ DEFAULTS = {
'domain_specific_rules': DEFAULT_RULES_FILE,
'enable_memento': True,
'use_lxml_parser': True,
'enable_memento': True
}
@ -177,8 +173,8 @@ def create_wb_router(passed_config={}):
else:
request_class = WbRequest
if config.get('use_lxml_parser', False):
use_lxml_parser()
#if config.get('use_lxml_parser', False):
# use_lxml_parser()
for name, value in collections.iteritems():

View File

@ -66,17 +66,19 @@ class BaseContentView(object):
# render top level frame if in frame mode
# (not supported in proxy mode)
if (self.is_frame_mode and
not wbrequest.is_proxy and
not wbrequest.wb_url.mod):
not wbrequest.is_proxy and
not wbrequest.wb_url.mod):
embed_url = wbrequest.wb_url.to_str(mod=self._mp_mod)
timestamp = datetime_to_timestamp(datetime.datetime.utcnow())
url = wbrequest.wb_url.url
ctype='text/html'
return self.frame_insert_view.render_response(embed_url=embed_url,
wbrequest=wbrequest,
timestamp=timestamp,
url=url)
url=url,
content_type=ctype)
return self.render_content(wbrequest, *args)

View File

@ -98,7 +98,7 @@ class J2TemplateView(object):
def render_response(self, **kwargs):
template_result = self.render_to_string(**kwargs)
status = kwargs.get('status', '200 OK')
content_type = 'text/html; charset=utf-8'
content_type = kwargs.get('content_type', 'text/html; charset=utf-8')
return WbResponse.text_response(template_result.encode('utf-8'),
status=status,
content_type=content_type)

View File

@ -110,7 +110,7 @@ reporter: !!python/object/new:tests.fixture.PrintReporter []
#domain_specific_rules: rules.yaml
# Use lxml parser, if available
use_lxml_parser: true
# use_lxml_parser: true
# Replay content in an iframe
framed_replay: true