mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Edge-case and HTML Rewrite Fixes (#441)
* recoder fix: ensure Transfer-Encoding header is not passed through by RecorderApp, as may result in duplicate Transfer-Encoding in py2.7, fixes #432 * html rewriter fixes: - html detection: allow for UTF-8 BOM when detecting if text is html - html decl parsing: modify base parser regex to allow IE conditional declaration to also end with -->, eg. support '<![endif]-->' in addition to '<![endif]>', fixes #425 * travis: add allow failure for integration tests (for now)
This commit is contained in:
parent
100c7f5509
commit
38c1b1cc3e
@ -36,6 +36,9 @@ after_success:
|
|||||||
- codecov
|
- codecov
|
||||||
|
|
||||||
matrix:
|
matrix:
|
||||||
|
allow_failures:
|
||||||
|
- env: WR_TEST=yes
|
||||||
|
|
||||||
exclude:
|
exclude:
|
||||||
- env: WR_TEST=yes
|
- env: WR_TEST=yes
|
||||||
python: "2.7"
|
python: "2.7"
|
||||||
|
@ -347,7 +347,7 @@ class StreamingRewriter(object):
|
|||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class RewriteInfo(object):
|
class RewriteInfo(object):
|
||||||
TAG_REGEX = re.compile(b'^\s*\<')
|
TAG_REGEX = re.compile(b'^(\xef\xbb\xbf)?\s*\<')
|
||||||
TAG_REGEX2 = re.compile(b'^.*<\w+[\s>]')
|
TAG_REGEX2 = re.compile(b'^.*<\w+[\s>]')
|
||||||
JSON_REGEX = re.compile(b'^\s*[{[][{"]') # if it starts with this then highly likely not HTML
|
JSON_REGEX = re.compile(b'^\s*[{[][{"]') # if it starts with this then highly likely not HTML
|
||||||
|
|
||||||
|
@ -13,6 +13,8 @@ from pywb.rewrite.regex_rewriters import JSRewriter, CSSRewriter
|
|||||||
|
|
||||||
from pywb.rewrite.content_rewriter import StreamingRewriter
|
from pywb.rewrite.content_rewriter import StreamingRewriter
|
||||||
|
|
||||||
|
from six import text_type
|
||||||
|
|
||||||
import six.moves.html_parser
|
import six.moves.html_parser
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -21,7 +23,15 @@ try:
|
|||||||
except:
|
except:
|
||||||
orig_unescape = None
|
orig_unescape = None
|
||||||
|
|
||||||
from six import text_type
|
|
||||||
|
try:
|
||||||
|
import _markupbase as markupbase
|
||||||
|
except:
|
||||||
|
import markupbase as markupbase
|
||||||
|
|
||||||
|
# ensure invalid cond ending ']-->' closing decl
|
||||||
|
# is treated same as ']>'
|
||||||
|
markupbase._msmarkedsectionclose = re.compile(r']\s*-{0,2}>')
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -286,6 +286,10 @@ r"""
|
|||||||
>>> parse('<!DOCTYPE html>Some Text without any tags <!-- comments -->', head_insert = '<script>load_stuff();</script>')
|
>>> parse('<!DOCTYPE html>Some Text without any tags <!-- comments -->', head_insert = '<script>load_stuff();</script>')
|
||||||
<!DOCTYPE html>Some Text without any tags <!-- comments --><script>load_stuff();</script>
|
<!DOCTYPE html>Some Text without any tags <!-- comments --><script>load_stuff();</script>
|
||||||
|
|
||||||
|
# UTF-8 BOM
|
||||||
|
>>> parse('\ufeff<!DOCTYPE html>Some Text without any tags <!-- comments -->', head_insert = '<script>load_stuff();</script>')
|
||||||
|
\ufeff<!DOCTYPE html>Some Text without any tags <!-- comments --><script>load_stuff();</script>
|
||||||
|
|
||||||
# no parse comments
|
# no parse comments
|
||||||
>>> parse('<html><!-- <a href="/foo.html"> --></html>')
|
>>> parse('<html><!-- <a href="/foo.html"> --></html>')
|
||||||
<html><!-- <a href="/foo.html"> --></html>
|
<html><!-- <a href="/foo.html"> --></html>
|
||||||
@ -395,6 +399,13 @@ r"""
|
|||||||
>>> parse('<html><a href="javascript:alert()"></a></html>', js_proxy=True)
|
>>> parse('<html><a href="javascript:alert()"></a></html>', js_proxy=True)
|
||||||
<html><a href="javascript:alert()"></a></html>
|
<html><a href="javascript:alert()"></a></html>
|
||||||
|
|
||||||
|
# IE conditional
|
||||||
|
>>> parse('<!--[if !IE]><html><![endif]--><a href="http://example.com/"><!--[if IE]><![endif]--><a href="http://example.com/"></html>')
|
||||||
|
<!--[if !IE]><html><![endif]--><a href="/web/20131226101010/http://example.com/"><!--[if IE]><![endif]--><a href="/web/20131226101010/http://example.com/"></html>
|
||||||
|
|
||||||
|
# IE conditional with invalid ']-->' ending, rewritten as ']>'
|
||||||
|
>>> parse('<!--[if !IE]> --><html><![endif]--><a href="http://example.com/"><!--[if IE]><![endif]--><a href="http://example.com/"></html>')
|
||||||
|
<!--[if !IE]> --><html><![endif]><a href="/web/20131226101010/http://example.com/"><!--[if IE]><![endif]--><a href="/web/20131226101010/http://example.com/"></html>
|
||||||
|
|
||||||
|
|
||||||
# Test blank
|
# Test blank
|
||||||
|
Loading…
x
Reference in New Issue
Block a user