1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

Edge-case and HTML Rewrite Fixes (#441)

* recoder fix: ensure Transfer-Encoding header is not passed through by RecorderApp,
as may result in duplicate Transfer-Encoding in py2.7, fixes #432

* html rewriter fixes:
- html detection: allow for UTF-8 BOM when detecting if text is html
- html decl parsing: modify base parser regex to allow IE conditional declaration to also
end with -->, eg. support '<![endif]-->' in addition to '<![endif]>', fixes #425

* travis: add allow failure for integration tests (for now)
This commit is contained in:
Ilya Kreymer 2019-02-18 10:11:29 -08:00 committed by GitHub
parent 100c7f5509
commit 38c1b1cc3e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 26 additions and 2 deletions

View File

@ -36,6 +36,9 @@ after_success:
- codecov - codecov
matrix: matrix:
allow_failures:
- env: WR_TEST=yes
exclude: exclude:
- env: WR_TEST=yes - env: WR_TEST=yes
python: "2.7" python: "2.7"

View File

@ -347,7 +347,7 @@ class StreamingRewriter(object):
# ============================================================================ # ============================================================================
class RewriteInfo(object): class RewriteInfo(object):
TAG_REGEX = re.compile(b'^\s*\<') TAG_REGEX = re.compile(b'^(\xef\xbb\xbf)?\s*\<')
TAG_REGEX2 = re.compile(b'^.*<\w+[\s>]') TAG_REGEX2 = re.compile(b'^.*<\w+[\s>]')
JSON_REGEX = re.compile(b'^\s*[{[][{"]') # if it starts with this then highly likely not HTML JSON_REGEX = re.compile(b'^\s*[{[][{"]') # if it starts with this then highly likely not HTML

View File

@ -13,6 +13,8 @@ from pywb.rewrite.regex_rewriters import JSRewriter, CSSRewriter
from pywb.rewrite.content_rewriter import StreamingRewriter from pywb.rewrite.content_rewriter import StreamingRewriter
from six import text_type
import six.moves.html_parser import six.moves.html_parser
try: try:
@ -21,7 +23,15 @@ try:
except: except:
orig_unescape = None orig_unescape = None
from six import text_type
try:
import _markupbase as markupbase
except:
import markupbase as markupbase
# ensure invalid cond ending ']-->' closing decl
# is treated same as ']>'
markupbase._msmarkedsectionclose = re.compile(r']\s*-{0,2}>')
#================================================================= #=================================================================

View File

@ -286,6 +286,10 @@ r"""
>>> parse('<!DOCTYPE html>Some Text without any tags <!-- comments -->', head_insert = '<script>load_stuff();</script>') >>> parse('<!DOCTYPE html>Some Text without any tags <!-- comments -->', head_insert = '<script>load_stuff();</script>')
<!DOCTYPE html>Some Text without any tags <!-- comments --><script>load_stuff();</script> <!DOCTYPE html>Some Text without any tags <!-- comments --><script>load_stuff();</script>
# UTF-8 BOM
>>> parse('\ufeff<!DOCTYPE html>Some Text without any tags <!-- comments -->', head_insert = '<script>load_stuff();</script>')
\ufeff<!DOCTYPE html>Some Text without any tags <!-- comments --><script>load_stuff();</script>
# no parse comments # no parse comments
>>> parse('<html><!-- <a href="/foo.html"> --></html>') >>> parse('<html><!-- <a href="/foo.html"> --></html>')
<html><!-- <a href="/foo.html"> --></html> <html><!-- <a href="/foo.html"> --></html>
@ -395,6 +399,13 @@ r"""
>>> parse('<html><a href="javascript:alert()"></a></html>', js_proxy=True) >>> parse('<html><a href="javascript:alert()"></a></html>', js_proxy=True)
<html><a href="javascript:alert()"></a></html> <html><a href="javascript:alert()"></a></html>
# IE conditional
>>> parse('<!--[if !IE]><html><![endif]--><a href="http://example.com/"><!--[if IE]><![endif]--><a href="http://example.com/"></html>')
<!--[if !IE]><html><![endif]--><a href="/web/20131226101010/http://example.com/"><!--[if IE]><![endif]--><a href="/web/20131226101010/http://example.com/"></html>
# IE conditional with invalid ']-->' ending, rewritten as ']>'
>>> parse('<!--[if !IE]> --><html><![endif]--><a href="http://example.com/"><!--[if IE]><![endif]--><a href="http://example.com/"></html>')
<!--[if !IE]> --><html><![endif]><a href="/web/20131226101010/http://example.com/"><!--[if IE]><![endif]--><a href="/web/20131226101010/http://example.com/"></html>
# Test blank # Test blank