1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

rewrite: detect edge-case where html starts with BOM characters followed followed <!DOCTYPE html> as html (#758)

tests: add test that now results in correct html rewriting
fixes #756
This commit is contained in:
Ilya Kreymer 2022-08-31 16:51:41 -07:00 committed by GitHub
parent 2ccd8eb2c3
commit 1249b41dba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 12 additions and 1 deletions

View File

@ -391,7 +391,7 @@ class StreamingRewriter(object):
# ============================================================================
class RewriteInfo(object):
TAG_REGEX = re.compile(b'^(\xef\xbb\xbf)?\s*\<')
TAG_REGEX2 = re.compile(b'^.*<\w+[\s>]')
TAG_REGEX2 = re.compile(b'^.*<[!]?\w+[\s>]')
JSON_REGEX = re.compile(b'^\s*[{[][{"]') # if it starts with this then highly likely not HTML
JSONP_CONTAINS = ['callback=jQuery',

View File

@ -141,6 +141,17 @@ class TestContentRewriter(object):
assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
assert b''.join(gen).decode('utf-8') == exp
def test_rewrite_html_ignore_bom(self):
headers = {'Content-Type': 'text/html'}
content = u'\ufeff\ufeff\ufeff<!DOCTYPE html>\n<head>\n<a href="http://example.com"></a></body></html>'
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
exp = '\ufeff\ufeff\ufeff<!DOCTYPE html>\n<head>\n<a href="http://localhost:8080/prefix/201701/http://example.com"></a></body></html>'
assert is_rw
assert ('Content-Type', 'text/html') in headers.headers
assert b''.join(gen).decode('utf-8') == exp
def test_rewrite_html_utf_8_anchor(self):
headers = {'Content-Type': 'text/html; charset=utf-8'}
content = u'<html><body><a href="#éxample-tésté"></a></body></html>'