mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
rewrite: detect edge-case where html starts with BOM characters followed followed <!DOCTYPE html> as html (#758)
tests: add test that now results in correct html rewriting fixes #756
This commit is contained in:
parent
2ccd8eb2c3
commit
1249b41dba
@ -391,7 +391,7 @@ class StreamingRewriter(object):
|
||||
# ============================================================================
|
||||
class RewriteInfo(object):
|
||||
TAG_REGEX = re.compile(b'^(\xef\xbb\xbf)?\s*\<')
|
||||
TAG_REGEX2 = re.compile(b'^.*<\w+[\s>]')
|
||||
TAG_REGEX2 = re.compile(b'^.*<[!]?\w+[\s>]')
|
||||
JSON_REGEX = re.compile(b'^\s*[{[][{"]') # if it starts with this then highly likely not HTML
|
||||
|
||||
JSONP_CONTAINS = ['callback=jQuery',
|
||||
|
@ -141,6 +141,17 @@ class TestContentRewriter(object):
|
||||
assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
|
||||
assert b''.join(gen).decode('utf-8') == exp
|
||||
|
||||
def test_rewrite_html_ignore_bom(self):
|
||||
headers = {'Content-Type': 'text/html'}
|
||||
content = u'\ufeff\ufeff\ufeff<!DOCTYPE html>\n<head>\n<a href="http://example.com"></a></body></html>'
|
||||
|
||||
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
|
||||
|
||||
exp = '\ufeff\ufeff\ufeff<!DOCTYPE html>\n<head>\n<a href="http://localhost:8080/prefix/201701/http://example.com"></a></body></html>'
|
||||
assert is_rw
|
||||
assert ('Content-Type', 'text/html') in headers.headers
|
||||
assert b''.join(gen).decode('utf-8') == exp
|
||||
|
||||
def test_rewrite_html_utf_8_anchor(self):
|
||||
headers = {'Content-Type': 'text/html; charset=utf-8'}
|
||||
content = u'<html><body><a href="#éxample-tésté"></a></body></html>'
|
||||
|
Loading…
x
Reference in New Issue
Block a user