1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

indexing: py2 fix: if decoding error while writing utf-8 encoded url, try decoding as utf-8. avoids indexing error in py2 when if warc has non-ascii urls, fix for #312

test: add test for decoding utf-8 url
This commit is contained in:
Ilya Kreymer 2018-04-27 18:36:52 -07:00
parent bef63b4c6c
commit 9acad27801
2 changed files with 35 additions and 2 deletions

View File

@ -99,7 +99,10 @@ class CDX09(object):
out.write(' ')
out.write(entry['timestamp'])
out.write(' ')
out.write(entry['url'])
try:
out.write(entry['url'])
except UnicodeDecodeError:
out.write(entry['url'].decode('utf-8'))
out.write(' ')
out.write(entry['mime'])
out.write(' ')
@ -123,7 +126,10 @@ class CDX11(object):
out.write(' ')
out.write(entry['timestamp'])
out.write(' ')
out.write(entry['url'])
try:
out.write(entry['url'])
except UnicodeDecodeError:
out.write(entry['url'].decode('utf-8'))
out.write(' ')
out.write(entry['mime'])
out.write(' ')

View File

@ -395,6 +395,33 @@ def test_cdxj_middle_empty_records():
assert len(lines) == 2, lines
def test_invalid_decoding_uri_py2():
test_data = b'\
WARC/1.0\r\n\
WARC-Type: resource\r\n\
WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
WARC-Target-URI: http://example.com/\xc3\x83\xc2\xa9\r\n\
WARC-Date: 2000-01-01T00:00:00Z\r\n\
Content-Type: text/plain\r\n\
Content-Length: 4\r\n\
\r\n\
ABCD\r\n\
\r\n'
options = dict(include_all=True)
buff = BytesIO()
test_record = BytesIO(test_data)
write_cdx_index(buff, test_record, 'test.warc.gz', **options)
assert buff.getvalue() == b"""\
CDX N b a m s k r M S V g
com,example)/%c3%83%c2%a9 20000101000000 http://example.com/\xc3\x83\xc2\xa9 text/plain - 7MXYLSEFM7Z4RTU3PGOHYVDEFUGHWQPW - - 222 0 test.warc.gz
"""
if __name__ == "__main__":
import doctest
doctest.testmod()