mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
indexing: py2 fix: if decoding error while writing utf-8 encoded url, try decoding as utf-8. avoids indexing error in py2 when if warc has non-ascii urls, fix for #312
test: add test for decoding utf-8 url
This commit is contained in:
parent
bef63b4c6c
commit
9acad27801
@ -99,7 +99,10 @@ class CDX09(object):
|
||||
out.write(' ')
|
||||
out.write(entry['timestamp'])
|
||||
out.write(' ')
|
||||
out.write(entry['url'])
|
||||
try:
|
||||
out.write(entry['url'])
|
||||
except UnicodeDecodeError:
|
||||
out.write(entry['url'].decode('utf-8'))
|
||||
out.write(' ')
|
||||
out.write(entry['mime'])
|
||||
out.write(' ')
|
||||
@ -123,7 +126,10 @@ class CDX11(object):
|
||||
out.write(' ')
|
||||
out.write(entry['timestamp'])
|
||||
out.write(' ')
|
||||
out.write(entry['url'])
|
||||
try:
|
||||
out.write(entry['url'])
|
||||
except UnicodeDecodeError:
|
||||
out.write(entry['url'].decode('utf-8'))
|
||||
out.write(' ')
|
||||
out.write(entry['mime'])
|
||||
out.write(' ')
|
||||
|
@ -395,6 +395,33 @@ def test_cdxj_middle_empty_records():
|
||||
assert len(lines) == 2, lines
|
||||
|
||||
|
||||
def test_invalid_decoding_uri_py2():
|
||||
test_data = b'\
|
||||
WARC/1.0\r\n\
|
||||
WARC-Type: resource\r\n\
|
||||
WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
|
||||
WARC-Target-URI: http://example.com/\xc3\x83\xc2\xa9\r\n\
|
||||
WARC-Date: 2000-01-01T00:00:00Z\r\n\
|
||||
Content-Type: text/plain\r\n\
|
||||
Content-Length: 4\r\n\
|
||||
\r\n\
|
||||
ABCD\r\n\
|
||||
\r\n'
|
||||
|
||||
options = dict(include_all=True)
|
||||
|
||||
buff = BytesIO()
|
||||
|
||||
test_record = BytesIO(test_data)
|
||||
|
||||
write_cdx_index(buff, test_record, 'test.warc.gz', **options)
|
||||
|
||||
assert buff.getvalue() == b"""\
|
||||
CDX N b a m s k r M S V g
|
||||
com,example)/%c3%83%c2%a9 20000101000000 http://example.com/\xc3\x83\xc2\xa9 text/plain - 7MXYLSEFM7Z4RTU3PGOHYVDEFUGHWQPW - - 222 0 test.warc.gz
|
||||
"""
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
Loading…
x
Reference in New Issue
Block a user