mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
indexing: py2 fix: if decoding error while writing utf-8 encoded url, try decoding as utf-8. avoids indexing error in py2 when if warc has non-ascii urls, fix for #312
test: add test for decoding utf-8 url
This commit is contained in:
parent
bef63b4c6c
commit
9acad27801
@ -99,7 +99,10 @@ class CDX09(object):
|
|||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry['timestamp'])
|
out.write(entry['timestamp'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry['url'])
|
try:
|
||||||
|
out.write(entry['url'])
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
out.write(entry['url'].decode('utf-8'))
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry['mime'])
|
out.write(entry['mime'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
@ -123,7 +126,10 @@ class CDX11(object):
|
|||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry['timestamp'])
|
out.write(entry['timestamp'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry['url'])
|
try:
|
||||||
|
out.write(entry['url'])
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
out.write(entry['url'].decode('utf-8'))
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
out.write(entry['mime'])
|
out.write(entry['mime'])
|
||||||
out.write(' ')
|
out.write(' ')
|
||||||
|
@ -395,6 +395,33 @@ def test_cdxj_middle_empty_records():
|
|||||||
assert len(lines) == 2, lines
|
assert len(lines) == 2, lines
|
||||||
|
|
||||||
|
|
||||||
|
def test_invalid_decoding_uri_py2():
|
||||||
|
test_data = b'\
|
||||||
|
WARC/1.0\r\n\
|
||||||
|
WARC-Type: resource\r\n\
|
||||||
|
WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
|
||||||
|
WARC-Target-URI: http://example.com/\xc3\x83\xc2\xa9\r\n\
|
||||||
|
WARC-Date: 2000-01-01T00:00:00Z\r\n\
|
||||||
|
Content-Type: text/plain\r\n\
|
||||||
|
Content-Length: 4\r\n\
|
||||||
|
\r\n\
|
||||||
|
ABCD\r\n\
|
||||||
|
\r\n'
|
||||||
|
|
||||||
|
options = dict(include_all=True)
|
||||||
|
|
||||||
|
buff = BytesIO()
|
||||||
|
|
||||||
|
test_record = BytesIO(test_data)
|
||||||
|
|
||||||
|
write_cdx_index(buff, test_record, 'test.warc.gz', **options)
|
||||||
|
|
||||||
|
assert buff.getvalue() == b"""\
|
||||||
|
CDX N b a m s k r M S V g
|
||||||
|
com,example)/%c3%83%c2%a9 20000101000000 http://example.com/\xc3\x83\xc2\xa9 text/plain - 7MXYLSEFM7Z4RTU3PGOHYVDEFUGHWQPW - - 222 0 test.warc.gz
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
doctest.testmod()
|
doctest.testmod()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user