From 9acad278011e7ec6e2b4e2044e137d5d5382c840 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 27 Apr 2018 18:36:52 -0700 Subject: [PATCH] indexing: py2 fix: if decoding error while writing utf-8 encoded url, try decoding as utf-8. avoids indexing error in py2 when if warc has non-ascii urls, fix for #312 test: add test for decoding utf-8 url --- pywb/indexer/cdxindexer.py | 10 ++++++++-- pywb/indexer/test/test_indexing.py | 27 +++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/pywb/indexer/cdxindexer.py b/pywb/indexer/cdxindexer.py index 4939b9fb..325b2279 100644 --- a/pywb/indexer/cdxindexer.py +++ b/pywb/indexer/cdxindexer.py @@ -99,7 +99,10 @@ class CDX09(object): out.write(' ') out.write(entry['timestamp']) out.write(' ') - out.write(entry['url']) + try: + out.write(entry['url']) + except UnicodeDecodeError: + out.write(entry['url'].decode('utf-8')) out.write(' ') out.write(entry['mime']) out.write(' ') @@ -123,7 +126,10 @@ class CDX11(object): out.write(' ') out.write(entry['timestamp']) out.write(' ') - out.write(entry['url']) + try: + out.write(entry['url']) + except UnicodeDecodeError: + out.write(entry['url'].decode('utf-8')) out.write(' ') out.write(entry['mime']) out.write(' ') diff --git a/pywb/indexer/test/test_indexing.py b/pywb/indexer/test/test_indexing.py index de79f2e9..80a9f8d9 100644 --- a/pywb/indexer/test/test_indexing.py +++ b/pywb/indexer/test/test_indexing.py @@ -395,6 +395,33 @@ def test_cdxj_middle_empty_records(): assert len(lines) == 2, lines +def test_invalid_decoding_uri_py2(): + test_data = b'\ +WARC/1.0\r\n\ +WARC-Type: resource\r\n\ +WARC-Record-ID: \r\n\ +WARC-Target-URI: http://example.com/\xc3\x83\xc2\xa9\r\n\ +WARC-Date: 2000-01-01T00:00:00Z\r\n\ +Content-Type: text/plain\r\n\ +Content-Length: 4\r\n\ +\r\n\ +ABCD\r\n\ +\r\n' + + options = dict(include_all=True) + + buff = BytesIO() + + test_record = BytesIO(test_data) + + write_cdx_index(buff, test_record, 'test.warc.gz', **options) + + assert buff.getvalue() == b"""\ + CDX N b a m s k r M S V g +com,example)/%c3%83%c2%a9 20000101000000 http://example.com/\xc3\x83\xc2\xa9 text/plain - 7MXYLSEFM7Z4RTU3PGOHYVDEFUGHWQPW - - 222 0 test.warc.gz +""" + + if __name__ == "__main__": import doctest doctest.testmod()