1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

cdx: when reading cdxj, and run into non-ascii chars in url, utf-8 encode and %-encode

This commit is contained in:
Ilya Kreymer 2015-03-29 09:21:50 -07:00
parent fc9d659b5d
commit 273176bce5
2 changed files with 19 additions and 2 deletions

View File

@ -5,7 +5,7 @@ except ImportError: # pragma: no cover
import itertools
from urllib import urlencode
from urllib import urlencode, quote
from urlparse import parse_qs
from pywb.utils.wbexception import WbException
@ -111,7 +111,15 @@ class CDXObject(OrderedDict):
json_fields = json_decode(fields[-1])
for n, v in json_fields.iteritems():
n = self.CDX_ALT_FIELDS.get(n, n)
self[n] = str(v)
try:
self[n] = str(v)
except UnicodeEncodeError:
v = v.encode('utf-8')
parts = v.split('//', 1)
v = parts[0] + '//' + quote(parts[1])
self[n] = v
self.cdxline = cdxline
self._from_json = True
return

View File

@ -1,3 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from pywb.cdx.cdxobject import CDXObject, IDXObject, CDXException
from pytest import raises
@ -25,6 +28,12 @@ def test_valid_cdx_formats():
_make_line(11)
_make_line(14)
def test_unicode_url():
x = CDXObject('com,example,cafe)/ 123 {"url": "http://example.com/café/path"}')
assert x['urlkey'] == 'com,example,cafe)/'
assert x['timestamp'] == '123'
assert x['url'] == 'http://example.com/caf%C3%A9/path'
def test_invalid_idx_format():
with raises(CDXException):
x = IDXObject('a b c')