mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
cdx: when reading cdxj, and run into non-ascii chars in url, utf-8 encode and %-encode
This commit is contained in:
parent
fc9d659b5d
commit
273176bce5
@ -5,7 +5,7 @@ except ImportError: # pragma: no cover
|
||||
|
||||
import itertools
|
||||
|
||||
from urllib import urlencode
|
||||
from urllib import urlencode, quote
|
||||
from urlparse import parse_qs
|
||||
|
||||
from pywb.utils.wbexception import WbException
|
||||
@ -111,7 +111,15 @@ class CDXObject(OrderedDict):
|
||||
json_fields = json_decode(fields[-1])
|
||||
for n, v in json_fields.iteritems():
|
||||
n = self.CDX_ALT_FIELDS.get(n, n)
|
||||
self[n] = str(v)
|
||||
|
||||
try:
|
||||
self[n] = str(v)
|
||||
except UnicodeEncodeError:
|
||||
v = v.encode('utf-8')
|
||||
parts = v.split('//', 1)
|
||||
v = parts[0] + '//' + quote(parts[1])
|
||||
self[n] = v
|
||||
|
||||
self.cdxline = cdxline
|
||||
self._from_json = True
|
||||
return
|
||||
|
@ -1,3 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from pywb.cdx.cdxobject import CDXObject, IDXObject, CDXException
|
||||
from pytest import raises
|
||||
|
||||
@ -25,6 +28,12 @@ def test_valid_cdx_formats():
|
||||
_make_line(11)
|
||||
_make_line(14)
|
||||
|
||||
def test_unicode_url():
|
||||
x = CDXObject('com,example,cafe)/ 123 {"url": "http://example.com/café/path"}')
|
||||
assert x['urlkey'] == 'com,example,cafe)/'
|
||||
assert x['timestamp'] == '123'
|
||||
assert x['url'] == 'http://example.com/caf%C3%A9/path'
|
||||
|
||||
def test_invalid_idx_format():
|
||||
with raises(CDXException):
|
||||
x = IDXObject('a b c')
|
||||
|
Loading…
x
Reference in New Issue
Block a user