mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
cdx: when reading cdxj, and run into non-ascii chars in url, utf-8 encode and %-encode
This commit is contained in:
parent
fc9d659b5d
commit
273176bce5
@ -5,7 +5,7 @@ except ImportError: # pragma: no cover
|
|||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
from urllib import urlencode
|
from urllib import urlencode, quote
|
||||||
from urlparse import parse_qs
|
from urlparse import parse_qs
|
||||||
|
|
||||||
from pywb.utils.wbexception import WbException
|
from pywb.utils.wbexception import WbException
|
||||||
@ -111,7 +111,15 @@ class CDXObject(OrderedDict):
|
|||||||
json_fields = json_decode(fields[-1])
|
json_fields = json_decode(fields[-1])
|
||||||
for n, v in json_fields.iteritems():
|
for n, v in json_fields.iteritems():
|
||||||
n = self.CDX_ALT_FIELDS.get(n, n)
|
n = self.CDX_ALT_FIELDS.get(n, n)
|
||||||
self[n] = str(v)
|
|
||||||
|
try:
|
||||||
|
self[n] = str(v)
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
v = v.encode('utf-8')
|
||||||
|
parts = v.split('//', 1)
|
||||||
|
v = parts[0] + '//' + quote(parts[1])
|
||||||
|
self[n] = v
|
||||||
|
|
||||||
self.cdxline = cdxline
|
self.cdxline = cdxline
|
||||||
self._from_json = True
|
self._from_json = True
|
||||||
return
|
return
|
||||||
|
@ -1,3 +1,6 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from pywb.cdx.cdxobject import CDXObject, IDXObject, CDXException
|
from pywb.cdx.cdxobject import CDXObject, IDXObject, CDXException
|
||||||
from pytest import raises
|
from pytest import raises
|
||||||
|
|
||||||
@ -25,6 +28,12 @@ def test_valid_cdx_formats():
|
|||||||
_make_line(11)
|
_make_line(11)
|
||||||
_make_line(14)
|
_make_line(14)
|
||||||
|
|
||||||
|
def test_unicode_url():
|
||||||
|
x = CDXObject('com,example,cafe)/ 123 {"url": "http://example.com/café/path"}')
|
||||||
|
assert x['urlkey'] == 'com,example,cafe)/'
|
||||||
|
assert x['timestamp'] == '123'
|
||||||
|
assert x['url'] == 'http://example.com/caf%C3%A9/path'
|
||||||
|
|
||||||
def test_invalid_idx_format():
|
def test_invalid_idx_format():
|
||||||
with raises(CDXException):
|
with raises(CDXException):
|
||||||
x = IDXObject('a b c')
|
x = IDXObject('a b c')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user