1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

encoding: ensure cdx fields are in the native encoding, except filename, which should stay as unicode in py2 for further use

This commit is contained in:
Ilya Kreymer 2016-04-30 16:08:43 -07:00
parent e8c77c0538
commit dd8ac42f2c

View File

@ -10,6 +10,7 @@ from six.moves.urllib.parse import urlencode, quote
from six.moves.urllib.parse import parse_qs
from pywb.utils.wbexception import WbException
from pywb.utils.loaders import to_native_str
from json import loads as json_decode
from json import dumps as json_encode
@ -117,10 +118,11 @@ class CDXObject(OrderedDict):
fields = cdxline.split(b' ' , 2)
# Check for CDX JSON
if fields[-1].startswith(b'{'):
self[URLKEY] = fields[0].decode('utf-8')
self[TIMESTAMP] = fields[1].decode('utf-8')
json_fields = json_decode(fields[-1].decode('utf-8'))
self[URLKEY] = to_native_str(fields[0], 'utf-8')
self[TIMESTAMP] = to_native_str(fields[1], 'utf-8')
json_fields = json_decode(to_native_str(fields[-1], 'utf-8'))
for n, v in six.iteritems(json_fields):
n = to_native_str(n, 'utf-8')
n = self.CDX_ALT_FIELDS.get(n, n)
if n == 'url':
@ -129,6 +131,9 @@ class CDXObject(OrderedDict):
except UnicodeEncodeError:
v = quote(v.encode('utf-8'), safe=':/')
if n != 'filename':
v = to_native_str(v, 'utf-8')
self[n] = v
self.cdxline = cdxline