encoding: ensure cdx fields are in the native encoding, except filename, which should stay as unicode in py2 for further use

2025-03-15 00:03:28 +01:00 · 2016-04-30 16:08:43 -07:00 · 2016-04-30 16:08:43 -07:00 · dd8ac42f2c
commit dd8ac42f2c
parent e8c77c0538
1 changed files with 8 additions and 3 deletions
--- a/pywb/cdx/cdxobject.py
+++ b/pywb/cdx/cdxobject.py
@ -10,6 +10,7 @@ from six.moves.urllib.parse import urlencode, quote
 from six.moves.urllib.parse import parse_qs

 from pywb.utils.wbexception import WbException
+from pywb.utils.loaders import to_native_str

 from json import loads as json_decode
 from json import dumps as json_encode
@ -117,10 +118,11 @@ class CDXObject(OrderedDict):
        fields = cdxline.split(b' ' , 2)
        # Check for CDX JSON
        if fields[-1].startswith(b'{'):
-            self[URLKEY] = fields[0].decode('utf-8')
-            self[TIMESTAMP] = fields[1].decode('utf-8')
-            json_fields = json_decode(fields[-1].decode('utf-8'))
+            self[URLKEY] = to_native_str(fields[0], 'utf-8')
+            self[TIMESTAMP] = to_native_str(fields[1], 'utf-8')
+            json_fields = json_decode(to_native_str(fields[-1], 'utf-8'))
            for n, v in six.iteritems(json_fields):
+                n = to_native_str(n, 'utf-8')
                n = self.CDX_ALT_FIELDS.get(n, n)

                if n == 'url':
@ -129,6 +131,9 @@ class CDXObject(OrderedDict):
                    except UnicodeEncodeError:
                        v = quote(v.encode('utf-8'), safe=':/')

+                if n != 'filename':
+                    v = to_native_str(v, 'utf-8')
+
                self[n] = v

            self.cdxline = cdxline