From ab8b4efaec73e5b644f443cb42c12c08c3e6e13b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 30 Apr 2016 14:38:48 -0700 Subject: [PATCH] encoding: cdx: only quote-encode 'url' warc: ensure path index loads are utf-8 decoded --- pywb/cdx/cdxobject.py | 10 +++++----- pywb/warc/pathresolvers.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index f933d77a..e9174a37 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -123,11 +123,11 @@ class CDXObject(OrderedDict): for n, v in six.iteritems(json_fields): n = self.CDX_ALT_FIELDS.get(n, n) - try: - v.encode('ascii') - except UnicodeEncodeError: - parts = v.encode('utf-8').split(b'//', 1) - v = parts[0].decode('utf-8') + '//' + quote(parts[1]) + if n == 'url': + try: + v.encode('ascii') + except UnicodeEncodeError: + v = quote(v, safe=':/') self[n] = v diff --git a/pywb/warc/pathresolvers.py b/pywb/warc/pathresolvers.py index ea9d2119..cc6510b4 100644 --- a/pywb/warc/pathresolvers.py +++ b/pywb/warc/pathresolvers.py @@ -51,7 +51,7 @@ class RedisResolver(object): def __call__(self, filename, cdx=None): redis_val = self.redis.hget(self.key_prefix + filename, 'path') - return [to_native_str(redis_val)] if redis_val else [] + return [to_native_str(redis_val, 'utf-8')] if redis_val else [] def __repr__(self): return "RedisResolver('{0}')".format(self.redis_url) @@ -69,7 +69,7 @@ class PathIndexResolver(object): for pathline in result: paths = pathline.split(b'\t')[1:] for path in paths: - yield to_native_str(path) + yield to_native_str(path, 'utf-8') def __repr__(self): # pragma: no cover return "PathIndexResolver('{0}')".format(self.pathindex_file)