1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

encoding: cdx: only quote-encode 'url'

warc: ensure path index loads are utf-8 decoded
This commit is contained in:
Ilya Kreymer 2016-04-30 14:38:48 -07:00
parent 67a02613e7
commit ab8b4efaec
2 changed files with 7 additions and 7 deletions

View File

@ -123,11 +123,11 @@ class CDXObject(OrderedDict):
for n, v in six.iteritems(json_fields):
n = self.CDX_ALT_FIELDS.get(n, n)
try:
v.encode('ascii')
except UnicodeEncodeError:
parts = v.encode('utf-8').split(b'//', 1)
v = parts[0].decode('utf-8') + '//' + quote(parts[1])
if n == 'url':
try:
v.encode('ascii')
except UnicodeEncodeError:
v = quote(v, safe=':/')
self[n] = v

View File

@ -51,7 +51,7 @@ class RedisResolver(object):
def __call__(self, filename, cdx=None):
redis_val = self.redis.hget(self.key_prefix + filename, 'path')
return [to_native_str(redis_val)] if redis_val else []
return [to_native_str(redis_val, 'utf-8')] if redis_val else []
def __repr__(self):
return "RedisResolver('{0}')".format(self.redis_url)
@ -69,7 +69,7 @@ class PathIndexResolver(object):
for pathline in result:
paths = pathline.split(b'\t')[1:]
for path in paths:
yield to_native_str(path)
yield to_native_str(path, 'utf-8')
def __repr__(self): # pragma: no cover
return "PathIndexResolver('{0}')".format(self.pathindex_file)