mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 08:04:49 +01:00
encoding: cdx: only quote-encode 'url'
warc: ensure path index loads are utf-8 decoded
This commit is contained in:
parent
67a02613e7
commit
ab8b4efaec
@ -123,11 +123,11 @@ class CDXObject(OrderedDict):
|
||||
for n, v in six.iteritems(json_fields):
|
||||
n = self.CDX_ALT_FIELDS.get(n, n)
|
||||
|
||||
try:
|
||||
v.encode('ascii')
|
||||
except UnicodeEncodeError:
|
||||
parts = v.encode('utf-8').split(b'//', 1)
|
||||
v = parts[0].decode('utf-8') + '//' + quote(parts[1])
|
||||
if n == 'url':
|
||||
try:
|
||||
v.encode('ascii')
|
||||
except UnicodeEncodeError:
|
||||
v = quote(v, safe=':/')
|
||||
|
||||
self[n] = v
|
||||
|
||||
|
@ -51,7 +51,7 @@ class RedisResolver(object):
|
||||
|
||||
def __call__(self, filename, cdx=None):
|
||||
redis_val = self.redis.hget(self.key_prefix + filename, 'path')
|
||||
return [to_native_str(redis_val)] if redis_val else []
|
||||
return [to_native_str(redis_val, 'utf-8')] if redis_val else []
|
||||
|
||||
def __repr__(self):
|
||||
return "RedisResolver('{0}')".format(self.redis_url)
|
||||
@ -69,7 +69,7 @@ class PathIndexResolver(object):
|
||||
for pathline in result:
|
||||
paths = pathline.split(b'\t')[1:]
|
||||
for path in paths:
|
||||
yield to_native_str(path)
|
||||
yield to_native_str(path, 'utf-8')
|
||||
|
||||
def __repr__(self): # pragma: no cover
|
||||
return "PathIndexResolver('{0}')".format(self.pathindex_file)
|
||||
|
Loading…
x
Reference in New Issue
Block a user