From 273176bce5b8d814682c1c058084f82a5351748c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 29 Mar 2015 09:21:50 -0700 Subject: [PATCH] cdx: when reading cdxj, and run into non-ascii chars in url, utf-8 encode and %-encode --- pywb/cdx/cdxobject.py | 12 ++++++++++-- pywb/cdx/test/test_cdxobject.py | 9 +++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 30bce587..00b1feed 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -5,7 +5,7 @@ except ImportError: # pragma: no cover import itertools -from urllib import urlencode +from urllib import urlencode, quote from urlparse import parse_qs from pywb.utils.wbexception import WbException @@ -111,7 +111,15 @@ class CDXObject(OrderedDict): json_fields = json_decode(fields[-1]) for n, v in json_fields.iteritems(): n = self.CDX_ALT_FIELDS.get(n, n) - self[n] = str(v) + + try: + self[n] = str(v) + except UnicodeEncodeError: + v = v.encode('utf-8') + parts = v.split('//', 1) + v = parts[0] + '//' + quote(parts[1]) + self[n] = v + self.cdxline = cdxline self._from_json = True return diff --git a/pywb/cdx/test/test_cdxobject.py b/pywb/cdx/test/test_cdxobject.py index 13e242a7..a2e73cbe 100644 --- a/pywb/cdx/test/test_cdxobject.py +++ b/pywb/cdx/test/test_cdxobject.py @@ -1,3 +1,6 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + from pywb.cdx.cdxobject import CDXObject, IDXObject, CDXException from pytest import raises @@ -25,6 +28,12 @@ def test_valid_cdx_formats(): _make_line(11) _make_line(14) +def test_unicode_url(): + x = CDXObject('com,example,cafe)/ 123 {"url": "http://example.com/café/path"}') + assert x['urlkey'] == 'com,example,cafe)/' + assert x['timestamp'] == '123' + assert x['url'] == 'http://example.com/caf%C3%A9/path' + def test_invalid_idx_format(): with raises(CDXException): x = IDXObject('a b c')