diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 6e76252a..7eb57180 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -107,6 +107,7 @@ class CDXObject(OrderedDict): cdxline = cdxline.rstrip() self._from_json = False + self._cached_json = None # Allows for filling the fields later or in a custom way if not cdxline: @@ -157,6 +158,9 @@ class CDXObject(OrderedDict): # force regen on next __str__ call self.cdxline = None + # force regen on next to_json() call + self._cached_json = None + def is_revisit(self): """return ``True`` if this record is a revisit record.""" return (self.get(MIMETYPE) == 'warc/revisit' or @@ -174,7 +178,7 @@ class CDXObject(OrderedDict): return str(self) + '\n' try: - result = ' '.join(self[x] for x in fields) + '\n' + result = ' '.join(str(self[x]) for x in fields) + '\n' except KeyError as ke: msg = 'Invalid field "{0}" found in fields= argument' msg = msg.format(ke.message) @@ -182,7 +186,6 @@ class CDXObject(OrderedDict): return result - def to_json(self, fields=None): return self.conv_to_json(self, fields) @@ -213,7 +216,7 @@ class CDXObject(OrderedDict): return self.cdxline.decode('utf-8') if not self._from_json: - return ' '.join(val for n, val in six.iteritems(self)) + return ' '.join(str(val) for val in six.itervalues(self)) else: return json_encode(self) @@ -223,7 +226,13 @@ class CDXObject(OrderedDict): return prefix + self.conv_to_json(dupe, fields) def __lt__(self, other): - return str(self) < str(other) + if not self._cached_json: + self._cached_json = self.to_json() + + if not other._cached_json: + other._cached_json = other.to_json() + + return self._cached_json < other._cached_json #================================================================= diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 49cf48e6..bfdf5741 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -69,7 +69,7 @@ class BaseCDXServer(object): def load_cdx(self, **params): params['_url_canon'] = self.url_canon - query = CDXQuery(**params) + query = CDXQuery(params) #key, end_key = self._calc_search_keys(query) #query.set_key(key, end_key) diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index 272d3c41..95fc2616 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -60,8 +60,8 @@ class RemoteCDXSource(CDXSource): remote_query = query else: # Only send url and matchType to remote - remote_query = CDXQuery(url=query.url, - matchType=query.match_type) + remote_query = CDXQuery(dict(url=query.url, + matchType=query.match_type)) urlparams = remote_query.urlencode() diff --git a/pywb/cdx/query.py b/pywb/cdx/query.py index 3058630c..4e82a5de 100644 --- a/pywb/cdx/query.py +++ b/pywb/cdx/query.py @@ -5,8 +5,8 @@ from pywb.utils.canonicalize import calc_search_range #================================================================= class CDXQuery(object): - def __init__(self, **kwargs): - self.params = kwargs + def __init__(self, params): + self.params = params url = self.url if not self.params.get('matchType'): if url.startswith('*.'): @@ -18,7 +18,7 @@ class CDXQuery(object): else: self.params['matchType'] = 'exact' - start, end = calc_search_range(url=url, + start, end = calc_search_range(url=self.url, match_type=self.params['matchType'], url_canon=self.params.get('_url_canon')) diff --git a/pywb/cdx/test/test_lazy_ops.py b/pywb/cdx/test/test_lazy_ops.py index e5c64ea5..d31f27b6 100644 --- a/pywb/cdx/test/test_lazy_ops.py +++ b/pywb/cdx/test/test_lazy_ops.py @@ -29,7 +29,7 @@ def lazy_cdx_load(**params): params['custom_ops'] = [raise_access_exception] cdx_iter = cdx_load(['bogus ignored'], - CDXQuery(**params), + CDXQuery(params), process=True) # exception happens on first access attempt