1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

cdx: CDXQuery takes params dict not **params

CDXObject comparison using to_json()
This commit is contained in:
Ilya Kreymer 2016-02-23 01:36:39 -08:00
parent 57991fd0cf
commit 0dff388e4e
5 changed files with 20 additions and 11 deletions

View File

@ -107,6 +107,7 @@ class CDXObject(OrderedDict):
cdxline = cdxline.rstrip() cdxline = cdxline.rstrip()
self._from_json = False self._from_json = False
self._cached_json = None
# Allows for filling the fields later or in a custom way # Allows for filling the fields later or in a custom way
if not cdxline: if not cdxline:
@ -157,6 +158,9 @@ class CDXObject(OrderedDict):
# force regen on next __str__ call # force regen on next __str__ call
self.cdxline = None self.cdxline = None
# force regen on next to_json() call
self._cached_json = None
def is_revisit(self): def is_revisit(self):
"""return ``True`` if this record is a revisit record.""" """return ``True`` if this record is a revisit record."""
return (self.get(MIMETYPE) == 'warc/revisit' or return (self.get(MIMETYPE) == 'warc/revisit' or
@ -174,7 +178,7 @@ class CDXObject(OrderedDict):
return str(self) + '\n' return str(self) + '\n'
try: try:
result = ' '.join(self[x] for x in fields) + '\n' result = ' '.join(str(self[x]) for x in fields) + '\n'
except KeyError as ke: except KeyError as ke:
msg = 'Invalid field "{0}" found in fields= argument' msg = 'Invalid field "{0}" found in fields= argument'
msg = msg.format(ke.message) msg = msg.format(ke.message)
@ -182,7 +186,6 @@ class CDXObject(OrderedDict):
return result return result
def to_json(self, fields=None): def to_json(self, fields=None):
return self.conv_to_json(self, fields) return self.conv_to_json(self, fields)
@ -213,7 +216,7 @@ class CDXObject(OrderedDict):
return self.cdxline.decode('utf-8') return self.cdxline.decode('utf-8')
if not self._from_json: if not self._from_json:
return ' '.join(val for n, val in six.iteritems(self)) return ' '.join(str(val) for val in six.itervalues(self))
else: else:
return json_encode(self) return json_encode(self)
@ -223,7 +226,13 @@ class CDXObject(OrderedDict):
return prefix + self.conv_to_json(dupe, fields) return prefix + self.conv_to_json(dupe, fields)
def __lt__(self, other): def __lt__(self, other):
return str(self) < str(other) if not self._cached_json:
self._cached_json = self.to_json()
if not other._cached_json:
other._cached_json = other.to_json()
return self._cached_json < other._cached_json
#================================================================= #=================================================================

View File

@ -69,7 +69,7 @@ class BaseCDXServer(object):
def load_cdx(self, **params): def load_cdx(self, **params):
params['_url_canon'] = self.url_canon params['_url_canon'] = self.url_canon
query = CDXQuery(**params) query = CDXQuery(params)
#key, end_key = self._calc_search_keys(query) #key, end_key = self._calc_search_keys(query)
#query.set_key(key, end_key) #query.set_key(key, end_key)

View File

@ -60,8 +60,8 @@ class RemoteCDXSource(CDXSource):
remote_query = query remote_query = query
else: else:
# Only send url and matchType to remote # Only send url and matchType to remote
remote_query = CDXQuery(url=query.url, remote_query = CDXQuery(dict(url=query.url,
matchType=query.match_type) matchType=query.match_type))
urlparams = remote_query.urlencode() urlparams = remote_query.urlencode()

View File

@ -5,8 +5,8 @@ from pywb.utils.canonicalize import calc_search_range
#================================================================= #=================================================================
class CDXQuery(object): class CDXQuery(object):
def __init__(self, **kwargs): def __init__(self, params):
self.params = kwargs self.params = params
url = self.url url = self.url
if not self.params.get('matchType'): if not self.params.get('matchType'):
if url.startswith('*.'): if url.startswith('*.'):
@ -18,7 +18,7 @@ class CDXQuery(object):
else: else:
self.params['matchType'] = 'exact' self.params['matchType'] = 'exact'
start, end = calc_search_range(url=url, start, end = calc_search_range(url=self.url,
match_type=self.params['matchType'], match_type=self.params['matchType'],
url_canon=self.params.get('_url_canon')) url_canon=self.params.get('_url_canon'))

View File

@ -29,7 +29,7 @@ def lazy_cdx_load(**params):
params['custom_ops'] = [raise_access_exception] params['custom_ops'] = [raise_access_exception]
cdx_iter = cdx_load(['bogus ignored'], cdx_iter = cdx_load(['bogus ignored'],
CDXQuery(**params), CDXQuery(params),
process=True) process=True)
# exception happens on first access attempt # exception happens on first access attempt