diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index 2efca76c..ef332ec4 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -8,8 +8,7 @@ from six.moves.urllib.parse import urlsplit from pywb.utils.dsrules import BaseRule, RuleSet from pywb.utils.canonicalize import unsurt, UrlCanonicalizer - -from pywb.cdx.query import CDXQuery +from pywb.utils.loaders import to_native_str #================================================================= @@ -65,14 +64,14 @@ class CustomUrlCanonicalizer(UrlCanonicalizer): #================================================================= -class FuzzyQuery: +class FuzzyQuery(object): def __init__(self, rules): self.rules = rules def __call__(self, query): matched_rule = None - urlkey = query.key + urlkey = to_native_str(query.key, 'utf-8') url = query.url filter_ = query.filters output = query.output @@ -149,7 +148,7 @@ class CDXDomainSpecificRule(BaseRule): In the case of non-surt format, this method is called to desurt any urls """ - self.url_prefix = map(unsurt, self.url_prefix) + self.url_prefix = list(map(unsurt, self.url_prefix)) if self.regex: self.regex = re.compile(unsurt(self.regex.pattern)) @@ -181,6 +180,6 @@ class CDXDomainSpecificRule(BaseRule): def conv(value): return '[?&]({0}=[^&]+)'.format(re.escape(value)) - params_list = map(conv, params_list) + params_list = list(map(conv, params_list)) final_str = '.*'.join(params_list) return final_str diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 9d1b6f38..6e76252a 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -182,7 +182,12 @@ class CDXObject(OrderedDict): return result + def to_json(self, fields=None): + return self.conv_to_json(self, fields) + + @staticmethod + def conv_to_json(obj, fields=None): """ return cdx as json dictionary string if ``fields`` is ``None``, output will include all fields @@ -192,10 +197,10 @@ class CDXObject(OrderedDict): :param fields: list of field names to output """ if fields is None: - return json_encode(self) + '\n' + return json_encode(obj) + '\n' try: - result = json_encode(OrderedDict((x, self[x]) for x in fields)) + '\n' + result = json_encode(OrderedDict([(x, obj[x]) for x in fields if x in obj])) + '\n' except KeyError as ke: msg = 'Invalid field "{0}" found in fields= argument' msg = msg.format(ke.message) @@ -212,6 +217,14 @@ class CDXObject(OrderedDict): else: return json_encode(self) + def to_cdxj(self, fields=None): + prefix = self['urlkey'] + ' ' + self['timestamp'] + ' ' + dupe = OrderedDict(list(self.items())[2:]) + return prefix + self.conv_to_json(dupe, fields) + + def __lt__(self, other): + return str(self) < str(other) + #================================================================= class IDXObject(OrderedDict): diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 790e5ac9..18c420c5 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -11,8 +11,6 @@ import bisect from six.moves import zip, range, map import re - - from heapq import merge from collections import deque diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index a9644283..49cf48e6 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -1,4 +1,4 @@ -from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range +from pywb.utils.canonicalize import UrlCanonicalizer from pywb.utils.wbexception import NotFoundException from pywb.cdx.cdxops import cdx_load @@ -62,17 +62,17 @@ class BaseCDXServer(object): raise NotFoundException(msg, url=query.url) - def _calc_search_keys(self, query): - return calc_search_range(url=query.url, - match_type=query.match_type, - url_canon=self.url_canon) + #def _calc_search_keys(self, query): + # return calc_search_range(url=query.url, + # match_type=query.match_type, + # url_canon=self.url_canon) def load_cdx(self, **params): + params['_url_canon'] = self.url_canon query = CDXQuery(**params) - key, end_key = self._calc_search_keys(query) - - query.set_key(key, end_key) + #key, end_key = self._calc_search_keys(query) + #query.set_key(key, end_key) cdx_iter = self._load_cdx_query(query) diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index b5245dbb..272d3c41 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -33,8 +33,8 @@ class CDXFile(CDXSource): @staticmethod def _do_load_file(filename, query): with open(filename, 'rb') as source: - gen = iter_range(source, query.key.encode('utf-8'), - query.end_key.encode('utf-8')) + gen = iter_range(source, query.key, + query.end_key) for line in gen: yield line @@ -61,7 +61,7 @@ class RemoteCDXSource(CDXSource): else: # Only send url and matchType to remote remote_query = CDXQuery(url=query.url, - match_type=query.match_type) + matchType=query.match_type) urlparams = remote_query.urlencode() @@ -127,7 +127,7 @@ class RedisCDXSource(CDXSource): if self.cdx_key: return self.load_sorted_range(query, self.cdx_key) else: - return self.load_single_key(query.key.encode('utf-8')) + return self.load_single_key(query.key) def load_sorted_range(self, query, cdx_key): cdx_list = self.redis.zrangebylex(cdx_key, diff --git a/pywb/cdx/query.py b/pywb/cdx/query.py index dd6144c6..95ac0534 100644 --- a/pywb/cdx/query.py +++ b/pywb/cdx/query.py @@ -1,5 +1,6 @@ from six.moves.urllib.parse import urlencode from pywb.cdx.cdxobject import CDXException +from pywb.utils.canonicalize import calc_search_range #================================================================= @@ -14,6 +15,15 @@ class CDXQuery(object): elif url.endswith('*'): self.params['url'] = url[:-1] self.params['matchType'] = 'prefix' + else: + self.params['matchType'] = 'exact' + + start, end = calc_search_range(url=self.params['url'], + match_type=self.params['matchType'], + url_canon=self.params.get('_url_canon')) + + self.params['key'] = start.encode('utf-8') + self.params['end_key'] = end.encode('utf-8') @property def key(self): diff --git a/pywb/cdx/test/test_cdxserver.py b/pywb/cdx/test/test_cdxserver.py index 23772ced..4febb1fc 100644 --- a/pywb/cdx/test/test_cdxserver.py +++ b/pywb/cdx/test/test_cdxserver.py @@ -55,6 +55,7 @@ def mock_urlopen_err(err): # Second time expect a 200 for fuzzy match def mock_urlopen_fuzzy(req): status = 200 + print(req.get_full_url()) if 'exact' in req.get_full_url(): status = 404 diff --git a/pywb/cdx/test/test_lazy_ops.py b/pywb/cdx/test/test_lazy_ops.py index 5fdad2a4..e5c64ea5 100644 --- a/pywb/cdx/test/test_lazy_ops.py +++ b/pywb/cdx/test/test_lazy_ops.py @@ -6,11 +6,13 @@ from pytest import raises import six -KEY = 'com,example)/' + +URL = 'http://example.com/' + #================================================================ def raise_access_exception(cdx_iter, query): - if query.key == KEY: + if query.url == URL: raise AccessException for cdx in cdx_iter: @@ -36,22 +38,22 @@ def lazy_cdx_load(**params): def test_no_process(): - lazy_cdx_load(key=KEY) + lazy_cdx_load(url=URL) def test_reverse(): - lazy_cdx_load(key=KEY, reverse=True) + lazy_cdx_load(url=URL, reverse=True) def test_closest(): - lazy_cdx_load(key=KEY, closest='2013') + lazy_cdx_load(url=URL, closest='2013') def test_limit(): - lazy_cdx_load(key=KEY, limit=10) + lazy_cdx_load(url=URL, limit=10) def test_limit_1_reverse(): - lazy_cdx_load(key=KEY, limit=1, reverse=True) + lazy_cdx_load(url=URL, limit=1, reverse=True) def test_multi_ops(): - lazy_cdx_load(key=KEY, + lazy_cdx_load(url=URL, resolveRevisits=True, filters=['=filename:A'], collapseTime=10, diff --git a/pywb/cdx/zipnum.py b/pywb/cdx/zipnum.py index 94b31f8f..f44a6b6a 100644 --- a/pywb/cdx/zipnum.py +++ b/pywb/cdx/zipnum.py @@ -173,11 +173,8 @@ class ZipNumCluster(CDXSource): last_line = None - start_key = query.key.encode('utf-8') - end_key = query.end_key.encode('utf-8') - # Get End - end_iter = search(reader, end_key, prev_size=1) + end_iter = search(reader, query.end_key, prev_size=1) try: end_line = six.next(end_iter) @@ -187,14 +184,14 @@ class ZipNumCluster(CDXSource): # Get Start first_iter = iter_range(reader, - start_key, - end_key, + query.key, + query.end_key, prev_size=1) try: first_line = six.next(first_iter) except StopIteration: - if end_line == last_line and start_key >= last_line: + if end_line == last_line and query.key >= last_line: first_line = last_line else: reader.close() @@ -336,11 +333,10 @@ class ZipNumCluster(CDXSource): iter_ = itertools.chain(*map(decompress_block, ranges)) # start bound - iter_ = linearsearch(iter_, query.key.encode('utf-8')) + iter_ = linearsearch(iter_, query.key) # end bound - end = query.end_key.encode('utf-8') - iter_ = itertools.takewhile(lambda line: line < end, iter_) + iter_ = itertools.takewhile(lambda line: line < query.end_key, iter_) return iter_ def __str__(self):