1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

cdx: ensure CDXQuery computes key and end_key automatically

key and end_key encoded as utf-8 by default
This commit is contained in:
Ilya Kreymer 2016-02-22 13:39:47 -08:00
parent 7513011cac
commit af7c876263
9 changed files with 59 additions and 40 deletions

View File

@ -8,8 +8,7 @@ from six.moves.urllib.parse import urlsplit
from pywb.utils.dsrules import BaseRule, RuleSet from pywb.utils.dsrules import BaseRule, RuleSet
from pywb.utils.canonicalize import unsurt, UrlCanonicalizer from pywb.utils.canonicalize import unsurt, UrlCanonicalizer
from pywb.utils.loaders import to_native_str
from pywb.cdx.query import CDXQuery
#================================================================= #=================================================================
@ -65,14 +64,14 @@ class CustomUrlCanonicalizer(UrlCanonicalizer):
#================================================================= #=================================================================
class FuzzyQuery: class FuzzyQuery(object):
def __init__(self, rules): def __init__(self, rules):
self.rules = rules self.rules = rules
def __call__(self, query): def __call__(self, query):
matched_rule = None matched_rule = None
urlkey = query.key urlkey = to_native_str(query.key, 'utf-8')
url = query.url url = query.url
filter_ = query.filters filter_ = query.filters
output = query.output output = query.output
@ -149,7 +148,7 @@ class CDXDomainSpecificRule(BaseRule):
In the case of non-surt format, this method is called In the case of non-surt format, this method is called
to desurt any urls to desurt any urls
""" """
self.url_prefix = map(unsurt, self.url_prefix) self.url_prefix = list(map(unsurt, self.url_prefix))
if self.regex: if self.regex:
self.regex = re.compile(unsurt(self.regex.pattern)) self.regex = re.compile(unsurt(self.regex.pattern))
@ -181,6 +180,6 @@ class CDXDomainSpecificRule(BaseRule):
def conv(value): def conv(value):
return '[?&]({0}=[^&]+)'.format(re.escape(value)) return '[?&]({0}=[^&]+)'.format(re.escape(value))
params_list = map(conv, params_list) params_list = list(map(conv, params_list))
final_str = '.*'.join(params_list) final_str = '.*'.join(params_list)
return final_str return final_str

View File

@ -182,7 +182,12 @@ class CDXObject(OrderedDict):
return result return result
def to_json(self, fields=None): def to_json(self, fields=None):
return self.conv_to_json(self, fields)
@staticmethod
def conv_to_json(obj, fields=None):
""" """
return cdx as json dictionary string return cdx as json dictionary string
if ``fields`` is ``None``, output will include all fields if ``fields`` is ``None``, output will include all fields
@ -192,10 +197,10 @@ class CDXObject(OrderedDict):
:param fields: list of field names to output :param fields: list of field names to output
""" """
if fields is None: if fields is None:
return json_encode(self) + '\n' return json_encode(obj) + '\n'
try: try:
result = json_encode(OrderedDict((x, self[x]) for x in fields)) + '\n' result = json_encode(OrderedDict([(x, obj[x]) for x in fields if x in obj])) + '\n'
except KeyError as ke: except KeyError as ke:
msg = 'Invalid field "{0}" found in fields= argument' msg = 'Invalid field "{0}" found in fields= argument'
msg = msg.format(ke.message) msg = msg.format(ke.message)
@ -212,6 +217,14 @@ class CDXObject(OrderedDict):
else: else:
return json_encode(self) return json_encode(self)
def to_cdxj(self, fields=None):
prefix = self['urlkey'] + ' ' + self['timestamp'] + ' '
dupe = OrderedDict(list(self.items())[2:])
return prefix + self.conv_to_json(dupe, fields)
def __lt__(self, other):
return str(self) < str(other)
#================================================================= #=================================================================
class IDXObject(OrderedDict): class IDXObject(OrderedDict):

View File

@ -11,8 +11,6 @@ import bisect
from six.moves import zip, range, map from six.moves import zip, range, map
import re import re
from heapq import merge from heapq import merge
from collections import deque from collections import deque

View File

@ -1,4 +1,4 @@
from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range from pywb.utils.canonicalize import UrlCanonicalizer
from pywb.utils.wbexception import NotFoundException from pywb.utils.wbexception import NotFoundException
from pywb.cdx.cdxops import cdx_load from pywb.cdx.cdxops import cdx_load
@ -62,17 +62,17 @@ class BaseCDXServer(object):
raise NotFoundException(msg, url=query.url) raise NotFoundException(msg, url=query.url)
def _calc_search_keys(self, query): #def _calc_search_keys(self, query):
return calc_search_range(url=query.url, # return calc_search_range(url=query.url,
match_type=query.match_type, # match_type=query.match_type,
url_canon=self.url_canon) # url_canon=self.url_canon)
def load_cdx(self, **params): def load_cdx(self, **params):
params['_url_canon'] = self.url_canon
query = CDXQuery(**params) query = CDXQuery(**params)
key, end_key = self._calc_search_keys(query) #key, end_key = self._calc_search_keys(query)
#query.set_key(key, end_key)
query.set_key(key, end_key)
cdx_iter = self._load_cdx_query(query) cdx_iter = self._load_cdx_query(query)

View File

@ -33,8 +33,8 @@ class CDXFile(CDXSource):
@staticmethod @staticmethod
def _do_load_file(filename, query): def _do_load_file(filename, query):
with open(filename, 'rb') as source: with open(filename, 'rb') as source:
gen = iter_range(source, query.key.encode('utf-8'), gen = iter_range(source, query.key,
query.end_key.encode('utf-8')) query.end_key)
for line in gen: for line in gen:
yield line yield line
@ -61,7 +61,7 @@ class RemoteCDXSource(CDXSource):
else: else:
# Only send url and matchType to remote # Only send url and matchType to remote
remote_query = CDXQuery(url=query.url, remote_query = CDXQuery(url=query.url,
match_type=query.match_type) matchType=query.match_type)
urlparams = remote_query.urlencode() urlparams = remote_query.urlencode()
@ -127,7 +127,7 @@ class RedisCDXSource(CDXSource):
if self.cdx_key: if self.cdx_key:
return self.load_sorted_range(query, self.cdx_key) return self.load_sorted_range(query, self.cdx_key)
else: else:
return self.load_single_key(query.key.encode('utf-8')) return self.load_single_key(query.key)
def load_sorted_range(self, query, cdx_key): def load_sorted_range(self, query, cdx_key):
cdx_list = self.redis.zrangebylex(cdx_key, cdx_list = self.redis.zrangebylex(cdx_key,

View File

@ -1,5 +1,6 @@
from six.moves.urllib.parse import urlencode from six.moves.urllib.parse import urlencode
from pywb.cdx.cdxobject import CDXException from pywb.cdx.cdxobject import CDXException
from pywb.utils.canonicalize import calc_search_range
#================================================================= #=================================================================
@ -14,6 +15,15 @@ class CDXQuery(object):
elif url.endswith('*'): elif url.endswith('*'):
self.params['url'] = url[:-1] self.params['url'] = url[:-1]
self.params['matchType'] = 'prefix' self.params['matchType'] = 'prefix'
else:
self.params['matchType'] = 'exact'
start, end = calc_search_range(url=self.params['url'],
match_type=self.params['matchType'],
url_canon=self.params.get('_url_canon'))
self.params['key'] = start.encode('utf-8')
self.params['end_key'] = end.encode('utf-8')
@property @property
def key(self): def key(self):

View File

@ -55,6 +55,7 @@ def mock_urlopen_err(err):
# Second time expect a 200 for fuzzy match # Second time expect a 200 for fuzzy match
def mock_urlopen_fuzzy(req): def mock_urlopen_fuzzy(req):
status = 200 status = 200
print(req.get_full_url())
if 'exact' in req.get_full_url(): if 'exact' in req.get_full_url():
status = 404 status = 404

View File

@ -6,11 +6,13 @@ from pytest import raises
import six import six
KEY = 'com,example)/'
URL = 'http://example.com/'
#================================================================ #================================================================
def raise_access_exception(cdx_iter, query): def raise_access_exception(cdx_iter, query):
if query.key == KEY: if query.url == URL:
raise AccessException raise AccessException
for cdx in cdx_iter: for cdx in cdx_iter:
@ -36,22 +38,22 @@ def lazy_cdx_load(**params):
def test_no_process(): def test_no_process():
lazy_cdx_load(key=KEY) lazy_cdx_load(url=URL)
def test_reverse(): def test_reverse():
lazy_cdx_load(key=KEY, reverse=True) lazy_cdx_load(url=URL, reverse=True)
def test_closest(): def test_closest():
lazy_cdx_load(key=KEY, closest='2013') lazy_cdx_load(url=URL, closest='2013')
def test_limit(): def test_limit():
lazy_cdx_load(key=KEY, limit=10) lazy_cdx_load(url=URL, limit=10)
def test_limit_1_reverse(): def test_limit_1_reverse():
lazy_cdx_load(key=KEY, limit=1, reverse=True) lazy_cdx_load(url=URL, limit=1, reverse=True)
def test_multi_ops(): def test_multi_ops():
lazy_cdx_load(key=KEY, lazy_cdx_load(url=URL,
resolveRevisits=True, resolveRevisits=True,
filters=['=filename:A'], filters=['=filename:A'],
collapseTime=10, collapseTime=10,

View File

@ -173,11 +173,8 @@ class ZipNumCluster(CDXSource):
last_line = None last_line = None
start_key = query.key.encode('utf-8')
end_key = query.end_key.encode('utf-8')
# Get End # Get End
end_iter = search(reader, end_key, prev_size=1) end_iter = search(reader, query.end_key, prev_size=1)
try: try:
end_line = six.next(end_iter) end_line = six.next(end_iter)
@ -187,14 +184,14 @@ class ZipNumCluster(CDXSource):
# Get Start # Get Start
first_iter = iter_range(reader, first_iter = iter_range(reader,
start_key, query.key,
end_key, query.end_key,
prev_size=1) prev_size=1)
try: try:
first_line = six.next(first_iter) first_line = six.next(first_iter)
except StopIteration: except StopIteration:
if end_line == last_line and start_key >= last_line: if end_line == last_line and query.key >= last_line:
first_line = last_line first_line = last_line
else: else:
reader.close() reader.close()
@ -336,11 +333,10 @@ class ZipNumCluster(CDXSource):
iter_ = itertools.chain(*map(decompress_block, ranges)) iter_ = itertools.chain(*map(decompress_block, ranges))
# start bound # start bound
iter_ = linearsearch(iter_, query.key.encode('utf-8')) iter_ = linearsearch(iter_, query.key)
# end bound # end bound
end = query.end_key.encode('utf-8') iter_ = itertools.takewhile(lambda line: line < query.end_key, iter_)
iter_ = itertools.takewhile(lambda line: line < end, iter_)
return iter_ return iter_
def __str__(self): def __str__(self):