1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

cdx: ensure CDXQuery computes key and end_key automatically

key and end_key encoded as utf-8 by default
This commit is contained in:
Ilya Kreymer 2016-02-22 13:39:47 -08:00
parent 7513011cac
commit af7c876263
9 changed files with 59 additions and 40 deletions

View File

@ -8,8 +8,7 @@ from six.moves.urllib.parse import urlsplit
from pywb.utils.dsrules import BaseRule, RuleSet
from pywb.utils.canonicalize import unsurt, UrlCanonicalizer
from pywb.cdx.query import CDXQuery
from pywb.utils.loaders import to_native_str
#=================================================================
@ -65,14 +64,14 @@ class CustomUrlCanonicalizer(UrlCanonicalizer):
#=================================================================
class FuzzyQuery:
class FuzzyQuery(object):
def __init__(self, rules):
self.rules = rules
def __call__(self, query):
matched_rule = None
urlkey = query.key
urlkey = to_native_str(query.key, 'utf-8')
url = query.url
filter_ = query.filters
output = query.output
@ -149,7 +148,7 @@ class CDXDomainSpecificRule(BaseRule):
In the case of non-surt format, this method is called
to desurt any urls
"""
self.url_prefix = map(unsurt, self.url_prefix)
self.url_prefix = list(map(unsurt, self.url_prefix))
if self.regex:
self.regex = re.compile(unsurt(self.regex.pattern))
@ -181,6 +180,6 @@ class CDXDomainSpecificRule(BaseRule):
def conv(value):
return '[?&]({0}=[^&]+)'.format(re.escape(value))
params_list = map(conv, params_list)
params_list = list(map(conv, params_list))
final_str = '.*'.join(params_list)
return final_str

View File

@ -182,7 +182,12 @@ class CDXObject(OrderedDict):
return result
def to_json(self, fields=None):
return self.conv_to_json(self, fields)
@staticmethod
def conv_to_json(obj, fields=None):
"""
return cdx as json dictionary string
if ``fields`` is ``None``, output will include all fields
@ -192,10 +197,10 @@ class CDXObject(OrderedDict):
:param fields: list of field names to output
"""
if fields is None:
return json_encode(self) + '\n'
return json_encode(obj) + '\n'
try:
result = json_encode(OrderedDict((x, self[x]) for x in fields)) + '\n'
result = json_encode(OrderedDict([(x, obj[x]) for x in fields if x in obj])) + '\n'
except KeyError as ke:
msg = 'Invalid field "{0}" found in fields= argument'
msg = msg.format(ke.message)
@ -212,6 +217,14 @@ class CDXObject(OrderedDict):
else:
return json_encode(self)
def to_cdxj(self, fields=None):
prefix = self['urlkey'] + ' ' + self['timestamp'] + ' '
dupe = OrderedDict(list(self.items())[2:])
return prefix + self.conv_to_json(dupe, fields)
def __lt__(self, other):
return str(self) < str(other)
#=================================================================
class IDXObject(OrderedDict):

View File

@ -11,8 +11,6 @@ import bisect
from six.moves import zip, range, map
import re
from heapq import merge
from collections import deque

View File

@ -1,4 +1,4 @@
from pywb.utils.canonicalize import UrlCanonicalizer, calc_search_range
from pywb.utils.canonicalize import UrlCanonicalizer
from pywb.utils.wbexception import NotFoundException
from pywb.cdx.cdxops import cdx_load
@ -62,17 +62,17 @@ class BaseCDXServer(object):
raise NotFoundException(msg, url=query.url)
def _calc_search_keys(self, query):
return calc_search_range(url=query.url,
match_type=query.match_type,
url_canon=self.url_canon)
#def _calc_search_keys(self, query):
# return calc_search_range(url=query.url,
# match_type=query.match_type,
# url_canon=self.url_canon)
def load_cdx(self, **params):
params['_url_canon'] = self.url_canon
query = CDXQuery(**params)
key, end_key = self._calc_search_keys(query)
query.set_key(key, end_key)
#key, end_key = self._calc_search_keys(query)
#query.set_key(key, end_key)
cdx_iter = self._load_cdx_query(query)

View File

@ -33,8 +33,8 @@ class CDXFile(CDXSource):
@staticmethod
def _do_load_file(filename, query):
with open(filename, 'rb') as source:
gen = iter_range(source, query.key.encode('utf-8'),
query.end_key.encode('utf-8'))
gen = iter_range(source, query.key,
query.end_key)
for line in gen:
yield line
@ -61,7 +61,7 @@ class RemoteCDXSource(CDXSource):
else:
# Only send url and matchType to remote
remote_query = CDXQuery(url=query.url,
match_type=query.match_type)
matchType=query.match_type)
urlparams = remote_query.urlencode()
@ -127,7 +127,7 @@ class RedisCDXSource(CDXSource):
if self.cdx_key:
return self.load_sorted_range(query, self.cdx_key)
else:
return self.load_single_key(query.key.encode('utf-8'))
return self.load_single_key(query.key)
def load_sorted_range(self, query, cdx_key):
cdx_list = self.redis.zrangebylex(cdx_key,

View File

@ -1,5 +1,6 @@
from six.moves.urllib.parse import urlencode
from pywb.cdx.cdxobject import CDXException
from pywb.utils.canonicalize import calc_search_range
#=================================================================
@ -14,6 +15,15 @@ class CDXQuery(object):
elif url.endswith('*'):
self.params['url'] = url[:-1]
self.params['matchType'] = 'prefix'
else:
self.params['matchType'] = 'exact'
start, end = calc_search_range(url=self.params['url'],
match_type=self.params['matchType'],
url_canon=self.params.get('_url_canon'))
self.params['key'] = start.encode('utf-8')
self.params['end_key'] = end.encode('utf-8')
@property
def key(self):

View File

@ -55,6 +55,7 @@ def mock_urlopen_err(err):
# Second time expect a 200 for fuzzy match
def mock_urlopen_fuzzy(req):
status = 200
print(req.get_full_url())
if 'exact' in req.get_full_url():
status = 404

View File

@ -6,11 +6,13 @@ from pytest import raises
import six
KEY = 'com,example)/'
URL = 'http://example.com/'
#================================================================
def raise_access_exception(cdx_iter, query):
if query.key == KEY:
if query.url == URL:
raise AccessException
for cdx in cdx_iter:
@ -36,22 +38,22 @@ def lazy_cdx_load(**params):
def test_no_process():
lazy_cdx_load(key=KEY)
lazy_cdx_load(url=URL)
def test_reverse():
lazy_cdx_load(key=KEY, reverse=True)
lazy_cdx_load(url=URL, reverse=True)
def test_closest():
lazy_cdx_load(key=KEY, closest='2013')
lazy_cdx_load(url=URL, closest='2013')
def test_limit():
lazy_cdx_load(key=KEY, limit=10)
lazy_cdx_load(url=URL, limit=10)
def test_limit_1_reverse():
lazy_cdx_load(key=KEY, limit=1, reverse=True)
lazy_cdx_load(url=URL, limit=1, reverse=True)
def test_multi_ops():
lazy_cdx_load(key=KEY,
lazy_cdx_load(url=URL,
resolveRevisits=True,
filters=['=filename:A'],
collapseTime=10,

View File

@ -173,11 +173,8 @@ class ZipNumCluster(CDXSource):
last_line = None
start_key = query.key.encode('utf-8')
end_key = query.end_key.encode('utf-8')
# Get End
end_iter = search(reader, end_key, prev_size=1)
end_iter = search(reader, query.end_key, prev_size=1)
try:
end_line = six.next(end_iter)
@ -187,14 +184,14 @@ class ZipNumCluster(CDXSource):
# Get Start
first_iter = iter_range(reader,
start_key,
end_key,
query.key,
query.end_key,
prev_size=1)
try:
first_line = six.next(first_iter)
except StopIteration:
if end_line == last_line and start_key >= last_line:
if end_line == last_line and query.key >= last_line:
first_line = last_line
else:
reader.close()
@ -336,11 +333,10 @@ class ZipNumCluster(CDXSource):
iter_ = itertools.chain(*map(decompress_block, ranges))
# start bound
iter_ = linearsearch(iter_, query.key.encode('utf-8'))
iter_ = linearsearch(iter_, query.key)
# end bound
end = query.end_key.encode('utf-8')
iter_ = itertools.takewhile(lambda line: line < end, iter_)
iter_ = itertools.takewhile(lambda line: line < query.end_key, iter_)
return iter_
def __str__(self):