mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
when given a redis path of redis://<host>/<db>/<key>, use <key> as a
sorted cdx file with zrangebylex! modified tests but need zrangebylex() support in fakeredis to finish
This commit is contained in:
parent
e4262502b0
commit
2b8bea616e
@ -94,22 +94,43 @@ class RedisCDXSource(CDXSource):
|
|||||||
|
|
||||||
def __init__(self, redis_url, config=None):
|
def __init__(self, redis_url, config=None):
|
||||||
import redis
|
import redis
|
||||||
|
|
||||||
|
parts = redis_url.split('/')
|
||||||
|
if len(parts) > 4:
|
||||||
|
self.cdx_key = parts[4]
|
||||||
|
else:
|
||||||
|
self.cdx_key = None
|
||||||
|
|
||||||
self.redis_url = redis_url
|
self.redis_url = redis_url
|
||||||
self.redis = redis.StrictRedis.from_url(redis_url)
|
self.redis = redis.StrictRedis.from_url(redis_url)
|
||||||
|
|
||||||
self.key_prefix = self.DEFAULT_KEY_PREFIX
|
self.key_prefix = self.DEFAULT_KEY_PREFIX
|
||||||
if config:
|
|
||||||
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
|
|
||||||
|
|
||||||
def load_cdx(self, query):
|
def load_cdx(self, query):
|
||||||
"""
|
"""
|
||||||
Load cdx from redis cache, from an ordered list
|
Load cdx from redis cache, from an ordered list
|
||||||
|
|
||||||
Currently, there is no support for range queries
|
If cdx_key is set, treat it as cdx file and load use
|
||||||
Only 'exact' matchType is supported
|
zrangebylex! (Supports all match types!)
|
||||||
"""
|
|
||||||
key = query.key
|
|
||||||
|
|
||||||
|
Otherwise, assume a key per-url and load all entries for that key.
|
||||||
|
(Only exact match supported)
|
||||||
|
"""
|
||||||
|
|
||||||
|
if self.cdx_key:
|
||||||
|
return self.load_sorted_range(query)
|
||||||
|
else:
|
||||||
|
return self.load_single_key(query.key)
|
||||||
|
|
||||||
|
def load_sorted_range(self, query):
|
||||||
|
cdx_list = self.redis.zrangebylex(self.cdx_key,
|
||||||
|
'[' + query.key,
|
||||||
|
'(' + query.end_key)
|
||||||
|
|
||||||
|
return cdx_list
|
||||||
|
|
||||||
|
|
||||||
|
def load_single_key(self, key):
|
||||||
# ensure only url/surt is part of key
|
# ensure only url/surt is part of key
|
||||||
key = key.split(' ')[0]
|
key = key.split(' ')[0]
|
||||||
cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
|
cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
|
||||||
|
@ -1,9 +1,12 @@
|
|||||||
"""
|
"""
|
||||||
>>> redis_cdx('http://example.com')
|
>>> redis_cdx(redis_cdx_server, 'http://example.com')
|
||||||
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
|
||||||
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
|
||||||
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
|
||||||
|
|
||||||
|
|
||||||
|
>>> redis_cdx(redis_cdx_server_key, 'http://example.com')
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from fakeredis import FakeStrictRedis
|
from fakeredis import FakeStrictRedis
|
||||||
@ -21,13 +24,17 @@ import os
|
|||||||
test_cdx_dir = get_test_dir() + 'cdx/'
|
test_cdx_dir = get_test_dir() + 'cdx/'
|
||||||
|
|
||||||
|
|
||||||
def load_cdx_into_redis(source, filename):
|
def load_cdx_into_redis(source, filename, key=None):
|
||||||
# load a cdx into mock redis
|
# load a cdx into mock redis
|
||||||
with open(test_cdx_dir + filename) as fh:
|
with open(test_cdx_dir + filename) as fh:
|
||||||
for line in fh:
|
for line in fh:
|
||||||
zadd_cdx(source, line)
|
zadd_cdx(source, line, key)
|
||||||
|
|
||||||
|
def zadd_cdx(source, cdx, key):
|
||||||
|
if key:
|
||||||
|
source.redis.zadd(key, 0, cdx)
|
||||||
|
return
|
||||||
|
|
||||||
def zadd_cdx(source, cdx):
|
|
||||||
parts = cdx.split(' ', 2)
|
parts = cdx.split(' ', 2)
|
||||||
|
|
||||||
key = parts[0]
|
key = parts[0]
|
||||||
@ -49,9 +56,22 @@ def init_redis_server():
|
|||||||
|
|
||||||
return CDXServer([source])
|
return CDXServer([source])
|
||||||
|
|
||||||
def redis_cdx(url, **params):
|
@patch('redis.StrictRedis', FakeStrictRedis)
|
||||||
|
def init_redis_server_key_file():
|
||||||
|
source = RedisCDXSource('redis://127.0.0.1:6379/0/key')
|
||||||
|
|
||||||
|
for f in os.listdir(test_cdx_dir):
|
||||||
|
if f.endswith('.cdx'):
|
||||||
|
load_cdx_into_redis(source, f, source.cdx_key)
|
||||||
|
|
||||||
|
return CDXServer([source])
|
||||||
|
|
||||||
|
|
||||||
|
def redis_cdx(cdx_server, url, **params):
|
||||||
cdx_iter = cdx_server.load_cdx(url=url, **params)
|
cdx_iter = cdx_server.load_cdx(url=url, **params)
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
sys.stdout.write(cdx)
|
sys.stdout.write(cdx)
|
||||||
|
|
||||||
cdx_server = init_redis_server()
|
redis_cdx_server = init_redis_server()
|
||||||
|
redis_cdx_server_key = init_redis_server_key_file()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user