1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

when given a redis path of redis://<host>/<db>/<key>, use <key> as a

sorted cdx file with zrangebylex!

modified tests but need zrangebylex() support in fakeredis to finish
This commit is contained in:
Ilya Kreymer 2014-04-25 10:52:35 -07:00
parent e4262502b0
commit 2b8bea616e
2 changed files with 53 additions and 12 deletions

View File

@ -94,22 +94,43 @@ class RedisCDXSource(CDXSource):
def __init__(self, redis_url, config=None):
import redis
parts = redis_url.split('/')
if len(parts) > 4:
self.cdx_key = parts[4]
else:
self.cdx_key = None
self.redis_url = redis_url
self.redis = redis.StrictRedis.from_url(redis_url)
self.key_prefix = self.DEFAULT_KEY_PREFIX
if config:
self.key_prefix = config.get('redis_key_prefix', self.key_prefix)
def load_cdx(self, query):
"""
Load cdx from redis cache, from an ordered list
Currently, there is no support for range queries
Only 'exact' matchType is supported
"""
key = query.key
If cdx_key is set, treat it as cdx file and load use
zrangebylex! (Supports all match types!)
Otherwise, assume a key per-url and load all entries for that key.
(Only exact match supported)
"""
if self.cdx_key:
return self.load_sorted_range(query)
else:
return self.load_single_key(query.key)
def load_sorted_range(self, query):
cdx_list = self.redis.zrangebylex(self.cdx_key,
'[' + query.key,
'(' + query.end_key)
return cdx_list
def load_single_key(self, key):
# ensure only url/surt is part of key
key = key.split(' ')[0]
cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)

View File

@ -1,9 +1,12 @@
"""
>>> redis_cdx('http://example.com')
>>> redis_cdx(redis_cdx_server, 'http://example.com')
com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz
com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz
com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz
>>> redis_cdx(redis_cdx_server_key, 'http://example.com')
"""
from fakeredis import FakeStrictRedis
@ -21,13 +24,17 @@ import os
test_cdx_dir = get_test_dir() + 'cdx/'
def load_cdx_into_redis(source, filename):
def load_cdx_into_redis(source, filename, key=None):
# load a cdx into mock redis
with open(test_cdx_dir + filename) as fh:
for line in fh:
zadd_cdx(source, line)
zadd_cdx(source, line, key)
def zadd_cdx(source, cdx, key):
if key:
source.redis.zadd(key, 0, cdx)
return
def zadd_cdx(source, cdx):
parts = cdx.split(' ', 2)
key = parts[0]
@ -49,9 +56,22 @@ def init_redis_server():
return CDXServer([source])
def redis_cdx(url, **params):
@patch('redis.StrictRedis', FakeStrictRedis)
def init_redis_server_key_file():
source = RedisCDXSource('redis://127.0.0.1:6379/0/key')
for f in os.listdir(test_cdx_dir):
if f.endswith('.cdx'):
load_cdx_into_redis(source, f, source.cdx_key)
return CDXServer([source])
def redis_cdx(cdx_server, url, **params):
cdx_iter = cdx_server.load_cdx(url=url, **params)
for cdx in cdx_iter:
sys.stdout.write(cdx)
cdx_server = init_redis_server()
redis_cdx_server = init_redis_server()
redis_cdx_server_key = init_redis_server_key_file()