From 2b8bea616e6dbd58bbd13e151953f10638e3e4c8 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 25 Apr 2014 10:52:35 -0700 Subject: [PATCH] when given a redis path of redis:////, use as a sorted cdx file with zrangebylex! modified tests but need zrangebylex() support in fakeredis to finish --- pywb/cdx/cdxsource.py | 33 ++++++++++++++++++++++++------ pywb/cdx/test/test_redis_source.py | 32 +++++++++++++++++++++++------ 2 files changed, 53 insertions(+), 12 deletions(-) diff --git a/pywb/cdx/cdxsource.py b/pywb/cdx/cdxsource.py index bf57209d..1bd3c158 100644 --- a/pywb/cdx/cdxsource.py +++ b/pywb/cdx/cdxsource.py @@ -94,22 +94,43 @@ class RedisCDXSource(CDXSource): def __init__(self, redis_url, config=None): import redis + + parts = redis_url.split('/') + if len(parts) > 4: + self.cdx_key = parts[4] + else: + self.cdx_key = None + self.redis_url = redis_url self.redis = redis.StrictRedis.from_url(redis_url) self.key_prefix = self.DEFAULT_KEY_PREFIX - if config: - self.key_prefix = config.get('redis_key_prefix', self.key_prefix) def load_cdx(self, query): """ Load cdx from redis cache, from an ordered list - Currently, there is no support for range queries - Only 'exact' matchType is supported - """ - key = query.key + If cdx_key is set, treat it as cdx file and load use + zrangebylex! (Supports all match types!) + Otherwise, assume a key per-url and load all entries for that key. + (Only exact match supported) + """ + + if self.cdx_key: + return self.load_sorted_range(query) + else: + return self.load_single_key(query.key) + + def load_sorted_range(self, query): + cdx_list = self.redis.zrangebylex(self.cdx_key, + '[' + query.key, + '(' + query.end_key) + + return cdx_list + + + def load_single_key(self, key): # ensure only url/surt is part of key key = key.split(' ')[0] cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1) diff --git a/pywb/cdx/test/test_redis_source.py b/pywb/cdx/test/test_redis_source.py index e620811c..34abddf1 100644 --- a/pywb/cdx/test/test_redis_source.py +++ b/pywb/cdx/test/test_redis_source.py @@ -1,9 +1,12 @@ """ ->>> redis_cdx('http://example.com') +>>> redis_cdx(redis_cdx_server, 'http://example.com') com,example)/ 20130729195151 http://test@example.com/ warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 591 355 example-url-agnostic-revisit.warc.gz com,example)/ 20140127171200 http://example.com text/html 200 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 1046 334 dupes.warc.gz com,example)/ 20140127171251 http://example.com warc/revisit - B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A - - 553 11875 dupes.warc.gz + +>>> redis_cdx(redis_cdx_server_key, 'http://example.com') + """ from fakeredis import FakeStrictRedis @@ -21,13 +24,17 @@ import os test_cdx_dir = get_test_dir() + 'cdx/' -def load_cdx_into_redis(source, filename): +def load_cdx_into_redis(source, filename, key=None): # load a cdx into mock redis with open(test_cdx_dir + filename) as fh: for line in fh: - zadd_cdx(source, line) + zadd_cdx(source, line, key) + +def zadd_cdx(source, cdx, key): + if key: + source.redis.zadd(key, 0, cdx) + return -def zadd_cdx(source, cdx): parts = cdx.split(' ', 2) key = parts[0] @@ -49,9 +56,22 @@ def init_redis_server(): return CDXServer([source]) -def redis_cdx(url, **params): +@patch('redis.StrictRedis', FakeStrictRedis) +def init_redis_server_key_file(): + source = RedisCDXSource('redis://127.0.0.1:6379/0/key') + + for f in os.listdir(test_cdx_dir): + if f.endswith('.cdx'): + load_cdx_into_redis(source, f, source.cdx_key) + + return CDXServer([source]) + + +def redis_cdx(cdx_server, url, **params): cdx_iter = cdx_server.load_cdx(url=url, **params) for cdx in cdx_iter: sys.stdout.write(cdx) -cdx_server = init_redis_server() +redis_cdx_server = init_redis_server() +redis_cdx_server_key = init_redis_server_key_file() +