1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

cdx: add prototype support for redis cdx source (need testing)

This commit is contained in:
Ilya Kreymer 2014-02-24 11:02:28 -08:00
parent 9194e867ea
commit ef062fee7b
2 changed files with 35 additions and 4 deletions

View File

@ -1,7 +1,7 @@
from canonicalize import UrlCanonicalizer, calc_search_range
from cdxops import cdx_load
from cdxsource import CDXSource, CDXFile, RemoteCDXSource
from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
from zipnum import ZipNumCluster
from cdxobject import CDXObject, CaptureNotFoundException, CDXException
from cdxdomainspecific import load_domain_specific_cdx_rules
@ -206,6 +206,9 @@ def create_cdx_source(filename, config):
if is_http(filename):
return RemoteCDXSource(filename)
if filename.startswith('redis://'):
return RedisCDXSource(filename, config)
if filename.endswith('.cdx'):
return CDXFile(filename)
@ -213,9 +216,6 @@ def create_cdx_source(filename, config):
return ZipNumCluster(filename, config)
return None
#TODO: support zipnum
#elif filename.startswith('redis://')
# return RedisCDXSource(filename)
#=================================================================

View File

@ -3,6 +3,7 @@ from pywb.utils.loaders import SeekableTextFileReader
import urllib
import urllib2
import redis
#=================================================================
@ -80,3 +81,33 @@ class RemoteCDXSource(CDXSource):
def __str__(self):
return 'Remote CDX Server: ' + self.remote_url
#=================================================================
class RedisCDXSource(CDXSource):
DEFAULT_KEY_PREFIX = 'c:'
def __init__(self, redis_url, config=None):
self.redis = redis.StrictRedis.from_url(redis_url)
key_prefix = self.DEFAULT_KEY_PREFIX
if config:
self.key_prefix = config.get('redis_key_prefix', key_prefix)
def load_cdx(self, params):
"""
Load cdx from redis cache, from an ordered list
Currently, there is no support for range queries
Only 'exact' matchType is supported
"""
key = params['key']
# ensure only url/surt is part of key
key = key.split(' ')[0]
cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
# key is not part of list, so prepend to each line
key += ' '
cdx_list = itertools.imap(lambda x: key + x, cdx_list)
return cdx_list