1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

cdx: add prototype support for redis cdx source (need testing)

This commit is contained in:
Ilya Kreymer 2014-02-24 11:02:28 -08:00
parent 9194e867ea
commit ef062fee7b
2 changed files with 35 additions and 4 deletions

View File

@ -1,7 +1,7 @@
from canonicalize import UrlCanonicalizer, calc_search_range from canonicalize import UrlCanonicalizer, calc_search_range
from cdxops import cdx_load from cdxops import cdx_load
from cdxsource import CDXSource, CDXFile, RemoteCDXSource from cdxsource import CDXSource, CDXFile, RemoteCDXSource, RedisCDXSource
from zipnum import ZipNumCluster from zipnum import ZipNumCluster
from cdxobject import CDXObject, CaptureNotFoundException, CDXException from cdxobject import CDXObject, CaptureNotFoundException, CDXException
from cdxdomainspecific import load_domain_specific_cdx_rules from cdxdomainspecific import load_domain_specific_cdx_rules
@ -206,6 +206,9 @@ def create_cdx_source(filename, config):
if is_http(filename): if is_http(filename):
return RemoteCDXSource(filename) return RemoteCDXSource(filename)
if filename.startswith('redis://'):
return RedisCDXSource(filename, config)
if filename.endswith('.cdx'): if filename.endswith('.cdx'):
return CDXFile(filename) return CDXFile(filename)
@ -213,9 +216,6 @@ def create_cdx_source(filename, config):
return ZipNumCluster(filename, config) return ZipNumCluster(filename, config)
return None return None
#TODO: support zipnum
#elif filename.startswith('redis://')
# return RedisCDXSource(filename)
#================================================================= #=================================================================

View File

@ -3,6 +3,7 @@ from pywb.utils.loaders import SeekableTextFileReader
import urllib import urllib
import urllib2 import urllib2
import redis
#================================================================= #=================================================================
@ -80,3 +81,33 @@ class RemoteCDXSource(CDXSource):
def __str__(self): def __str__(self):
return 'Remote CDX Server: ' + self.remote_url return 'Remote CDX Server: ' + self.remote_url
#=================================================================
class RedisCDXSource(CDXSource):
DEFAULT_KEY_PREFIX = 'c:'
def __init__(self, redis_url, config=None):
self.redis = redis.StrictRedis.from_url(redis_url)
key_prefix = self.DEFAULT_KEY_PREFIX
if config:
self.key_prefix = config.get('redis_key_prefix', key_prefix)
def load_cdx(self, params):
"""
Load cdx from redis cache, from an ordered list
Currently, there is no support for range queries
Only 'exact' matchType is supported
"""
key = params['key']
# ensure only url/surt is part of key
key = key.split(' ')[0]
cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
# key is not part of list, so prepend to each line
key += ' '
cdx_list = itertools.imap(lambda x: key + x, cdx_list)
return cdx_list