1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-25 23:47:47 +01:00
pywb/pywb/cdx/cdxsource.py

114 lines
3.3 KiB
Python

from pywb.utils.binsearch import iter_range
from pywb.utils.loaders import SeekableTextFileReader
import urllib
import urllib2
import redis
#=================================================================
class CDXSource(object):
"""
Represents any cdx index source
"""
def load_cdx(self, params):
raise NotImplementedError('Implement in subclass')
#=================================================================
class CDXFile(CDXSource):
"""
Represents a local plain-text .cdx file
"""
def __init__(self, filename):
self.filename = filename
def load_cdx(self, params):
source = SeekableTextFileReader(self.filename)
return iter_range(source, params.get('key'), params.get('end_key'))
def __str__(self):
return 'CDX File - ' + self.filename
#=================================================================
class RemoteCDXSource(CDXSource):
"""
Represents a remote cdx server, to which requests will be proxied.
Only url and match type params are proxied at this time,
the stream is passed through all other filters locally.
"""
def __init__(self, filename, cookie=None, proxy_all=True):
self.remote_url = filename
self.cookie = cookie
self.proxy_all = proxy_all
def load_cdx(self, proxy_params):
if self.proxy_all:
params = proxy_params
params['proxyAll'] = True
else:
# Only send url and matchType params to remote
params = {}
params['url'] = proxy_params['url']
match_type = proxy_params.get('matchType')
if match_type:
proxy_params['matchType'] = match_type
urlparams = urllib.urlencode(params, True)
try:
request = urllib2.Request(self.remote_url, urlparams)
if self.cookie:
request.add_header('Cookie', self.cookie)
response = urllib2.urlopen(request)
except urllib2.HTTPError as e:
if e.code == 403:
exc_msg = e.read()
msg = ('Blocked By Robots' if 'Blocked By Robots' in exc_msg
else 'Excluded')
raise AccessException(msg)
else:
raise
return iter(response)
def __str__(self):
return 'Remote CDX Server: ' + self.remote_url
#=================================================================
class RedisCDXSource(CDXSource):
DEFAULT_KEY_PREFIX = 'c:'
def __init__(self, redis_url, config=None):
self.redis = redis.StrictRedis.from_url(redis_url)
key_prefix = self.DEFAULT_KEY_PREFIX
if config:
self.key_prefix = config.get('redis_key_prefix', key_prefix)
def load_cdx(self, params):
"""
Load cdx from redis cache, from an ordered list
Currently, there is no support for range queries
Only 'exact' matchType is supported
"""
key = params['key']
# ensure only url/surt is part of key
key = key.split(' ')[0]
cdx_list = self.redis.zrange(self.key_prefix + key, 0, -1)
# key is not part of list, so prepend to each line
key += ' '
cdx_list = itertools.imap(lambda x: key + x, cdx_list)
return cdx_list