1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-21 11:02:10 +01:00
pywb/indexloader.py

160 lines
5.2 KiB
Python
Raw Normal View History

import redis
from pywb.utils.binsearch import iter_range
from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp
from pywb.utils.timeutils import timestamp_to_sec, timestamp_now
from pywb.utils.canonicalize import calc_search_range
from pywb.cdx.cdxobject import CDXObject
from pywb.cdx.cdxops import cdx_sort_closest, cdx_limit
import requests
from utils import MementoUtils
#=============================================================================
class BaseIndexSource(object):
def __init__(self, index_template=''):
self.index_template = index_template
def get_index(self, params):
return self.index_template.format(params.get('coll'))
#=============================================================================
class FileIndexSource(BaseIndexSource):
def load_index(self, params):
filename = self.get_index(params)
with open(filename, 'rb') as fh:
gen = iter_range(fh, params['start_key'], params['end_key'])
for line in gen:
yield CDXObject(line)
#=============================================================================
class RemoteIndexSource(BaseIndexSource):
def __init__(self, cdx_url, replay_url):
self.index_template = cdx_url
self.replay_url = replay_url
def load_index(self, params):
url = self.get_index(params)
url += '?url=' + params['url']
r = requests.get(url)
lines = r.content.strip().split(b'\n')
for line in lines:
cdx = CDXObject(line)
cdx['load_url'] = self.replay_url.format(timestamp=cdx['timestamp'], url=cdx['url'])
yield cdx
#=============================================================================
class LiveIndexSource(BaseIndexSource):
def load_index(self, params):
cdx = CDXObject()
cdx['urlkey'] = params.get('start_key').decode('utf-8')
cdx['timestamp'] = timestamp_now()
cdx['url'] = params['url']
cdx['load_url'] = params['url']
def live():
yield cdx
return live()
#=============================================================================
class RedisIndexSource(BaseIndexSource):
def __init__(self, redis_url):
parts = redis_url.split('/')
key_prefix = ''
if len(parts) > 4:
key_prefix = parts[4]
redis_url = 'redis://' + parts[2] + '/' + parts[3]
self.redis_url = redis_url
self.index_template = key_prefix
self.redis = redis.StrictRedis.from_url(redis_url)
def load_index(self, params):
z_key = self.get_index(params)
index_list = self.redis.zrangebylex(z_key,
b'[' + params['start_key'],
b'(' + params['end_key'])
for line in index_list:
yield CDXObject(line)
#=============================================================================
class MementoIndexSource(BaseIndexSource):
def __init__(self, timegate_url, timemap_url, replay_url):
self.timegate_url = timegate_url
self.timemap_url = timemap_url
self.replay_url = replay_url
def make_iter(self, links, def_name):
original, link_iter = MementoUtils.links_to_json(links, def_name)
for cdx in link_iter():
cdx['load_url'] = self.replay_url.format(timestamp=cdx['timestamp'], url=original)
yield cdx
def load_timegate(self, params, closest):
url = self.timegate_url.format(coll=params.get('coll')) + params['url']
accept_dt = timestamp_to_http_date(closest)
res = requests.head(url, headers={'Accept-Datetime': accept_dt})
return self.make_iter(res.headers.get('Link'), 'timegate')
def load_timemap(self, params):
url = self.timemap_url + params['url']
r = requests.get(url)
return self.make_iter(r.text, 'timemap')
def load_index(self, params):
closest = params.get('closest')
if not closest:
return self.load_timemap(params)
else:
return self.load_timegate(params, closest)
@staticmethod
def from_timegate_url(timegate_url, type_='link'):
return MementoIndexSource(timegate_url,
timegate_url + 'timemap/' + type_ + '/',
timegate_url + '{timestamp}id_/{url}')
def query_index(source, params):
url = params.get('url', '')
if not params.get('matchType'):
if url.startswith('*.'):
params['url'] = url[2:]
params['matchType'] = 'domain'
elif url.endswith('*'):
params['url'] = url[:-1]
params['matchType'] = 'prefix'
else:
params['matchType'] = 'exact'
start, end = calc_search_range(url=params['url'],
match_type=params['matchType'])
params['start_key'] = start.encode('utf-8')
params['end_key'] = end.encode('utf-8')
res = source.load_index(params)
limit = int(params.get('limit', 10))
closest = params.get('closest')
if closest:
res = cdx_sort_closest(closest, res, limit)
elif limit:
res = cdx_limit(res, limit)
return res