mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-21 11:02:10 +01:00
160 lines
5.2 KiB
Python
160 lines
5.2 KiB
Python
|
import redis
|
||
|
|
||
|
from pywb.utils.binsearch import iter_range
|
||
|
from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp
|
||
|
from pywb.utils.timeutils import timestamp_to_sec, timestamp_now
|
||
|
from pywb.utils.canonicalize import calc_search_range
|
||
|
|
||
|
from pywb.cdx.cdxobject import CDXObject
|
||
|
from pywb.cdx.cdxops import cdx_sort_closest, cdx_limit
|
||
|
|
||
|
import requests
|
||
|
|
||
|
from utils import MementoUtils
|
||
|
|
||
|
|
||
|
#=============================================================================
|
||
|
class BaseIndexSource(object):
|
||
|
def __init__(self, index_template=''):
|
||
|
self.index_template = index_template
|
||
|
|
||
|
def get_index(self, params):
|
||
|
return self.index_template.format(params.get('coll'))
|
||
|
|
||
|
|
||
|
#=============================================================================
|
||
|
class FileIndexSource(BaseIndexSource):
|
||
|
def load_index(self, params):
|
||
|
filename = self.get_index(params)
|
||
|
|
||
|
with open(filename, 'rb') as fh:
|
||
|
gen = iter_range(fh, params['start_key'], params['end_key'])
|
||
|
for line in gen:
|
||
|
yield CDXObject(line)
|
||
|
|
||
|
|
||
|
#=============================================================================
|
||
|
class RemoteIndexSource(BaseIndexSource):
|
||
|
def __init__(self, cdx_url, replay_url):
|
||
|
self.index_template = cdx_url
|
||
|
self.replay_url = replay_url
|
||
|
|
||
|
def load_index(self, params):
|
||
|
url = self.get_index(params)
|
||
|
url += '?url=' + params['url']
|
||
|
r = requests.get(url)
|
||
|
lines = r.content.strip().split(b'\n')
|
||
|
for line in lines:
|
||
|
cdx = CDXObject(line)
|
||
|
cdx['load_url'] = self.replay_url.format(timestamp=cdx['timestamp'], url=cdx['url'])
|
||
|
yield cdx
|
||
|
|
||
|
|
||
|
#=============================================================================
|
||
|
class LiveIndexSource(BaseIndexSource):
|
||
|
def load_index(self, params):
|
||
|
cdx = CDXObject()
|
||
|
cdx['urlkey'] = params.get('start_key').decode('utf-8')
|
||
|
cdx['timestamp'] = timestamp_now()
|
||
|
cdx['url'] = params['url']
|
||
|
cdx['load_url'] = params['url']
|
||
|
def live():
|
||
|
yield cdx
|
||
|
|
||
|
return live()
|
||
|
|
||
|
|
||
|
#=============================================================================
|
||
|
class RedisIndexSource(BaseIndexSource):
|
||
|
def __init__(self, redis_url):
|
||
|
parts = redis_url.split('/')
|
||
|
key_prefix = ''
|
||
|
if len(parts) > 4:
|
||
|
key_prefix = parts[4]
|
||
|
redis_url = 'redis://' + parts[2] + '/' + parts[3]
|
||
|
|
||
|
self.redis_url = redis_url
|
||
|
self.index_template = key_prefix
|
||
|
self.redis = redis.StrictRedis.from_url(redis_url)
|
||
|
|
||
|
def load_index(self, params):
|
||
|
z_key = self.get_index(params)
|
||
|
index_list = self.redis.zrangebylex(z_key,
|
||
|
b'[' + params['start_key'],
|
||
|
b'(' + params['end_key'])
|
||
|
|
||
|
for line in index_list:
|
||
|
yield CDXObject(line)
|
||
|
|
||
|
|
||
|
#=============================================================================
|
||
|
class MementoIndexSource(BaseIndexSource):
|
||
|
def __init__(self, timegate_url, timemap_url, replay_url):
|
||
|
self.timegate_url = timegate_url
|
||
|
self.timemap_url = timemap_url
|
||
|
self.replay_url = replay_url
|
||
|
|
||
|
def make_iter(self, links, def_name):
|
||
|
original, link_iter = MementoUtils.links_to_json(links, def_name)
|
||
|
|
||
|
for cdx in link_iter():
|
||
|
cdx['load_url'] = self.replay_url.format(timestamp=cdx['timestamp'], url=original)
|
||
|
yield cdx
|
||
|
|
||
|
def load_timegate(self, params, closest):
|
||
|
url = self.timegate_url.format(coll=params.get('coll')) + params['url']
|
||
|
accept_dt = timestamp_to_http_date(closest)
|
||
|
res = requests.head(url, headers={'Accept-Datetime': accept_dt})
|
||
|
return self.make_iter(res.headers.get('Link'), 'timegate')
|
||
|
|
||
|
def load_timemap(self, params):
|
||
|
url = self.timemap_url + params['url']
|
||
|
r = requests.get(url)
|
||
|
return self.make_iter(r.text, 'timemap')
|
||
|
|
||
|
def load_index(self, params):
|
||
|
closest = params.get('closest')
|
||
|
if not closest:
|
||
|
return self.load_timemap(params)
|
||
|
else:
|
||
|
return self.load_timegate(params, closest)
|
||
|
|
||
|
@staticmethod
|
||
|
def from_timegate_url(timegate_url, type_='link'):
|
||
|
return MementoIndexSource(timegate_url,
|
||
|
timegate_url + 'timemap/' + type_ + '/',
|
||
|
timegate_url + '{timestamp}id_/{url}')
|
||
|
|
||
|
|
||
|
|
||
|
def query_index(source, params):
|
||
|
url = params.get('url', '')
|
||
|
|
||
|
if not params.get('matchType'):
|
||
|
if url.startswith('*.'):
|
||
|
params['url'] = url[2:]
|
||
|
params['matchType'] = 'domain'
|
||
|
elif url.endswith('*'):
|
||
|
params['url'] = url[:-1]
|
||
|
params['matchType'] = 'prefix'
|
||
|
else:
|
||
|
params['matchType'] = 'exact'
|
||
|
|
||
|
start, end = calc_search_range(url=params['url'],
|
||
|
match_type=params['matchType'])
|
||
|
|
||
|
params['start_key'] = start.encode('utf-8')
|
||
|
params['end_key'] = end.encode('utf-8')
|
||
|
|
||
|
res = source.load_index(params)
|
||
|
|
||
|
limit = int(params.get('limit', 10))
|
||
|
closest = params.get('closest')
|
||
|
if closest:
|
||
|
res = cdx_sort_closest(closest, res, limit)
|
||
|
elif limit:
|
||
|
res = cdx_limit(res, limit)
|
||
|
|
||
|
|
||
|
return res
|