1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-21 19:12:10 +01:00
pywb/indexsource.py

198 lines
6.1 KiB
Python
Raw Normal View History

import redis
from pywb.utils.binsearch import iter_range
from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp
from pywb.utils.timeutils import timestamp_to_sec, timestamp_now
2016-02-22 13:30:12 -08:00
from pywb.utils.canonicalize import canonicalize, calc_search_range
from pywb.utils.wbexception import NotFoundException
from pywb.cdx.cdxobject import CDXObject
2016-02-22 13:30:12 -08:00
from pywb.cdx.query import CDXQuery
from pywb.cdx.cdxops import process_cdx
import requests
from utils import MementoUtils
#=============================================================================
class BaseIndexSource(object):
def __init__(self, index_template=''):
self.index_template = index_template
def get_index(self, params):
return self.index_template.format(params.get('coll'))
2016-02-22 13:30:12 -08:00
def __call__(self, params):
query = CDXQuery(**params)
try:
cdx_iter = self.load_index(query.params)
except NotFoundException as nf:
cdx_iter = iter([])
cdx_iter = process_cdx(cdx_iter, query)
return cdx_iter
#=============================================================================
class FileIndexSource(BaseIndexSource):
def load_index(self, params):
filename = self.get_index(params)
with open(filename, 'rb') as fh:
2016-02-22 13:30:12 -08:00
gen = iter_range(fh, params['key'], params['end_key'])
for line in gen:
yield CDXObject(line)
#=============================================================================
class RemoteIndexSource(BaseIndexSource):
def __init__(self, cdx_url, replay_url):
self.index_template = cdx_url
self.replay_url = replay_url
def load_index(self, params):
url = self.get_index(params)
url += '?url=' + params['url']
r = requests.get(url)
2016-02-22 13:30:12 -08:00
if r.status_code >= 400:
raise NotFoundException(url)
lines = r.content.strip().split(b'\n')
2016-02-22 13:30:12 -08:00
def do_load(lines):
for line in lines:
cdx = CDXObject(line)
cdx['load_url'] = self.replay_url.format(timestamp=cdx['timestamp'], url=cdx['url'])
yield cdx
return do_load(lines)
#=============================================================================
class LiveIndexSource(BaseIndexSource):
def load_index(self, params):
cdx = CDXObject()
2016-02-22 13:30:12 -08:00
cdx['urlkey'] = params.get('key').decode('utf-8')
cdx['timestamp'] = timestamp_now()
cdx['url'] = params['url']
cdx['load_url'] = params['url']
2016-02-22 13:30:12 -08:00
cdx['is_live'] = True
def live():
yield cdx
return live()
#=============================================================================
class RedisIndexSource(BaseIndexSource):
def __init__(self, redis_url):
parts = redis_url.split('/')
key_prefix = ''
if len(parts) > 4:
key_prefix = parts[4]
redis_url = 'redis://' + parts[2] + '/' + parts[3]
self.redis_url = redis_url
self.index_template = key_prefix
self.redis = redis.StrictRedis.from_url(redis_url)
def load_index(self, params):
z_key = self.get_index(params)
index_list = self.redis.zrangebylex(z_key,
2016-02-22 13:30:12 -08:00
b'[' + params['key'],
b'(' + params['end_key'])
for line in index_list:
yield CDXObject(line)
#=============================================================================
class MementoIndexSource(BaseIndexSource):
def __init__(self, timegate_url, timemap_url, replay_url):
self.timegate_url = timegate_url
self.timemap_url = timemap_url
self.replay_url = replay_url
2016-02-22 13:30:12 -08:00
def links_to_cdxobject(self, link_header, def_name, sort=False):
results = MementoUtils.parse_links(link_header, def_name)
#meta = MementoUtils.meta_field('timegate', results)
#if meta:
# yield meta
#meta = MementoUtils.meta_field('timemap', results)
#if meta:
# yield meta
#meta = MementoUtils.meta_field('original', results)
#if meta:
# yield meta
original = results['original']['url']
key = canonicalize(original)
mementos = results['mementos']
if sort:
mementos = sorted(mementos)
for val in mementos:
dt = val.get('datetime')
if not dt:
continue
ts = http_date_to_timestamp(dt)
cdx = CDXObject()
cdx['urlkey'] = key
cdx['timestamp'] = ts
cdx['url'] = original
cdx['mem_rel'] = val.get('rel', '')
cdx['memento_url'] = val['url']
load_url = self.replay_url.format(timestamp=cdx['timestamp'],
url=original)
2016-02-22 13:30:12 -08:00
cdx['load_url'] = load_url
yield cdx
2016-02-22 13:30:12 -08:00
def get_timegate_links(self, params, closest):
url = self.timegate_url.format(coll=params.get('coll')) + params['url']
accept_dt = timestamp_to_http_date(closest)
res = requests.head(url, headers={'Accept-Datetime': accept_dt})
2016-02-22 13:30:12 -08:00
if res.status_code >= 400:
raise NotFoundException(url)
2016-02-22 13:30:12 -08:00
return res.headers.get('Link')
def get_timemap_links(self, params):
url = self.timemap_url + params['url']
2016-02-22 13:30:12 -08:00
res = requests.get(url)
if res.status_code >= 400:
raise NotFoundException(url)
return res.text
def load_index(self, params):
closest = params.get('closest')
2016-02-22 13:30:12 -08:00
if not closest:
2016-02-22 13:30:12 -08:00
links = self.get_timemap_links(params)
def_name = 'timemap'
else:
2016-02-22 13:30:12 -08:00
links = self.get_timegate_links(params, closest)
def_name = 'timegate'
#if not links:
# return iter([])
return self.links_to_cdxobject(links, def_name)
@staticmethod
2016-02-22 13:30:12 -08:00
def from_timegate_url(timegate_url, path='link'):
return MementoIndexSource(timegate_url,
2016-02-22 13:30:12 -08:00
timegate_url + 'timemap/' + path + '/',
timegate_url + '{timestamp}id_/{url}')