2016-02-19 17:25:54 -08:00
|
|
|
import redis
|
|
|
|
|
|
|
|
from pywb.utils.binsearch import iter_range
|
|
|
|
from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp
|
2016-03-08 10:27:13 -08:00
|
|
|
from pywb.utils.timeutils import timestamp_now
|
|
|
|
from pywb.utils.canonicalize import canonicalize
|
2016-02-22 13:30:12 -08:00
|
|
|
from pywb.utils.wbexception import NotFoundException
|
2016-02-19 17:25:54 -08:00
|
|
|
|
|
|
|
from pywb.cdx.cdxobject import CDXObject
|
|
|
|
|
2016-03-03 11:55:43 -08:00
|
|
|
from webagg.liverec import patched_requests as requests
|
2016-02-19 17:25:54 -08:00
|
|
|
|
2016-03-05 16:49:26 -08:00
|
|
|
from webagg.utils import ParamFormatter, res_template
|
2016-03-03 11:55:43 -08:00
|
|
|
from webagg.utils import MementoUtils
|
2016-02-19 17:25:54 -08:00
|
|
|
|
|
|
|
|
2016-02-29 12:34:06 -08:00
|
|
|
WAYBACK_ORIG_SUFFIX = '{timestamp}id_/{url}'
|
|
|
|
|
|
|
|
|
2016-02-19 17:25:54 -08:00
|
|
|
#=============================================================================
|
|
|
|
class BaseIndexSource(object):
|
2016-02-26 18:25:10 -08:00
|
|
|
def load_index(self, params): #pragma: no cover
|
2016-02-24 14:22:29 -08:00
|
|
|
raise NotImplemented()
|
2016-02-19 17:25:54 -08:00
|
|
|
|
|
|
|
|
|
|
|
#=============================================================================
|
|
|
|
class FileIndexSource(BaseIndexSource):
|
2016-02-26 18:25:10 -08:00
|
|
|
def __init__(self, filename):
|
|
|
|
self.filename_template = filename
|
2016-02-24 14:22:29 -08:00
|
|
|
|
2016-02-26 18:25:10 -08:00
|
|
|
def load_index(self, params):
|
2016-03-05 16:49:26 -08:00
|
|
|
filename = res_template(self.filename_template, params)
|
2016-02-19 17:25:54 -08:00
|
|
|
|
2016-02-28 14:33:08 -08:00
|
|
|
try:
|
|
|
|
fh = open(filename, 'rb')
|
|
|
|
except IOError:
|
|
|
|
raise NotFoundException(filename)
|
|
|
|
|
2016-03-01 14:46:05 -08:00
|
|
|
def do_load(fh):
|
|
|
|
with fh:
|
|
|
|
gen = iter_range(fh, params['key'], params['end_key'])
|
|
|
|
for line in gen:
|
|
|
|
yield CDXObject(line)
|
|
|
|
|
|
|
|
return do_load(fh)
|
2016-02-19 17:25:54 -08:00
|
|
|
|
2016-02-26 18:25:10 -08:00
|
|
|
def __str__(self):
|
|
|
|
return 'file'
|
2016-02-24 14:22:29 -08:00
|
|
|
|
2016-02-19 17:25:54 -08:00
|
|
|
|
|
|
|
#=============================================================================
|
|
|
|
class RemoteIndexSource(BaseIndexSource):
|
2016-03-06 09:10:17 -08:00
|
|
|
def __init__(self, api_url, replay_url, url_field='load_url'):
|
2016-02-26 18:25:10 -08:00
|
|
|
self.api_url_template = api_url
|
2016-02-19 17:25:54 -08:00
|
|
|
self.replay_url = replay_url
|
2016-03-06 09:10:17 -08:00
|
|
|
self.url_field = url_field
|
2016-02-19 17:25:54 -08:00
|
|
|
|
|
|
|
def load_index(self, params):
|
2016-03-05 16:49:26 -08:00
|
|
|
api_url = res_template(self.api_url_template, params)
|
2016-02-24 14:22:29 -08:00
|
|
|
r = requests.get(api_url, timeout=params.get('_timeout'))
|
2016-02-22 13:30:12 -08:00
|
|
|
if r.status_code >= 400:
|
2016-02-24 14:22:29 -08:00
|
|
|
raise NotFoundException(api_url)
|
2016-02-22 13:30:12 -08:00
|
|
|
|
2016-02-19 17:25:54 -08:00
|
|
|
lines = r.content.strip().split(b'\n')
|
2016-02-22 13:30:12 -08:00
|
|
|
def do_load(lines):
|
|
|
|
for line in lines:
|
|
|
|
cdx = CDXObject(line)
|
2016-03-06 23:10:30 -08:00
|
|
|
self._set_load_url(cdx)
|
2016-02-22 13:30:12 -08:00
|
|
|
yield cdx
|
|
|
|
|
|
|
|
return do_load(lines)
|
2016-02-19 17:25:54 -08:00
|
|
|
|
2016-03-06 23:10:30 -08:00
|
|
|
def _set_load_url(self, cdx):
|
|
|
|
cdx[self.url_field] = self.replay_url.format(
|
|
|
|
timestamp=cdx['timestamp'],
|
|
|
|
url=cdx['url'])
|
2016-03-06 09:10:17 -08:00
|
|
|
|
2016-02-26 18:25:10 -08:00
|
|
|
def __str__(self):
|
|
|
|
return 'remote'
|
|
|
|
|
2016-02-19 17:25:54 -08:00
|
|
|
|
|
|
|
#=============================================================================
|
|
|
|
class LiveIndexSource(BaseIndexSource):
|
2016-03-08 10:27:13 -08:00
|
|
|
def __init__(self, proxy_url='{url}'):
|
|
|
|
self.proxy_url = proxy_url
|
|
|
|
|
2016-02-19 17:25:54 -08:00
|
|
|
def load_index(self, params):
|
|
|
|
cdx = CDXObject()
|
2016-02-22 13:30:12 -08:00
|
|
|
cdx['urlkey'] = params.get('key').decode('utf-8')
|
2016-02-19 17:25:54 -08:00
|
|
|
cdx['timestamp'] = timestamp_now()
|
|
|
|
cdx['url'] = params['url']
|
2016-03-08 10:27:13 -08:00
|
|
|
cdx['load_url'] = res_template(self.proxy_url, params)
|
2016-03-06 09:10:17 -08:00
|
|
|
cdx['is_live'] = 'true'
|
2016-02-19 17:25:54 -08:00
|
|
|
def live():
|
|
|
|
yield cdx
|
|
|
|
|
|
|
|
return live()
|
|
|
|
|
2016-02-26 18:25:10 -08:00
|
|
|
def __str__(self):
|
|
|
|
return 'live'
|
|
|
|
|
2016-02-19 17:25:54 -08:00
|
|
|
|
|
|
|
#=============================================================================
|
|
|
|
class RedisIndexSource(BaseIndexSource):
|
|
|
|
def __init__(self, redis_url):
|
|
|
|
parts = redis_url.split('/')
|
|
|
|
key_prefix = ''
|
|
|
|
if len(parts) > 4:
|
|
|
|
key_prefix = parts[4]
|
|
|
|
redis_url = 'redis://' + parts[2] + '/' + parts[3]
|
|
|
|
|
|
|
|
self.redis_url = redis_url
|
2016-02-26 18:25:10 -08:00
|
|
|
self.redis_key_template = key_prefix
|
2016-02-19 17:25:54 -08:00
|
|
|
self.redis = redis.StrictRedis.from_url(redis_url)
|
|
|
|
|
|
|
|
def load_index(self, params):
|
2016-03-05 16:49:26 -08:00
|
|
|
z_key = res_template(self.redis_key_template, params)
|
2016-02-19 17:25:54 -08:00
|
|
|
index_list = self.redis.zrangebylex(z_key,
|
2016-02-22 13:30:12 -08:00
|
|
|
b'[' + params['key'],
|
2016-02-19 17:25:54 -08:00
|
|
|
b'(' + params['end_key'])
|
|
|
|
|
2016-02-24 14:22:29 -08:00
|
|
|
def do_load(index_list):
|
|
|
|
for line in index_list:
|
|
|
|
yield CDXObject(line)
|
|
|
|
|
|
|
|
return do_load(index_list)
|
2016-02-19 17:25:54 -08:00
|
|
|
|
2016-02-26 18:25:10 -08:00
|
|
|
def __str__(self):
|
|
|
|
return 'redis'
|
|
|
|
|
2016-02-19 17:25:54 -08:00
|
|
|
|
|
|
|
#=============================================================================
|
|
|
|
class MementoIndexSource(BaseIndexSource):
|
|
|
|
def __init__(self, timegate_url, timemap_url, replay_url):
|
|
|
|
self.timegate_url = timegate_url
|
|
|
|
self.timemap_url = timemap_url
|
|
|
|
self.replay_url = replay_url
|
|
|
|
|
2016-02-26 18:25:10 -08:00
|
|
|
def links_to_cdxobject(self, link_header, def_name):
|
2016-02-22 13:30:12 -08:00
|
|
|
results = MementoUtils.parse_links(link_header, def_name)
|
|
|
|
|
|
|
|
#meta = MementoUtils.meta_field('timegate', results)
|
|
|
|
#if meta:
|
|
|
|
# yield meta
|
|
|
|
|
|
|
|
#meta = MementoUtils.meta_field('timemap', results)
|
|
|
|
#if meta:
|
|
|
|
# yield meta
|
|
|
|
|
|
|
|
#meta = MementoUtils.meta_field('original', results)
|
|
|
|
#if meta:
|
|
|
|
# yield meta
|
|
|
|
|
|
|
|
original = results['original']['url']
|
|
|
|
key = canonicalize(original)
|
|
|
|
|
|
|
|
mementos = results['mementos']
|
|
|
|
|
|
|
|
for val in mementos:
|
2016-02-26 18:25:10 -08:00
|
|
|
dt = val['datetime']
|
2016-02-22 13:30:12 -08:00
|
|
|
ts = http_date_to_timestamp(dt)
|
|
|
|
cdx = CDXObject()
|
|
|
|
cdx['urlkey'] = key
|
|
|
|
cdx['timestamp'] = ts
|
|
|
|
cdx['url'] = original
|
|
|
|
cdx['mem_rel'] = val.get('rel', '')
|
|
|
|
cdx['memento_url'] = val['url']
|
|
|
|
|
|
|
|
load_url = self.replay_url.format(timestamp=cdx['timestamp'],
|
|
|
|
url=original)
|
2016-02-19 17:25:54 -08:00
|
|
|
|
2016-02-22 13:30:12 -08:00
|
|
|
cdx['load_url'] = load_url
|
2016-02-19 17:25:54 -08:00
|
|
|
yield cdx
|
|
|
|
|
2016-02-22 13:30:12 -08:00
|
|
|
def get_timegate_links(self, params, closest):
|
2016-03-05 16:49:26 -08:00
|
|
|
url = res_template(self.timegate_url, params)
|
2016-02-19 17:25:54 -08:00
|
|
|
accept_dt = timestamp_to_http_date(closest)
|
|
|
|
res = requests.head(url, headers={'Accept-Datetime': accept_dt})
|
2016-02-22 13:30:12 -08:00
|
|
|
if res.status_code >= 400:
|
|
|
|
raise NotFoundException(url)
|
2016-02-19 17:25:54 -08:00
|
|
|
|
2016-02-22 13:30:12 -08:00
|
|
|
return res.headers.get('Link')
|
|
|
|
|
|
|
|
def get_timemap_links(self, params):
|
2016-03-05 16:49:26 -08:00
|
|
|
url = res_template(self.timemap_url, params)
|
2016-02-24 14:22:29 -08:00
|
|
|
res = requests.get(url, timeout=params.get('_timeout'))
|
2016-02-22 13:30:12 -08:00
|
|
|
if res.status_code >= 400:
|
|
|
|
raise NotFoundException(url)
|
|
|
|
|
|
|
|
return res.text
|
2016-02-19 17:25:54 -08:00
|
|
|
|
|
|
|
def load_index(self, params):
|
|
|
|
closest = params.get('closest')
|
2016-02-22 13:30:12 -08:00
|
|
|
|
2016-02-19 17:25:54 -08:00
|
|
|
if not closest:
|
2016-02-22 13:30:12 -08:00
|
|
|
links = self.get_timemap_links(params)
|
|
|
|
def_name = 'timemap'
|
2016-02-19 17:25:54 -08:00
|
|
|
else:
|
2016-02-22 13:30:12 -08:00
|
|
|
links = self.get_timegate_links(params, closest)
|
|
|
|
def_name = 'timegate'
|
|
|
|
|
|
|
|
return self.links_to_cdxobject(links, def_name)
|
2016-02-19 17:25:54 -08:00
|
|
|
|
|
|
|
@staticmethod
|
2016-02-22 13:30:12 -08:00
|
|
|
def from_timegate_url(timegate_url, path='link'):
|
2016-02-29 12:34:06 -08:00
|
|
|
return MementoIndexSource(timegate_url + '{url}',
|
|
|
|
timegate_url + 'timemap/' + path + '/{url}',
|
|
|
|
timegate_url + WAYBACK_ORIG_SUFFIX)
|
2016-02-19 17:25:54 -08:00
|
|
|
|
2016-02-26 18:25:10 -08:00
|
|
|
def __str__(self):
|
|
|
|
return 'memento'
|
2016-02-19 17:25:54 -08:00
|
|
|
|
|
|
|
|