import redis from pywb.utils.binsearch import iter_range from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp from pywb.utils.timeutils import timestamp_to_sec, timestamp_now from pywb.utils.canonicalize import canonicalize, calc_search_range from pywb.utils.wbexception import NotFoundException from pywb.cdx.cdxobject import CDXObject from pywb.cdx.query import CDXQuery from pywb.cdx.cdxops import process_cdx import requests from utils import MementoUtils #============================================================================= class BaseIndexSource(object): def __init__(self, index_template=''): self.index_template = index_template def get_index(self, params): res = self.index_template.format(**params) return res def load_index(self, params): raise NotImplemented() def __call__(self, params): query = CDXQuery(params) try: cdx_iter = self.load_index(query.params) except NotFoundException as nf: cdx_iter = iter([]) cdx_iter = process_cdx(cdx_iter, query) return cdx_iter def _include_post_query(self, params): input_req = params.get('_input_req') if input_req: orig_url = params['url'] params['url'] = input_req.include_post_query(params['url']) return (params['url'] != orig_url) #============================================================================= class FileIndexSource(BaseIndexSource): def load_index(self, params): if self._include_post_query(params): params = CDXQuery(params).params filename = self.get_index(params) with open(filename, 'rb') as fh: gen = iter_range(fh, params['key'], params['end_key']) for line in gen: yield CDXObject(line) #return do_load(filename) #============================================================================= class RemoteIndexSource(BaseIndexSource): def __init__(self, cdx_url, replay_url): self.index_template = cdx_url self.replay_url = replay_url def load_index(self, params): if self._include_post_query(params): params = CDXQuery(**params).params api_url = self.get_index(params) api_url += '?url=' + params['url'] r = requests.get(api_url, timeout=params.get('_timeout')) if r.status_code >= 400: raise NotFoundException(api_url) lines = r.content.strip().split(b'\n') def do_load(lines): for line in lines: cdx = CDXObject(line) cdx['load_url'] = self.replay_url.format(timestamp=cdx['timestamp'], url=cdx['url']) yield cdx return do_load(lines) #============================================================================= class LiveIndexSource(BaseIndexSource): def load_index(self, params): cdx = CDXObject() cdx['urlkey'] = params.get('key').decode('utf-8') cdx['timestamp'] = timestamp_now() cdx['url'] = params['url'] cdx['load_url'] = params['url'] cdx['is_live'] = True def live(): yield cdx return live() #============================================================================= class RedisIndexSource(BaseIndexSource): def __init__(self, redis_url): parts = redis_url.split('/') key_prefix = '' if len(parts) > 4: key_prefix = parts[4] redis_url = 'redis://' + parts[2] + '/' + parts[3] self.redis_url = redis_url self.index_template = key_prefix self.redis = redis.StrictRedis.from_url(redis_url) def load_index(self, params): z_key = self.get_index(params) index_list = self.redis.zrangebylex(z_key, b'[' + params['key'], b'(' + params['end_key']) def do_load(index_list): for line in index_list: yield CDXObject(line) return do_load(index_list) #============================================================================= class MementoIndexSource(BaseIndexSource): def __init__(self, timegate_url, timemap_url, replay_url): self.timegate_url = timegate_url self.timemap_url = timemap_url self.replay_url = replay_url def links_to_cdxobject(self, link_header, def_name, sort=False): results = MementoUtils.parse_links(link_header, def_name) #meta = MementoUtils.meta_field('timegate', results) #if meta: # yield meta #meta = MementoUtils.meta_field('timemap', results) #if meta: # yield meta #meta = MementoUtils.meta_field('original', results) #if meta: # yield meta original = results['original']['url'] key = canonicalize(original) mementos = results['mementos'] if sort: mementos = sorted(mementos) for val in mementos: dt = val.get('datetime') if not dt: continue ts = http_date_to_timestamp(dt) cdx = CDXObject() cdx['urlkey'] = key cdx['timestamp'] = ts cdx['url'] = original cdx['mem_rel'] = val.get('rel', '') cdx['memento_url'] = val['url'] load_url = self.replay_url.format(timestamp=cdx['timestamp'], url=original) cdx['load_url'] = load_url yield cdx def get_timegate_links(self, params, closest): url = self.timegate_url.format(coll=params.get('coll')) + params['url'] accept_dt = timestamp_to_http_date(closest) res = requests.head(url, headers={'Accept-Datetime': accept_dt}) if res.status_code >= 400: raise NotFoundException(url) return res.headers.get('Link') def get_timemap_links(self, params): url = self.timemap_url + params['url'] res = requests.get(url, timeout=params.get('_timeout')) if res.status_code >= 400: raise NotFoundException(url) return res.text def load_index(self, params): closest = params.get('closest') if not closest: links = self.get_timemap_links(params) def_name = 'timemap' else: links = self.get_timegate_links(params, closest) def_name = 'timegate' return self.links_to_cdxobject(links, def_name) @staticmethod def from_timegate_url(timegate_url, path='link'): return MementoIndexSource(timegate_url, timegate_url + 'timemap/' + path + '/', timegate_url + '{timestamp}id_/{url}')