mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
add aggregate index source and tests!
This commit is contained in:
parent
37198767ed
commit
1a0b2fba17
141
aggindexsource.py
Normal file
141
aggindexsource.py
Normal file
@ -0,0 +1,141 @@
|
|||||||
|
from gevent.pool import Pool
|
||||||
|
import gevent
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
|
||||||
|
from heapq import merge
|
||||||
|
from collections import deque
|
||||||
|
|
||||||
|
from indexsource import BaseIndexSource
|
||||||
|
from pywb.utils.wbexception import NotFoundException
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class BaseAggIndexSource(BaseIndexSource):
|
||||||
|
def __init__(self, sources):
|
||||||
|
self.sources = sources
|
||||||
|
|
||||||
|
def do_query(self, name, source, params):
|
||||||
|
try:
|
||||||
|
cdx_iter = source.load_index(params)
|
||||||
|
except NotFoundException as nf:
|
||||||
|
print('Not found in ' + name)
|
||||||
|
cdx_iter = iter([])
|
||||||
|
|
||||||
|
def add_name(cdx_iter):
|
||||||
|
for cdx in cdx_iter:
|
||||||
|
cdx['source_name'] = name
|
||||||
|
yield cdx
|
||||||
|
|
||||||
|
return add_name(cdx_iter)
|
||||||
|
|
||||||
|
def load_index(self, params):
|
||||||
|
iter_list = self._load_all(params)
|
||||||
|
|
||||||
|
cdx_iter = merge(*(iter_list))
|
||||||
|
|
||||||
|
return cdx_iter
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class TimingOutMixin(object):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(TimingOutMixin, self).__init__(*args, **kwargs)
|
||||||
|
self.t_count = kwargs.get('t_count', 3)
|
||||||
|
self.t_dura = kwargs.get('t_duration', 20)
|
||||||
|
self.timeouts = {}
|
||||||
|
|
||||||
|
def is_timed_out(self, name):
|
||||||
|
timeout_deq = self.timeouts.get(name)
|
||||||
|
if not timeout_deq:
|
||||||
|
return False
|
||||||
|
|
||||||
|
the_time = time.time()
|
||||||
|
for t in list(timeout_deq):
|
||||||
|
if (the_time - t) > self.t_dura:
|
||||||
|
timeout_deq.popleft()
|
||||||
|
|
||||||
|
if len(timeout_deq) >= self.t_count:
|
||||||
|
print('Skipping {0}, {1} timeouts in {2} seconds'.
|
||||||
|
format(name, self.t_count, self.t_dura))
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_valid_sources(self, sources):
|
||||||
|
for name in sources.keys():
|
||||||
|
if not self.is_timed_out(name):
|
||||||
|
yield name
|
||||||
|
|
||||||
|
def track_source_error(self, name):
|
||||||
|
the_time = time.time()
|
||||||
|
if name not in self.timeouts:
|
||||||
|
self.timeouts[name] = deque()
|
||||||
|
|
||||||
|
self.timeouts[name].append(the_time)
|
||||||
|
print(name + ' timed out!')
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class GeventAggIndexSource(BaseAggIndexSource):
|
||||||
|
def __init__(self, sources, timeout=5.0, size=None):
|
||||||
|
super(GeventAggIndexSource, self).__init__(sources)
|
||||||
|
self.pool = Pool(size=size)
|
||||||
|
self.timeout = timeout
|
||||||
|
|
||||||
|
def get_valid_sources(self, sources):
|
||||||
|
return sources.keys()
|
||||||
|
|
||||||
|
def track_source_error(self, name):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _load_all(self, params):
|
||||||
|
def do_spawn(n):
|
||||||
|
return self.pool.spawn(self.do_query, n, self.sources[n], params)
|
||||||
|
|
||||||
|
jobs = [do_spawn(src) for src in self.get_valid_sources(self.sources)]
|
||||||
|
|
||||||
|
gevent.joinall(jobs, timeout=self.timeout)
|
||||||
|
|
||||||
|
res = []
|
||||||
|
for name, job in zip(self.sources.keys(), jobs):
|
||||||
|
if job.value:
|
||||||
|
res.append(job.value)
|
||||||
|
else:
|
||||||
|
self.track_source_error(name)
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class AggIndexSource(TimingOutMixin, GeventAggIndexSource):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class SimpleAggIndexSource(BaseAggIndexSource):
|
||||||
|
def _load_all(self, params):
|
||||||
|
return list(map(lambda n: self.do_query(n, self.sources[n], params),
|
||||||
|
self.sources))
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class ResourceLoadAgg(object):
|
||||||
|
def __init__(self, load_index, load_resource):
|
||||||
|
self.load_index = load_index
|
||||||
|
self.load_resource = load_resource
|
||||||
|
|
||||||
|
def __call__(self, params):
|
||||||
|
cdx_iter = self.load_index(params)
|
||||||
|
for cdx in cdx_iter:
|
||||||
|
for loader in self.load_resource:
|
||||||
|
try:
|
||||||
|
resp = loader(cdx)
|
||||||
|
if resp:
|
||||||
|
return resp
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
raise Exception('Not Found')
|
||||||
|
|
||||||
|
|
@ -3,10 +3,12 @@ import redis
|
|||||||
from pywb.utils.binsearch import iter_range
|
from pywb.utils.binsearch import iter_range
|
||||||
from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp
|
from pywb.utils.timeutils import timestamp_to_http_date, http_date_to_timestamp
|
||||||
from pywb.utils.timeutils import timestamp_to_sec, timestamp_now
|
from pywb.utils.timeutils import timestamp_to_sec, timestamp_now
|
||||||
from pywb.utils.canonicalize import calc_search_range
|
from pywb.utils.canonicalize import canonicalize, calc_search_range
|
||||||
|
from pywb.utils.wbexception import NotFoundException
|
||||||
|
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
from pywb.cdx.cdxobject import CDXObject
|
||||||
from pywb.cdx.cdxops import cdx_sort_closest, cdx_limit
|
from pywb.cdx.query import CDXQuery
|
||||||
|
from pywb.cdx.cdxops import process_cdx
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
@ -21,6 +23,17 @@ class BaseIndexSource(object):
|
|||||||
def get_index(self, params):
|
def get_index(self, params):
|
||||||
return self.index_template.format(params.get('coll'))
|
return self.index_template.format(params.get('coll'))
|
||||||
|
|
||||||
|
def __call__(self, params):
|
||||||
|
query = CDXQuery(**params)
|
||||||
|
|
||||||
|
try:
|
||||||
|
cdx_iter = self.load_index(query.params)
|
||||||
|
except NotFoundException as nf:
|
||||||
|
cdx_iter = iter([])
|
||||||
|
|
||||||
|
cdx_iter = process_cdx(cdx_iter, query)
|
||||||
|
return cdx_iter
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class FileIndexSource(BaseIndexSource):
|
class FileIndexSource(BaseIndexSource):
|
||||||
@ -28,7 +41,7 @@ class FileIndexSource(BaseIndexSource):
|
|||||||
filename = self.get_index(params)
|
filename = self.get_index(params)
|
||||||
|
|
||||||
with open(filename, 'rb') as fh:
|
with open(filename, 'rb') as fh:
|
||||||
gen = iter_range(fh, params['start_key'], params['end_key'])
|
gen = iter_range(fh, params['key'], params['end_key'])
|
||||||
for line in gen:
|
for line in gen:
|
||||||
yield CDXObject(line)
|
yield CDXObject(line)
|
||||||
|
|
||||||
@ -43,21 +56,28 @@ class RemoteIndexSource(BaseIndexSource):
|
|||||||
url = self.get_index(params)
|
url = self.get_index(params)
|
||||||
url += '?url=' + params['url']
|
url += '?url=' + params['url']
|
||||||
r = requests.get(url)
|
r = requests.get(url)
|
||||||
|
if r.status_code >= 400:
|
||||||
|
raise NotFoundException(url)
|
||||||
|
|
||||||
lines = r.content.strip().split(b'\n')
|
lines = r.content.strip().split(b'\n')
|
||||||
for line in lines:
|
def do_load(lines):
|
||||||
cdx = CDXObject(line)
|
for line in lines:
|
||||||
cdx['load_url'] = self.replay_url.format(timestamp=cdx['timestamp'], url=cdx['url'])
|
cdx = CDXObject(line)
|
||||||
yield cdx
|
cdx['load_url'] = self.replay_url.format(timestamp=cdx['timestamp'], url=cdx['url'])
|
||||||
|
yield cdx
|
||||||
|
|
||||||
|
return do_load(lines)
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class LiveIndexSource(BaseIndexSource):
|
class LiveIndexSource(BaseIndexSource):
|
||||||
def load_index(self, params):
|
def load_index(self, params):
|
||||||
cdx = CDXObject()
|
cdx = CDXObject()
|
||||||
cdx['urlkey'] = params.get('start_key').decode('utf-8')
|
cdx['urlkey'] = params.get('key').decode('utf-8')
|
||||||
cdx['timestamp'] = timestamp_now()
|
cdx['timestamp'] = timestamp_now()
|
||||||
cdx['url'] = params['url']
|
cdx['url'] = params['url']
|
||||||
cdx['load_url'] = params['url']
|
cdx['load_url'] = params['url']
|
||||||
|
cdx['is_live'] = True
|
||||||
def live():
|
def live():
|
||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
@ -80,7 +100,7 @@ class RedisIndexSource(BaseIndexSource):
|
|||||||
def load_index(self, params):
|
def load_index(self, params):
|
||||||
z_key = self.get_index(params)
|
z_key = self.get_index(params)
|
||||||
index_list = self.redis.zrangebylex(z_key,
|
index_list = self.redis.zrangebylex(z_key,
|
||||||
b'[' + params['start_key'],
|
b'[' + params['key'],
|
||||||
b'(' + params['end_key'])
|
b'(' + params['end_key'])
|
||||||
|
|
||||||
for line in index_list:
|
for line in index_list:
|
||||||
@ -94,66 +114,84 @@ class MementoIndexSource(BaseIndexSource):
|
|||||||
self.timemap_url = timemap_url
|
self.timemap_url = timemap_url
|
||||||
self.replay_url = replay_url
|
self.replay_url = replay_url
|
||||||
|
|
||||||
def make_iter(self, links, def_name):
|
def links_to_cdxobject(self, link_header, def_name, sort=False):
|
||||||
original, link_iter = MementoUtils.links_to_json(links, def_name)
|
results = MementoUtils.parse_links(link_header, def_name)
|
||||||
|
|
||||||
for cdx in link_iter():
|
#meta = MementoUtils.meta_field('timegate', results)
|
||||||
cdx['load_url'] = self.replay_url.format(timestamp=cdx['timestamp'], url=original)
|
#if meta:
|
||||||
|
# yield meta
|
||||||
|
|
||||||
|
#meta = MementoUtils.meta_field('timemap', results)
|
||||||
|
#if meta:
|
||||||
|
# yield meta
|
||||||
|
|
||||||
|
#meta = MementoUtils.meta_field('original', results)
|
||||||
|
#if meta:
|
||||||
|
# yield meta
|
||||||
|
|
||||||
|
original = results['original']['url']
|
||||||
|
key = canonicalize(original)
|
||||||
|
|
||||||
|
mementos = results['mementos']
|
||||||
|
if sort:
|
||||||
|
mementos = sorted(mementos)
|
||||||
|
|
||||||
|
for val in mementos:
|
||||||
|
dt = val.get('datetime')
|
||||||
|
if not dt:
|
||||||
|
continue
|
||||||
|
|
||||||
|
ts = http_date_to_timestamp(dt)
|
||||||
|
cdx = CDXObject()
|
||||||
|
cdx['urlkey'] = key
|
||||||
|
cdx['timestamp'] = ts
|
||||||
|
cdx['url'] = original
|
||||||
|
cdx['mem_rel'] = val.get('rel', '')
|
||||||
|
cdx['memento_url'] = val['url']
|
||||||
|
|
||||||
|
load_url = self.replay_url.format(timestamp=cdx['timestamp'],
|
||||||
|
url=original)
|
||||||
|
|
||||||
|
cdx['load_url'] = load_url
|
||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
def load_timegate(self, params, closest):
|
def get_timegate_links(self, params, closest):
|
||||||
url = self.timegate_url.format(coll=params.get('coll')) + params['url']
|
url = self.timegate_url.format(coll=params.get('coll')) + params['url']
|
||||||
accept_dt = timestamp_to_http_date(closest)
|
accept_dt = timestamp_to_http_date(closest)
|
||||||
res = requests.head(url, headers={'Accept-Datetime': accept_dt})
|
res = requests.head(url, headers={'Accept-Datetime': accept_dt})
|
||||||
return self.make_iter(res.headers.get('Link'), 'timegate')
|
if res.status_code >= 400:
|
||||||
|
raise NotFoundException(url)
|
||||||
|
|
||||||
def load_timemap(self, params):
|
return res.headers.get('Link')
|
||||||
|
|
||||||
|
def get_timemap_links(self, params):
|
||||||
url = self.timemap_url + params['url']
|
url = self.timemap_url + params['url']
|
||||||
r = requests.get(url)
|
res = requests.get(url)
|
||||||
return self.make_iter(r.text, 'timemap')
|
if res.status_code >= 400:
|
||||||
|
raise NotFoundException(url)
|
||||||
|
|
||||||
|
return res.text
|
||||||
|
|
||||||
def load_index(self, params):
|
def load_index(self, params):
|
||||||
closest = params.get('closest')
|
closest = params.get('closest')
|
||||||
|
|
||||||
if not closest:
|
if not closest:
|
||||||
return self.load_timemap(params)
|
links = self.get_timemap_links(params)
|
||||||
|
def_name = 'timemap'
|
||||||
else:
|
else:
|
||||||
return self.load_timegate(params, closest)
|
links = self.get_timegate_links(params, closest)
|
||||||
|
def_name = 'timegate'
|
||||||
|
|
||||||
|
#if not links:
|
||||||
|
# return iter([])
|
||||||
|
|
||||||
|
return self.links_to_cdxobject(links, def_name)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_timegate_url(timegate_url, type_='link'):
|
def from_timegate_url(timegate_url, path='link'):
|
||||||
return MementoIndexSource(timegate_url,
|
return MementoIndexSource(timegate_url,
|
||||||
timegate_url + 'timemap/' + type_ + '/',
|
timegate_url + 'timemap/' + path + '/',
|
||||||
timegate_url + '{timestamp}id_/{url}')
|
timegate_url + '{timestamp}id_/{url}')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def query_index(source, params):
|
|
||||||
url = params.get('url', '')
|
|
||||||
|
|
||||||
if not params.get('matchType'):
|
|
||||||
if url.startswith('*.'):
|
|
||||||
params['url'] = url[2:]
|
|
||||||
params['matchType'] = 'domain'
|
|
||||||
elif url.endswith('*'):
|
|
||||||
params['url'] = url[:-1]
|
|
||||||
params['matchType'] = 'prefix'
|
|
||||||
else:
|
|
||||||
params['matchType'] = 'exact'
|
|
||||||
|
|
||||||
start, end = calc_search_range(url=params['url'],
|
|
||||||
match_type=params['matchType'])
|
|
||||||
|
|
||||||
params['start_key'] = start.encode('utf-8')
|
|
||||||
params['end_key'] = end.encode('utf-8')
|
|
||||||
|
|
||||||
res = source.load_index(params)
|
|
||||||
|
|
||||||
limit = int(params.get('limit', 10))
|
|
||||||
closest = params.get('closest')
|
|
||||||
if closest:
|
|
||||||
res = cdx_sort_closest(closest, res, limit)
|
|
||||||
elif limit:
|
|
||||||
res = cdx_limit(res, limit)
|
|
||||||
|
|
||||||
|
|
||||||
return res
|
|
@ -157,6 +157,7 @@ class BaseRecorder(object):
|
|||||||
def finish_response(self, incomplete=False):
|
def finish_response(self, incomplete=False):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ReadFullyStream(object):
|
class ReadFullyStream(object):
|
||||||
def __init__(self, stream):
|
def __init__(self, stream):
|
||||||
|
@ -2,7 +2,8 @@ from liverec import BaseRecorder
|
|||||||
from liverec import request as remote_request
|
from liverec import request as remote_request
|
||||||
|
|
||||||
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
|
from pywb.warc.recordloader import ArcWarcRecordLoader, ArchiveLoadFailed
|
||||||
from pywb.utils.timeutils import timestamp_to_datetime
|
from pywb.utils.timeutils import timestamp_to_datetime, datetime_to_http_date
|
||||||
|
from pywb.warc.resolvingloader import ResolvingLoader
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from bottle import response
|
from bottle import response
|
||||||
@ -25,22 +26,26 @@ def incr_reader(stream, header=None, size=8192):
|
|||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class WARCPathPrefixLoader(object):
|
class WARCPathPrefixLoader(object):
|
||||||
def __init__(self, prefix):
|
def __init__(self, prefix, cdx_loader):
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
self.record_loader = ArcWarcRecordLoader()
|
|
||||||
|
def add_prefix(filename, cdx):
|
||||||
|
return [self.prefix + filename]
|
||||||
|
|
||||||
|
self.resolve_loader = ResolvingLoader([add_prefix], no_record_parse=True)
|
||||||
|
self.cdx_loader = cdx_loader
|
||||||
|
|
||||||
def __call__(self, cdx):
|
def __call__(self, cdx):
|
||||||
filename = cdx.get('filename')
|
if not cdx.get('filename') or cdx.get('offset') is None:
|
||||||
offset = cdx.get('offset')
|
return None
|
||||||
length = cdx.get('length', -1)
|
|
||||||
|
|
||||||
if filename is None or offset is None:
|
failed_files = []
|
||||||
raise Exception
|
headers, payload = self.resolve_loader.load_headers_and_payload(cdx, failed_files, self.cdx_loader)
|
||||||
|
|
||||||
record = self.record_loader.load(self.prefix + filename,
|
if headers != payload:
|
||||||
offset,
|
headers.stream.close()
|
||||||
length,
|
|
||||||
no_record_parse=True)
|
record = payload
|
||||||
|
|
||||||
for n, v in record.rec_headers.headers:
|
for n, v in record.rec_headers.headers:
|
||||||
response.headers[n] = v
|
response.headers[n] = v
|
||||||
@ -75,40 +80,50 @@ class LiveWebLoader(object):
|
|||||||
SKIP_HEADERS = (b'link',
|
SKIP_HEADERS = (b'link',
|
||||||
b'memento-datetime',
|
b'memento-datetime',
|
||||||
b'content-location',
|
b'content-location',
|
||||||
b'x-archive',
|
b'x-archive')
|
||||||
b'set-cookie')
|
|
||||||
|
|
||||||
def __call__(self, cdx):
|
def __call__(self, cdx):
|
||||||
load_url = cdx.get('load_url')
|
load_url = cdx.get('load_url')
|
||||||
if not load_url:
|
if not load_url:
|
||||||
raise Exception
|
return None
|
||||||
|
|
||||||
recorder = HeaderRecorder(self.SKIP_HEADERS)
|
recorder = HeaderRecorder(self.SKIP_HEADERS)
|
||||||
|
|
||||||
upstream_res = remote_request(load_url, recorder=recorder, stream=True,
|
req_headers = {}
|
||||||
headers={'Accept-Encoding': 'identity'})
|
|
||||||
|
dt = timestamp_to_datetime(cdx['timestamp'])
|
||||||
|
|
||||||
|
if not cdx.get('is_live'):
|
||||||
|
req_headers['Accept-Datetime'] = datetime_to_http_date(dt)
|
||||||
|
|
||||||
|
upstream_res = remote_request(load_url,
|
||||||
|
recorder=recorder,
|
||||||
|
stream=True,
|
||||||
|
headers=req_headers)
|
||||||
|
|
||||||
|
resp_headers = recorder.get_header()
|
||||||
|
|
||||||
response.headers['Content-Type'] = 'application/http; msgtype=response'
|
response.headers['Content-Type'] = 'application/http; msgtype=response'
|
||||||
|
|
||||||
response.headers['WARC-Type'] = 'response'
|
#response.headers['WARC-Type'] = 'response'
|
||||||
response.headers['WARC-Record-ID'] = self._make_warc_id()
|
#response.headers['WARC-Record-ID'] = self._make_warc_id()
|
||||||
response.headers['WARC-Target-URI'] = cdx['url']
|
response.headers['WARC-Target-URI'] = cdx['url']
|
||||||
response.headers['WARC-Date'] = self._make_date(cdx['timestamp'])
|
response.headers['WARC-Date'] = self._make_date(dt)
|
||||||
|
|
||||||
# Try to set content-length, if it is available and valid
|
# Try to set content-length, if it is available and valid
|
||||||
try:
|
try:
|
||||||
content_len = int(upstream_res.headers.get('content-length', 0))
|
content_len = int(upstream_res.headers.get('content-length', 0))
|
||||||
if content_len > 0:
|
if content_len > 0:
|
||||||
content_len += len(recorder.get_header())
|
content_len += len(resp_headers)
|
||||||
response.headers['Content-Length'] = content_len
|
response.headers['Content-Length'] = content_len
|
||||||
except:
|
except:
|
||||||
pass
|
raise
|
||||||
|
|
||||||
return incr_reader(upstream_res.raw, header=recorder.get_header())
|
return incr_reader(upstream_res.raw, header=resp_headers)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _make_date(ts):
|
def _make_date(dt):
|
||||||
return timestamp_to_datetime(ts).strftime('%Y-%m-%dT%H:%M:%SZ')
|
return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _make_warc_id(id_=None):
|
def _make_warc_id(id_=None):
|
||||||
|
62
test_aggindexsource.py
Normal file
62
test_aggindexsource.py
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
from gevent import monkey; monkey.patch_all()
|
||||||
|
from aggindexsource import AggIndexSource
|
||||||
|
|
||||||
|
from indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
sources = {
|
||||||
|
'local': FileIndexSource('sample.cdxj'),
|
||||||
|
'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
|
||||||
|
'ait': MementoIndexSource.from_timegate_url('http://wayback.archive-it.org/all/'),
|
||||||
|
'bl': MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/'),
|
||||||
|
'rhiz': MementoIndexSource.from_timegate_url('http://webenact.rhizome.org/vvork/', path='*')
|
||||||
|
}
|
||||||
|
|
||||||
|
source = AggIndexSource(sources, timeout=5.0)
|
||||||
|
|
||||||
|
def select_json(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source_name']):
|
||||||
|
return list([json.loads(cdx.to_json(fields)) for cdx in cdxlist])
|
||||||
|
|
||||||
|
|
||||||
|
def test_agg_index_1():
|
||||||
|
url = 'http://iana.org/'
|
||||||
|
res = source(dict(url=url, closest='20140126000000', limit=5))
|
||||||
|
|
||||||
|
|
||||||
|
exp = [{"timestamp": "20140126093743", "load_url": "http://web.archive.org/web/20140126093743id_/http://iana.org/", "source_name": "ia"},
|
||||||
|
{"timestamp": "20140126200624", "filename": "iana.warc.gz", "source_name": "local"},
|
||||||
|
{"timestamp": "20140123034755", "load_url": "http://web.archive.org/web/20140123034755id_/http://iana.org/", "source_name": "ia"},
|
||||||
|
{"timestamp": "20140129175203", "load_url": "http://web.archive.org/web/20140129175203id_/http://iana.org/", "source_name": "ia"},
|
||||||
|
{"timestamp": "20140107040552", "load_url": "http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/", "source_name": "ait"}
|
||||||
|
]
|
||||||
|
|
||||||
|
assert(select_json(res) == exp)
|
||||||
|
|
||||||
|
|
||||||
|
def test_agg_index_2():
|
||||||
|
url = 'http://example.com/'
|
||||||
|
res = source(dict(url=url, closest='20100512', limit=6))
|
||||||
|
|
||||||
|
exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source_name": "bl"},
|
||||||
|
{"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source_name": "bl"},
|
||||||
|
{"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source_name": "ia"},
|
||||||
|
{"timestamp": "20100511201151", "load_url": "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source_name": "ait"},
|
||||||
|
{"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source_name": "ait"},
|
||||||
|
{"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source_name": "ia"}]
|
||||||
|
|
||||||
|
assert(select_json(res) == exp)
|
||||||
|
|
||||||
|
|
||||||
|
def test_agg_index_3():
|
||||||
|
url = 'http://vvork.com/'
|
||||||
|
res = source(dict(url=url, closest='20141001', limit=5))
|
||||||
|
|
||||||
|
exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source_name": "rhiz"},
|
||||||
|
{"timestamp": "20141018133107", "load_url": "http://web.archive.org/web/20141018133107id_/http://vvork.com/", "source_name": "ia"},
|
||||||
|
{"timestamp": "20141020161243", "load_url": "http://web.archive.org/web/20141020161243id_/http://vvork.com/", "source_name": "ia"},
|
||||||
|
{"timestamp": "20140806161228", "load_url": "http://web.archive.org/web/20140806161228id_/http://vvork.com/", "source_name": "ia"},
|
||||||
|
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source_name": "ait"}]
|
||||||
|
|
||||||
|
assert(select_json(res) == exp)
|
||||||
|
|
@ -1,6 +1,5 @@
|
|||||||
from indexloader import FileIndexSource, RemoteIndexSource, MementoIndexSource, RedisIndexSource
|
from indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource, RedisIndexSource
|
||||||
from indexloader import LiveIndexSource
|
from indexsource import LiveIndexSource
|
||||||
from indexloader import query_index
|
|
||||||
|
|
||||||
from pywb.utils.timeutils import timestamp_now
|
from pywb.utils.timeutils import timestamp_now
|
||||||
|
|
||||||
@ -42,11 +41,10 @@ remote_sources = [
|
|||||||
|
|
||||||
# Url Match -- Local Loaders
|
# Url Match -- Local Loaders
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@pytest.mark.parametrize("source1", local_sources, ids=["file", "redis"])
|
@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"])
|
||||||
def test_local_cdxj_loader(source1):
|
def test_local_cdxj_loader(source):
|
||||||
url = 'http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf'
|
url = 'http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf'
|
||||||
res = query_index(source1, dict(url=url,
|
res = source(dict(url=url, limit=3))
|
||||||
limit=3))
|
|
||||||
|
|
||||||
expected = """\
|
expected = """\
|
||||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 iana.warc.gz
|
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 iana.warc.gz
|
||||||
@ -58,12 +56,12 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 iana.warc.gz"""
|
|||||||
|
|
||||||
# Closest -- Local Loaders
|
# Closest -- Local Loaders
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@pytest.mark.parametrize("source1", local_sources, ids=["file", "redis"])
|
@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"])
|
||||||
def test_local_closest_loader(source1):
|
def test_local_closest_loader(source):
|
||||||
url = 'http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf'
|
url = 'http://www.iana.org/_css/2013.1/fonts/Inconsolata.otf'
|
||||||
res = query_index(source1, dict(url=url,
|
res = source(dict(url=url,
|
||||||
closest='20140126200930',
|
closest='20140126200930',
|
||||||
limit=3))
|
limit=3))
|
||||||
|
|
||||||
expected = """\
|
expected = """\
|
||||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 iana.warc.gz
|
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200930 iana.warc.gz
|
||||||
@ -75,9 +73,9 @@ org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126200826 iana.warc.gz"""
|
|||||||
|
|
||||||
# Prefix -- Local Loaders
|
# Prefix -- Local Loaders
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@pytest.mark.parametrize("source1", local_sources, ids=["file", "redis"])
|
@pytest.mark.parametrize("source", local_sources, ids=["file", "redis"])
|
||||||
def test_file_prefix_loader(source1):
|
def test_file_prefix_loader(source):
|
||||||
res = query_index(source1, dict(url='http://iana.org/domains/root/*'))
|
res = source(dict(url='http://iana.org/domains/root/*'))
|
||||||
|
|
||||||
expected = """\
|
expected = """\
|
||||||
org,iana)/domains/root/db 20140126200927 iana.warc.gz
|
org,iana)/domains/root/db 20140126200927 iana.warc.gz
|
||||||
@ -89,10 +87,10 @@ org,iana)/domains/root/servers 20140126201227 iana.warc.gz"""
|
|||||||
|
|
||||||
# Url Match -- Remote Loaders
|
# Url Match -- Remote Loaders
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@pytest.mark.parametrize("source2", remote_sources, ids=["remote_cdx", "memento"])
|
@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"])
|
||||||
def test_remote_loader(source2):
|
def test_remote_loader(source):
|
||||||
url = 'http://instagram.com/amaliaulman'
|
url = 'http://instagram.com/amaliaulman'
|
||||||
res = query_index(source2, dict(url=url))
|
res = source(dict(url=url))
|
||||||
|
|
||||||
expected = """\
|
expected = """\
|
||||||
com,instagram)/amaliaulman 20141014150552 http://webenact.rhizome.org/all/20141014150552id_/http://instagram.com/amaliaulman
|
com,instagram)/amaliaulman 20141014150552 http://webenact.rhizome.org/all/20141014150552id_/http://instagram.com/amaliaulman
|
||||||
@ -105,10 +103,10 @@ com,instagram)/amaliaulman 20141014171636 http://webenact.rhizome.org/all/201410
|
|||||||
|
|
||||||
# Url Match -- Remote Loaders
|
# Url Match -- Remote Loaders
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@pytest.mark.parametrize("source2", remote_sources, ids=["remote_cdx", "memento"])
|
@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"])
|
||||||
def test_remote_closest_loader(source2):
|
def test_remote_closest_loader(source):
|
||||||
url = 'http://instagram.com/amaliaulman'
|
url = 'http://instagram.com/amaliaulman'
|
||||||
res = query_index(source2, dict(url=url, closest='20141014162332', limit=1))
|
res = source(dict(url=url, closest='20141014162332', limit=1))
|
||||||
|
|
||||||
expected = """\
|
expected = """\
|
||||||
com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman"""
|
com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman"""
|
||||||
@ -116,12 +114,24 @@ com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/201410
|
|||||||
assert(key_ts_res(res, 'load_url') == expected)
|
assert(key_ts_res(res, 'load_url') == expected)
|
||||||
|
|
||||||
|
|
||||||
|
# Url Match -- Memento
|
||||||
|
# ============================================================================
|
||||||
|
@pytest.mark.parametrize("source", remote_sources, ids=["remote_cdx", "memento"])
|
||||||
|
def test_remote_closest_loader(source):
|
||||||
|
url = 'http://instagram.com/amaliaulman'
|
||||||
|
res = source(dict(url=url, closest='20141014162332', limit=1))
|
||||||
|
|
||||||
|
expected = """\
|
||||||
|
com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/20141014162333id_/http://instagram.com/amaliaulman"""
|
||||||
|
|
||||||
|
assert(key_ts_res(res, 'load_url') == expected)
|
||||||
|
|
||||||
# Live Index -- No Load!
|
# Live Index -- No Load!
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
def test_live():
|
def test_live():
|
||||||
url = 'http://example.com/'
|
url = 'http://example.com/'
|
||||||
source = LiveIndexSource()
|
source = LiveIndexSource()
|
||||||
res = query_index(source, dict(url=url))
|
res = source(dict(url=url))
|
||||||
|
|
||||||
expected = 'com,example)/ {0} http://example.com/'.format(timestamp_now())
|
expected = 'com,example)/ {0} http://example.com/'.format(timestamp_now())
|
||||||
|
|
||||||
@ -130,5 +140,26 @@ def test_live():
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Errors -- Not Found All
|
||||||
|
# ============================================================================
|
||||||
|
@pytest.mark.parametrize("source", local_sources + remote_sources, ids=["file", "redis", "remote_cdx", "memento"])
|
||||||
|
def test_all_not_found(source):
|
||||||
|
url = 'http://x-not-found-x.notfound/'
|
||||||
|
res = source(dict(url=url, limit=3))
|
||||||
|
|
||||||
|
expected = ''
|
||||||
|
assert(key_ts_res(res) == expected)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
def test_another_remote_not_found():
|
||||||
|
source = MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/')
|
||||||
|
url = 'http://x-not-found-x.notfound/'
|
||||||
|
res = source(dict(url=url, limit=3))
|
||||||
|
|
||||||
|
|
||||||
|
expected = ''
|
||||||
|
assert(key_ts_res(res) == expected)
|
||||||
|
|
||||||
|
|
72
utils.py
72
utils.py
@ -1,8 +1,4 @@
|
|||||||
import re, json
|
import re
|
||||||
from pywb.utils.canonicalize import canonicalize
|
|
||||||
from pywb.utils.timeutils import timestamp_to_sec, http_date_to_timestamp
|
|
||||||
from pywb.cdx.cdxobject import CDXObject
|
|
||||||
|
|
||||||
|
|
||||||
LINK_SPLIT = re.compile(',\s*(?=[<])')
|
LINK_SPLIT = re.compile(',\s*(?=[<])')
|
||||||
LINK_SEG_SPLIT = re.compile(';\s*')
|
LINK_SEG_SPLIT = re.compile(';\s*')
|
||||||
@ -54,69 +50,3 @@ class MementoUtils(object):
|
|||||||
|
|
||||||
results['mementos'] = mementos
|
results['mementos'] = mementos
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def links_to_json(link_header, def_name='timemap', sort=False):
|
|
||||||
results = MementoUtils.parse_links(link_header, def_name)
|
|
||||||
|
|
||||||
#meta = MementoUtils.meta_field('timegate', results)
|
|
||||||
#if meta:
|
|
||||||
# yield meta
|
|
||||||
|
|
||||||
#meta = MementoUtils.meta_field('timemap', results)
|
|
||||||
#if meta:
|
|
||||||
# yield meta
|
|
||||||
|
|
||||||
#meta = MementoUtils.meta_field('original', results)
|
|
||||||
#if meta:
|
|
||||||
# yield meta
|
|
||||||
|
|
||||||
original = results['original']['url']
|
|
||||||
key = canonicalize(original)
|
|
||||||
|
|
||||||
mementos = results['mementos']
|
|
||||||
if sort:
|
|
||||||
mementos = sorted(mementos)
|
|
||||||
|
|
||||||
def link_iter():
|
|
||||||
for val in mementos:
|
|
||||||
dt = val.get('datetime')
|
|
||||||
if not dt:
|
|
||||||
continue
|
|
||||||
|
|
||||||
ts = http_date_to_timestamp(dt)
|
|
||||||
line = CDXObject()
|
|
||||||
line['urlkey'] = key
|
|
||||||
line['timestamp'] = ts
|
|
||||||
line['url'] = original
|
|
||||||
line['mem_rel'] = val.get('rel', '')
|
|
||||||
line['memento_url'] = val['url']
|
|
||||||
yield line
|
|
||||||
|
|
||||||
return original, link_iter
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def meta_field(name, results):
|
|
||||||
v = results.get(name)
|
|
||||||
if v:
|
|
||||||
c = CDXObject()
|
|
||||||
c['key'] = '@' + name
|
|
||||||
c['url'] = v['url']
|
|
||||||
return c
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def cdx_sort_closest(closest, cdx_json):
|
|
||||||
closest_sec = timestamp_to_sec(closest)
|
|
||||||
|
|
||||||
def get_key(cdx):
|
|
||||||
sec = timestamp_to_sec(cdx['timestamp'])
|
|
||||||
return abs(closest_sec - sec)
|
|
||||||
|
|
||||||
cdx_sorted = sorted(cdx_json, key=get_key)
|
|
||||||
return cdx_sorted
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user