mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
inputrequest: add input request handling (direct wsgi headers) or as a prepared post request
add timemap link output rename source_name -> source
This commit is contained in:
parent
1a0b2fba17
commit
398e8f1a77
@ -2,29 +2,27 @@ from gevent.pool import Pool
|
|||||||
import gevent
|
import gevent
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
|
import os
|
||||||
|
|
||||||
from heapq import merge
|
from heapq import merge
|
||||||
from collections import deque
|
from collections import deque
|
||||||
|
|
||||||
from indexsource import BaseIndexSource
|
from indexsource import BaseIndexSource, FileIndexSource
|
||||||
from pywb.utils.wbexception import NotFoundException
|
from pywb.utils.wbexception import NotFoundException
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class BaseAggIndexSource(BaseIndexSource):
|
class BaseAggIndexSource(BaseIndexSource):
|
||||||
def __init__(self, sources):
|
|
||||||
self.sources = sources
|
|
||||||
|
|
||||||
def do_query(self, name, source, params):
|
def do_query(self, name, source, params):
|
||||||
try:
|
try:
|
||||||
cdx_iter = source.load_index(params)
|
cdx_iter = source.load_index(dict(params))
|
||||||
except NotFoundException as nf:
|
except NotFoundException as nf:
|
||||||
print('Not found in ' + name)
|
print('Not found in ' + name)
|
||||||
cdx_iter = iter([])
|
cdx_iter = iter([])
|
||||||
|
|
||||||
def add_name(cdx_iter):
|
def add_name(cdx_iter):
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
cdx['source_name'] = name
|
cdx['source'] = name
|
||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
return add_name(cdx_iter)
|
return add_name(cdx_iter)
|
||||||
@ -36,6 +34,9 @@ class BaseAggIndexSource(BaseIndexSource):
|
|||||||
|
|
||||||
return cdx_iter
|
return cdx_iter
|
||||||
|
|
||||||
|
def _load_all(self):
|
||||||
|
raise NotImplemented()
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class TimingOutMixin(object):
|
class TimingOutMixin(object):
|
||||||
@ -63,7 +64,7 @@ class TimingOutMixin(object):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def get_valid_sources(self, sources):
|
def get_valid_sources(self, sources):
|
||||||
for name in sources.keys():
|
for name in sources:
|
||||||
if not self.is_timed_out(name):
|
if not self.is_timed_out(name):
|
||||||
yield name
|
yield name
|
||||||
|
|
||||||
@ -79,10 +80,19 @@ class TimingOutMixin(object):
|
|||||||
#=============================================================================
|
#=============================================================================
|
||||||
class GeventAggIndexSource(BaseAggIndexSource):
|
class GeventAggIndexSource(BaseAggIndexSource):
|
||||||
def __init__(self, sources, timeout=5.0, size=None):
|
def __init__(self, sources, timeout=5.0, size=None):
|
||||||
super(GeventAggIndexSource, self).__init__(sources)
|
self.sources = sources
|
||||||
self.pool = Pool(size=size)
|
self.pool = Pool(size=size)
|
||||||
self.timeout = timeout
|
self.timeout = timeout
|
||||||
|
|
||||||
|
def get_sources(self, params):
|
||||||
|
srcs_list = params.get('sources')
|
||||||
|
if not srcs_list:
|
||||||
|
return self.sources
|
||||||
|
|
||||||
|
sel_sources = tuple(srcs_list.split(','))
|
||||||
|
|
||||||
|
return [src for src in self.sources if src in sel_sources]
|
||||||
|
|
||||||
def get_valid_sources(self, sources):
|
def get_valid_sources(self, sources):
|
||||||
return sources.keys()
|
return sources.keys()
|
||||||
|
|
||||||
@ -90,15 +100,18 @@ class GeventAggIndexSource(BaseAggIndexSource):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def _load_all(self, params):
|
def _load_all(self, params):
|
||||||
|
params['_timeout'] = self.timeout
|
||||||
|
|
||||||
def do_spawn(n):
|
def do_spawn(n):
|
||||||
return self.pool.spawn(self.do_query, n, self.sources[n], params)
|
return self.pool.spawn(self.do_query, n, self.sources[n], params)
|
||||||
|
|
||||||
jobs = [do_spawn(src) for src in self.get_valid_sources(self.sources)]
|
sources = self.get_sources(params)
|
||||||
|
jobs = [do_spawn(src) for src in self.get_valid_sources(sources)]
|
||||||
|
|
||||||
gevent.joinall(jobs, timeout=self.timeout)
|
gevent.joinall(jobs, timeout=self.timeout)
|
||||||
|
|
||||||
res = []
|
res = []
|
||||||
for name, job in zip(self.sources.keys(), jobs):
|
for name, job in zip(sources, jobs):
|
||||||
if job.value:
|
if job.value:
|
||||||
res.append(job.value)
|
res.append(job.value)
|
||||||
else:
|
else:
|
||||||
@ -113,29 +126,30 @@ class AggIndexSource(TimingOutMixin, GeventAggIndexSource):
|
|||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class SimpleAggIndexSource(BaseAggIndexSource):
|
class DirAggIndexSource(BaseAggIndexSource):
|
||||||
|
CDX_EXT = ('.cdx', '.cdxj')
|
||||||
|
|
||||||
|
def __init__(self, base_dir):
|
||||||
|
self.index_template = base_dir
|
||||||
|
|
||||||
|
def _init_files(self, the_dir):
|
||||||
|
sources = {}
|
||||||
|
for name in os.listdir(the_dir):
|
||||||
|
filename = os.path.join(the_dir, name)
|
||||||
|
|
||||||
|
if filename.endswith(self.CDX_EXT):
|
||||||
|
print('Adding ' + filename)
|
||||||
|
sources[name] = FileIndexSource(filename)
|
||||||
|
|
||||||
|
return sources
|
||||||
|
|
||||||
def _load_all(self, params):
|
def _load_all(self, params):
|
||||||
return list(map(lambda n: self.do_query(n, self.sources[n], params),
|
the_dir = self.get_index(params)
|
||||||
self.sources))
|
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
|
||||||
class ResourceLoadAgg(object):
|
|
||||||
def __init__(self, load_index, load_resource):
|
|
||||||
self.load_index = load_index
|
|
||||||
self.load_resource = load_resource
|
|
||||||
|
|
||||||
def __call__(self, params):
|
|
||||||
cdx_iter = self.load_index(params)
|
|
||||||
for cdx in cdx_iter:
|
|
||||||
for loader in self.load_resource:
|
|
||||||
try:
|
|
||||||
resp = loader(cdx)
|
|
||||||
if resp:
|
|
||||||
return resp
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
raise Exception('Not Found')
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
sources = self._init_files(the_dir)
|
||||||
|
except Exception:
|
||||||
|
raise NotFoundException(the_dir)
|
||||||
|
|
||||||
|
return list([self.do_query(src, sources[src], params)
|
||||||
|
for src in sources.keys()])
|
||||||
|
@ -21,10 +21,14 @@ class BaseIndexSource(object):
|
|||||||
self.index_template = index_template
|
self.index_template = index_template
|
||||||
|
|
||||||
def get_index(self, params):
|
def get_index(self, params):
|
||||||
return self.index_template.format(params.get('coll'))
|
res = self.index_template.format(**params)
|
||||||
|
return res
|
||||||
|
|
||||||
|
def load_index(self, params):
|
||||||
|
raise NotImplemented()
|
||||||
|
|
||||||
def __call__(self, params):
|
def __call__(self, params):
|
||||||
query = CDXQuery(**params)
|
query = CDXQuery(params)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
cdx_iter = self.load_index(query.params)
|
cdx_iter = self.load_index(query.params)
|
||||||
@ -34,10 +38,20 @@ class BaseIndexSource(object):
|
|||||||
cdx_iter = process_cdx(cdx_iter, query)
|
cdx_iter = process_cdx(cdx_iter, query)
|
||||||
return cdx_iter
|
return cdx_iter
|
||||||
|
|
||||||
|
def _include_post_query(self, params):
|
||||||
|
input_req = params.get('_input_req')
|
||||||
|
if input_req:
|
||||||
|
orig_url = params['url']
|
||||||
|
params['url'] = input_req.include_post_query(params['url'])
|
||||||
|
return (params['url'] != orig_url)
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class FileIndexSource(BaseIndexSource):
|
class FileIndexSource(BaseIndexSource):
|
||||||
def load_index(self, params):
|
def load_index(self, params):
|
||||||
|
if self._include_post_query(params):
|
||||||
|
params = CDXQuery(params).params
|
||||||
|
|
||||||
filename = self.get_index(params)
|
filename = self.get_index(params)
|
||||||
|
|
||||||
with open(filename, 'rb') as fh:
|
with open(filename, 'rb') as fh:
|
||||||
@ -45,6 +59,8 @@ class FileIndexSource(BaseIndexSource):
|
|||||||
for line in gen:
|
for line in gen:
|
||||||
yield CDXObject(line)
|
yield CDXObject(line)
|
||||||
|
|
||||||
|
#return do_load(filename)
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class RemoteIndexSource(BaseIndexSource):
|
class RemoteIndexSource(BaseIndexSource):
|
||||||
@ -53,11 +69,14 @@ class RemoteIndexSource(BaseIndexSource):
|
|||||||
self.replay_url = replay_url
|
self.replay_url = replay_url
|
||||||
|
|
||||||
def load_index(self, params):
|
def load_index(self, params):
|
||||||
url = self.get_index(params)
|
if self._include_post_query(params):
|
||||||
url += '?url=' + params['url']
|
params = CDXQuery(**params).params
|
||||||
r = requests.get(url)
|
|
||||||
|
api_url = self.get_index(params)
|
||||||
|
api_url += '?url=' + params['url']
|
||||||
|
r = requests.get(api_url, timeout=params.get('_timeout'))
|
||||||
if r.status_code >= 400:
|
if r.status_code >= 400:
|
||||||
raise NotFoundException(url)
|
raise NotFoundException(api_url)
|
||||||
|
|
||||||
lines = r.content.strip().split(b'\n')
|
lines = r.content.strip().split(b'\n')
|
||||||
def do_load(lines):
|
def do_load(lines):
|
||||||
@ -103,8 +122,11 @@ class RedisIndexSource(BaseIndexSource):
|
|||||||
b'[' + params['key'],
|
b'[' + params['key'],
|
||||||
b'(' + params['end_key'])
|
b'(' + params['end_key'])
|
||||||
|
|
||||||
for line in index_list:
|
def do_load(index_list):
|
||||||
yield CDXObject(line)
|
for line in index_list:
|
||||||
|
yield CDXObject(line)
|
||||||
|
|
||||||
|
return do_load(index_list)
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
@ -166,7 +188,7 @@ class MementoIndexSource(BaseIndexSource):
|
|||||||
|
|
||||||
def get_timemap_links(self, params):
|
def get_timemap_links(self, params):
|
||||||
url = self.timemap_url + params['url']
|
url = self.timemap_url + params['url']
|
||||||
res = requests.get(url)
|
res = requests.get(url, timeout=params.get('_timeout'))
|
||||||
if res.status_code >= 400:
|
if res.status_code >= 400:
|
||||||
raise NotFoundException(url)
|
raise NotFoundException(url)
|
||||||
|
|
||||||
@ -182,9 +204,6 @@ class MementoIndexSource(BaseIndexSource):
|
|||||||
links = self.get_timegate_links(params, closest)
|
links = self.get_timegate_links(params, closest)
|
||||||
def_name = 'timegate'
|
def_name = 'timegate'
|
||||||
|
|
||||||
#if not links:
|
|
||||||
# return iter([])
|
|
||||||
|
|
||||||
return self.links_to_cdxobject(links, def_name)
|
return self.links_to_cdxobject(links, def_name)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
136
inputrequest.py
Normal file
136
inputrequest.py
Normal file
@ -0,0 +1,136 @@
|
|||||||
|
from pywb.utils.loaders import extract_client_cookie
|
||||||
|
from pywb.utils.loaders import extract_post_query, append_post_query
|
||||||
|
from pywb.utils.loaders import LimitReader
|
||||||
|
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||||
|
|
||||||
|
from six.moves.urllib.parse import urlsplit
|
||||||
|
from six import StringIO
|
||||||
|
import six
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class WSGIInputRequest(object):
|
||||||
|
def __init__(self, env):
|
||||||
|
self.env = env
|
||||||
|
|
||||||
|
def get_req_method(self):
|
||||||
|
return self.env['REQUEST_METHOD'].upper()
|
||||||
|
|
||||||
|
def get_req_headers(self, url):
|
||||||
|
headers = {}
|
||||||
|
|
||||||
|
splits = urlsplit(url)
|
||||||
|
|
||||||
|
for name, value in six.iteritems(self.env):
|
||||||
|
if name == 'HTTP_HOST':
|
||||||
|
name = 'Host'
|
||||||
|
value = splits.netloc
|
||||||
|
|
||||||
|
elif name == 'HTTP_ORIGIN':
|
||||||
|
name = 'Origin'
|
||||||
|
value = (splits.scheme + '://' + splits.netloc)
|
||||||
|
|
||||||
|
elif name == 'HTTP_X_CSRFTOKEN':
|
||||||
|
name = 'X-CSRFToken'
|
||||||
|
cookie_val = extract_client_cookie(env, 'csrftoken')
|
||||||
|
if cookie_val:
|
||||||
|
value = cookie_val
|
||||||
|
|
||||||
|
elif name == 'HTTP_X_FORWARDED_PROTO':
|
||||||
|
name = 'X-Forwarded-Proto'
|
||||||
|
value = splits.scheme
|
||||||
|
|
||||||
|
elif name.startswith('HTTP_'):
|
||||||
|
name = name[5:].title().replace('_', '-')
|
||||||
|
|
||||||
|
elif name in ('CONTENT_LENGTH', 'CONTENT_TYPE'):
|
||||||
|
name = name.title().replace('_', '-')
|
||||||
|
|
||||||
|
else:
|
||||||
|
value = None
|
||||||
|
|
||||||
|
if value:
|
||||||
|
headers[name] = value
|
||||||
|
|
||||||
|
return headers
|
||||||
|
|
||||||
|
def get_req_body(self):
|
||||||
|
input_ = self.env.get('wsgi.input')
|
||||||
|
if not input_:
|
||||||
|
return None
|
||||||
|
|
||||||
|
len_ = self._get_content_length()
|
||||||
|
enc = self._get_header('Transfer-Encoding')
|
||||||
|
|
||||||
|
if len_:
|
||||||
|
data = LimitReader(input_, int(len_))
|
||||||
|
elif enc:
|
||||||
|
data = input_
|
||||||
|
else:
|
||||||
|
data = None
|
||||||
|
|
||||||
|
return data
|
||||||
|
#buf = data.read().decode('utf-8')
|
||||||
|
#print(buf)
|
||||||
|
#return StringIO(buf)
|
||||||
|
|
||||||
|
def _get_content_type(self):
|
||||||
|
return self.env.get('CONTENT_TYPE')
|
||||||
|
|
||||||
|
def _get_content_length(self):
|
||||||
|
return self.env.get('CONTENT_LENGTH')
|
||||||
|
|
||||||
|
def _get_header(self, name):
|
||||||
|
return self.env.get('HTTP_' + name.upper().replace('-', '_'))
|
||||||
|
|
||||||
|
def include_post_query(self, url):
|
||||||
|
if self.get_req_method() != 'POST':
|
||||||
|
return url
|
||||||
|
|
||||||
|
mime = self._get_content_type()
|
||||||
|
mime = mime.split(';')[0] if mime else ''
|
||||||
|
length = self._get_content_length()
|
||||||
|
stream = self.env['wsgi.input']
|
||||||
|
|
||||||
|
buffered_stream = StringIO()
|
||||||
|
|
||||||
|
post_query = extract_post_query('POST', mime, length, stream,
|
||||||
|
buffered_stream=buffered_stream)
|
||||||
|
|
||||||
|
if post_query:
|
||||||
|
self.env['wsgi.input'] = buffered_stream
|
||||||
|
url = append_post_query(url, post_query)
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class POSTInputRequest(WSGIInputRequest):
|
||||||
|
def __init__(self, env):
|
||||||
|
self.env = env
|
||||||
|
|
||||||
|
parser = StatusAndHeadersParser([], verify=False)
|
||||||
|
|
||||||
|
self.status_headers = parser.parse(self.env['wsgi.input'])
|
||||||
|
|
||||||
|
def get_req_method(self):
|
||||||
|
return self.status_headers.protocol
|
||||||
|
|
||||||
|
def get_req_headers(self, url):
|
||||||
|
headers = {}
|
||||||
|
for n, v in self.status_headers.headers:
|
||||||
|
headers[n] = v
|
||||||
|
|
||||||
|
return headers
|
||||||
|
|
||||||
|
def _get_content_type(self):
|
||||||
|
return self.status_headers.get_header('Content-Type')
|
||||||
|
|
||||||
|
def _get_content_length(self):
|
||||||
|
return self.status_headers.get_header('Content-Length')
|
||||||
|
|
||||||
|
def _get_header(self, name):
|
||||||
|
return self.status_headers.get_header(name)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -95,7 +95,7 @@ class RecordingHTTPConnection(httplib.HTTPConnection):
|
|||||||
if hasattr(data,'read') and not isinstance(data, array):
|
if hasattr(data,'read') and not isinstance(data, array):
|
||||||
url = None
|
url = None
|
||||||
while True:
|
while True:
|
||||||
buff = data.read(self.BUFF_SIZE)
|
buff = data.read(BUFF_SIZE)
|
||||||
if not buff:
|
if not buff:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@ from io import BytesIO
|
|||||||
from bottle import response
|
from bottle import response
|
||||||
|
|
||||||
import uuid
|
import uuid
|
||||||
|
from utils import MementoUtils
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
@ -23,24 +24,46 @@ def incr_reader(stream, header=None, size=8192):
|
|||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
try:
|
||||||
|
stream.close()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class WARCPathPrefixLoader(object):
|
class WARCPathLoader(object):
|
||||||
def __init__(self, prefix, cdx_loader):
|
def __init__(self, paths, cdx_source):
|
||||||
self.prefix = prefix
|
self.paths = paths
|
||||||
|
if isinstance(paths, str):
|
||||||
|
self.paths = [paths]
|
||||||
|
|
||||||
def add_prefix(filename, cdx):
|
self.path_checks = list(self.warc_paths())
|
||||||
return [self.prefix + filename]
|
|
||||||
|
|
||||||
self.resolve_loader = ResolvingLoader([add_prefix], no_record_parse=True)
|
self.resolve_loader = ResolvingLoader(self.path_checks,
|
||||||
self.cdx_loader = cdx_loader
|
no_record_parse=True)
|
||||||
|
self.cdx_source = cdx_source
|
||||||
|
|
||||||
def __call__(self, cdx):
|
def warc_paths(self):
|
||||||
|
for path in self.paths:
|
||||||
|
def check(filename, cdx):
|
||||||
|
try:
|
||||||
|
full_path = path.format(**cdx)
|
||||||
|
return full_path
|
||||||
|
except KeyError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
yield check
|
||||||
|
|
||||||
|
|
||||||
|
def __call__(self, cdx, params):
|
||||||
if not cdx.get('filename') or cdx.get('offset') is None:
|
if not cdx.get('filename') or cdx.get('offset') is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
failed_files = []
|
failed_files = []
|
||||||
headers, payload = self.resolve_loader.load_headers_and_payload(cdx, failed_files, self.cdx_loader)
|
headers, payload = (self.resolve_loader.
|
||||||
|
load_headers_and_payload(cdx,
|
||||||
|
failed_files,
|
||||||
|
self.cdx_source))
|
||||||
|
|
||||||
if headers != payload:
|
if headers != payload:
|
||||||
headers.stream.close()
|
headers.stream.close()
|
||||||
@ -50,6 +73,8 @@ class WARCPathPrefixLoader(object):
|
|||||||
for n, v in record.rec_headers.headers:
|
for n, v in record.rec_headers.headers:
|
||||||
response.headers[n] = v
|
response.headers[n] = v
|
||||||
|
|
||||||
|
response.headers['WARC-Coll'] = cdx.get('source')
|
||||||
|
|
||||||
return incr_reader(record.stream)
|
return incr_reader(record.stream)
|
||||||
|
|
||||||
|
|
||||||
@ -82,24 +107,33 @@ class LiveWebLoader(object):
|
|||||||
b'content-location',
|
b'content-location',
|
||||||
b'x-archive')
|
b'x-archive')
|
||||||
|
|
||||||
def __call__(self, cdx):
|
def __call__(self, cdx, params):
|
||||||
load_url = cdx.get('load_url')
|
load_url = cdx.get('load_url')
|
||||||
if not load_url:
|
if not load_url:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
recorder = HeaderRecorder(self.SKIP_HEADERS)
|
recorder = HeaderRecorder(self.SKIP_HEADERS)
|
||||||
|
|
||||||
req_headers = {}
|
input_req = params['_input_req']
|
||||||
|
|
||||||
|
req_headers = input_req.get_req_headers(cdx['url'])
|
||||||
|
|
||||||
dt = timestamp_to_datetime(cdx['timestamp'])
|
dt = timestamp_to_datetime(cdx['timestamp'])
|
||||||
|
|
||||||
if not cdx.get('is_live'):
|
if not cdx.get('is_live'):
|
||||||
req_headers['Accept-Datetime'] = datetime_to_http_date(dt)
|
req_headers['Accept-Datetime'] = datetime_to_http_date(dt)
|
||||||
|
|
||||||
upstream_res = remote_request(load_url,
|
method = input_req.get_req_method()
|
||||||
|
data = input_req.get_req_body()
|
||||||
|
|
||||||
|
upstream_res = remote_request(url=load_url,
|
||||||
|
method=method,
|
||||||
recorder=recorder,
|
recorder=recorder,
|
||||||
stream=True,
|
stream=True,
|
||||||
headers=req_headers)
|
allow_redirects=False,
|
||||||
|
headers=req_headers,
|
||||||
|
data=data,
|
||||||
|
timeout=params.get('_timeout'))
|
||||||
|
|
||||||
resp_headers = recorder.get_header()
|
resp_headers = recorder.get_header()
|
||||||
|
|
||||||
@ -109,6 +143,7 @@ class LiveWebLoader(object):
|
|||||||
#response.headers['WARC-Record-ID'] = self._make_warc_id()
|
#response.headers['WARC-Record-ID'] = self._make_warc_id()
|
||||||
response.headers['WARC-Target-URI'] = cdx['url']
|
response.headers['WARC-Target-URI'] = cdx['url']
|
||||||
response.headers['WARC-Date'] = self._make_date(dt)
|
response.headers['WARC-Date'] = self._make_date(dt)
|
||||||
|
response.headers['WARC-Coll'] = cdx.get('source', '')
|
||||||
|
|
||||||
# Try to set content-length, if it is available and valid
|
# Try to set content-length, if it is available and valid
|
||||||
try:
|
try:
|
||||||
@ -131,3 +166,110 @@ class LiveWebLoader(object):
|
|||||||
id_ = uuid.uuid1()
|
id_ = uuid.uuid1()
|
||||||
return '<urn:uuid:{0}>'.format(id_)
|
return '<urn:uuid:{0}>'.format(id_)
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
def to_cdxj(cdx_iter, fields):
|
||||||
|
response.headers['Content-Type'] = 'text/x-cdxj'
|
||||||
|
return [cdx.to_cdxj(fields) for cdx in cdx_iter]
|
||||||
|
|
||||||
|
def to_json(cdx_iter, fields):
|
||||||
|
response.headers['Content-Type'] = 'application/x-ndjson'
|
||||||
|
return [cdx.to_json(fields) for cdx in cdx_iter]
|
||||||
|
|
||||||
|
def to_text(cdx_iter, fields):
|
||||||
|
response.headers['Content-Type'] = 'text/plain'
|
||||||
|
return [cdx.to_text(fields) for cdx in cdx_iter]
|
||||||
|
|
||||||
|
def to_link(cdx_iter, fields):
|
||||||
|
response.headers['Content-Type'] = 'application/link'
|
||||||
|
return MementoUtils.make_timemap(cdx_iter)
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class IndexLoader(object):
|
||||||
|
OUTPUTS = {
|
||||||
|
'cdxj': to_cdxj,
|
||||||
|
'json': to_json,
|
||||||
|
'text': to_text,
|
||||||
|
'link': to_link,
|
||||||
|
}
|
||||||
|
|
||||||
|
DEF_OUTPUT = 'cdxj'
|
||||||
|
|
||||||
|
def __init__(self, index_source):
|
||||||
|
self.index_source = index_source
|
||||||
|
|
||||||
|
def __call__(self, params):
|
||||||
|
cdx_iter = self.index_source(params)
|
||||||
|
|
||||||
|
output = params.get('output', self.DEF_OUTPUT)
|
||||||
|
fields = params.get('fields')
|
||||||
|
|
||||||
|
handler = self.OUTPUTS.get(output)
|
||||||
|
if not handler:
|
||||||
|
handler = self.OUTPUTS[self.DEF_OUTPUT]
|
||||||
|
|
||||||
|
res = handler(cdx_iter, fields)
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class ResourceLoader(IndexLoader):
|
||||||
|
def __init__(self, index_source, resource_loaders):
|
||||||
|
super(ResourceLoader, self).__init__(index_source)
|
||||||
|
self.resource_loaders = resource_loaders
|
||||||
|
|
||||||
|
def __call__(self, params):
|
||||||
|
output = params.get('output')
|
||||||
|
if output != 'resource':
|
||||||
|
return super(ResourceLoader, self).__call__(params)
|
||||||
|
|
||||||
|
cdx_iter = self.index_source(params)
|
||||||
|
|
||||||
|
any_found = False
|
||||||
|
|
||||||
|
for cdx in cdx_iter:
|
||||||
|
any_found = True
|
||||||
|
cdx['coll'] = params.get('coll', '')
|
||||||
|
|
||||||
|
for loader in self.resource_loaders:
|
||||||
|
try:
|
||||||
|
resp = loader(cdx, params)
|
||||||
|
if resp:
|
||||||
|
return resp
|
||||||
|
except ArchiveLoadFailed as e:
|
||||||
|
print(e)
|
||||||
|
pass
|
||||||
|
|
||||||
|
if any_found:
|
||||||
|
raise ArchiveLoadFailed('Resource Found, could not be Loaded')
|
||||||
|
else:
|
||||||
|
raise ArchiveLoadFailed('No Resource Found')
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class DefaultResourceLoader(ResourceLoader):
|
||||||
|
def __init__(self, index_source, warc_paths=''):
|
||||||
|
loaders = [WARCPathLoader(warc_paths, index_source),
|
||||||
|
LiveWebLoader()
|
||||||
|
]
|
||||||
|
super(DefaultResourceLoader, self).__init__(index_source, loaders)
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class LoaderSeq(object):
|
||||||
|
def __init__(self, loaders):
|
||||||
|
self.loaders = loaders
|
||||||
|
|
||||||
|
def __call__(self, params):
|
||||||
|
for loader in self.loaders:
|
||||||
|
try:
|
||||||
|
res = loader(params)
|
||||||
|
if res:
|
||||||
|
return res
|
||||||
|
except ArchiveLoadFailed:
|
||||||
|
pass
|
||||||
|
|
||||||
|
raise ArchiveLoadFailed('No Resource Found')
|
||||||
|
|
||||||
|
|
||||||
|
@ -15,7 +15,7 @@ sources = {
|
|||||||
|
|
||||||
source = AggIndexSource(sources, timeout=5.0)
|
source = AggIndexSource(sources, timeout=5.0)
|
||||||
|
|
||||||
def select_json(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source_name']):
|
def select_json(cdxlist, fields=['timestamp', 'load_url', 'filename', 'source']):
|
||||||
return list([json.loads(cdx.to_json(fields)) for cdx in cdxlist])
|
return list([json.loads(cdx.to_json(fields)) for cdx in cdxlist])
|
||||||
|
|
||||||
|
|
||||||
@ -24,11 +24,11 @@ def test_agg_index_1():
|
|||||||
res = source(dict(url=url, closest='20140126000000', limit=5))
|
res = source(dict(url=url, closest='20140126000000', limit=5))
|
||||||
|
|
||||||
|
|
||||||
exp = [{"timestamp": "20140126093743", "load_url": "http://web.archive.org/web/20140126093743id_/http://iana.org/", "source_name": "ia"},
|
exp = [{"timestamp": "20140126093743", "load_url": "http://web.archive.org/web/20140126093743id_/http://iana.org/", "source": "ia"},
|
||||||
{"timestamp": "20140126200624", "filename": "iana.warc.gz", "source_name": "local"},
|
{"timestamp": "20140126200624", "filename": "iana.warc.gz", "source": "local"},
|
||||||
{"timestamp": "20140123034755", "load_url": "http://web.archive.org/web/20140123034755id_/http://iana.org/", "source_name": "ia"},
|
{"timestamp": "20140123034755", "load_url": "http://web.archive.org/web/20140123034755id_/http://iana.org/", "source": "ia"},
|
||||||
{"timestamp": "20140129175203", "load_url": "http://web.archive.org/web/20140129175203id_/http://iana.org/", "source_name": "ia"},
|
{"timestamp": "20140129175203", "load_url": "http://web.archive.org/web/20140129175203id_/http://iana.org/", "source": "ia"},
|
||||||
{"timestamp": "20140107040552", "load_url": "http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/", "source_name": "ait"}
|
{"timestamp": "20140107040552", "load_url": "http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/", "source": "ait"}
|
||||||
]
|
]
|
||||||
|
|
||||||
assert(select_json(res) == exp)
|
assert(select_json(res) == exp)
|
||||||
@ -38,12 +38,12 @@ def test_agg_index_2():
|
|||||||
url = 'http://example.com/'
|
url = 'http://example.com/'
|
||||||
res = source(dict(url=url, closest='20100512', limit=6))
|
res = source(dict(url=url, closest='20100512', limit=6))
|
||||||
|
|
||||||
exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source_name": "bl"},
|
exp = [{"timestamp": "20100513010014", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100513010014id_/http://example.com/", "source": "bl"},
|
||||||
{"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source_name": "bl"},
|
{"timestamp": "20100512204410", "load_url": "http://www.webarchive.org.uk/wayback/archive/20100512204410id_/http://example.com/", "source": "bl"},
|
||||||
{"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source_name": "ia"},
|
{"timestamp": "20100513052358", "load_url": "http://web.archive.org/web/20100513052358id_/http://example.com/", "source": "ia"},
|
||||||
{"timestamp": "20100511201151", "load_url": "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source_name": "ait"},
|
{"timestamp": "20100511201151", "load_url": "http://wayback.archive-it.org/all/20100511201151id_/http://example.com/", "source": "ait"},
|
||||||
{"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source_name": "ait"},
|
{"timestamp": "20100514231857", "load_url": "http://wayback.archive-it.org/all/20100514231857id_/http://example.com/", "source": "ait"},
|
||||||
{"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source_name": "ia"}]
|
{"timestamp": "20100514231857", "load_url": "http://web.archive.org/web/20100514231857id_/http://example.com/", "source": "ia"}]
|
||||||
|
|
||||||
assert(select_json(res) == exp)
|
assert(select_json(res) == exp)
|
||||||
|
|
||||||
@ -52,11 +52,22 @@ def test_agg_index_3():
|
|||||||
url = 'http://vvork.com/'
|
url = 'http://vvork.com/'
|
||||||
res = source(dict(url=url, closest='20141001', limit=5))
|
res = source(dict(url=url, closest='20141001', limit=5))
|
||||||
|
|
||||||
exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source_name": "rhiz"},
|
exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"},
|
||||||
{"timestamp": "20141018133107", "load_url": "http://web.archive.org/web/20141018133107id_/http://vvork.com/", "source_name": "ia"},
|
{"timestamp": "20141018133107", "load_url": "http://web.archive.org/web/20141018133107id_/http://vvork.com/", "source": "ia"},
|
||||||
{"timestamp": "20141020161243", "load_url": "http://web.archive.org/web/20141020161243id_/http://vvork.com/", "source_name": "ia"},
|
{"timestamp": "20141020161243", "load_url": "http://web.archive.org/web/20141020161243id_/http://vvork.com/", "source": "ia"},
|
||||||
{"timestamp": "20140806161228", "load_url": "http://web.archive.org/web/20140806161228id_/http://vvork.com/", "source_name": "ia"},
|
{"timestamp": "20140806161228", "load_url": "http://web.archive.org/web/20140806161228id_/http://vvork.com/", "source": "ia"},
|
||||||
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source_name": "ait"}]
|
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}]
|
||||||
|
|
||||||
assert(select_json(res) == exp)
|
assert(select_json(res) == exp)
|
||||||
|
|
||||||
|
|
||||||
|
def test_agg_index_4():
|
||||||
|
url = 'http://vvork.com/'
|
||||||
|
res = source(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
|
||||||
|
|
||||||
|
exp = [{"timestamp": "20141006184357", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"},
|
||||||
|
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}]
|
||||||
|
|
||||||
|
assert(select_json(res) == exp)
|
||||||
|
|
||||||
|
|
||||||
|
43
utils.py
43
utils.py
@ -1,4 +1,8 @@
|
|||||||
import re
|
import re
|
||||||
|
import six
|
||||||
|
|
||||||
|
from pywb.utils.timeutils import timestamp_to_http_date
|
||||||
|
|
||||||
|
|
||||||
LINK_SPLIT = re.compile(',\s*(?=[<])')
|
LINK_SPLIT = re.compile(',\s*(?=[<])')
|
||||||
LINK_SEG_SPLIT = re.compile(';\s*')
|
LINK_SEG_SPLIT = re.compile(';\s*')
|
||||||
@ -50,3 +54,42 @@ class MementoUtils(object):
|
|||||||
|
|
||||||
results['mementos'] = mementos
|
results['mementos'] = mementos
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def make_timemap_memento_link(cdx, datetime=None, rel='memento', end=',\n'):
|
||||||
|
|
||||||
|
url = cdx.get('load_url')
|
||||||
|
if not url:
|
||||||
|
url = 'filename://' + cdx.get('filename')
|
||||||
|
|
||||||
|
memento = '<{0}>; rel="{1}"; datetime="{2}"; src="{3}"' + end
|
||||||
|
|
||||||
|
if not datetime:
|
||||||
|
datetime = timestamp_to_http_date(cdx['timestamp'])
|
||||||
|
|
||||||
|
return memento.format(url, rel, datetime, cdx.get('source', ''))
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def make_timemap(cdx_iter):
|
||||||
|
# get first memento as it'll be used for 'from' field
|
||||||
|
try:
|
||||||
|
first_cdx = six.next(cdx_iter)
|
||||||
|
from_date = timestamp_to_http_date(first_cdx['timestamp'])
|
||||||
|
except StopIteration:
|
||||||
|
first_cdx = None
|
||||||
|
|
||||||
|
# first memento link
|
||||||
|
yield MementoUtils.make_timemap_memento_link(first_cdx, datetime=from_date)
|
||||||
|
|
||||||
|
prev_cdx = None
|
||||||
|
|
||||||
|
for cdx in cdx_iter:
|
||||||
|
if prev_cdx:
|
||||||
|
yield MementoUtils.make_timemap_memento_link(prev_cdx)
|
||||||
|
|
||||||
|
prev_cdx = cdx
|
||||||
|
|
||||||
|
# last memento link, if any
|
||||||
|
if prev_cdx:
|
||||||
|
yield MementoUtils.make_timemap_memento_link(prev_cdx, end='')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user