mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
handlers: return out_headers directly instead of setting bottle response, contains bottle dependency to app.py (to allow alternate impl not using bottle)
param parsing: instead of setting custom _src_params and _all_params, use a custom ParamFormatter which will check param dict for params with prefix and custom name
This commit is contained in:
parent
bdda1b8c03
commit
20ebccc13e
@ -134,13 +134,14 @@ def test_handler_output_cdxj():
|
|||||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||||
handler = IndexHandler(agg)
|
handler = IndexHandler(agg)
|
||||||
url = 'http://vvork.com/'
|
url = 'http://vvork.com/'
|
||||||
res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
|
headers, res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
|
||||||
|
|
||||||
exp = """\
|
exp = """\
|
||||||
com,vvork)/ 20141006184357 {"url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
|
com,vvork)/ 20141006184357 {"url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
|
||||||
com,vvork)/ 20131004231540 {"url": "http://vvork.com/", "mem_rel": "last memento", "memento_url": "http://wayback.archive-it.org/all/20131004231540/http://vvork.com/", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}
|
com,vvork)/ 20131004231540 {"url": "http://vvork.com/", "mem_rel": "last memento", "memento_url": "http://wayback.archive-it.org/all/20131004231540/http://vvork.com/", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
assert(headers['Content-Type'] == 'text/x-cdxj')
|
||||||
assert(''.join(res) == exp)
|
assert(''.join(res) == exp)
|
||||||
assert(errs == {})
|
assert(errs == {})
|
||||||
|
|
||||||
@ -149,13 +150,14 @@ def test_handler_output_json():
|
|||||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||||
handler = IndexHandler(agg)
|
handler = IndexHandler(agg)
|
||||||
url = 'http://vvork.com/'
|
url = 'http://vvork.com/'
|
||||||
res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json'))
|
headers, res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json'))
|
||||||
|
|
||||||
exp = """\
|
exp = """\
|
||||||
{"urlkey": "com,vvork)/", "timestamp": "20141006184357", "url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
|
{"urlkey": "com,vvork)/", "timestamp": "20141006184357", "url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
|
||||||
{"urlkey": "com,vvork)/", "timestamp": "20131004231540", "url": "http://vvork.com/", "mem_rel": "last memento", "memento_url": "http://wayback.archive-it.org/all/20131004231540/http://vvork.com/", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}
|
{"urlkey": "com,vvork)/", "timestamp": "20131004231540", "url": "http://vvork.com/", "mem_rel": "last memento", "memento_url": "http://wayback.archive-it.org/all/20131004231540/http://vvork.com/", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
assert(headers['Content-Type'] == 'application/x-ndjson')
|
||||||
assert(''.join(res) == exp)
|
assert(''.join(res) == exp)
|
||||||
assert(errs == {})
|
assert(errs == {})
|
||||||
|
|
||||||
@ -163,12 +165,13 @@ def test_handler_output_link():
|
|||||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||||
handler = IndexHandler(agg)
|
handler = IndexHandler(agg)
|
||||||
url = 'http://vvork.com/'
|
url = 'http://vvork.com/'
|
||||||
res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link'))
|
headers, res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link'))
|
||||||
|
|
||||||
exp = """\
|
exp = """\
|
||||||
<http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT"; src="rhiz",
|
<http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT"; src="rhiz",
|
||||||
<http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/>; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait"
|
<http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/>; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait"
|
||||||
"""
|
"""
|
||||||
|
assert(headers['Content-Type'] == 'application/link')
|
||||||
assert(''.join(res) == exp)
|
assert(''.join(res) == exp)
|
||||||
assert(errs == {})
|
assert(errs == {})
|
||||||
|
|
||||||
@ -177,7 +180,7 @@ def test_handler_output_link_2():
|
|||||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||||
handler = IndexHandler(agg)
|
handler = IndexHandler(agg)
|
||||||
url = 'http://iana.org/'
|
url = 'http://iana.org/'
|
||||||
res, errs = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
|
headers, res, errs = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
|
||||||
|
|
||||||
exp = """\
|
exp = """\
|
||||||
<http://web.archive.org/web/20140126093743id_/http://iana.org/>; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT"; src="ia",
|
<http://web.archive.org/web/20140126093743id_/http://iana.org/>; rel="memento"; datetime="Sun, 26 Jan 2014 09:37:43 GMT"; src="ia",
|
||||||
@ -186,6 +189,7 @@ def test_handler_output_link_2():
|
|||||||
<http://web.archive.org/web/20140129175203id_/http://iana.org/>; rel="memento"; datetime="Wed, 29 Jan 2014 17:52:03 GMT"; src="ia",
|
<http://web.archive.org/web/20140129175203id_/http://iana.org/>; rel="memento"; datetime="Wed, 29 Jan 2014 17:52:03 GMT"; src="ia",
|
||||||
<http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/>; rel="memento"; datetime="Tue, 07 Jan 2014 04:05:52 GMT"; src="ait"
|
<http://wayback.archive-it.org/all/20140107040552id_/http://iana.org/>; rel="memento"; datetime="Tue, 07 Jan 2014 04:05:52 GMT"; src="ait"
|
||||||
"""
|
"""
|
||||||
|
assert(headers['Content-Type'] == 'application/link')
|
||||||
assert(''.join(res) == exp)
|
assert(''.join(res) == exp)
|
||||||
|
|
||||||
exp_errs = {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)",
|
exp_errs = {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)",
|
||||||
@ -199,10 +203,11 @@ def test_handler_output_link_3():
|
|||||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||||
handler = IndexHandler(agg)
|
handler = IndexHandler(agg)
|
||||||
url = 'http://foo.bar.non-existent'
|
url = 'http://foo.bar.non-existent'
|
||||||
res, errs = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
|
headers, res, errs = handler(dict(url=url, closest='20140126000000', limit=5, output='link'))
|
||||||
|
|
||||||
exp = ''
|
exp = ''
|
||||||
|
|
||||||
|
assert(headers['Content-Type'] == 'application/link')
|
||||||
assert(''.join(res) == exp)
|
assert(''.join(res) == exp)
|
||||||
|
|
||||||
exp_errs = {'ait': "NotFoundException('http://wayback.archive-it.org/all/http://foo.bar.non-existent',)",
|
exp_errs = {'ait': "NotFoundException('http://wayback.archive-it.org/all/http://foo.bar.non-existent',)",
|
||||||
@ -216,12 +221,13 @@ def test_handler_output_text():
|
|||||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||||
handler = IndexHandler(agg)
|
handler = IndexHandler(agg)
|
||||||
url = 'http://vvork.com/'
|
url = 'http://vvork.com/'
|
||||||
res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text'))
|
headers, res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text'))
|
||||||
|
|
||||||
exp = """\
|
exp = """\
|
||||||
com,vvork)/ 20141006184357 http://www.vvork.com/ memento http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/ http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/ rhiz
|
com,vvork)/ 20141006184357 http://www.vvork.com/ memento http://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/ http://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/ rhiz
|
||||||
com,vvork)/ 20131004231540 http://vvork.com/ last memento http://wayback.archive-it.org/all/20131004231540/http://vvork.com/ http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/ ait
|
com,vvork)/ 20131004231540 http://vvork.com/ last memento http://wayback.archive-it.org/all/20131004231540/http://vvork.com/ http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/ ait
|
||||||
"""
|
"""
|
||||||
|
assert(headers['Content-Type'] == 'text/plain')
|
||||||
assert(''.join(res) == exp)
|
assert(''.join(res) == exp)
|
||||||
assert(errs == {})
|
assert(errs == {})
|
||||||
|
|
||||||
@ -229,8 +235,9 @@ com,vvork)/ 20131004231540 http://vvork.com/ last memento http://wayback.archive
|
|||||||
def test_handler_list_sources():
|
def test_handler_list_sources():
|
||||||
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
agg = GeventTimeoutAggregator(sources, timeout=5.0)
|
||||||
handler = IndexHandler(agg)
|
handler = IndexHandler(agg)
|
||||||
res, errs = handler(dict(mode='list_sources'))
|
headers, res, errs = handler(dict(mode='list_sources'))
|
||||||
|
|
||||||
|
assert(headers == {})
|
||||||
assert(res == {'sources': {'bl': 'memento',
|
assert(res == {'sources': {'bl': 'memento',
|
||||||
'ait': 'memento',
|
'ait': 'memento',
|
||||||
'ia': 'memento',
|
'ia': 'memento',
|
||||||
|
@ -17,6 +17,9 @@ from itertools import chain
|
|||||||
|
|
||||||
from webagg.indexsource import FileIndexSource
|
from webagg.indexsource import FileIndexSource
|
||||||
from pywb.utils.wbexception import NotFoundException, WbException
|
from pywb.utils.wbexception import NotFoundException, WbException
|
||||||
|
|
||||||
|
from webagg.utils import ParamFormatter, res_template
|
||||||
|
|
||||||
import six
|
import six
|
||||||
import glob
|
import glob
|
||||||
|
|
||||||
@ -28,43 +31,19 @@ class BaseAggregator(object):
|
|||||||
params['closest'] = timestamp_now()
|
params['closest'] = timestamp_now()
|
||||||
|
|
||||||
query = CDXQuery(params)
|
query = CDXQuery(params)
|
||||||
self._set_src_params(params)
|
|
||||||
|
|
||||||
cdx_iter, errs = self.load_index(query.params)
|
cdx_iter, errs = self.load_index(query.params)
|
||||||
|
|
||||||
cdx_iter = process_cdx(cdx_iter, query)
|
cdx_iter = process_cdx(cdx_iter, query)
|
||||||
return cdx_iter, dict(errs)
|
return cdx_iter, dict(errs)
|
||||||
|
|
||||||
def _set_src_params(self, params):
|
|
||||||
src_params = {}
|
|
||||||
for param, value in six.iteritems(params):
|
|
||||||
if not param.startswith('param.'):
|
|
||||||
continue
|
|
||||||
|
|
||||||
parts = param.split('.', 3)[1:]
|
|
||||||
|
|
||||||
if len(parts) == 2:
|
|
||||||
src = parts[0]
|
|
||||||
name = parts[1]
|
|
||||||
else:
|
|
||||||
src = ''
|
|
||||||
name = parts[0]
|
|
||||||
|
|
||||||
if not src in src_params:
|
|
||||||
src_params[src] = {}
|
|
||||||
|
|
||||||
src_params[src][name] = value
|
|
||||||
|
|
||||||
params['_all_src_params'] = src_params
|
|
||||||
|
|
||||||
def load_child_source_list(self, name, source, params):
|
def load_child_source_list(self, name, source, params):
|
||||||
res = self.load_child_source(name, source, params)
|
res = self.load_child_source(name, source, params)
|
||||||
return list(res[0]), res[1]
|
return list(res[0]), res[1]
|
||||||
|
|
||||||
def load_child_source(self, name, source, params):
|
def load_child_source(self, name, source, params):
|
||||||
try:
|
try:
|
||||||
_src_params = params['_all_src_params'].get(name)
|
params['_formatter'] = ParamFormatter(params, name)
|
||||||
params['_src_params'] = _src_params
|
|
||||||
res = source.load_index(params)
|
res = source.load_index(params)
|
||||||
if isinstance(res, tuple):
|
if isinstance(res, tuple):
|
||||||
cdx_iter, err_list = res
|
cdx_iter, err_list = res
|
||||||
@ -277,18 +256,7 @@ class BaseDirectoryIndexSource(BaseAggregator):
|
|||||||
self.base_dir = base_dir
|
self.base_dir = base_dir
|
||||||
|
|
||||||
def _iter_sources(self, params):
|
def _iter_sources(self, params):
|
||||||
self._set_src_params(params)
|
the_dir = res_template(self.base_dir, params)
|
||||||
# see if specific params (when part of another agg)
|
|
||||||
src_params = params.get('_src_params')
|
|
||||||
if not src_params:
|
|
||||||
# try default param. settings
|
|
||||||
src_params = params.get('_all_src_params', {}).get('')
|
|
||||||
|
|
||||||
if src_params:
|
|
||||||
the_dir = self.base_dir.format(**src_params)
|
|
||||||
else:
|
|
||||||
the_dir = self.base_dir
|
|
||||||
|
|
||||||
the_dir = os.path.join(self.base_prefix, the_dir)
|
the_dir = os.path.join(self.base_prefix, the_dir)
|
||||||
try:
|
try:
|
||||||
sources = list(self._load_files(the_dir))
|
sources = list(self._load_files(the_dir))
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
from webagg.liverec import request as remote_request
|
||||||
|
|
||||||
from webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest
|
from webagg.inputrequest import DirectWSGIInputRequest, POSTInputRequest
|
||||||
from bottle import route, request, response, default_app, abort
|
from bottle import route, request, response, default_app, abort
|
||||||
import bottle
|
import bottle
|
||||||
@ -7,6 +9,42 @@ import json
|
|||||||
|
|
||||||
JSON_CT = 'application/json; charset=utf-8'
|
JSON_CT = 'application/json; charset=utf-8'
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
route_dict = {}
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
def add_route(path, handler):
|
||||||
|
@route([path, path + '/<mode:path>'], 'ANY')
|
||||||
|
@wrap_error
|
||||||
|
def direct_input_request(mode=''):
|
||||||
|
params = dict(request.query)
|
||||||
|
params['mode'] = mode
|
||||||
|
params['_input_req'] = DirectWSGIInputRequest(request.environ)
|
||||||
|
return handler(params)
|
||||||
|
|
||||||
|
@route([path + '/postreq', path + '/<mode:path>/postreq'], 'POST')
|
||||||
|
@wrap_error
|
||||||
|
def post_fullrequest(mode=''):
|
||||||
|
params = dict(request.query)
|
||||||
|
params['mode'] = mode
|
||||||
|
params['_input_req'] = POSTInputRequest(request.environ)
|
||||||
|
return handler(params)
|
||||||
|
|
||||||
|
global route_dict
|
||||||
|
handler_dict = handler.get_supported_modes()
|
||||||
|
route_dict[path] = handler_dict
|
||||||
|
route_dict[path + '/postreq'] = handler_dict
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
@route('/')
|
||||||
|
def list_routes():
|
||||||
|
return route_dict
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
def err_handler(exc):
|
def err_handler(exc):
|
||||||
response.status = exc.status_code
|
response.status = exc.status_code
|
||||||
response.content_type = JSON_CT
|
response.content_type = JSON_CT
|
||||||
@ -15,10 +53,15 @@ def err_handler(exc):
|
|||||||
return err_msg
|
return err_msg
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
def wrap_error(func):
|
def wrap_error(func):
|
||||||
def wrap_func(*args, **kwargs):
|
def wrap_func(*args, **kwargs):
|
||||||
try:
|
try:
|
||||||
res, errs = func(*args, **kwargs)
|
out_headers, res, errs = func(*args, **kwargs)
|
||||||
|
|
||||||
|
if out_headers:
|
||||||
|
for n, v in out_headers.items():
|
||||||
|
response.headers[n] = v
|
||||||
|
|
||||||
if res:
|
if res:
|
||||||
if errs:
|
if errs:
|
||||||
@ -53,36 +96,7 @@ def wrap_error(func):
|
|||||||
return wrap_func
|
return wrap_func
|
||||||
|
|
||||||
|
|
||||||
route_dict = {}
|
#=============================================================================
|
||||||
|
|
||||||
def add_route(path, handler):
|
|
||||||
@route([path, path + '/<mode:path>'], 'ANY')
|
|
||||||
@wrap_error
|
|
||||||
def direct_input_request(mode=''):
|
|
||||||
params = dict(request.query)
|
|
||||||
params['mode'] = mode
|
|
||||||
params['_input_req'] = DirectWSGIInputRequest(request.environ)
|
|
||||||
return handler(params)
|
|
||||||
|
|
||||||
@route([path + '/postreq', path + '/<mode:path>/postreq'], 'POST')
|
|
||||||
@wrap_error
|
|
||||||
def post_fullrequest(mode=''):
|
|
||||||
params = dict(request.query)
|
|
||||||
params['mode'] = mode
|
|
||||||
params['_input_req'] = POSTInputRequest(request.environ)
|
|
||||||
return handler(params)
|
|
||||||
|
|
||||||
global route_dict
|
|
||||||
handler_dict = handler.get_supported_modes()
|
|
||||||
route_dict[path] = handler_dict
|
|
||||||
route_dict[path + '/postreq'] = handler_dict
|
|
||||||
|
|
||||||
|
|
||||||
@route('/')
|
|
||||||
def list_routes():
|
|
||||||
return route_dict
|
|
||||||
|
|
||||||
|
|
||||||
application = default_app()
|
application = default_app()
|
||||||
application.default_error_handler = err_handler
|
application.default_error_handler = err_handler
|
||||||
|
|
||||||
|
@ -2,25 +2,24 @@ from webagg.responseloader import WARCPathLoader, LiveWebLoader
|
|||||||
from webagg.utils import MementoUtils
|
from webagg.utils import MementoUtils
|
||||||
from pywb.utils.wbexception import BadRequestException, WbException
|
from pywb.utils.wbexception import BadRequestException, WbException
|
||||||
from pywb.utils.wbexception import NotFoundException
|
from pywb.utils.wbexception import NotFoundException
|
||||||
from bottle import response
|
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
def to_cdxj(cdx_iter, fields):
|
def to_cdxj(cdx_iter, fields):
|
||||||
response.headers['Content-Type'] = 'text/x-cdxj'
|
content_type = 'text/x-cdxj'
|
||||||
return [cdx.to_cdxj(fields) for cdx in cdx_iter]
|
return content_type, (cdx.to_cdxj(fields) for cdx in cdx_iter)
|
||||||
|
|
||||||
def to_json(cdx_iter, fields):
|
def to_json(cdx_iter, fields):
|
||||||
response.headers['Content-Type'] = 'application/x-ndjson'
|
content_type = 'application/x-ndjson'
|
||||||
return [cdx.to_json(fields) for cdx in cdx_iter]
|
return content_type, (cdx.to_json(fields) for cdx in cdx_iter)
|
||||||
|
|
||||||
def to_text(cdx_iter, fields):
|
def to_text(cdx_iter, fields):
|
||||||
response.headers['Content-Type'] = 'text/plain'
|
content_type = 'text/plain'
|
||||||
return [cdx.to_text(fields) for cdx in cdx_iter]
|
return content_type, (cdx.to_text(fields) for cdx in cdx_iter)
|
||||||
|
|
||||||
def to_link(cdx_iter, fields):
|
def to_link(cdx_iter, fields):
|
||||||
response.headers['Content-Type'] = 'application/link'
|
content_type = 'application/link'
|
||||||
return MementoUtils.make_timemap(cdx_iter)
|
return content_type, MementoUtils.make_timemap(cdx_iter)
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
@ -56,10 +55,10 @@ class IndexHandler(object):
|
|||||||
def __call__(self, params):
|
def __call__(self, params):
|
||||||
mode = params.get('mode', 'index')
|
mode = params.get('mode', 'index')
|
||||||
if mode == 'list_sources':
|
if mode == 'list_sources':
|
||||||
return self.index_source.get_source_list(params), {}
|
return {}, self.index_source.get_source_list(params), {}
|
||||||
|
|
||||||
if mode != 'index':
|
if mode != 'index':
|
||||||
return self.get_supported_modes(), {}
|
return {}, self.get_supported_modes(), {}
|
||||||
|
|
||||||
output = params.get('output', self.DEF_OUTPUT)
|
output = params.get('output', self.DEF_OUTPUT)
|
||||||
fields = params.get('fields')
|
fields = params.get('fields')
|
||||||
@ -67,14 +66,15 @@ class IndexHandler(object):
|
|||||||
handler = self.OUTPUTS.get(output)
|
handler = self.OUTPUTS.get(output)
|
||||||
if not handler:
|
if not handler:
|
||||||
errs = dict(last_exc=BadRequestException('output={0} not supported'.format(output)))
|
errs = dict(last_exc=BadRequestException('output={0} not supported'.format(output)))
|
||||||
return None, errs
|
return None, None, errs
|
||||||
|
|
||||||
cdx_iter, errs = self._load_index_source(params)
|
cdx_iter, errs = self._load_index_source(params)
|
||||||
if not cdx_iter:
|
if not cdx_iter:
|
||||||
return None, errs
|
return None, None, errs
|
||||||
|
|
||||||
res = handler(cdx_iter, fields)
|
content_type, res = handler(cdx_iter, fields)
|
||||||
return res, errs
|
out_headers = {'Content-Type': content_type}
|
||||||
|
return out_headers, res, errs
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
@ -94,16 +94,16 @@ class ResourceHandler(IndexHandler):
|
|||||||
|
|
||||||
cdx_iter, errs = self._load_index_source(params)
|
cdx_iter, errs = self._load_index_source(params)
|
||||||
if not cdx_iter:
|
if not cdx_iter:
|
||||||
return None, errs
|
return None, None, errs
|
||||||
|
|
||||||
last_exc = None
|
last_exc = None
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
for loader in self.resource_loaders:
|
for loader in self.resource_loaders:
|
||||||
try:
|
try:
|
||||||
resp = loader(cdx, params)
|
out_headers, resp = loader(cdx, params)
|
||||||
if resp is not None:
|
if resp is not None:
|
||||||
return resp, errs
|
return out_headers, resp, errs
|
||||||
except WbException as e:
|
except WbException as e:
|
||||||
last_exc = e
|
last_exc = e
|
||||||
errs[str(loader)] = repr(e)
|
errs[str(loader)] = repr(e)
|
||||||
@ -111,7 +111,7 @@ class ResourceHandler(IndexHandler):
|
|||||||
if last_exc:
|
if last_exc:
|
||||||
errs['last_exc'] = last_exc
|
errs['last_exc'] = last_exc
|
||||||
|
|
||||||
return None, errs
|
return None, None, errs
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
@ -137,11 +137,11 @@ class HandlerSeq(object):
|
|||||||
def __call__(self, params):
|
def __call__(self, params):
|
||||||
all_errs = {}
|
all_errs = {}
|
||||||
for handler in self.handlers:
|
for handler in self.handlers:
|
||||||
res, errs = handler(params)
|
out_headers, res, errs = handler(params)
|
||||||
all_errs.update(errs)
|
all_errs.update(errs)
|
||||||
if res is not None:
|
if res is not None:
|
||||||
return res, all_errs
|
return out_headers, res, all_errs
|
||||||
|
|
||||||
return None, all_errs
|
return None, None, all_errs
|
||||||
|
|
||||||
|
|
||||||
|
@ -11,6 +11,7 @@ from pywb.cdx.query import CDXQuery
|
|||||||
|
|
||||||
from webagg.liverec import patched_requests as requests
|
from webagg.liverec import patched_requests as requests
|
||||||
|
|
||||||
|
from webagg.utils import ParamFormatter, res_template
|
||||||
from webagg.utils import MementoUtils
|
from webagg.utils import MementoUtils
|
||||||
|
|
||||||
|
|
||||||
@ -22,15 +23,6 @@ class BaseIndexSource(object):
|
|||||||
def load_index(self, params): #pragma: no cover
|
def load_index(self, params): #pragma: no cover
|
||||||
raise NotImplemented()
|
raise NotImplemented()
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def res_template(template, params):
|
|
||||||
src_params = params.get('_src_params')
|
|
||||||
if not src_params:
|
|
||||||
res = template.format(url=params['url'])
|
|
||||||
else:
|
|
||||||
res = template.format(url=params['url'], **src_params)
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class FileIndexSource(BaseIndexSource):
|
class FileIndexSource(BaseIndexSource):
|
||||||
@ -38,7 +30,7 @@ class FileIndexSource(BaseIndexSource):
|
|||||||
self.filename_template = filename
|
self.filename_template = filename
|
||||||
|
|
||||||
def load_index(self, params):
|
def load_index(self, params):
|
||||||
filename = self.res_template(self.filename_template, params)
|
filename = res_template(self.filename_template, params)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
fh = open(filename, 'rb')
|
fh = open(filename, 'rb')
|
||||||
@ -64,7 +56,7 @@ class RemoteIndexSource(BaseIndexSource):
|
|||||||
self.replay_url = replay_url
|
self.replay_url = replay_url
|
||||||
|
|
||||||
def load_index(self, params):
|
def load_index(self, params):
|
||||||
api_url = self.res_template(self.api_url_template, params)
|
api_url = res_template(self.api_url_template, params)
|
||||||
r = requests.get(api_url, timeout=params.get('_timeout'))
|
r = requests.get(api_url, timeout=params.get('_timeout'))
|
||||||
if r.status_code >= 400:
|
if r.status_code >= 400:
|
||||||
raise NotFoundException(api_url)
|
raise NotFoundException(api_url)
|
||||||
@ -73,7 +65,9 @@ class RemoteIndexSource(BaseIndexSource):
|
|||||||
def do_load(lines):
|
def do_load(lines):
|
||||||
for line in lines:
|
for line in lines:
|
||||||
cdx = CDXObject(line)
|
cdx = CDXObject(line)
|
||||||
cdx['load_url'] = self.replay_url.format(timestamp=cdx['timestamp'], url=cdx['url'])
|
cdx['load_url'] = self.replay_url.format(
|
||||||
|
timestamp=cdx['timestamp'],
|
||||||
|
url=cdx['url'])
|
||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
return do_load(lines)
|
return do_load(lines)
|
||||||
@ -114,7 +108,7 @@ class RedisIndexSource(BaseIndexSource):
|
|||||||
self.redis = redis.StrictRedis.from_url(redis_url)
|
self.redis = redis.StrictRedis.from_url(redis_url)
|
||||||
|
|
||||||
def load_index(self, params):
|
def load_index(self, params):
|
||||||
z_key = self.res_template(self.redis_key_template, params)
|
z_key = res_template(self.redis_key_template, params)
|
||||||
index_list = self.redis.zrangebylex(z_key,
|
index_list = self.redis.zrangebylex(z_key,
|
||||||
b'[' + params['key'],
|
b'[' + params['key'],
|
||||||
b'(' + params['end_key'])
|
b'(' + params['end_key'])
|
||||||
@ -173,7 +167,7 @@ class MementoIndexSource(BaseIndexSource):
|
|||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
def get_timegate_links(self, params, closest):
|
def get_timegate_links(self, params, closest):
|
||||||
url = self.res_template(self.timegate_url, params)
|
url = res_template(self.timegate_url, params)
|
||||||
accept_dt = timestamp_to_http_date(closest)
|
accept_dt = timestamp_to_http_date(closest)
|
||||||
res = requests.head(url, headers={'Accept-Datetime': accept_dt})
|
res = requests.head(url, headers={'Accept-Datetime': accept_dt})
|
||||||
if res.status_code >= 400:
|
if res.status_code >= 400:
|
||||||
@ -182,7 +176,7 @@ class MementoIndexSource(BaseIndexSource):
|
|||||||
return res.headers.get('Link')
|
return res.headers.get('Link')
|
||||||
|
|
||||||
def get_timemap_links(self, params):
|
def get_timemap_links(self, params):
|
||||||
url = self.res_template(self.timemap_url, params)
|
url = res_template(self.timemap_url, params)
|
||||||
res = requests.get(url, timeout=params.get('_timeout'))
|
res = requests.get(url, timeout=params.get('_timeout'))
|
||||||
if res.status_code >= 400:
|
if res.status_code >= 400:
|
||||||
raise NotFoundException(url)
|
raise NotFoundException(url)
|
||||||
|
@ -9,7 +9,6 @@ from pywb.utils.wbexception import LiveResourceException
|
|||||||
from pywb.warc.resolvingloader import ResolvingLoader
|
from pywb.warc.resolvingloader import ResolvingLoader
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from bottle import response
|
|
||||||
|
|
||||||
import uuid
|
import uuid
|
||||||
import six
|
import six
|
||||||
@ -52,19 +51,19 @@ class StreamIter(six.Iterator):
|
|||||||
#=============================================================================
|
#=============================================================================
|
||||||
class BaseLoader(object):
|
class BaseLoader(object):
|
||||||
def __call__(self, cdx, params):
|
def __call__(self, cdx, params):
|
||||||
res = self._load_resource(cdx, params)
|
out_headers, res = self._load_resource(cdx, params)
|
||||||
if not res:
|
if not res:
|
||||||
return res
|
return None, None
|
||||||
|
|
||||||
response.headers['WARC-Coll'] = cdx.get('source', '')
|
out_headers['WARC-Coll'] = cdx.get('source', '')
|
||||||
|
|
||||||
response.headers['Link'] = MementoUtils.make_link(
|
out_headers['Link'] = MementoUtils.make_link(
|
||||||
response.headers['WARC-Target-URI'],
|
out_headers['WARC-Target-URI'],
|
||||||
'original')
|
'original')
|
||||||
|
|
||||||
memento_dt = iso_date_to_datetime(response.headers['WARC-Date'])
|
memento_dt = iso_date_to_datetime(out_headers['WARC-Date'])
|
||||||
response.headers['Memento-Datetime'] = datetime_to_http_date(memento_dt)
|
out_headers['Memento-Datetime'] = datetime_to_http_date(memento_dt)
|
||||||
return res
|
return out_headers, res
|
||||||
|
|
||||||
def _load_resource(self, cdx, params): #pragma: no cover
|
def _load_resource(self, cdx, params): #pragma: no cover
|
||||||
raise NotImplemented()
|
raise NotImplemented()
|
||||||
@ -91,8 +90,8 @@ class WARCPathLoader(BaseLoader):
|
|||||||
for path in self.paths:
|
for path in self.paths:
|
||||||
def check(filename, cdx):
|
def check(filename, cdx):
|
||||||
try:
|
try:
|
||||||
if hasattr(cdx, '_src_params') and cdx._src_params:
|
if hasattr(cdx, '_formatter') and cdx._formatter:
|
||||||
full_path = path.format(**cdx._src_params)
|
full_path = cdx._formatter.format(path)
|
||||||
else:
|
else:
|
||||||
full_path = path
|
full_path = path
|
||||||
full_path += filename
|
full_path += filename
|
||||||
@ -104,9 +103,9 @@ class WARCPathLoader(BaseLoader):
|
|||||||
|
|
||||||
def _load_resource(self, cdx, params):
|
def _load_resource(self, cdx, params):
|
||||||
if not cdx.get('filename') or cdx.get('offset') is None:
|
if not cdx.get('filename') or cdx.get('offset') is None:
|
||||||
return None
|
return None, None
|
||||||
|
|
||||||
cdx._src_params = params.get('_src_params')
|
cdx._formatter = params.get('_formatter')
|
||||||
failed_files = []
|
failed_files = []
|
||||||
headers, payload = (self.resolve_loader.
|
headers, payload = (self.resolve_loader.
|
||||||
load_headers_and_payload(cdx,
|
load_headers_and_payload(cdx,
|
||||||
@ -114,18 +113,19 @@ class WARCPathLoader(BaseLoader):
|
|||||||
self.cdx_index_source))
|
self.cdx_index_source))
|
||||||
|
|
||||||
record = payload
|
record = payload
|
||||||
|
out_headers = {}
|
||||||
|
|
||||||
for n, v in record.rec_headers.headers:
|
for n, v in record.rec_headers.headers:
|
||||||
response.headers[n] = v
|
out_headers[n] = v
|
||||||
|
|
||||||
if headers != payload:
|
if headers != payload:
|
||||||
response.headers['WARC-Target-URI'] = headers.rec_headers.get_header('WARC-Target-URI')
|
out_headers['WARC-Target-URI'] = headers.rec_headers.get_header('WARC-Target-URI')
|
||||||
response.headers['WARC-Date'] = headers.rec_headers.get_header('WARC-Date')
|
out_headers['WARC-Date'] = headers.rec_headers.get_header('WARC-Date')
|
||||||
response.headers['WARC-Refers-To-Target-URI'] = payload.rec_headers.get_header('WARC-Target-URI')
|
out_headers['WARC-Refers-To-Target-URI'] = payload.rec_headers.get_header('WARC-Target-URI')
|
||||||
response.headers['WARC-Refers-To-Date'] = payload.rec_headers.get_header('WARC-Date')
|
out_headers['WARC-Refers-To-Date'] = payload.rec_headers.get_header('WARC-Date')
|
||||||
headers.stream.close()
|
headers.stream.close()
|
||||||
|
|
||||||
return StreamIter(record.stream)
|
return out_headers, StreamIter(record.stream)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'WARCPathLoader'
|
return 'WARCPathLoader'
|
||||||
@ -137,6 +137,7 @@ class HeaderRecorder(BaseRecorder):
|
|||||||
self.buff = BytesIO()
|
self.buff = BytesIO()
|
||||||
self.skip_list = skip_list
|
self.skip_list = skip_list
|
||||||
self.skipped = []
|
self.skipped = []
|
||||||
|
self.target_ip = None
|
||||||
|
|
||||||
def write_response_header_line(self, line):
|
def write_response_header_line(self, line):
|
||||||
if self.accept_header(line):
|
if self.accept_header(line):
|
||||||
@ -152,6 +153,11 @@ class HeaderRecorder(BaseRecorder):
|
|||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def finish_request(self, socket):
|
||||||
|
ip = socket.getpeername()
|
||||||
|
if ip:
|
||||||
|
self.target_ip = ip[0]
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class LiveWebLoader(BaseLoader):
|
class LiveWebLoader(BaseLoader):
|
||||||
@ -163,7 +169,7 @@ class LiveWebLoader(BaseLoader):
|
|||||||
def _load_resource(self, cdx, params):
|
def _load_resource(self, cdx, params):
|
||||||
load_url = cdx.get('load_url')
|
load_url = cdx.get('load_url')
|
||||||
if not load_url:
|
if not load_url:
|
||||||
return None
|
return None, None
|
||||||
|
|
||||||
recorder = HeaderRecorder(self.SKIP_HEADERS)
|
recorder = HeaderRecorder(self.SKIP_HEADERS)
|
||||||
|
|
||||||
@ -200,30 +206,33 @@ class LiveWebLoader(BaseLoader):
|
|||||||
|
|
||||||
resp_headers = recorder.get_header()
|
resp_headers = recorder.get_header()
|
||||||
|
|
||||||
response.headers['Content-Type'] = 'application/http; msgtype=response'
|
out_headers = {}
|
||||||
|
out_headers['Content-Type'] = 'application/http; msgtype=response'
|
||||||
|
|
||||||
#response.headers['WARC-Type'] = 'response'
|
out_headers['WARC-Type'] = 'response'
|
||||||
#response.headers['WARC-Record-ID'] = self._make_warc_id()
|
out_headers['WARC-Record-ID'] = self._make_warc_id()
|
||||||
response.headers['WARC-Target-URI'] = cdx['url']
|
out_headers['WARC-Target-URI'] = cdx['url']
|
||||||
response.headers['WARC-Date'] = self._make_date(dt)
|
out_headers['WARC-Date'] = self._make_date(dt)
|
||||||
|
if recorder.target_ip:
|
||||||
|
out_headers['WARC-IP-Address'] = recorder.target_ip
|
||||||
|
|
||||||
# Try to set content-length, if it is available and valid
|
# Try to set content-length, if it is available and valid
|
||||||
try:
|
try:
|
||||||
content_len = int(upstream_res.headers.get('content-length', 0))
|
content_len = int(upstream_res.headers.get('content-length', 0))
|
||||||
if content_len > 0:
|
if content_len > 0:
|
||||||
content_len += len(resp_headers)
|
content_len += len(resp_headers)
|
||||||
response.headers['Content-Length'] = content_len
|
out_headers['Content-Length'] = content_len
|
||||||
except (KeyError, TypeError):
|
except (KeyError, TypeError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return StreamIter(upstream_res.raw, header=resp_headers)
|
return out_headers, StreamIter(upstream_res.raw, header=resp_headers)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _make_date(dt):
|
def _make_date(dt):
|
||||||
return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
|
return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _make_warc_id(id_=None): #pragma: no cover
|
def _make_warc_id(id_=None):
|
||||||
if not id_:
|
if not id_:
|
||||||
id_ = uuid.uuid1()
|
id_ = uuid.uuid1()
|
||||||
return '<urn:uuid:{0}>'.format(id_)
|
return '<urn:uuid:{0}>'.format(id_)
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
import six
|
import six
|
||||||
|
import string
|
||||||
|
|
||||||
from pywb.utils.timeutils import timestamp_to_http_date
|
from pywb.utils.timeutils import timestamp_to_http_date
|
||||||
from pywb.utils.wbexception import BadRequestException
|
from pywb.utils.wbexception import BadRequestException
|
||||||
@ -10,12 +11,12 @@ LINK_URL = re.compile('<(.*)>')
|
|||||||
LINK_PROP = re.compile('([\w]+)="([^"]+)')
|
LINK_PROP = re.compile('([\w]+)="([^"]+)')
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=============================================================================
|
||||||
class MementoException(BadRequestException):
|
class MementoException(BadRequestException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=============================================================================
|
||||||
class MementoUtils(object):
|
class MementoUtils(object):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_links(link_header, def_name='timemap'):
|
def parse_links(link_header, def_name='timemap'):
|
||||||
@ -102,3 +103,42 @@ class MementoUtils(object):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def make_link(url, type):
|
def make_link(url, type):
|
||||||
return '<{0}>; rel="{1}"'.format(url, type)
|
return '<{0}>; rel="{1}"'.format(url, type)
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
class ParamFormatter(string.Formatter):
|
||||||
|
def __init__(self, params, name='', prefix='param.'):
|
||||||
|
self.params = params
|
||||||
|
self.prefix = prefix
|
||||||
|
self.name = name
|
||||||
|
|
||||||
|
def get_value(self, key, args, kwargs):
|
||||||
|
# First, try the named param 'param.{name}.{key}'
|
||||||
|
if self.name:
|
||||||
|
named_key = self.prefix + self.name + '.' + key
|
||||||
|
value = self.params.get(named_key)
|
||||||
|
if value is not None:
|
||||||
|
return value
|
||||||
|
|
||||||
|
# Then, try 'param.{key}'
|
||||||
|
named_key = self.prefix + key
|
||||||
|
value = self.params.get(named_key)
|
||||||
|
if value is not None:
|
||||||
|
return value
|
||||||
|
|
||||||
|
# default to just '{key}'
|
||||||
|
value = kwargs.get(key, '')
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
#=============================================================================
|
||||||
|
def res_template(template, params):
|
||||||
|
formatter = params.get('_formatter')
|
||||||
|
if not formatter:
|
||||||
|
formatter = ParamFormatter(params)
|
||||||
|
|
||||||
|
res = formatter.format(template, url=params['url'])
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user