mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
webagg improvements:
responseloader: direct loader: unrewrite location, content-location headers for non-live responses autoapp: support custom indexsource list indexsource: ensure closest query is added for RemoteIndexSource utils res)template: urlencode '{url}' param if after '?'
This commit is contained in:
parent
cbe7508afc
commit
74276f58f3
@ -31,10 +31,6 @@ SOURCE_LIST = [LiveIndexSource,
|
|||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class AutoConfigApp(ResAggApp):
|
class AutoConfigApp(ResAggApp):
|
||||||
@staticmethod
|
|
||||||
def register_source(source_cls):
|
|
||||||
SOURCE_LIST.append(source_cls)
|
|
||||||
|
|
||||||
def __init__(self, config_file='./config.yaml'):
|
def __init__(self, config_file='./config.yaml'):
|
||||||
config = load_yaml_config(DEFAULT_CONFIG)
|
config = load_yaml_config(DEFAULT_CONFIG)
|
||||||
|
|
||||||
@ -162,15 +158,16 @@ class AutoConfigApp(ResAggApp):
|
|||||||
return HandlerSeq(handlers)
|
return HandlerSeq(handlers)
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
def init_index_source(value):
|
def init_index_source(value, source_list=None):
|
||||||
|
source_list = source_list or SOURCE_LIST
|
||||||
if isinstance(value, str):
|
if isinstance(value, str):
|
||||||
for source_cls in SOURCE_LIST:
|
for source_cls in source_list:
|
||||||
source = source_cls.init_from_string(value)
|
source = source_cls.init_from_string(value)
|
||||||
if source:
|
if source:
|
||||||
return source
|
return source
|
||||||
|
|
||||||
elif isinstance(value, dict):
|
elif isinstance(value, dict):
|
||||||
for source_cls in SOURCE_LIST:
|
for source_cls in source_list:
|
||||||
source = source_cls.init_from_config(value)
|
source = source_cls.init_from_config(value)
|
||||||
if source:
|
if source:
|
||||||
return source
|
return source
|
||||||
@ -182,10 +179,15 @@ def init_index_source(value):
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
def init_index_agg(source_configs, use_gevent=False, timeout=0):
|
def register_source(source_cls):
|
||||||
|
SOURCE_LIST.append(source_cls)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
def init_index_agg(source_configs, use_gevent=False, timeout=0, source_list=None):
|
||||||
sources = {}
|
sources = {}
|
||||||
for n, v in iteritems(source_configs):
|
for n, v in iteritems(source_configs):
|
||||||
sources[n] = init_index_source(v)
|
sources[n] = init_index_source(v, source_list=source_list)
|
||||||
|
|
||||||
if use_gevent:
|
if use_gevent:
|
||||||
return GeventTimeoutAggregator(sources, timeout=timeout)
|
return GeventTimeoutAggregator(sources, timeout=timeout)
|
||||||
|
@ -26,11 +26,6 @@ def to_link(cdx_iter, fields):
|
|||||||
content_type = 'application/link'
|
content_type = 'application/link'
|
||||||
return content_type, MementoUtils.make_timemap(cdx_iter)
|
return content_type, MementoUtils.make_timemap(cdx_iter)
|
||||||
|
|
||||||
def to_raw(cdx_iter, fields):
|
|
||||||
content_type = 'cdx'
|
|
||||||
return content_type, cdx_iter
|
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class FuzzyMatcher(object):
|
class FuzzyMatcher(object):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -95,6 +95,9 @@ class RemoteIndexSource(BaseIndexSource):
|
|||||||
lines = r.content.strip().split(b'\n')
|
lines = r.content.strip().split(b'\n')
|
||||||
def do_load(lines):
|
def do_load(lines):
|
||||||
for line in lines:
|
for line in lines:
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
cdx = CDXObject(line)
|
cdx = CDXObject(line)
|
||||||
self._set_load_url(cdx)
|
self._set_load_url(cdx)
|
||||||
yield cdx
|
yield cdx
|
||||||
@ -140,7 +143,7 @@ class RemoteIndexSource(BaseIndexSource):
|
|||||||
# add specified coll, if any
|
# add specified coll, if any
|
||||||
replay = url.rsplit('/', 1)[0] + coll + '/' + WAYBACK_ORIG_SUFFIX
|
replay = url.rsplit('/', 1)[0] + coll + '/' + WAYBACK_ORIG_SUFFIX
|
||||||
|
|
||||||
url += '?url={url}'
|
url += '?url={url}&closest={timestamp}&sort=closest'
|
||||||
|
|
||||||
return cls(url, replay)
|
return cls(url, replay)
|
||||||
|
|
||||||
@ -201,15 +204,15 @@ class LiveIndexSource(BaseIndexSource):
|
|||||||
#=============================================================================
|
#=============================================================================
|
||||||
class RedisIndexSource(BaseIndexSource):
|
class RedisIndexSource(BaseIndexSource):
|
||||||
def __init__(self, redis_url, redis=None, key_template=None):
|
def __init__(self, redis_url, redis=None, key_template=None):
|
||||||
if redis_url and not redis:
|
if redis_url:
|
||||||
redis, key_template = self.parse_redis_url(redis_url)
|
redis, key_template = self.parse_redis_url(redis_url, redis)
|
||||||
|
|
||||||
self.redis_url = redis_url
|
self.redis_url = redis_url
|
||||||
self.redis = redis
|
self.redis = redis
|
||||||
self.redis_key_template = key_template
|
self.redis_key_template = key_template
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_redis_url(redis_url):
|
def parse_redis_url(redis_url, redis_=None):
|
||||||
parts = redis_url.split('/')
|
parts = redis_url.split('/')
|
||||||
key_prefix = ''
|
key_prefix = ''
|
||||||
if len(parts) > 4:
|
if len(parts) > 4:
|
||||||
@ -217,8 +220,9 @@ class RedisIndexSource(BaseIndexSource):
|
|||||||
redis_url = 'redis://' + parts[2] + '/' + parts[3]
|
redis_url = 'redis://' + parts[2] + '/' + parts[3]
|
||||||
|
|
||||||
redis_key_template = key_prefix
|
redis_key_template = key_prefix
|
||||||
red = redis.StrictRedis.from_url(redis_url)
|
if not redis_:
|
||||||
return red, key_prefix
|
redis_ = redis.StrictRedis.from_url(redis_url)
|
||||||
|
return redis_, key_prefix
|
||||||
|
|
||||||
def load_index(self, params):
|
def load_index(self, params):
|
||||||
return self.load_key_index(self.redis_key_template, params)
|
return self.load_key_index(self.redis_key_template, params)
|
||||||
|
@ -272,6 +272,8 @@ class LiveWebLoader(BaseLoader):
|
|||||||
'content-location',
|
'content-location',
|
||||||
'x-archive')
|
'x-archive')
|
||||||
|
|
||||||
|
UNREWRITE_HEADERS = ('location', 'content-location')
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.num_retries = 3
|
self.num_retries = 3
|
||||||
self.num_pools = 10
|
self.num_pools = 10
|
||||||
@ -342,7 +344,7 @@ class LiveWebLoader(BaseLoader):
|
|||||||
|
|
||||||
self.raise_on_self_redirect(params, cdx,
|
self.raise_on_self_redirect(params, cdx,
|
||||||
str(upstream_res.status),
|
str(upstream_res.status),
|
||||||
upstream_res.headers.get('Location'))
|
self.unrewrite_header(cdx, upstream_res.headers.get('Location')))
|
||||||
|
|
||||||
|
|
||||||
if upstream_res.version == 11:
|
if upstream_res.version == 11:
|
||||||
@ -363,18 +365,30 @@ class LiveWebLoader(BaseLoader):
|
|||||||
#PY 3
|
#PY 3
|
||||||
resp_headers = orig_resp.headers._headers
|
resp_headers = orig_resp.headers._headers
|
||||||
for n, v in resp_headers:
|
for n, v in resp_headers:
|
||||||
if n.lower() in self.SKIP_HEADERS:
|
nl = n.lower()
|
||||||
|
if nl in self.SKIP_HEADERS:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if nl in self.UNREWRITE_HEADERS:
|
||||||
|
v = self.unrewrite_header(cdx, v)
|
||||||
|
|
||||||
http_headers_buff += n + ': ' + v + '\r\n'
|
http_headers_buff += n + ': ' + v + '\r\n'
|
||||||
except: #pragma: no cover
|
except: #pragma: no cover
|
||||||
#PY 2
|
#PY 2
|
||||||
resp_headers = orig_resp.msg.headers
|
resp_headers = orig_resp.msg.headers
|
||||||
for n, v in zip(orig_resp.getheaders(), resp_headers):
|
for (n, v), line in zip(orig_resp.getheaders(), resp_headers):
|
||||||
if n in self.SKIP_HEADERS:
|
if n in self.SKIP_HEADERS:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
http_headers_buff += v
|
new_v = v
|
||||||
|
if n in self.UNREWRITE_HEADERS:
|
||||||
|
new_v = self.unrewrite_header(cdx, v)
|
||||||
|
|
||||||
|
if new_v != v:
|
||||||
|
http_headers_buff += n + ': ' + new_v + '\r\n'
|
||||||
|
else:
|
||||||
|
http_headers_buff += line
|
||||||
|
|
||||||
|
|
||||||
http_headers_buff += '\r\n'
|
http_headers_buff += '\r\n'
|
||||||
http_headers_buff = http_headers_buff.encode('latin-1')
|
http_headers_buff = http_headers_buff.encode('latin-1')
|
||||||
@ -405,6 +419,19 @@ class LiveWebLoader(BaseLoader):
|
|||||||
warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
|
warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
|
||||||
return (warc_headers, http_headers_buff, upstream_res)
|
return (warc_headers, http_headers_buff, upstream_res)
|
||||||
|
|
||||||
|
def unrewrite_header(self, cdx, value):
|
||||||
|
if not value:
|
||||||
|
return value
|
||||||
|
|
||||||
|
if cdx.get('is_live'):
|
||||||
|
return value
|
||||||
|
|
||||||
|
inx = value.find('/http', 1)
|
||||||
|
if inx < 1:
|
||||||
|
return value
|
||||||
|
|
||||||
|
return value[inx + 1:]
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'LiveWebLoader'
|
return 'LiveWebLoader'
|
||||||
|
|
||||||
|
@ -49,7 +49,7 @@ class TestAutoConfigApp(TempDirTests, BaseTestClass):
|
|||||||
def test_remote_cdx(self):
|
def test_remote_cdx(self):
|
||||||
sources = self._get_sources('ait')
|
sources = self._get_sources('ait')
|
||||||
assert isinstance(sources['ait'], RemoteIndexSource)
|
assert isinstance(sources['ait'], RemoteIndexSource)
|
||||||
assert sources['ait'].api_url == 'http://wayback.archive-it.org/cdx?url={url}'
|
assert sources['ait'].api_url == 'http://wayback.archive-it.org/cdx?url={url}&closest={timestamp}&sort=closest'
|
||||||
assert sources['ait'].replay_url == 'http://wayback.archive-it.org/all/{timestamp}id_/{url}'
|
assert sources['ait'].replay_url == 'http://wayback.archive-it.org/all/{timestamp}id_/{url}'
|
||||||
|
|
||||||
long_form_sources = self._get_sources('ait_long')
|
long_form_sources = self._get_sources('ait_long')
|
||||||
@ -68,7 +68,7 @@ class TestAutoConfigApp(TempDirTests, BaseTestClass):
|
|||||||
def test_remote_cdx_2(self):
|
def test_remote_cdx_2(self):
|
||||||
sources = self._get_sources('rhiz_cdx')
|
sources = self._get_sources('rhiz_cdx')
|
||||||
assert isinstance(sources['rhiz_cdx'], RemoteIndexSource)
|
assert isinstance(sources['rhiz_cdx'], RemoteIndexSource)
|
||||||
assert sources['rhiz_cdx'].api_url == 'http://webenact.rhizome.org/all-cdx?url={url}'
|
assert sources['rhiz_cdx'].api_url == 'http://webenact.rhizome.org/all-cdx?url={url}&closest={timestamp}&sort=closest'
|
||||||
assert sources['rhiz_cdx'].replay_url == 'http://webenact.rhizome.org/all/{timestamp}id_/{url}'
|
assert sources['rhiz_cdx'].replay_url == 'http://webenact.rhizome.org/all/{timestamp}id_/{url}'
|
||||||
|
|
||||||
def test_live(self):
|
def test_live(self):
|
||||||
|
@ -22,7 +22,7 @@ collections:
|
|||||||
ait_long:
|
ait_long:
|
||||||
index:
|
index:
|
||||||
type: cdx
|
type: cdx
|
||||||
api_url: http://wayback.archive-it.org/cdx?url={url}
|
api_url: 'http://wayback.archive-it.org/cdx?url={url}&closest={timestamp}&sort=closest'
|
||||||
replay_url: http://wayback.archive-it.org/all/{timestamp}id_/{url}
|
replay_url: http://wayback.archive-it.org/all/{timestamp}id_/{url}
|
||||||
|
|
||||||
rhiz_long:
|
rhiz_long:
|
||||||
|
@ -5,12 +5,16 @@ from collections import OrderedDict
|
|||||||
from pywb.webagg.handlers import DefaultResourceHandler, HandlerSeq
|
from pywb.webagg.handlers import DefaultResourceHandler, HandlerSeq
|
||||||
|
|
||||||
from pywb.webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource
|
from pywb.webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource
|
||||||
|
from pywb.webagg.indexsource import RemoteIndexSource
|
||||||
|
|
||||||
from pywb.webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator
|
from pywb.webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator
|
||||||
from pywb.webagg.aggregator import DirectoryIndexSource
|
from pywb.webagg.aggregator import DirectoryIndexSource
|
||||||
|
|
||||||
from pywb.webagg.app import ResAggApp
|
from pywb.webagg.app import ResAggApp
|
||||||
from pywb.webagg.utils import MementoUtils
|
from pywb.webagg.utils import MementoUtils
|
||||||
|
|
||||||
|
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||||
|
|
||||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||||
from pywb.utils.bufferedreaders import ChunkedDataReader
|
from pywb.utils.bufferedreaders import ChunkedDataReader
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
@ -31,6 +35,13 @@ sources = {
|
|||||||
'live': LiveIndexSource(),
|
'live': LiveIndexSource(),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ia_cdx = {
|
||||||
|
'ia-cdx': RemoteIndexSource('http://web.archive.org/cdx?url={url}&closest={timestamp}&sort=closest',
|
||||||
|
'http://web.archive.org/web/{timestamp}id_/{url}')
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
|
class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -46,6 +57,8 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
|
|||||||
handler1 = DefaultResourceHandler(source1, TEST_WARC_PATH)
|
handler1 = DefaultResourceHandler(source1, TEST_WARC_PATH)
|
||||||
app.add_route('/many', handler1)
|
app.add_route('/many', handler1)
|
||||||
|
|
||||||
|
app.add_route('/cdx_api', DefaultResourceHandler(SimpleAggregator(ia_cdx), TEST_WARC_PATH))
|
||||||
|
|
||||||
source2 = SimpleAggregator({'post': FileIndexSource(TEST_CDX_PATH + 'post-test.cdxj')})
|
source2 = SimpleAggregator({'post': FileIndexSource(TEST_CDX_PATH + 'post-test.cdxj')})
|
||||||
handler2 = DefaultResourceHandler(source2, TEST_WARC_PATH)
|
handler2 = DefaultResourceHandler(source2, TEST_WARC_PATH)
|
||||||
app.add_route('/posttest', handler2)
|
app.add_route('/posttest', handler2)
|
||||||
@ -87,6 +100,7 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
|
|||||||
'/fallback', '/fallback/postreq',
|
'/fallback', '/fallback/postreq',
|
||||||
'/live', '/live/postreq',
|
'/live', '/live/postreq',
|
||||||
'/many', '/many/postreq',
|
'/many', '/many/postreq',
|
||||||
|
'/cdx_api', '/cdx_api/postreq',
|
||||||
'/posttest', '/posttest/postreq',
|
'/posttest', '/posttest/postreq',
|
||||||
'/seq', '/seq/postreq',
|
'/seq', '/seq/postreq',
|
||||||
'/allredis', '/allredis/postreq',
|
'/allredis', '/allredis/postreq',
|
||||||
@ -194,6 +208,17 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
|
|||||||
|
|
||||||
assert 'ResErrors' not in resp.headers
|
assert 'ResErrors' not in resp.headers
|
||||||
|
|
||||||
|
def test_agg_select_mem_unrewrite_headers(self):
|
||||||
|
resp = self.testapp.get('/cdx_api/resource?closest=20161103124134&url=http://iana.org/')
|
||||||
|
|
||||||
|
assert resp.headers['WebAgg-Source-Coll'] == 'ia-cdx'
|
||||||
|
|
||||||
|
buff = BytesIO(resp.body)
|
||||||
|
record = ArcWarcRecordLoader().parse_record_stream(buff, no_record_parse=False)
|
||||||
|
print(record.status_headers)
|
||||||
|
assert record.status_headers.get_statuscode() == '302'
|
||||||
|
assert record.status_headers.get_header('Location') == 'https://www.iana.org/'
|
||||||
|
|
||||||
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('select_live'))
|
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('select_live'))
|
||||||
def test_agg_select_live(self):
|
def test_agg_select_live(self):
|
||||||
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=2016')
|
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=2016')
|
||||||
|
@ -170,7 +170,7 @@ def test_all_not_found(source):
|
|||||||
expected = ''
|
expected = ''
|
||||||
assert(key_ts_res(res) == expected)
|
assert(key_ts_res(res) == expected)
|
||||||
if source == remote_sources[0]:
|
if source == remote_sources[0]:
|
||||||
assert('http://x-not-found-x.notfound/' in errs['source'])
|
assert('http%3A//x-not-found-x.notfound/' in errs['source'])
|
||||||
else:
|
else:
|
||||||
assert(errs == {})
|
assert(errs == {})
|
||||||
|
|
||||||
|
@ -10,6 +10,9 @@ from contextlib import closing
|
|||||||
from pywb.utils.timeutils import timestamp_to_http_date
|
from pywb.utils.timeutils import timestamp_to_http_date
|
||||||
from pywb.utils.wbexception import BadRequestException
|
from pywb.utils.wbexception import BadRequestException
|
||||||
|
|
||||||
|
from six.moves.urllib.parse import quote
|
||||||
|
|
||||||
|
|
||||||
LINK_SPLIT = re.compile(',\s*(?=[<])')
|
LINK_SPLIT = re.compile(',\s*(?=[<])')
|
||||||
LINK_SEG_SPLIT = re.compile(';\s*')
|
LINK_SEG_SPLIT = re.compile(';\s*')
|
||||||
LINK_URL = re.compile('<(.*)>')
|
LINK_URL = re.compile('<(.*)>')
|
||||||
@ -143,7 +146,13 @@ def res_template(template, params, **extra_params):
|
|||||||
formatter = params.get('_formatter')
|
formatter = params.get('_formatter')
|
||||||
if not formatter:
|
if not formatter:
|
||||||
formatter = ParamFormatter(params)
|
formatter = ParamFormatter(params)
|
||||||
res = formatter.format(template, url=params.get('url', ''), **extra_params)
|
|
||||||
|
url = params.get('url', '')
|
||||||
|
qi = template.find('?')
|
||||||
|
if qi >= 0 and template.find('{url}') > qi:
|
||||||
|
url = quote(url)
|
||||||
|
|
||||||
|
res = formatter.format(template, url=url, **extra_params)
|
||||||
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user