mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 08:04:49 +01:00
webagg improvements:
responseloader: direct loader: unrewrite location, content-location headers for non-live responses autoapp: support custom indexsource list indexsource: ensure closest query is added for RemoteIndexSource utils res)template: urlencode '{url}' param if after '?'
This commit is contained in:
parent
cbe7508afc
commit
74276f58f3
@ -31,10 +31,6 @@ SOURCE_LIST = [LiveIndexSource,
|
||||
|
||||
# ============================================================================
|
||||
class AutoConfigApp(ResAggApp):
|
||||
@staticmethod
|
||||
def register_source(source_cls):
|
||||
SOURCE_LIST.append(source_cls)
|
||||
|
||||
def __init__(self, config_file='./config.yaml'):
|
||||
config = load_yaml_config(DEFAULT_CONFIG)
|
||||
|
||||
@ -162,15 +158,16 @@ class AutoConfigApp(ResAggApp):
|
||||
return HandlerSeq(handlers)
|
||||
|
||||
# ============================================================================
|
||||
def init_index_source(value):
|
||||
def init_index_source(value, source_list=None):
|
||||
source_list = source_list or SOURCE_LIST
|
||||
if isinstance(value, str):
|
||||
for source_cls in SOURCE_LIST:
|
||||
for source_cls in source_list:
|
||||
source = source_cls.init_from_string(value)
|
||||
if source:
|
||||
return source
|
||||
|
||||
elif isinstance(value, dict):
|
||||
for source_cls in SOURCE_LIST:
|
||||
for source_cls in source_list:
|
||||
source = source_cls.init_from_config(value)
|
||||
if source:
|
||||
return source
|
||||
@ -182,10 +179,15 @@ def init_index_source(value):
|
||||
|
||||
|
||||
# ============================================================================
|
||||
def init_index_agg(source_configs, use_gevent=False, timeout=0):
|
||||
def register_source(source_cls):
|
||||
SOURCE_LIST.append(source_cls)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
def init_index_agg(source_configs, use_gevent=False, timeout=0, source_list=None):
|
||||
sources = {}
|
||||
for n, v in iteritems(source_configs):
|
||||
sources[n] = init_index_source(v)
|
||||
sources[n] = init_index_source(v, source_list=source_list)
|
||||
|
||||
if use_gevent:
|
||||
return GeventTimeoutAggregator(sources, timeout=timeout)
|
||||
|
@ -26,11 +26,6 @@ def to_link(cdx_iter, fields):
|
||||
content_type = 'application/link'
|
||||
return content_type, MementoUtils.make_timemap(cdx_iter)
|
||||
|
||||
def to_raw(cdx_iter, fields):
|
||||
content_type = 'cdx'
|
||||
return content_type, cdx_iter
|
||||
|
||||
|
||||
#=============================================================================
|
||||
class FuzzyMatcher(object):
|
||||
def __init__(self):
|
||||
|
@ -95,6 +95,9 @@ class RemoteIndexSource(BaseIndexSource):
|
||||
lines = r.content.strip().split(b'\n')
|
||||
def do_load(lines):
|
||||
for line in lines:
|
||||
if not line:
|
||||
continue
|
||||
|
||||
cdx = CDXObject(line)
|
||||
self._set_load_url(cdx)
|
||||
yield cdx
|
||||
@ -140,7 +143,7 @@ class RemoteIndexSource(BaseIndexSource):
|
||||
# add specified coll, if any
|
||||
replay = url.rsplit('/', 1)[0] + coll + '/' + WAYBACK_ORIG_SUFFIX
|
||||
|
||||
url += '?url={url}'
|
||||
url += '?url={url}&closest={timestamp}&sort=closest'
|
||||
|
||||
return cls(url, replay)
|
||||
|
||||
@ -201,15 +204,15 @@ class LiveIndexSource(BaseIndexSource):
|
||||
#=============================================================================
|
||||
class RedisIndexSource(BaseIndexSource):
|
||||
def __init__(self, redis_url, redis=None, key_template=None):
|
||||
if redis_url and not redis:
|
||||
redis, key_template = self.parse_redis_url(redis_url)
|
||||
if redis_url:
|
||||
redis, key_template = self.parse_redis_url(redis_url, redis)
|
||||
|
||||
self.redis_url = redis_url
|
||||
self.redis = redis
|
||||
self.redis_key_template = key_template
|
||||
|
||||
@staticmethod
|
||||
def parse_redis_url(redis_url):
|
||||
def parse_redis_url(redis_url, redis_=None):
|
||||
parts = redis_url.split('/')
|
||||
key_prefix = ''
|
||||
if len(parts) > 4:
|
||||
@ -217,8 +220,9 @@ class RedisIndexSource(BaseIndexSource):
|
||||
redis_url = 'redis://' + parts[2] + '/' + parts[3]
|
||||
|
||||
redis_key_template = key_prefix
|
||||
red = redis.StrictRedis.from_url(redis_url)
|
||||
return red, key_prefix
|
||||
if not redis_:
|
||||
redis_ = redis.StrictRedis.from_url(redis_url)
|
||||
return redis_, key_prefix
|
||||
|
||||
def load_index(self, params):
|
||||
return self.load_key_index(self.redis_key_template, params)
|
||||
|
@ -272,6 +272,8 @@ class LiveWebLoader(BaseLoader):
|
||||
'content-location',
|
||||
'x-archive')
|
||||
|
||||
UNREWRITE_HEADERS = ('location', 'content-location')
|
||||
|
||||
def __init__(self):
|
||||
self.num_retries = 3
|
||||
self.num_pools = 10
|
||||
@ -342,7 +344,7 @@ class LiveWebLoader(BaseLoader):
|
||||
|
||||
self.raise_on_self_redirect(params, cdx,
|
||||
str(upstream_res.status),
|
||||
upstream_res.headers.get('Location'))
|
||||
self.unrewrite_header(cdx, upstream_res.headers.get('Location')))
|
||||
|
||||
|
||||
if upstream_res.version == 11:
|
||||
@ -363,18 +365,30 @@ class LiveWebLoader(BaseLoader):
|
||||
#PY 3
|
||||
resp_headers = orig_resp.headers._headers
|
||||
for n, v in resp_headers:
|
||||
if n.lower() in self.SKIP_HEADERS:
|
||||
nl = n.lower()
|
||||
if nl in self.SKIP_HEADERS:
|
||||
continue
|
||||
|
||||
if nl in self.UNREWRITE_HEADERS:
|
||||
v = self.unrewrite_header(cdx, v)
|
||||
|
||||
http_headers_buff += n + ': ' + v + '\r\n'
|
||||
except: #pragma: no cover
|
||||
#PY 2
|
||||
resp_headers = orig_resp.msg.headers
|
||||
for n, v in zip(orig_resp.getheaders(), resp_headers):
|
||||
for (n, v), line in zip(orig_resp.getheaders(), resp_headers):
|
||||
if n in self.SKIP_HEADERS:
|
||||
continue
|
||||
|
||||
http_headers_buff += v
|
||||
new_v = v
|
||||
if n in self.UNREWRITE_HEADERS:
|
||||
new_v = self.unrewrite_header(cdx, v)
|
||||
|
||||
if new_v != v:
|
||||
http_headers_buff += n + ': ' + new_v + '\r\n'
|
||||
else:
|
||||
http_headers_buff += line
|
||||
|
||||
|
||||
http_headers_buff += '\r\n'
|
||||
http_headers_buff = http_headers_buff.encode('latin-1')
|
||||
@ -405,6 +419,19 @@ class LiveWebLoader(BaseLoader):
|
||||
warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
|
||||
return (warc_headers, http_headers_buff, upstream_res)
|
||||
|
||||
def unrewrite_header(self, cdx, value):
|
||||
if not value:
|
||||
return value
|
||||
|
||||
if cdx.get('is_live'):
|
||||
return value
|
||||
|
||||
inx = value.find('/http', 1)
|
||||
if inx < 1:
|
||||
return value
|
||||
|
||||
return value[inx + 1:]
|
||||
|
||||
def __str__(self):
|
||||
return 'LiveWebLoader'
|
||||
|
||||
|
@ -49,7 +49,7 @@ class TestAutoConfigApp(TempDirTests, BaseTestClass):
|
||||
def test_remote_cdx(self):
|
||||
sources = self._get_sources('ait')
|
||||
assert isinstance(sources['ait'], RemoteIndexSource)
|
||||
assert sources['ait'].api_url == 'http://wayback.archive-it.org/cdx?url={url}'
|
||||
assert sources['ait'].api_url == 'http://wayback.archive-it.org/cdx?url={url}&closest={timestamp}&sort=closest'
|
||||
assert sources['ait'].replay_url == 'http://wayback.archive-it.org/all/{timestamp}id_/{url}'
|
||||
|
||||
long_form_sources = self._get_sources('ait_long')
|
||||
@ -68,7 +68,7 @@ class TestAutoConfigApp(TempDirTests, BaseTestClass):
|
||||
def test_remote_cdx_2(self):
|
||||
sources = self._get_sources('rhiz_cdx')
|
||||
assert isinstance(sources['rhiz_cdx'], RemoteIndexSource)
|
||||
assert sources['rhiz_cdx'].api_url == 'http://webenact.rhizome.org/all-cdx?url={url}'
|
||||
assert sources['rhiz_cdx'].api_url == 'http://webenact.rhizome.org/all-cdx?url={url}&closest={timestamp}&sort=closest'
|
||||
assert sources['rhiz_cdx'].replay_url == 'http://webenact.rhizome.org/all/{timestamp}id_/{url}'
|
||||
|
||||
def test_live(self):
|
||||
|
@ -22,7 +22,7 @@ collections:
|
||||
ait_long:
|
||||
index:
|
||||
type: cdx
|
||||
api_url: http://wayback.archive-it.org/cdx?url={url}
|
||||
api_url: 'http://wayback.archive-it.org/cdx?url={url}&closest={timestamp}&sort=closest'
|
||||
replay_url: http://wayback.archive-it.org/all/{timestamp}id_/{url}
|
||||
|
||||
rhiz_long:
|
||||
|
@ -5,12 +5,16 @@ from collections import OrderedDict
|
||||
from pywb.webagg.handlers import DefaultResourceHandler, HandlerSeq
|
||||
|
||||
from pywb.webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource
|
||||
from pywb.webagg.indexsource import RemoteIndexSource
|
||||
|
||||
from pywb.webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator
|
||||
from pywb.webagg.aggregator import DirectoryIndexSource
|
||||
|
||||
from pywb.webagg.app import ResAggApp
|
||||
from pywb.webagg.utils import MementoUtils
|
||||
|
||||
from pywb.warc.recordloader import ArcWarcRecordLoader
|
||||
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||
from pywb.utils.bufferedreaders import ChunkedDataReader
|
||||
from io import BytesIO
|
||||
@ -31,6 +35,13 @@ sources = {
|
||||
'live': LiveIndexSource(),
|
||||
}
|
||||
|
||||
ia_cdx = {
|
||||
'ia-cdx': RemoteIndexSource('http://web.archive.org/cdx?url={url}&closest={timestamp}&sort=closest',
|
||||
'http://web.archive.org/web/{timestamp}id_/{url}')
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
|
||||
@classmethod
|
||||
@ -46,6 +57,8 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
|
||||
handler1 = DefaultResourceHandler(source1, TEST_WARC_PATH)
|
||||
app.add_route('/many', handler1)
|
||||
|
||||
app.add_route('/cdx_api', DefaultResourceHandler(SimpleAggregator(ia_cdx), TEST_WARC_PATH))
|
||||
|
||||
source2 = SimpleAggregator({'post': FileIndexSource(TEST_CDX_PATH + 'post-test.cdxj')})
|
||||
handler2 = DefaultResourceHandler(source2, TEST_WARC_PATH)
|
||||
app.add_route('/posttest', handler2)
|
||||
@ -87,6 +100,7 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
|
||||
'/fallback', '/fallback/postreq',
|
||||
'/live', '/live/postreq',
|
||||
'/many', '/many/postreq',
|
||||
'/cdx_api', '/cdx_api/postreq',
|
||||
'/posttest', '/posttest/postreq',
|
||||
'/seq', '/seq/postreq',
|
||||
'/allredis', '/allredis/postreq',
|
||||
@ -194,6 +208,17 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
|
||||
|
||||
assert 'ResErrors' not in resp.headers
|
||||
|
||||
def test_agg_select_mem_unrewrite_headers(self):
|
||||
resp = self.testapp.get('/cdx_api/resource?closest=20161103124134&url=http://iana.org/')
|
||||
|
||||
assert resp.headers['WebAgg-Source-Coll'] == 'ia-cdx'
|
||||
|
||||
buff = BytesIO(resp.body)
|
||||
record = ArcWarcRecordLoader().parse_record_stream(buff, no_record_parse=False)
|
||||
print(record.status_headers)
|
||||
assert record.status_headers.get_statuscode() == '302'
|
||||
assert record.status_headers.get_header('Location') == 'https://www.iana.org/'
|
||||
|
||||
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('select_live'))
|
||||
def test_agg_select_live(self):
|
||||
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=2016')
|
||||
|
@ -170,7 +170,7 @@ def test_all_not_found(source):
|
||||
expected = ''
|
||||
assert(key_ts_res(res) == expected)
|
||||
if source == remote_sources[0]:
|
||||
assert('http://x-not-found-x.notfound/' in errs['source'])
|
||||
assert('http%3A//x-not-found-x.notfound/' in errs['source'])
|
||||
else:
|
||||
assert(errs == {})
|
||||
|
||||
|
@ -10,6 +10,9 @@ from contextlib import closing
|
||||
from pywb.utils.timeutils import timestamp_to_http_date
|
||||
from pywb.utils.wbexception import BadRequestException
|
||||
|
||||
from six.moves.urllib.parse import quote
|
||||
|
||||
|
||||
LINK_SPLIT = re.compile(',\s*(?=[<])')
|
||||
LINK_SEG_SPLIT = re.compile(';\s*')
|
||||
LINK_URL = re.compile('<(.*)>')
|
||||
@ -143,7 +146,13 @@ def res_template(template, params, **extra_params):
|
||||
formatter = params.get('_formatter')
|
||||
if not formatter:
|
||||
formatter = ParamFormatter(params)
|
||||
res = formatter.format(template, url=params.get('url', ''), **extra_params)
|
||||
|
||||
url = params.get('url', '')
|
||||
qi = template.find('?')
|
||||
if qi >= 0 and template.find('{url}') > qi:
|
||||
url = quote(url)
|
||||
|
||||
res = formatter.format(template, url=url, **extra_params)
|
||||
|
||||
return res
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user