1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

webagg improvements:

responseloader: direct loader: unrewrite location, content-location headers for non-live responses
autoapp: support custom indexsource list
indexsource: ensure closest query is added for RemoteIndexSource
utils res)template: urlencode '{url}' param if after '?'
This commit is contained in:
Ilya Kreymer 2016-11-21 18:59:22 -08:00
parent cbe7508afc
commit 74276f58f3
9 changed files with 91 additions and 29 deletions

View File

@ -31,10 +31,6 @@ SOURCE_LIST = [LiveIndexSource,
# ============================================================================ # ============================================================================
class AutoConfigApp(ResAggApp): class AutoConfigApp(ResAggApp):
@staticmethod
def register_source(source_cls):
SOURCE_LIST.append(source_cls)
def __init__(self, config_file='./config.yaml'): def __init__(self, config_file='./config.yaml'):
config = load_yaml_config(DEFAULT_CONFIG) config = load_yaml_config(DEFAULT_CONFIG)
@ -162,15 +158,16 @@ class AutoConfigApp(ResAggApp):
return HandlerSeq(handlers) return HandlerSeq(handlers)
# ============================================================================ # ============================================================================
def init_index_source(value): def init_index_source(value, source_list=None):
source_list = source_list or SOURCE_LIST
if isinstance(value, str): if isinstance(value, str):
for source_cls in SOURCE_LIST: for source_cls in source_list:
source = source_cls.init_from_string(value) source = source_cls.init_from_string(value)
if source: if source:
return source return source
elif isinstance(value, dict): elif isinstance(value, dict):
for source_cls in SOURCE_LIST: for source_cls in source_list:
source = source_cls.init_from_config(value) source = source_cls.init_from_config(value)
if source: if source:
return source return source
@ -182,10 +179,15 @@ def init_index_source(value):
# ============================================================================ # ============================================================================
def init_index_agg(source_configs, use_gevent=False, timeout=0): def register_source(source_cls):
SOURCE_LIST.append(source_cls)
# ============================================================================
def init_index_agg(source_configs, use_gevent=False, timeout=0, source_list=None):
sources = {} sources = {}
for n, v in iteritems(source_configs): for n, v in iteritems(source_configs):
sources[n] = init_index_source(v) sources[n] = init_index_source(v, source_list=source_list)
if use_gevent: if use_gevent:
return GeventTimeoutAggregator(sources, timeout=timeout) return GeventTimeoutAggregator(sources, timeout=timeout)

View File

@ -26,11 +26,6 @@ def to_link(cdx_iter, fields):
content_type = 'application/link' content_type = 'application/link'
return content_type, MementoUtils.make_timemap(cdx_iter) return content_type, MementoUtils.make_timemap(cdx_iter)
def to_raw(cdx_iter, fields):
content_type = 'cdx'
return content_type, cdx_iter
#============================================================================= #=============================================================================
class FuzzyMatcher(object): class FuzzyMatcher(object):
def __init__(self): def __init__(self):

View File

@ -95,6 +95,9 @@ class RemoteIndexSource(BaseIndexSource):
lines = r.content.strip().split(b'\n') lines = r.content.strip().split(b'\n')
def do_load(lines): def do_load(lines):
for line in lines: for line in lines:
if not line:
continue
cdx = CDXObject(line) cdx = CDXObject(line)
self._set_load_url(cdx) self._set_load_url(cdx)
yield cdx yield cdx
@ -140,7 +143,7 @@ class RemoteIndexSource(BaseIndexSource):
# add specified coll, if any # add specified coll, if any
replay = url.rsplit('/', 1)[0] + coll + '/' + WAYBACK_ORIG_SUFFIX replay = url.rsplit('/', 1)[0] + coll + '/' + WAYBACK_ORIG_SUFFIX
url += '?url={url}' url += '?url={url}&closest={timestamp}&sort=closest'
return cls(url, replay) return cls(url, replay)
@ -201,15 +204,15 @@ class LiveIndexSource(BaseIndexSource):
#============================================================================= #=============================================================================
class RedisIndexSource(BaseIndexSource): class RedisIndexSource(BaseIndexSource):
def __init__(self, redis_url, redis=None, key_template=None): def __init__(self, redis_url, redis=None, key_template=None):
if redis_url and not redis: if redis_url:
redis, key_template = self.parse_redis_url(redis_url) redis, key_template = self.parse_redis_url(redis_url, redis)
self.redis_url = redis_url self.redis_url = redis_url
self.redis = redis self.redis = redis
self.redis_key_template = key_template self.redis_key_template = key_template
@staticmethod @staticmethod
def parse_redis_url(redis_url): def parse_redis_url(redis_url, redis_=None):
parts = redis_url.split('/') parts = redis_url.split('/')
key_prefix = '' key_prefix = ''
if len(parts) > 4: if len(parts) > 4:
@ -217,8 +220,9 @@ class RedisIndexSource(BaseIndexSource):
redis_url = 'redis://' + parts[2] + '/' + parts[3] redis_url = 'redis://' + parts[2] + '/' + parts[3]
redis_key_template = key_prefix redis_key_template = key_prefix
red = redis.StrictRedis.from_url(redis_url) if not redis_:
return red, key_prefix redis_ = redis.StrictRedis.from_url(redis_url)
return redis_, key_prefix
def load_index(self, params): def load_index(self, params):
return self.load_key_index(self.redis_key_template, params) return self.load_key_index(self.redis_key_template, params)

View File

@ -272,6 +272,8 @@ class LiveWebLoader(BaseLoader):
'content-location', 'content-location',
'x-archive') 'x-archive')
UNREWRITE_HEADERS = ('location', 'content-location')
def __init__(self): def __init__(self):
self.num_retries = 3 self.num_retries = 3
self.num_pools = 10 self.num_pools = 10
@ -342,7 +344,7 @@ class LiveWebLoader(BaseLoader):
self.raise_on_self_redirect(params, cdx, self.raise_on_self_redirect(params, cdx,
str(upstream_res.status), str(upstream_res.status),
upstream_res.headers.get('Location')) self.unrewrite_header(cdx, upstream_res.headers.get('Location')))
if upstream_res.version == 11: if upstream_res.version == 11:
@ -363,18 +365,30 @@ class LiveWebLoader(BaseLoader):
#PY 3 #PY 3
resp_headers = orig_resp.headers._headers resp_headers = orig_resp.headers._headers
for n, v in resp_headers: for n, v in resp_headers:
if n.lower() in self.SKIP_HEADERS: nl = n.lower()
if nl in self.SKIP_HEADERS:
continue continue
if nl in self.UNREWRITE_HEADERS:
v = self.unrewrite_header(cdx, v)
http_headers_buff += n + ': ' + v + '\r\n' http_headers_buff += n + ': ' + v + '\r\n'
except: #pragma: no cover except: #pragma: no cover
#PY 2 #PY 2
resp_headers = orig_resp.msg.headers resp_headers = orig_resp.msg.headers
for n, v in zip(orig_resp.getheaders(), resp_headers): for (n, v), line in zip(orig_resp.getheaders(), resp_headers):
if n in self.SKIP_HEADERS: if n in self.SKIP_HEADERS:
continue continue
http_headers_buff += v new_v = v
if n in self.UNREWRITE_HEADERS:
new_v = self.unrewrite_header(cdx, v)
if new_v != v:
http_headers_buff += n + ': ' + new_v + '\r\n'
else:
http_headers_buff += line
http_headers_buff += '\r\n' http_headers_buff += '\r\n'
http_headers_buff = http_headers_buff.encode('latin-1') http_headers_buff = http_headers_buff.encode('latin-1')
@ -405,6 +419,19 @@ class LiveWebLoader(BaseLoader):
warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
return (warc_headers, http_headers_buff, upstream_res) return (warc_headers, http_headers_buff, upstream_res)
def unrewrite_header(self, cdx, value):
if not value:
return value
if cdx.get('is_live'):
return value
inx = value.find('/http', 1)
if inx < 1:
return value
return value[inx + 1:]
def __str__(self): def __str__(self):
return 'LiveWebLoader' return 'LiveWebLoader'

View File

@ -49,7 +49,7 @@ class TestAutoConfigApp(TempDirTests, BaseTestClass):
def test_remote_cdx(self): def test_remote_cdx(self):
sources = self._get_sources('ait') sources = self._get_sources('ait')
assert isinstance(sources['ait'], RemoteIndexSource) assert isinstance(sources['ait'], RemoteIndexSource)
assert sources['ait'].api_url == 'http://wayback.archive-it.org/cdx?url={url}' assert sources['ait'].api_url == 'http://wayback.archive-it.org/cdx?url={url}&closest={timestamp}&sort=closest'
assert sources['ait'].replay_url == 'http://wayback.archive-it.org/all/{timestamp}id_/{url}' assert sources['ait'].replay_url == 'http://wayback.archive-it.org/all/{timestamp}id_/{url}'
long_form_sources = self._get_sources('ait_long') long_form_sources = self._get_sources('ait_long')
@ -68,7 +68,7 @@ class TestAutoConfigApp(TempDirTests, BaseTestClass):
def test_remote_cdx_2(self): def test_remote_cdx_2(self):
sources = self._get_sources('rhiz_cdx') sources = self._get_sources('rhiz_cdx')
assert isinstance(sources['rhiz_cdx'], RemoteIndexSource) assert isinstance(sources['rhiz_cdx'], RemoteIndexSource)
assert sources['rhiz_cdx'].api_url == 'http://webenact.rhizome.org/all-cdx?url={url}' assert sources['rhiz_cdx'].api_url == 'http://webenact.rhizome.org/all-cdx?url={url}&closest={timestamp}&sort=closest'
assert sources['rhiz_cdx'].replay_url == 'http://webenact.rhizome.org/all/{timestamp}id_/{url}' assert sources['rhiz_cdx'].replay_url == 'http://webenact.rhizome.org/all/{timestamp}id_/{url}'
def test_live(self): def test_live(self):

View File

@ -22,7 +22,7 @@ collections:
ait_long: ait_long:
index: index:
type: cdx type: cdx
api_url: http://wayback.archive-it.org/cdx?url={url} api_url: 'http://wayback.archive-it.org/cdx?url={url}&closest={timestamp}&sort=closest'
replay_url: http://wayback.archive-it.org/all/{timestamp}id_/{url} replay_url: http://wayback.archive-it.org/all/{timestamp}id_/{url}
rhiz_long: rhiz_long:

View File

@ -5,12 +5,16 @@ from collections import OrderedDict
from pywb.webagg.handlers import DefaultResourceHandler, HandlerSeq from pywb.webagg.handlers import DefaultResourceHandler, HandlerSeq
from pywb.webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource from pywb.webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource
from pywb.webagg.indexsource import RemoteIndexSource
from pywb.webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator from pywb.webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator
from pywb.webagg.aggregator import DirectoryIndexSource from pywb.webagg.aggregator import DirectoryIndexSource
from pywb.webagg.app import ResAggApp from pywb.webagg.app import ResAggApp
from pywb.webagg.utils import MementoUtils from pywb.webagg.utils import MementoUtils
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.utils.statusandheaders import StatusAndHeadersParser from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.bufferedreaders import ChunkedDataReader from pywb.utils.bufferedreaders import ChunkedDataReader
from io import BytesIO from io import BytesIO
@ -31,6 +35,13 @@ sources = {
'live': LiveIndexSource(), 'live': LiveIndexSource(),
} }
ia_cdx = {
'ia-cdx': RemoteIndexSource('http://web.archive.org/cdx?url={url}&closest={timestamp}&sort=closest',
'http://web.archive.org/web/{timestamp}id_/{url}')
}
class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass): class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
@classmethod @classmethod
@ -46,6 +57,8 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
handler1 = DefaultResourceHandler(source1, TEST_WARC_PATH) handler1 = DefaultResourceHandler(source1, TEST_WARC_PATH)
app.add_route('/many', handler1) app.add_route('/many', handler1)
app.add_route('/cdx_api', DefaultResourceHandler(SimpleAggregator(ia_cdx), TEST_WARC_PATH))
source2 = SimpleAggregator({'post': FileIndexSource(TEST_CDX_PATH + 'post-test.cdxj')}) source2 = SimpleAggregator({'post': FileIndexSource(TEST_CDX_PATH + 'post-test.cdxj')})
handler2 = DefaultResourceHandler(source2, TEST_WARC_PATH) handler2 = DefaultResourceHandler(source2, TEST_WARC_PATH)
app.add_route('/posttest', handler2) app.add_route('/posttest', handler2)
@ -87,6 +100,7 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
'/fallback', '/fallback/postreq', '/fallback', '/fallback/postreq',
'/live', '/live/postreq', '/live', '/live/postreq',
'/many', '/many/postreq', '/many', '/many/postreq',
'/cdx_api', '/cdx_api/postreq',
'/posttest', '/posttest/postreq', '/posttest', '/posttest/postreq',
'/seq', '/seq/postreq', '/seq', '/seq/postreq',
'/allredis', '/allredis/postreq', '/allredis', '/allredis/postreq',
@ -194,6 +208,17 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
assert 'ResErrors' not in resp.headers assert 'ResErrors' not in resp.headers
def test_agg_select_mem_unrewrite_headers(self):
resp = self.testapp.get('/cdx_api/resource?closest=20161103124134&url=http://iana.org/')
assert resp.headers['WebAgg-Source-Coll'] == 'ia-cdx'
buff = BytesIO(resp.body)
record = ArcWarcRecordLoader().parse_record_stream(buff, no_record_parse=False)
print(record.status_headers)
assert record.status_headers.get_statuscode() == '302'
assert record.status_headers.get_header('Location') == 'https://www.iana.org/'
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('select_live')) @patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('select_live'))
def test_agg_select_live(self): def test_agg_select_live(self):
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=2016') resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=2016')

View File

@ -170,7 +170,7 @@ def test_all_not_found(source):
expected = '' expected = ''
assert(key_ts_res(res) == expected) assert(key_ts_res(res) == expected)
if source == remote_sources[0]: if source == remote_sources[0]:
assert('http://x-not-found-x.notfound/' in errs['source']) assert('http%3A//x-not-found-x.notfound/' in errs['source'])
else: else:
assert(errs == {}) assert(errs == {})

View File

@ -10,6 +10,9 @@ from contextlib import closing
from pywb.utils.timeutils import timestamp_to_http_date from pywb.utils.timeutils import timestamp_to_http_date
from pywb.utils.wbexception import BadRequestException from pywb.utils.wbexception import BadRequestException
from six.moves.urllib.parse import quote
LINK_SPLIT = re.compile(',\s*(?=[<])') LINK_SPLIT = re.compile(',\s*(?=[<])')
LINK_SEG_SPLIT = re.compile(';\s*') LINK_SEG_SPLIT = re.compile(';\s*')
LINK_URL = re.compile('<(.*)>') LINK_URL = re.compile('<(.*)>')
@ -143,7 +146,13 @@ def res_template(template, params, **extra_params):
formatter = params.get('_formatter') formatter = params.get('_formatter')
if not formatter: if not formatter:
formatter = ParamFormatter(params) formatter = ParamFormatter(params)
res = formatter.format(template, url=params.get('url', ''), **extra_params)
url = params.get('url', '')
qi = template.find('?')
if qi >= 0 and template.find('{url}') > qi:
url = quote(url)
res = formatter.format(template, url=url, **extra_params)
return res return res