1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

webagg improvements:

responseloader: direct loader: unrewrite location, content-location headers for non-live responses
autoapp: support custom indexsource list
indexsource: ensure closest query is added for RemoteIndexSource
utils res)template: urlencode '{url}' param if after '?'
This commit is contained in:
Ilya Kreymer 2016-11-21 18:59:22 -08:00
parent cbe7508afc
commit 74276f58f3
9 changed files with 91 additions and 29 deletions

View File

@ -31,10 +31,6 @@ SOURCE_LIST = [LiveIndexSource,
# ============================================================================
class AutoConfigApp(ResAggApp):
@staticmethod
def register_source(source_cls):
SOURCE_LIST.append(source_cls)
def __init__(self, config_file='./config.yaml'):
config = load_yaml_config(DEFAULT_CONFIG)
@ -162,15 +158,16 @@ class AutoConfigApp(ResAggApp):
return HandlerSeq(handlers)
# ============================================================================
def init_index_source(value):
def init_index_source(value, source_list=None):
source_list = source_list or SOURCE_LIST
if isinstance(value, str):
for source_cls in SOURCE_LIST:
for source_cls in source_list:
source = source_cls.init_from_string(value)
if source:
return source
elif isinstance(value, dict):
for source_cls in SOURCE_LIST:
for source_cls in source_list:
source = source_cls.init_from_config(value)
if source:
return source
@ -182,10 +179,15 @@ def init_index_source(value):
# ============================================================================
def init_index_agg(source_configs, use_gevent=False, timeout=0):
def register_source(source_cls):
SOURCE_LIST.append(source_cls)
# ============================================================================
def init_index_agg(source_configs, use_gevent=False, timeout=0, source_list=None):
sources = {}
for n, v in iteritems(source_configs):
sources[n] = init_index_source(v)
sources[n] = init_index_source(v, source_list=source_list)
if use_gevent:
return GeventTimeoutAggregator(sources, timeout=timeout)

View File

@ -26,11 +26,6 @@ def to_link(cdx_iter, fields):
content_type = 'application/link'
return content_type, MementoUtils.make_timemap(cdx_iter)
def to_raw(cdx_iter, fields):
content_type = 'cdx'
return content_type, cdx_iter
#=============================================================================
class FuzzyMatcher(object):
def __init__(self):

View File

@ -95,6 +95,9 @@ class RemoteIndexSource(BaseIndexSource):
lines = r.content.strip().split(b'\n')
def do_load(lines):
for line in lines:
if not line:
continue
cdx = CDXObject(line)
self._set_load_url(cdx)
yield cdx
@ -140,7 +143,7 @@ class RemoteIndexSource(BaseIndexSource):
# add specified coll, if any
replay = url.rsplit('/', 1)[0] + coll + '/' + WAYBACK_ORIG_SUFFIX
url += '?url={url}'
url += '?url={url}&closest={timestamp}&sort=closest'
return cls(url, replay)
@ -201,15 +204,15 @@ class LiveIndexSource(BaseIndexSource):
#=============================================================================
class RedisIndexSource(BaseIndexSource):
def __init__(self, redis_url, redis=None, key_template=None):
if redis_url and not redis:
redis, key_template = self.parse_redis_url(redis_url)
if redis_url:
redis, key_template = self.parse_redis_url(redis_url, redis)
self.redis_url = redis_url
self.redis = redis
self.redis_key_template = key_template
@staticmethod
def parse_redis_url(redis_url):
def parse_redis_url(redis_url, redis_=None):
parts = redis_url.split('/')
key_prefix = ''
if len(parts) > 4:
@ -217,8 +220,9 @@ class RedisIndexSource(BaseIndexSource):
redis_url = 'redis://' + parts[2] + '/' + parts[3]
redis_key_template = key_prefix
red = redis.StrictRedis.from_url(redis_url)
return red, key_prefix
if not redis_:
redis_ = redis.StrictRedis.from_url(redis_url)
return redis_, key_prefix
def load_index(self, params):
return self.load_key_index(self.redis_key_template, params)

View File

@ -272,6 +272,8 @@ class LiveWebLoader(BaseLoader):
'content-location',
'x-archive')
UNREWRITE_HEADERS = ('location', 'content-location')
def __init__(self):
self.num_retries = 3
self.num_pools = 10
@ -342,7 +344,7 @@ class LiveWebLoader(BaseLoader):
self.raise_on_self_redirect(params, cdx,
str(upstream_res.status),
upstream_res.headers.get('Location'))
self.unrewrite_header(cdx, upstream_res.headers.get('Location')))
if upstream_res.version == 11:
@ -363,18 +365,30 @@ class LiveWebLoader(BaseLoader):
#PY 3
resp_headers = orig_resp.headers._headers
for n, v in resp_headers:
if n.lower() in self.SKIP_HEADERS:
nl = n.lower()
if nl in self.SKIP_HEADERS:
continue
if nl in self.UNREWRITE_HEADERS:
v = self.unrewrite_header(cdx, v)
http_headers_buff += n + ': ' + v + '\r\n'
except: #pragma: no cover
#PY 2
resp_headers = orig_resp.msg.headers
for n, v in zip(orig_resp.getheaders(), resp_headers):
for (n, v), line in zip(orig_resp.getheaders(), resp_headers):
if n in self.SKIP_HEADERS:
continue
http_headers_buff += v
new_v = v
if n in self.UNREWRITE_HEADERS:
new_v = self.unrewrite_header(cdx, v)
if new_v != v:
http_headers_buff += n + ': ' + new_v + '\r\n'
else:
http_headers_buff += line
http_headers_buff += '\r\n'
http_headers_buff = http_headers_buff.encode('latin-1')
@ -405,6 +419,19 @@ class LiveWebLoader(BaseLoader):
warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
return (warc_headers, http_headers_buff, upstream_res)
def unrewrite_header(self, cdx, value):
if not value:
return value
if cdx.get('is_live'):
return value
inx = value.find('/http', 1)
if inx < 1:
return value
return value[inx + 1:]
def __str__(self):
return 'LiveWebLoader'

View File

@ -49,7 +49,7 @@ class TestAutoConfigApp(TempDirTests, BaseTestClass):
def test_remote_cdx(self):
sources = self._get_sources('ait')
assert isinstance(sources['ait'], RemoteIndexSource)
assert sources['ait'].api_url == 'http://wayback.archive-it.org/cdx?url={url}'
assert sources['ait'].api_url == 'http://wayback.archive-it.org/cdx?url={url}&closest={timestamp}&sort=closest'
assert sources['ait'].replay_url == 'http://wayback.archive-it.org/all/{timestamp}id_/{url}'
long_form_sources = self._get_sources('ait_long')
@ -68,7 +68,7 @@ class TestAutoConfigApp(TempDirTests, BaseTestClass):
def test_remote_cdx_2(self):
sources = self._get_sources('rhiz_cdx')
assert isinstance(sources['rhiz_cdx'], RemoteIndexSource)
assert sources['rhiz_cdx'].api_url == 'http://webenact.rhizome.org/all-cdx?url={url}'
assert sources['rhiz_cdx'].api_url == 'http://webenact.rhizome.org/all-cdx?url={url}&closest={timestamp}&sort=closest'
assert sources['rhiz_cdx'].replay_url == 'http://webenact.rhizome.org/all/{timestamp}id_/{url}'
def test_live(self):

View File

@ -22,7 +22,7 @@ collections:
ait_long:
index:
type: cdx
api_url: http://wayback.archive-it.org/cdx?url={url}
api_url: 'http://wayback.archive-it.org/cdx?url={url}&closest={timestamp}&sort=closest'
replay_url: http://wayback.archive-it.org/all/{timestamp}id_/{url}
rhiz_long:

View File

@ -5,12 +5,16 @@ from collections import OrderedDict
from pywb.webagg.handlers import DefaultResourceHandler, HandlerSeq
from pywb.webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource
from pywb.webagg.indexsource import RemoteIndexSource
from pywb.webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator
from pywb.webagg.aggregator import DirectoryIndexSource
from pywb.webagg.app import ResAggApp
from pywb.webagg.utils import MementoUtils
from pywb.warc.recordloader import ArcWarcRecordLoader
from pywb.utils.statusandheaders import StatusAndHeadersParser
from pywb.utils.bufferedreaders import ChunkedDataReader
from io import BytesIO
@ -31,6 +35,13 @@ sources = {
'live': LiveIndexSource(),
}
ia_cdx = {
'ia-cdx': RemoteIndexSource('http://web.archive.org/cdx?url={url}&closest={timestamp}&sort=closest',
'http://web.archive.org/web/{timestamp}id_/{url}')
}
class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
@classmethod
@ -46,6 +57,8 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
handler1 = DefaultResourceHandler(source1, TEST_WARC_PATH)
app.add_route('/many', handler1)
app.add_route('/cdx_api', DefaultResourceHandler(SimpleAggregator(ia_cdx), TEST_WARC_PATH))
source2 = SimpleAggregator({'post': FileIndexSource(TEST_CDX_PATH + 'post-test.cdxj')})
handler2 = DefaultResourceHandler(source2, TEST_WARC_PATH)
app.add_route('/posttest', handler2)
@ -87,6 +100,7 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
'/fallback', '/fallback/postreq',
'/live', '/live/postreq',
'/many', '/many/postreq',
'/cdx_api', '/cdx_api/postreq',
'/posttest', '/posttest/postreq',
'/seq', '/seq/postreq',
'/allredis', '/allredis/postreq',
@ -194,6 +208,17 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass):
assert 'ResErrors' not in resp.headers
def test_agg_select_mem_unrewrite_headers(self):
resp = self.testapp.get('/cdx_api/resource?closest=20161103124134&url=http://iana.org/')
assert resp.headers['WebAgg-Source-Coll'] == 'ia-cdx'
buff = BytesIO(resp.body)
record = ArcWarcRecordLoader().parse_record_stream(buff, no_record_parse=False)
print(record.status_headers)
assert record.status_headers.get_statuscode() == '302'
assert record.status_headers.get_header('Location') == 'https://www.iana.org/'
@patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('select_live'))
def test_agg_select_live(self):
resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=2016')

View File

@ -170,7 +170,7 @@ def test_all_not_found(source):
expected = ''
assert(key_ts_res(res) == expected)
if source == remote_sources[0]:
assert('http://x-not-found-x.notfound/' in errs['source'])
assert('http%3A//x-not-found-x.notfound/' in errs['source'])
else:
assert(errs == {})

View File

@ -10,6 +10,9 @@ from contextlib import closing
from pywb.utils.timeutils import timestamp_to_http_date
from pywb.utils.wbexception import BadRequestException
from six.moves.urllib.parse import quote
LINK_SPLIT = re.compile(',\s*(?=[<])')
LINK_SEG_SPLIT = re.compile(';\s*')
LINK_URL = re.compile('<(.*)>')
@ -143,7 +146,13 @@ def res_template(template, params, **extra_params):
formatter = params.get('_formatter')
if not formatter:
formatter = ParamFormatter(params)
res = formatter.format(template, url=params.get('url', ''), **extra_params)
url = params.get('url', '')
qi = template.find('?')
if qi >= 0 and template.find('{url}') > qi:
url = quote(url)
res = formatter.format(template, url=url, **extra_params)
return res