diff --git a/pywb/webagg/autoapp.py b/pywb/webagg/autoapp.py index a8316a0f..2dacdf03 100644 --- a/pywb/webagg/autoapp.py +++ b/pywb/webagg/autoapp.py @@ -31,10 +31,6 @@ SOURCE_LIST = [LiveIndexSource, # ============================================================================ class AutoConfigApp(ResAggApp): - @staticmethod - def register_source(source_cls): - SOURCE_LIST.append(source_cls) - def __init__(self, config_file='./config.yaml'): config = load_yaml_config(DEFAULT_CONFIG) @@ -162,15 +158,16 @@ class AutoConfigApp(ResAggApp): return HandlerSeq(handlers) # ============================================================================ -def init_index_source(value): +def init_index_source(value, source_list=None): + source_list = source_list or SOURCE_LIST if isinstance(value, str): - for source_cls in SOURCE_LIST: + for source_cls in source_list: source = source_cls.init_from_string(value) if source: return source elif isinstance(value, dict): - for source_cls in SOURCE_LIST: + for source_cls in source_list: source = source_cls.init_from_config(value) if source: return source @@ -182,10 +179,15 @@ def init_index_source(value): # ============================================================================ -def init_index_agg(source_configs, use_gevent=False, timeout=0): +def register_source(source_cls): + SOURCE_LIST.append(source_cls) + + +# ============================================================================ +def init_index_agg(source_configs, use_gevent=False, timeout=0, source_list=None): sources = {} for n, v in iteritems(source_configs): - sources[n] = init_index_source(v) + sources[n] = init_index_source(v, source_list=source_list) if use_gevent: return GeventTimeoutAggregator(sources, timeout=timeout) diff --git a/pywb/webagg/handlers.py b/pywb/webagg/handlers.py index 505efc0c..7059351a 100644 --- a/pywb/webagg/handlers.py +++ b/pywb/webagg/handlers.py @@ -26,11 +26,6 @@ def to_link(cdx_iter, fields): content_type = 'application/link' return content_type, MementoUtils.make_timemap(cdx_iter) -def to_raw(cdx_iter, fields): - content_type = 'cdx' - return content_type, cdx_iter - - #============================================================================= class FuzzyMatcher(object): def __init__(self): diff --git a/pywb/webagg/indexsource.py b/pywb/webagg/indexsource.py index 6fd70eab..b3937a76 100644 --- a/pywb/webagg/indexsource.py +++ b/pywb/webagg/indexsource.py @@ -95,6 +95,9 @@ class RemoteIndexSource(BaseIndexSource): lines = r.content.strip().split(b'\n') def do_load(lines): for line in lines: + if not line: + continue + cdx = CDXObject(line) self._set_load_url(cdx) yield cdx @@ -140,7 +143,7 @@ class RemoteIndexSource(BaseIndexSource): # add specified coll, if any replay = url.rsplit('/', 1)[0] + coll + '/' + WAYBACK_ORIG_SUFFIX - url += '?url={url}' + url += '?url={url}&closest={timestamp}&sort=closest' return cls(url, replay) @@ -201,15 +204,15 @@ class LiveIndexSource(BaseIndexSource): #============================================================================= class RedisIndexSource(BaseIndexSource): def __init__(self, redis_url, redis=None, key_template=None): - if redis_url and not redis: - redis, key_template = self.parse_redis_url(redis_url) + if redis_url: + redis, key_template = self.parse_redis_url(redis_url, redis) self.redis_url = redis_url self.redis = redis self.redis_key_template = key_template @staticmethod - def parse_redis_url(redis_url): + def parse_redis_url(redis_url, redis_=None): parts = redis_url.split('/') key_prefix = '' if len(parts) > 4: @@ -217,8 +220,9 @@ class RedisIndexSource(BaseIndexSource): redis_url = 'redis://' + parts[2] + '/' + parts[3] redis_key_template = key_prefix - red = redis.StrictRedis.from_url(redis_url) - return red, key_prefix + if not redis_: + redis_ = redis.StrictRedis.from_url(redis_url) + return redis_, key_prefix def load_index(self, params): return self.load_key_index(self.redis_key_template, params) diff --git a/pywb/webagg/responseloader.py b/pywb/webagg/responseloader.py index c05a34eb..f837988c 100644 --- a/pywb/webagg/responseloader.py +++ b/pywb/webagg/responseloader.py @@ -272,6 +272,8 @@ class LiveWebLoader(BaseLoader): 'content-location', 'x-archive') + UNREWRITE_HEADERS = ('location', 'content-location') + def __init__(self): self.num_retries = 3 self.num_pools = 10 @@ -342,7 +344,7 @@ class LiveWebLoader(BaseLoader): self.raise_on_self_redirect(params, cdx, str(upstream_res.status), - upstream_res.headers.get('Location')) + self.unrewrite_header(cdx, upstream_res.headers.get('Location'))) if upstream_res.version == 11: @@ -363,18 +365,30 @@ class LiveWebLoader(BaseLoader): #PY 3 resp_headers = orig_resp.headers._headers for n, v in resp_headers: - if n.lower() in self.SKIP_HEADERS: + nl = n.lower() + if nl in self.SKIP_HEADERS: continue + if nl in self.UNREWRITE_HEADERS: + v = self.unrewrite_header(cdx, v) + http_headers_buff += n + ': ' + v + '\r\n' except: #pragma: no cover #PY 2 resp_headers = orig_resp.msg.headers - for n, v in zip(orig_resp.getheaders(), resp_headers): + for (n, v), line in zip(orig_resp.getheaders(), resp_headers): if n in self.SKIP_HEADERS: continue - http_headers_buff += v + new_v = v + if n in self.UNREWRITE_HEADERS: + new_v = self.unrewrite_header(cdx, v) + + if new_v != v: + http_headers_buff += n + ': ' + new_v + '\r\n' + else: + http_headers_buff += line + http_headers_buff += '\r\n' http_headers_buff = http_headers_buff.encode('latin-1') @@ -405,6 +419,19 @@ class LiveWebLoader(BaseLoader): warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) return (warc_headers, http_headers_buff, upstream_res) + def unrewrite_header(self, cdx, value): + if not value: + return value + + if cdx.get('is_live'): + return value + + inx = value.find('/http', 1) + if inx < 1: + return value + + return value[inx + 1:] + def __str__(self): return 'LiveWebLoader' diff --git a/pywb/webagg/test/test_autoapp.py b/pywb/webagg/test/test_autoapp.py index 0ffb857e..cb703569 100644 --- a/pywb/webagg/test/test_autoapp.py +++ b/pywb/webagg/test/test_autoapp.py @@ -49,7 +49,7 @@ class TestAutoConfigApp(TempDirTests, BaseTestClass): def test_remote_cdx(self): sources = self._get_sources('ait') assert isinstance(sources['ait'], RemoteIndexSource) - assert sources['ait'].api_url == 'http://wayback.archive-it.org/cdx?url={url}' + assert sources['ait'].api_url == 'http://wayback.archive-it.org/cdx?url={url}&closest={timestamp}&sort=closest' assert sources['ait'].replay_url == 'http://wayback.archive-it.org/all/{timestamp}id_/{url}' long_form_sources = self._get_sources('ait_long') @@ -68,7 +68,7 @@ class TestAutoConfigApp(TempDirTests, BaseTestClass): def test_remote_cdx_2(self): sources = self._get_sources('rhiz_cdx') assert isinstance(sources['rhiz_cdx'], RemoteIndexSource) - assert sources['rhiz_cdx'].api_url == 'http://webenact.rhizome.org/all-cdx?url={url}' + assert sources['rhiz_cdx'].api_url == 'http://webenact.rhizome.org/all-cdx?url={url}&closest={timestamp}&sort=closest' assert sources['rhiz_cdx'].replay_url == 'http://webenact.rhizome.org/all/{timestamp}id_/{url}' def test_live(self): diff --git a/pywb/webagg/test/test_autoapp.yaml b/pywb/webagg/test/test_autoapp.yaml index cdd8cb40..7ab962c2 100644 --- a/pywb/webagg/test/test_autoapp.yaml +++ b/pywb/webagg/test/test_autoapp.yaml @@ -22,7 +22,7 @@ collections: ait_long: index: type: cdx - api_url: http://wayback.archive-it.org/cdx?url={url} + api_url: 'http://wayback.archive-it.org/cdx?url={url}&closest={timestamp}&sort=closest' replay_url: http://wayback.archive-it.org/all/{timestamp}id_/{url} rhiz_long: diff --git a/pywb/webagg/test/test_handlers.py b/pywb/webagg/test/test_handlers.py index bbac105a..ecf489ce 100644 --- a/pywb/webagg/test/test_handlers.py +++ b/pywb/webagg/test/test_handlers.py @@ -5,12 +5,16 @@ from collections import OrderedDict from pywb.webagg.handlers import DefaultResourceHandler, HandlerSeq from pywb.webagg.indexsource import MementoIndexSource, FileIndexSource, LiveIndexSource +from pywb.webagg.indexsource import RemoteIndexSource + from pywb.webagg.aggregator import GeventTimeoutAggregator, SimpleAggregator from pywb.webagg.aggregator import DirectoryIndexSource from pywb.webagg.app import ResAggApp from pywb.webagg.utils import MementoUtils +from pywb.warc.recordloader import ArcWarcRecordLoader + from pywb.utils.statusandheaders import StatusAndHeadersParser from pywb.utils.bufferedreaders import ChunkedDataReader from io import BytesIO @@ -31,6 +35,13 @@ sources = { 'live': LiveIndexSource(), } +ia_cdx = { + 'ia-cdx': RemoteIndexSource('http://web.archive.org/cdx?url={url}&closest={timestamp}&sort=closest', + 'http://web.archive.org/web/{timestamp}id_/{url}') +} + + + class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass): @classmethod @@ -46,6 +57,8 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass): handler1 = DefaultResourceHandler(source1, TEST_WARC_PATH) app.add_route('/many', handler1) + app.add_route('/cdx_api', DefaultResourceHandler(SimpleAggregator(ia_cdx), TEST_WARC_PATH)) + source2 = SimpleAggregator({'post': FileIndexSource(TEST_CDX_PATH + 'post-test.cdxj')}) handler2 = DefaultResourceHandler(source2, TEST_WARC_PATH) app.add_route('/posttest', handler2) @@ -87,6 +100,7 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass): '/fallback', '/fallback/postreq', '/live', '/live/postreq', '/many', '/many/postreq', + '/cdx_api', '/cdx_api/postreq', '/posttest', '/posttest/postreq', '/seq', '/seq/postreq', '/allredis', '/allredis/postreq', @@ -194,6 +208,17 @@ class TestResAgg(MementoOverrideTests, FakeRedisTests, BaseTestClass): assert 'ResErrors' not in resp.headers + def test_agg_select_mem_unrewrite_headers(self): + resp = self.testapp.get('/cdx_api/resource?closest=20161103124134&url=http://iana.org/') + + assert resp.headers['WebAgg-Source-Coll'] == 'ia-cdx' + + buff = BytesIO(resp.body) + record = ArcWarcRecordLoader().parse_record_stream(buff, no_record_parse=False) + print(record.status_headers) + assert record.status_headers.get_statuscode() == '302' + assert record.status_headers.get_header('Location') == 'https://www.iana.org/' + @patch('pywb.webagg.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('select_live')) def test_agg_select_live(self): resp = self.testapp.get('/many/resource?url=http://vvork.com/&closest=2016') diff --git a/pywb/webagg/test/test_indexsource.py b/pywb/webagg/test/test_indexsource.py index 6171104b..c9c19c55 100644 --- a/pywb/webagg/test/test_indexsource.py +++ b/pywb/webagg/test/test_indexsource.py @@ -170,7 +170,7 @@ def test_all_not_found(source): expected = '' assert(key_ts_res(res) == expected) if source == remote_sources[0]: - assert('http://x-not-found-x.notfound/' in errs['source']) + assert('http%3A//x-not-found-x.notfound/' in errs['source']) else: assert(errs == {}) diff --git a/pywb/webagg/utils.py b/pywb/webagg/utils.py index 78c08396..d15950d8 100644 --- a/pywb/webagg/utils.py +++ b/pywb/webagg/utils.py @@ -10,6 +10,9 @@ from contextlib import closing from pywb.utils.timeutils import timestamp_to_http_date from pywb.utils.wbexception import BadRequestException +from six.moves.urllib.parse import quote + + LINK_SPLIT = re.compile(',\s*(?=[<])') LINK_SEG_SPLIT = re.compile(';\s*') LINK_URL = re.compile('<(.*)>') @@ -143,7 +146,13 @@ def res_template(template, params, **extra_params): formatter = params.get('_formatter') if not formatter: formatter = ParamFormatter(params) - res = formatter.format(template, url=params.get('url', ''), **extra_params) + + url = params.get('url', '') + qi = template.find('?') + if qi >= 0 and template.find('{url}') > qi: + url = quote(url) + + res = formatter.format(template, url=url, **extra_params) return res