diff --git a/pywb/warcserver/handlers.py b/pywb/warcserver/handlers.py index cf1bdf4a..c0d0d5fa 100644 --- a/pywb/warcserver/handlers.py +++ b/pywb/warcserver/handlers.py @@ -128,7 +128,7 @@ class ResourceHandler(IndexHandler): return out_headers, resp, errs except (WbException, ArchiveLoadFailed) as e: last_exc = e - if logger.isEnabledFor(logging.DEBUG): + if True or logger.isEnabledFor(logging.DEBUG): traceback.print_exc() errs[str(loader)] = str(e) diff --git a/pywb/warcserver/index/aggregator.py b/pywb/warcserver/index/aggregator.py index 16cd25e9..68f461f4 100644 --- a/pywb/warcserver/index/aggregator.py +++ b/pywb/warcserver/index/aggregator.py @@ -4,6 +4,7 @@ import gevent import json import time import os +import re from warcio.timeutils import timestamp_now @@ -392,3 +393,33 @@ class BaseRedisMultiKeyIndexSource(BaseAggregator, RedisIndexSource): class RedisMultiKeyIndexSource(SeqAggMixin, BaseRedisMultiKeyIndexSource): pass + +#============================================================================= +class BaseRulesAggregator(BaseSourceListAggregator): + def __init__(self, sources, **kwargs): + super(BaseRulesAggregator, self).__init__(sources, **kwargs) + rules = kwargs.get('rules', []) + + self.rules = [] + + for rule in rules: + match = rule['match'] + name = rule['name'] + self.rules.append((re.compile(match), name)) + + def get_all_sources(self, params): + url = params['url'] + + for rx, name in self.rules: + if rx.match(url): + source = self.sources.get(name) + if source: + return {name: source} + + return [] + + +#============================================================================= +class RulesAggregator(SeqAggMixin, BaseRulesAggregator): + pass + diff --git a/pywb/warcserver/index/indexsource.py b/pywb/warcserver/index/indexsource.py index 1250f42c..0fb2bdab 100644 --- a/pywb/warcserver/index/indexsource.py +++ b/pywb/warcserver/index/indexsource.py @@ -196,8 +196,7 @@ class RemoteIndexSource(BaseIndexSource): #============================================================================= class LiveIndexSource(BaseIndexSource): - def __init__(self, proxy_url='{url}'): - self.proxy_url = proxy_url + def __init__(self): self._init_sesh(DefaultAdapters.live_adapter) def load_index(self, params): @@ -209,7 +208,7 @@ class LiveIndexSource(BaseIndexSource): cdx['urlkey'] = params.get('key').decode('utf-8') cdx['timestamp'] = timestamp_now() cdx['url'] = params['url'] - cdx['load_url'] = res_template(self.proxy_url, params) + cdx['load_url'] = self.get_load_url(params) cdx['is_live'] = 'true' mime = params.get('content_type', '') @@ -231,6 +230,9 @@ class LiveIndexSource(BaseIndexSource): return iter([cdx]) + def get_load_url(self, params): + return params['url'] + def __repr__(self): return '{0}()'.format(self.__class__.__name__) @@ -259,6 +261,26 @@ class LiveIndexSource(BaseIndexSource): return cls() +#============================================================================= +class LiveRewriteIndexSource(LiveIndexSource): + def __init__(self, match, replace): + super(LiveRewriteIndexSource, self).__init__() + self.rx = re.compile(match) + self.replace = replace + + def get_load_url(self, params): + res = self.rx.sub(self.replace, params['url']) + print(res) + return res + + @classmethod + def init_from_config(cls, config): + if config['type'] != 'live_rw': + return + + return cls(config['match'], config['replace']) + + #============================================================================= class RedisIndexSource(BaseIndexSource): def __init__(self, redis_url=None, redis=None, key_template=None, **kwargs): @@ -579,3 +601,4 @@ class WBMementoIndexSource(MementoIndexSource): @classmethod def _init_id(cls): return 'wb-memento' + diff --git a/pywb/warcserver/index/test/test_indexsource.py b/pywb/warcserver/index/test/test_indexsource.py index d4a71705..f7d8693f 100644 --- a/pywb/warcserver/index/test/test_indexsource.py +++ b/pywb/warcserver/index/test/test_indexsource.py @@ -1,7 +1,7 @@ from pywb.warcserver.index.indexsource import FileIndexSource, RemoteIndexSource, MementoIndexSource, RedisIndexSource -from pywb.warcserver.index.indexsource import LiveIndexSource, WBMementoIndexSource +from pywb.warcserver.index.indexsource import LiveIndexSource, WBMementoIndexSource, LiveRewriteIndexSource -from pywb.warcserver.index.aggregator import SimpleAggregator +from pywb.warcserver.index.aggregator import SimpleAggregator, RulesAggregator from warcio.timeutils import timestamp_now @@ -144,6 +144,51 @@ com,instagram)/amaliaulman 20141014162333 http://webenact.rhizome.org/all/201410 assert(key_ts_res(res, 'load_url') == expected) assert(errs == {}) + def test_live_rewrite(self): + url = 'http://example.com/some/path?A=B' + source = LiveRewriteIndexSource(match='https?:\/\/example.com\/(.*)', + replace=r'https://other-host.example.org/\1') + + res, errs = self.query_single_source(source, dict(url=url)) + + assert list(res)[0]['load_url'] == 'https://other-host.example.org/some/path?A=B' + + assert(errs == {}) + + def test_cond_aggregator_rewrite(self): + match_url = 'https?:\/\/example.com\/(.*)' + live_rw = LiveRewriteIndexSource(match=match_url, + replace=r'https://other-host.example.org/\1') + + rules = [dict(match=match_url, name='live_rw'), + dict(match='.*', name='live') + ] + + rules_agg = RulesAggregator(sources={'live': LiveIndexSource(), + 'live_rw': live_rw}, + rules=rules) + + # Rewrite Matching url + res, errs = rules_agg({'url': 'http://example.com/some/path?A=B'}) + cdx = list(res)[0] + + # assert rewriting source used + assert cdx['load_url'] == 'https://other-host.example.org/some/path?A=B' + assert cdx['source'] == 'live_rw' + + assert(errs == {}) + + # Don't rewrite other urls + res, errs = rules_agg({'url': 'http://example.net/some/path?A=B'}) + cdx = list(res)[0] + + # assert live source used + assert cdx['load_url'] == 'http://example.net/some/path?A=B' + assert cdx['source'] == 'live' + + assert(errs == {}) + + # Errors -- Not Found All def test_all_not_found(self, all_source): url = 'http://x-not-found-x.notfound/' diff --git a/pywb/warcserver/test/test_warcserver.py b/pywb/warcserver/test/test_warcserver.py index 8a1fe374..2d5b4983 100644 --- a/pywb/warcserver/test/test_warcserver.py +++ b/pywb/warcserver/test/test_warcserver.py @@ -5,6 +5,7 @@ import os from pywb.warcserver.index.indexsource import RemoteIndexSource, LiveIndexSource, MementoIndexSource from pywb.warcserver.index.indexsource import WBMementoIndexSource, FileIndexSource from pywb.warcserver.index.aggregator import BaseSourceListAggregator, DirectoryIndexSource +from pywb.warcserver.index.aggregator import RulesAggregator from pywb.warcserver.handlers import ResourceHandler, HandlerSeq @@ -50,7 +51,7 @@ class TestWarcServer(TempDirTests, BaseTestClass): return handler.index_source.sources def test_list_static(self): - assert len(self.loader.list_fixed_routes()) == 13 + assert len(self.loader.list_fixed_routes()) == 14 def test_list_dynamic(self): assert set(self.loader.list_dynamic_routes()) == set(['auto1', 'auto2']) @@ -125,3 +126,18 @@ class TestWarcServer(TempDirTests, BaseTestClass): assert len(sources) == 1 assert isinstance(sources['live'], LiveIndexSource) + def test_rules_agg(self): + handler = self.loader.fixed_routes.get('many_rules') + assert(handler) + assert isinstance(handler.index_source, RulesAggregator) + + res = handler.index_source.get_all_sources({'url': 'http://example.com/path'}) + assert len(res) == 1 + assert list(res.keys()) == ['local'] + + res = handler.index_source.get_all_sources({'url': 'http://httpbin.org/'}) + assert len(res) == 1 + assert list(res.keys()) == ['liveweb'] + + + diff --git a/pywb/warcserver/test/test_warcserver_config.yaml b/pywb/warcserver/test/test_warcserver_config.yaml index c6b0dfb6..d8f3135b 100644 --- a/pywb/warcserver/test/test_warcserver_config.yaml +++ b/pywb/warcserver/test/test_warcserver_config.yaml @@ -1,3 +1,5 @@ +debug: true + collections: # Live Index @@ -45,6 +47,23 @@ collections: timeout: 10 + # many sources, conditional rules + many_rules: + index_group: + local: + path: ./local/indexes + archive_paths: ./local/data + type: file + + liveweb: live + + rules: + - match: 'https?:\/\/example\.com\/.*' + name: local + + - match: '.*' + name: liveweb + # Local Dir CDX local: index: ./local/indexes diff --git a/pywb/warcserver/warcserver.py b/pywb/warcserver/warcserver.py index 9f9233d1..ead99625 100644 --- a/pywb/warcserver/warcserver.py +++ b/pywb/warcserver/warcserver.py @@ -3,13 +3,14 @@ from pywb.utils.loaders import load_yaml_config, load_overlay_config from pywb.warcserver.basewarcserver import BaseWarcServer from pywb.warcserver.index.aggregator import CacheDirectoryIndexSource, RedisMultiKeyIndexSource -from pywb.warcserver.index.aggregator import GeventTimeoutAggregator, SimpleAggregator +from pywb.warcserver.index.aggregator import GeventTimeoutAggregator, SimpleAggregator, RulesAggregator from pywb.warcserver.handlers import DefaultResourceHandler, HandlerSeq from pywb.warcserver.index.indexsource import FileIndexSource, RemoteIndexSource -from pywb.warcserver.index.indexsource import MementoIndexSource, RedisIndexSource -from pywb.warcserver.index.indexsource import LiveIndexSource, WBMementoIndexSource +from pywb.warcserver.index.indexsource import MementoIndexSource, WBMementoIndexSource +from pywb.warcserver.index.indexsource import LiveIndexSource, LiveRewriteIndexSource +from pywb.warcserver.index.indexsource import RedisIndexSource from pywb.warcserver.index.zipnum import ZipNumIndexSource from pywb import DEFAULT_CONFIG @@ -27,6 +28,7 @@ SOURCE_LIST = [LiveIndexSource, FileIndexSource, RemoteIndexSource, ZipNumIndexSource, + LiveRewriteIndexSource, ] @@ -137,6 +139,7 @@ class WarcServer(BaseWarcServer): handler = self.load_coll(name, coll_config) except: print('Invalid Collection: ' + name) + raise if self.debug: import traceback traceback.print_exc() @@ -178,7 +181,8 @@ class WarcServer(BaseWarcServer): raise Exception('no index, index_group or sequence found') timeout = int(coll_config.get('timeout', 0)) - agg = init_index_agg(index_group, True, timeout) + rules = coll_config.get('rules') + agg = init_index_agg(index_group, True, timeout, rules=rules) if not archive_paths: archive_paths = self.config.get('archive_paths') @@ -232,11 +236,15 @@ def register_source(source_cls, end=False): # ============================================================================ -def init_index_agg(source_configs, use_gevent=False, timeout=0, source_list=None): +def init_index_agg(source_configs, use_gevent=False, timeout=0, source_list=None, + rules=None): sources = {} for n, v in iteritems(source_configs): sources[n] = init_index_source(v, source_list=source_list) + if rules: + return RulesAggregator(sources, rules=rules) + if use_gevent: return GeventTimeoutAggregator(sources, timeout=timeout) else: