From 1dbabef4100310fd3a9f9053528418520b093dc7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 18 Oct 2017 10:39:18 -0700 Subject: [PATCH] config: custom rules.yaml support and config improvements (addresses #176) (#257) - allow custom 'rules.yaml' to be specified via 'rules_file' config entry, and used by FuzzyMatcher and DefaultRewriter - default rules file specified by DEFAULT_RULES_FILE in pywb package - 'archive_paths' is the key for archive paths instead of 'resource' - 'use_js_obj_proxy' not auto-added to metadata, just set per-deployment --- config.yaml | 4 ++-- pywb/__init__.py | 2 ++ pywb/apps/cli.py | 3 +-- pywb/apps/frontendapp.py | 3 --- pywb/apps/rewriterapp.py | 8 ++++++-- pywb/rewrite/default_rewriter.py | 7 +++++-- pywb/warcserver/handlers.py | 12 +++++++----- pywb/warcserver/index/fuzzymatcher.py | 4 +++- .../warcserver/index/test/test_fuzzymatcher.py | 2 +- .../test/test_warcserver_config.yaml | 6 +++--- pywb/warcserver/warcserver.py | 18 ++++++++++-------- tests/config_test.yaml | 8 +++----- tests/test_cli.py | 4 ++-- tests/test_integration.py | 16 +++++++++------- 14 files changed, 54 insertions(+), 43 deletions(-) diff --git a/config.yaml b/config.yaml index bb0b27b5..420f0d26 100644 --- a/config.yaml +++ b/config.yaml @@ -4,8 +4,8 @@ collections: pywb: - index: ./sample_archive/cdx/ - resource: ./sample_archive/warcs/ + index_paths: ./sample_archive/cdx/ + archive_paths: ./sample_archive/warcs/ # Settings for each collection use_js_obj_proxy: true diff --git a/pywb/__init__.py b/pywb/__init__.py index 2c20cbe7..06fd5597 100644 --- a/pywb/__init__.py +++ b/pywb/__init__.py @@ -2,6 +2,8 @@ __version__ = '0.52.0' DEFAULT_CONFIG = 'pywb/default_config.yaml' +DEFAULT_RULES_FILE = 'pkg://pywb/rules.yaml' + def get_test_dir(): import os diff --git a/pywb/apps/cli.py b/pywb/apps/cli.py index ec098321..ae5f786f 100644 --- a/pywb/apps/cli.py +++ b/pywb/apps/cli.py @@ -66,8 +66,7 @@ class BaseCli(object): def load(self): if self.r.live: self.extra_config['collections'] = {'live': - {'index': '$live', - 'use_js_obj_proxy': True}} + {'index': '$live'}} if self.r.debug: self.extra_config['debug'] = True diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index 3c581d3f..e7c190e7 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -189,9 +189,6 @@ class FrontEndApp(object): metadata = {'coll': coll, 'type': 'replay'} - if self.warcserver.config.get('use_js_obj_proxy'): - metadata['use_js_obj_proxy'] = True - if coll in self.warcserver.list_fixed_routes(): metadata.update(self.warcserver.get_coll_config(coll)) else: diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index f90b5a19..8bf64e1f 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -65,7 +65,9 @@ class RewriterApp(object): self.frame_mod = None self.replay_mod = '' - self.default_rw = DefaultRewriter(replay_mod=self.replay_mod) + self.default_rw = DefaultRewriter(replay_mod=self.replay_mod, + config=config) + self.js_proxy_rw = RewriterWithJSProxy(replay_mod=self.replay_mod) if not jinja_env: @@ -87,6 +89,8 @@ class RewriterApp(object): self.not_found_view = BaseInsertView(self.jinja_env, self._html_templ('not_found_html')) self.query_view = BaseInsertView(self.jinja_env, self._html_templ('query_html')) + self.use_js_obj_proxy = config.get('use_js_obj_proxy', False) + self.cookie_tracker = None self.enable_memento = self.config.get('enable_memento') @@ -170,7 +174,7 @@ class RewriterApp(object): urlkey = canonicalize(wb_url.url) - if kwargs.get('use_js_obj_proxy'): + if self.use_js_obj_proxy: content_rw = self.js_proxy_rw else: content_rw = self.default_rw diff --git a/pywb/rewrite/default_rewriter.py b/pywb/rewrite/default_rewriter.py index c41e19cc..e2ce966f 100644 --- a/pywb/rewrite/default_rewriter.py +++ b/pywb/rewrite/default_rewriter.py @@ -15,6 +15,8 @@ from pywb.rewrite.rewrite_dash import RewriteDASH from pywb.rewrite.rewrite_hls import RewriteHLS from pywb.rewrite.rewrite_amf import RewriteAMF +from pywb import DEFAULT_RULES_FILE + import copy from werkzeug.useragents import UserAgent @@ -90,8 +92,9 @@ class DefaultRewriter(BaseContentRewriter): 'js': 'text/javascript' } - def __init__(self, rules_file=None, replay_mod=''): - rules_file = rules_file or 'pkg://pywb/rules.yaml' + def __init__(self, replay_mod='', config=None): + config = config or {} + rules_file = config.get('rules_file', DEFAULT_RULES_FILE) super(DefaultRewriter, self).__init__(rules_file, replay_mod) self.all_rewriters = copy.copy(self.DEFAULT_REWRITERS) diff --git a/pywb/warcserver/handlers.py b/pywb/warcserver/handlers.py index 35507f3e..8b5f61e1 100644 --- a/pywb/warcserver/handlers.py +++ b/pywb/warcserver/handlers.py @@ -42,7 +42,7 @@ class IndexHandler(object): def __init__(self, index_source, opts=None, *args, **kwargs): self.index_source = index_source self.opts = opts or {} - self.fuzzy = FuzzyMatcher('pkg://pywb/rules.yaml') + self.fuzzy = FuzzyMatcher(kwargs.get('rules_file')) def get_supported_modes(self): return dict(modes=['list_sources', 'index']) @@ -96,8 +96,8 @@ class IndexHandler(object): #============================================================================= class ResourceHandler(IndexHandler): - def __init__(self, index_source, resource_loaders): - super(ResourceHandler, self).__init__(index_source) + def __init__(self, index_source, resource_loaders, rules_file=None): + super(ResourceHandler, self).__init__(index_source, rules_file=rules_file) self.resource_loaders = resource_loaders def get_supported_modes(self): @@ -133,12 +133,14 @@ class ResourceHandler(IndexHandler): #============================================================================= class DefaultResourceHandler(ResourceHandler): - def __init__(self, index_source, warc_paths='', forward_proxy_prefix=''): + def __init__(self, index_source, warc_paths='', forward_proxy_prefix='', + rules_file=''): loaders = [WARCPathLoader(warc_paths, index_source), LiveWebLoader(forward_proxy_prefix), VideoLoader() ] - super(DefaultResourceHandler, self).__init__(index_source, loaders) + super(DefaultResourceHandler, self).__init__(index_source, loaders, + rules_file=rules_file) #============================================================================= diff --git a/pywb/warcserver/index/fuzzymatcher.py b/pywb/warcserver/index/fuzzymatcher.py index a38ca170..eca0c490 100644 --- a/pywb/warcserver/index/fuzzymatcher.py +++ b/pywb/warcserver/index/fuzzymatcher.py @@ -1,5 +1,6 @@ from warcio.utils import to_native_str from pywb.utils.loaders import load_yaml_config +from pywb import DEFAULT_RULES_FILE import re import os @@ -24,7 +25,8 @@ class FuzzyMatcher(object): FUZZY_SKIP_PARAMS = ('alt_url', 'reverse', 'closest', 'end_key', 'url', 'matchType', 'filter') - def __init__(self, filename): + def __init__(self, filename=None): + filename = filename or DEFAULT_RULES_FILE config = load_yaml_config(filename) self.rules = [] for rule in config.get('rules'): diff --git a/pywb/warcserver/index/test/test_fuzzymatcher.py b/pywb/warcserver/index/test/test_fuzzymatcher.py index f1285692..eea00c43 100644 --- a/pywb/warcserver/index/test/test_fuzzymatcher.py +++ b/pywb/warcserver/index/test/test_fuzzymatcher.py @@ -26,7 +26,7 @@ class TestFuzzy(object): @classmethod def setup_class(cls): cls.source = SimpleAggregator({'source': EchoParamsSource()}) - cls.fuzzy = FuzzyMatcher('pkg://pywb/rules.yaml') + cls.fuzzy = FuzzyMatcher() def get_params(self, url, actual_url, mime='text/html'): params = {'url': url, diff --git a/pywb/warcserver/test/test_warcserver_config.yaml b/pywb/warcserver/test/test_warcserver_config.yaml index d5515907..cc61b58f 100644 --- a/pywb/warcserver/test/test_warcserver_config.yaml +++ b/pywb/warcserver/test/test_warcserver_config.yaml @@ -46,18 +46,18 @@ collections: # Local Dir CDX local: index: ./local/indexes - resource: ./local/data + archive_paths: ./local/data local_file: index: ./local/indexes/file.cdxj - resource: ./local/data + archive_paths: ./local/data # Sequence many_seq: sequence: - index: ./local/indexes - resource: ./local/data + archive_paths: ./local/data name: local - diff --git a/pywb/warcserver/warcserver.py b/pywb/warcserver/warcserver.py index e51665e0..993f9b6c 100644 --- a/pywb/warcserver/warcserver.py +++ b/pywb/warcserver/warcserver.py @@ -58,6 +58,8 @@ class WarcServer(BaseWarcServer): self.index_paths = self.init_paths('index_paths') self.archive_paths = self.init_paths('archive_paths', self.root_dir) + self.rules_file = self.config.get('rules_file', '') + self.auto_handler = None if self.config.get('enable_auto_colls', True): @@ -98,7 +100,8 @@ class WarcServer(BaseWarcServer): base_dir=self.index_paths, config=self.config) - return DefaultResourceHandler(dir_source, self.archive_paths) + return DefaultResourceHandler(dir_source, self.archive_paths, + rules_file=self.rules_file) def list_fixed_routes(self): return list(self.fixed_routes.keys()) @@ -149,14 +152,12 @@ class WarcServer(BaseWarcServer): if isinstance(coll_config, str): index = coll_config - resource = None + archive_paths = None elif isinstance(coll_config, dict): index = coll_config.get('index') if not index: index = coll_config.get('index_paths') - resource = coll_config.get('resource') - if not resource: - resource = coll_config.get('archive_paths') + archive_paths = coll_config.get('archive_paths') else: raise Exception('collection config must be string or dict') @@ -179,10 +180,11 @@ class WarcServer(BaseWarcServer): timeout = int(coll_config.get('timeout', 0)) agg = init_index_agg(index_group, True, timeout) - if not resource: - resource = self.config.get('archive_paths') + if not archive_paths: + archive_paths = self.config.get('archive_paths') - return DefaultResourceHandler(agg, resource) + return DefaultResourceHandler(agg, archive_paths, + rules_file=self.rules_file) def init_sequence(self, coll_name, seq_config): if not isinstance(seq_config, list): diff --git a/tests/config_test.yaml b/tests/config_test.yaml index ca67e2e2..f83c67bf 100644 --- a/tests/config_test.yaml +++ b/tests/config_test.yaml @@ -2,16 +2,14 @@ debug: true +use_js_obj_proxy: true + collections_root: _test_colls collections: pywb: index: ./sample_archive/cdx/ - resource: ./sample_archive/warcs/ - - with-js-proxy: - index: ./sample_archive/cdx/ - use_js_obj_proxy: true + archive_paths: ./sample_archive/warcs/ # live collection live: $live diff --git a/tests/test_cli.py b/tests/test_cli.py index 4437b6fd..8a7d2b4c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -25,7 +25,7 @@ class TestProxyCLIConfig(CollsDirMixin, BaseTestClass): def test_proxy_cli_rec(self): res = wayback(['--proxy', 'test', '--proxy-record']) assert res.extra_config['proxy']['recording'] == True - assert res.extra_config['collections']['live'] == {'index': '$live', 'use_js_obj_proxy': True} + assert res.extra_config['collections']['live'] == {'index': '$live'} def test_proxy_cli_err_coll(self): with pytest.raises(Exception): @@ -37,6 +37,6 @@ class TestProxyCLIConfig(CollsDirMixin, BaseTestClass): def test_live_all_cli(self): res = wayback(['--all-coll', 'all', '--live']) - assert res.extra_config['collections'] == {'live': {'index': '$live', 'use_js_obj_proxy': True}, + assert res.extra_config['collections'] == {'live': {'index': '$live'}, 'all': '$all'} diff --git a/tests/test_integration.py b/tests/test_integration.py index 6e0fe345..6cf70045 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -253,29 +253,31 @@ class TestWbIntegration(BaseConfigTest): assert resp.status_int == 200 assert resp.content_type == 'text/css' - def test_replay_js_mod(self): - # an empty js file - resp = self.testapp.get('/pywb/20140126201054js_/http://www.iana.org/_js/2013.1/iana.js') + def test_replay_js_mod_no_obj_proxy(self): + # an empty js file, (ie11 UA no js obj proxy) + resp = self.testapp.get('/pywb/20140126201054js_/http://www.iana.org/_js/2013.1/iana.js', + headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'}) + assert resp.status_int == 200 assert resp.content_length == 0 assert resp.content_type == 'application/x-javascript' def test_replay_js_obj_proxy(self, fmod): # test js proxy obj with jquery -- no user agent - resp = self.get('/with-js-proxy/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod) + resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod) assert resp.status_int == 200 assert resp.content_length != 0 assert resp.content_type == 'application/x-javascript' # test with Chrome user agent - resp = self.get('/with-js-proxy/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod, + resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) assert 'let window = _____WB$wombat$assign$function_____(' in resp.text def test_replay_js_ie11_no_obj_proxy(self, fmod): # IE11 user-agent, no proxy - resp = self.get('/with-js-proxy/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod, + resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'}) assert 'let window = _____WB$wombat$assign$function_____(' not in resp.text @@ -468,7 +470,7 @@ class TestWbIntegration(BaseConfigTest): resp = self.testapp.get('/collinfo.json') assert resp.content_type == 'application/json' value = resp.json - assert len(value['fixed']) == 5 + assert len(value['fixed']) == 4 assert len(value['dynamic']) == 0 #def test_invalid_config(self):