1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

config: custom rules.yaml support and config improvements (addresses #176) (#257)

- allow custom 'rules.yaml' to be specified via 'rules_file' config entry,
and used by FuzzyMatcher and DefaultRewriter
- default rules file specified by DEFAULT_RULES_FILE in pywb package
- 'archive_paths' is the key for archive paths instead of 'resource'
- 'use_js_obj_proxy' not auto-added to metadata, just set per-deployment
This commit is contained in:
Ilya Kreymer 2017-10-18 10:39:18 -07:00 committed by GitHub
parent 61f825330c
commit 1dbabef410
14 changed files with 54 additions and 43 deletions

View File

@ -4,8 +4,8 @@
collections: collections:
pywb: pywb:
index: ./sample_archive/cdx/ index_paths: ./sample_archive/cdx/
resource: ./sample_archive/warcs/ archive_paths: ./sample_archive/warcs/
# Settings for each collection # Settings for each collection
use_js_obj_proxy: true use_js_obj_proxy: true

View File

@ -2,6 +2,8 @@ __version__ = '0.52.0'
DEFAULT_CONFIG = 'pywb/default_config.yaml' DEFAULT_CONFIG = 'pywb/default_config.yaml'
DEFAULT_RULES_FILE = 'pkg://pywb/rules.yaml'
def get_test_dir(): def get_test_dir():
import os import os

View File

@ -66,8 +66,7 @@ class BaseCli(object):
def load(self): def load(self):
if self.r.live: if self.r.live:
self.extra_config['collections'] = {'live': self.extra_config['collections'] = {'live':
{'index': '$live', {'index': '$live'}}
'use_js_obj_proxy': True}}
if self.r.debug: if self.r.debug:
self.extra_config['debug'] = True self.extra_config['debug'] = True

View File

@ -189,9 +189,6 @@ class FrontEndApp(object):
metadata = {'coll': coll, metadata = {'coll': coll,
'type': 'replay'} 'type': 'replay'}
if self.warcserver.config.get('use_js_obj_proxy'):
metadata['use_js_obj_proxy'] = True
if coll in self.warcserver.list_fixed_routes(): if coll in self.warcserver.list_fixed_routes():
metadata.update(self.warcserver.get_coll_config(coll)) metadata.update(self.warcserver.get_coll_config(coll))
else: else:

View File

@ -65,7 +65,9 @@ class RewriterApp(object):
self.frame_mod = None self.frame_mod = None
self.replay_mod = '' self.replay_mod = ''
self.default_rw = DefaultRewriter(replay_mod=self.replay_mod) self.default_rw = DefaultRewriter(replay_mod=self.replay_mod,
config=config)
self.js_proxy_rw = RewriterWithJSProxy(replay_mod=self.replay_mod) self.js_proxy_rw = RewriterWithJSProxy(replay_mod=self.replay_mod)
if not jinja_env: if not jinja_env:
@ -87,6 +89,8 @@ class RewriterApp(object):
self.not_found_view = BaseInsertView(self.jinja_env, self._html_templ('not_found_html')) self.not_found_view = BaseInsertView(self.jinja_env, self._html_templ('not_found_html'))
self.query_view = BaseInsertView(self.jinja_env, self._html_templ('query_html')) self.query_view = BaseInsertView(self.jinja_env, self._html_templ('query_html'))
self.use_js_obj_proxy = config.get('use_js_obj_proxy', False)
self.cookie_tracker = None self.cookie_tracker = None
self.enable_memento = self.config.get('enable_memento') self.enable_memento = self.config.get('enable_memento')
@ -170,7 +174,7 @@ class RewriterApp(object):
urlkey = canonicalize(wb_url.url) urlkey = canonicalize(wb_url.url)
if kwargs.get('use_js_obj_proxy'): if self.use_js_obj_proxy:
content_rw = self.js_proxy_rw content_rw = self.js_proxy_rw
else: else:
content_rw = self.default_rw content_rw = self.default_rw

View File

@ -15,6 +15,8 @@ from pywb.rewrite.rewrite_dash import RewriteDASH
from pywb.rewrite.rewrite_hls import RewriteHLS from pywb.rewrite.rewrite_hls import RewriteHLS
from pywb.rewrite.rewrite_amf import RewriteAMF from pywb.rewrite.rewrite_amf import RewriteAMF
from pywb import DEFAULT_RULES_FILE
import copy import copy
from werkzeug.useragents import UserAgent from werkzeug.useragents import UserAgent
@ -90,8 +92,9 @@ class DefaultRewriter(BaseContentRewriter):
'js': 'text/javascript' 'js': 'text/javascript'
} }
def __init__(self, rules_file=None, replay_mod=''): def __init__(self, replay_mod='', config=None):
rules_file = rules_file or 'pkg://pywb/rules.yaml' config = config or {}
rules_file = config.get('rules_file', DEFAULT_RULES_FILE)
super(DefaultRewriter, self).__init__(rules_file, replay_mod) super(DefaultRewriter, self).__init__(rules_file, replay_mod)
self.all_rewriters = copy.copy(self.DEFAULT_REWRITERS) self.all_rewriters = copy.copy(self.DEFAULT_REWRITERS)

View File

@ -42,7 +42,7 @@ class IndexHandler(object):
def __init__(self, index_source, opts=None, *args, **kwargs): def __init__(self, index_source, opts=None, *args, **kwargs):
self.index_source = index_source self.index_source = index_source
self.opts = opts or {} self.opts = opts or {}
self.fuzzy = FuzzyMatcher('pkg://pywb/rules.yaml') self.fuzzy = FuzzyMatcher(kwargs.get('rules_file'))
def get_supported_modes(self): def get_supported_modes(self):
return dict(modes=['list_sources', 'index']) return dict(modes=['list_sources', 'index'])
@ -96,8 +96,8 @@ class IndexHandler(object):
#============================================================================= #=============================================================================
class ResourceHandler(IndexHandler): class ResourceHandler(IndexHandler):
def __init__(self, index_source, resource_loaders): def __init__(self, index_source, resource_loaders, rules_file=None):
super(ResourceHandler, self).__init__(index_source) super(ResourceHandler, self).__init__(index_source, rules_file=rules_file)
self.resource_loaders = resource_loaders self.resource_loaders = resource_loaders
def get_supported_modes(self): def get_supported_modes(self):
@ -133,12 +133,14 @@ class ResourceHandler(IndexHandler):
#============================================================================= #=============================================================================
class DefaultResourceHandler(ResourceHandler): class DefaultResourceHandler(ResourceHandler):
def __init__(self, index_source, warc_paths='', forward_proxy_prefix=''): def __init__(self, index_source, warc_paths='', forward_proxy_prefix='',
rules_file=''):
loaders = [WARCPathLoader(warc_paths, index_source), loaders = [WARCPathLoader(warc_paths, index_source),
LiveWebLoader(forward_proxy_prefix), LiveWebLoader(forward_proxy_prefix),
VideoLoader() VideoLoader()
] ]
super(DefaultResourceHandler, self).__init__(index_source, loaders) super(DefaultResourceHandler, self).__init__(index_source, loaders,
rules_file=rules_file)
#============================================================================= #=============================================================================

View File

@ -1,5 +1,6 @@
from warcio.utils import to_native_str from warcio.utils import to_native_str
from pywb.utils.loaders import load_yaml_config from pywb.utils.loaders import load_yaml_config
from pywb import DEFAULT_RULES_FILE
import re import re
import os import os
@ -24,7 +25,8 @@ class FuzzyMatcher(object):
FUZZY_SKIP_PARAMS = ('alt_url', 'reverse', 'closest', 'end_key', FUZZY_SKIP_PARAMS = ('alt_url', 'reverse', 'closest', 'end_key',
'url', 'matchType', 'filter') 'url', 'matchType', 'filter')
def __init__(self, filename): def __init__(self, filename=None):
filename = filename or DEFAULT_RULES_FILE
config = load_yaml_config(filename) config = load_yaml_config(filename)
self.rules = [] self.rules = []
for rule in config.get('rules'): for rule in config.get('rules'):

View File

@ -26,7 +26,7 @@ class TestFuzzy(object):
@classmethod @classmethod
def setup_class(cls): def setup_class(cls):
cls.source = SimpleAggregator({'source': EchoParamsSource()}) cls.source = SimpleAggregator({'source': EchoParamsSource()})
cls.fuzzy = FuzzyMatcher('pkg://pywb/rules.yaml') cls.fuzzy = FuzzyMatcher()
def get_params(self, url, actual_url, mime='text/html'): def get_params(self, url, actual_url, mime='text/html'):
params = {'url': url, params = {'url': url,

View File

@ -46,18 +46,18 @@ collections:
# Local Dir CDX # Local Dir CDX
local: local:
index: ./local/indexes index: ./local/indexes
resource: ./local/data archive_paths: ./local/data
local_file: local_file:
index: ./local/indexes/file.cdxj index: ./local/indexes/file.cdxj
resource: ./local/data archive_paths: ./local/data
# Sequence # Sequence
many_seq: many_seq:
sequence: sequence:
- -
index: ./local/indexes index: ./local/indexes
resource: ./local/data archive_paths: ./local/data
name: local name: local
- -

View File

@ -58,6 +58,8 @@ class WarcServer(BaseWarcServer):
self.index_paths = self.init_paths('index_paths') self.index_paths = self.init_paths('index_paths')
self.archive_paths = self.init_paths('archive_paths', self.root_dir) self.archive_paths = self.init_paths('archive_paths', self.root_dir)
self.rules_file = self.config.get('rules_file', '')
self.auto_handler = None self.auto_handler = None
if self.config.get('enable_auto_colls', True): if self.config.get('enable_auto_colls', True):
@ -98,7 +100,8 @@ class WarcServer(BaseWarcServer):
base_dir=self.index_paths, base_dir=self.index_paths,
config=self.config) config=self.config)
return DefaultResourceHandler(dir_source, self.archive_paths) return DefaultResourceHandler(dir_source, self.archive_paths,
rules_file=self.rules_file)
def list_fixed_routes(self): def list_fixed_routes(self):
return list(self.fixed_routes.keys()) return list(self.fixed_routes.keys())
@ -149,14 +152,12 @@ class WarcServer(BaseWarcServer):
if isinstance(coll_config, str): if isinstance(coll_config, str):
index = coll_config index = coll_config
resource = None archive_paths = None
elif isinstance(coll_config, dict): elif isinstance(coll_config, dict):
index = coll_config.get('index') index = coll_config.get('index')
if not index: if not index:
index = coll_config.get('index_paths') index = coll_config.get('index_paths')
resource = coll_config.get('resource') archive_paths = coll_config.get('archive_paths')
if not resource:
resource = coll_config.get('archive_paths')
else: else:
raise Exception('collection config must be string or dict') raise Exception('collection config must be string or dict')
@ -179,10 +180,11 @@ class WarcServer(BaseWarcServer):
timeout = int(coll_config.get('timeout', 0)) timeout = int(coll_config.get('timeout', 0))
agg = init_index_agg(index_group, True, timeout) agg = init_index_agg(index_group, True, timeout)
if not resource: if not archive_paths:
resource = self.config.get('archive_paths') archive_paths = self.config.get('archive_paths')
return DefaultResourceHandler(agg, resource) return DefaultResourceHandler(agg, archive_paths,
rules_file=self.rules_file)
def init_sequence(self, coll_name, seq_config): def init_sequence(self, coll_name, seq_config):
if not isinstance(seq_config, list): if not isinstance(seq_config, list):

View File

@ -2,16 +2,14 @@
debug: true debug: true
use_js_obj_proxy: true
collections_root: _test_colls collections_root: _test_colls
collections: collections:
pywb: pywb:
index: ./sample_archive/cdx/ index: ./sample_archive/cdx/
resource: ./sample_archive/warcs/ archive_paths: ./sample_archive/warcs/
with-js-proxy:
index: ./sample_archive/cdx/
use_js_obj_proxy: true
# live collection # live collection
live: $live live: $live

View File

@ -25,7 +25,7 @@ class TestProxyCLIConfig(CollsDirMixin, BaseTestClass):
def test_proxy_cli_rec(self): def test_proxy_cli_rec(self):
res = wayback(['--proxy', 'test', '--proxy-record']) res = wayback(['--proxy', 'test', '--proxy-record'])
assert res.extra_config['proxy']['recording'] == True assert res.extra_config['proxy']['recording'] == True
assert res.extra_config['collections']['live'] == {'index': '$live', 'use_js_obj_proxy': True} assert res.extra_config['collections']['live'] == {'index': '$live'}
def test_proxy_cli_err_coll(self): def test_proxy_cli_err_coll(self):
with pytest.raises(Exception): with pytest.raises(Exception):
@ -37,6 +37,6 @@ class TestProxyCLIConfig(CollsDirMixin, BaseTestClass):
def test_live_all_cli(self): def test_live_all_cli(self):
res = wayback(['--all-coll', 'all', '--live']) res = wayback(['--all-coll', 'all', '--live'])
assert res.extra_config['collections'] == {'live': {'index': '$live', 'use_js_obj_proxy': True}, assert res.extra_config['collections'] == {'live': {'index': '$live'},
'all': '$all'} 'all': '$all'}

View File

@ -253,29 +253,31 @@ class TestWbIntegration(BaseConfigTest):
assert resp.status_int == 200 assert resp.status_int == 200
assert resp.content_type == 'text/css' assert resp.content_type == 'text/css'
def test_replay_js_mod(self): def test_replay_js_mod_no_obj_proxy(self):
# an empty js file # an empty js file, (ie11 UA no js obj proxy)
resp = self.testapp.get('/pywb/20140126201054js_/http://www.iana.org/_js/2013.1/iana.js') resp = self.testapp.get('/pywb/20140126201054js_/http://www.iana.org/_js/2013.1/iana.js',
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'})
assert resp.status_int == 200 assert resp.status_int == 200
assert resp.content_length == 0 assert resp.content_length == 0
assert resp.content_type == 'application/x-javascript' assert resp.content_type == 'application/x-javascript'
def test_replay_js_obj_proxy(self, fmod): def test_replay_js_obj_proxy(self, fmod):
# test js proxy obj with jquery -- no user agent # test js proxy obj with jquery -- no user agent
resp = self.get('/with-js-proxy/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod) resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod)
assert resp.status_int == 200 assert resp.status_int == 200
assert resp.content_length != 0 assert resp.content_length != 0
assert resp.content_type == 'application/x-javascript' assert resp.content_type == 'application/x-javascript'
# test with Chrome user agent # test with Chrome user agent
resp = self.get('/with-js-proxy/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod, resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'})
assert 'let window = _____WB$wombat$assign$function_____(' in resp.text assert 'let window = _____WB$wombat$assign$function_____(' in resp.text
def test_replay_js_ie11_no_obj_proxy(self, fmod): def test_replay_js_ie11_no_obj_proxy(self, fmod):
# IE11 user-agent, no proxy # IE11 user-agent, no proxy
resp = self.get('/with-js-proxy/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod, resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'}) headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'})
assert 'let window = _____WB$wombat$assign$function_____(' not in resp.text assert 'let window = _____WB$wombat$assign$function_____(' not in resp.text
@ -468,7 +470,7 @@ class TestWbIntegration(BaseConfigTest):
resp = self.testapp.get('/collinfo.json') resp = self.testapp.get('/collinfo.json')
assert resp.content_type == 'application/json' assert resp.content_type == 'application/json'
value = resp.json value = resp.json
assert len(value['fixed']) == 5 assert len(value['fixed']) == 4
assert len(value['dynamic']) == 0 assert len(value['dynamic']) == 0
#def test_invalid_config(self): #def test_invalid_config(self):