1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

config: custom rules.yaml support and config improvements (addresses #176) (#257)

- allow custom 'rules.yaml' to be specified via 'rules_file' config entry,
and used by FuzzyMatcher and DefaultRewriter
- default rules file specified by DEFAULT_RULES_FILE in pywb package
- 'archive_paths' is the key for archive paths instead of 'resource'
- 'use_js_obj_proxy' not auto-added to metadata, just set per-deployment
This commit is contained in:
Ilya Kreymer 2017-10-18 10:39:18 -07:00 committed by GitHub
parent 61f825330c
commit 1dbabef410
14 changed files with 54 additions and 43 deletions

View File

@ -4,8 +4,8 @@
collections:
pywb:
index: ./sample_archive/cdx/
resource: ./sample_archive/warcs/
index_paths: ./sample_archive/cdx/
archive_paths: ./sample_archive/warcs/
# Settings for each collection
use_js_obj_proxy: true

View File

@ -2,6 +2,8 @@ __version__ = '0.52.0'
DEFAULT_CONFIG = 'pywb/default_config.yaml'
DEFAULT_RULES_FILE = 'pkg://pywb/rules.yaml'
def get_test_dir():
import os

View File

@ -66,8 +66,7 @@ class BaseCli(object):
def load(self):
if self.r.live:
self.extra_config['collections'] = {'live':
{'index': '$live',
'use_js_obj_proxy': True}}
{'index': '$live'}}
if self.r.debug:
self.extra_config['debug'] = True

View File

@ -189,9 +189,6 @@ class FrontEndApp(object):
metadata = {'coll': coll,
'type': 'replay'}
if self.warcserver.config.get('use_js_obj_proxy'):
metadata['use_js_obj_proxy'] = True
if coll in self.warcserver.list_fixed_routes():
metadata.update(self.warcserver.get_coll_config(coll))
else:

View File

@ -65,7 +65,9 @@ class RewriterApp(object):
self.frame_mod = None
self.replay_mod = ''
self.default_rw = DefaultRewriter(replay_mod=self.replay_mod)
self.default_rw = DefaultRewriter(replay_mod=self.replay_mod,
config=config)
self.js_proxy_rw = RewriterWithJSProxy(replay_mod=self.replay_mod)
if not jinja_env:
@ -87,6 +89,8 @@ class RewriterApp(object):
self.not_found_view = BaseInsertView(self.jinja_env, self._html_templ('not_found_html'))
self.query_view = BaseInsertView(self.jinja_env, self._html_templ('query_html'))
self.use_js_obj_proxy = config.get('use_js_obj_proxy', False)
self.cookie_tracker = None
self.enable_memento = self.config.get('enable_memento')
@ -170,7 +174,7 @@ class RewriterApp(object):
urlkey = canonicalize(wb_url.url)
if kwargs.get('use_js_obj_proxy'):
if self.use_js_obj_proxy:
content_rw = self.js_proxy_rw
else:
content_rw = self.default_rw

View File

@ -15,6 +15,8 @@ from pywb.rewrite.rewrite_dash import RewriteDASH
from pywb.rewrite.rewrite_hls import RewriteHLS
from pywb.rewrite.rewrite_amf import RewriteAMF
from pywb import DEFAULT_RULES_FILE
import copy
from werkzeug.useragents import UserAgent
@ -90,8 +92,9 @@ class DefaultRewriter(BaseContentRewriter):
'js': 'text/javascript'
}
def __init__(self, rules_file=None, replay_mod=''):
rules_file = rules_file or 'pkg://pywb/rules.yaml'
def __init__(self, replay_mod='', config=None):
config = config or {}
rules_file = config.get('rules_file', DEFAULT_RULES_FILE)
super(DefaultRewriter, self).__init__(rules_file, replay_mod)
self.all_rewriters = copy.copy(self.DEFAULT_REWRITERS)

View File

@ -42,7 +42,7 @@ class IndexHandler(object):
def __init__(self, index_source, opts=None, *args, **kwargs):
self.index_source = index_source
self.opts = opts or {}
self.fuzzy = FuzzyMatcher('pkg://pywb/rules.yaml')
self.fuzzy = FuzzyMatcher(kwargs.get('rules_file'))
def get_supported_modes(self):
return dict(modes=['list_sources', 'index'])
@ -96,8 +96,8 @@ class IndexHandler(object):
#=============================================================================
class ResourceHandler(IndexHandler):
def __init__(self, index_source, resource_loaders):
super(ResourceHandler, self).__init__(index_source)
def __init__(self, index_source, resource_loaders, rules_file=None):
super(ResourceHandler, self).__init__(index_source, rules_file=rules_file)
self.resource_loaders = resource_loaders
def get_supported_modes(self):
@ -133,12 +133,14 @@ class ResourceHandler(IndexHandler):
#=============================================================================
class DefaultResourceHandler(ResourceHandler):
def __init__(self, index_source, warc_paths='', forward_proxy_prefix=''):
def __init__(self, index_source, warc_paths='', forward_proxy_prefix='',
rules_file=''):
loaders = [WARCPathLoader(warc_paths, index_source),
LiveWebLoader(forward_proxy_prefix),
VideoLoader()
]
super(DefaultResourceHandler, self).__init__(index_source, loaders)
super(DefaultResourceHandler, self).__init__(index_source, loaders,
rules_file=rules_file)
#=============================================================================

View File

@ -1,5 +1,6 @@
from warcio.utils import to_native_str
from pywb.utils.loaders import load_yaml_config
from pywb import DEFAULT_RULES_FILE
import re
import os
@ -24,7 +25,8 @@ class FuzzyMatcher(object):
FUZZY_SKIP_PARAMS = ('alt_url', 'reverse', 'closest', 'end_key',
'url', 'matchType', 'filter')
def __init__(self, filename):
def __init__(self, filename=None):
filename = filename or DEFAULT_RULES_FILE
config = load_yaml_config(filename)
self.rules = []
for rule in config.get('rules'):

View File

@ -26,7 +26,7 @@ class TestFuzzy(object):
@classmethod
def setup_class(cls):
cls.source = SimpleAggregator({'source': EchoParamsSource()})
cls.fuzzy = FuzzyMatcher('pkg://pywb/rules.yaml')
cls.fuzzy = FuzzyMatcher()
def get_params(self, url, actual_url, mime='text/html'):
params = {'url': url,

View File

@ -46,18 +46,18 @@ collections:
# Local Dir CDX
local:
index: ./local/indexes
resource: ./local/data
archive_paths: ./local/data
local_file:
index: ./local/indexes/file.cdxj
resource: ./local/data
archive_paths: ./local/data
# Sequence
many_seq:
sequence:
-
index: ./local/indexes
resource: ./local/data
archive_paths: ./local/data
name: local
-

View File

@ -58,6 +58,8 @@ class WarcServer(BaseWarcServer):
self.index_paths = self.init_paths('index_paths')
self.archive_paths = self.init_paths('archive_paths', self.root_dir)
self.rules_file = self.config.get('rules_file', '')
self.auto_handler = None
if self.config.get('enable_auto_colls', True):
@ -98,7 +100,8 @@ class WarcServer(BaseWarcServer):
base_dir=self.index_paths,
config=self.config)
return DefaultResourceHandler(dir_source, self.archive_paths)
return DefaultResourceHandler(dir_source, self.archive_paths,
rules_file=self.rules_file)
def list_fixed_routes(self):
return list(self.fixed_routes.keys())
@ -149,14 +152,12 @@ class WarcServer(BaseWarcServer):
if isinstance(coll_config, str):
index = coll_config
resource = None
archive_paths = None
elif isinstance(coll_config, dict):
index = coll_config.get('index')
if not index:
index = coll_config.get('index_paths')
resource = coll_config.get('resource')
if not resource:
resource = coll_config.get('archive_paths')
archive_paths = coll_config.get('archive_paths')
else:
raise Exception('collection config must be string or dict')
@ -179,10 +180,11 @@ class WarcServer(BaseWarcServer):
timeout = int(coll_config.get('timeout', 0))
agg = init_index_agg(index_group, True, timeout)
if not resource:
resource = self.config.get('archive_paths')
if not archive_paths:
archive_paths = self.config.get('archive_paths')
return DefaultResourceHandler(agg, resource)
return DefaultResourceHandler(agg, archive_paths,
rules_file=self.rules_file)
def init_sequence(self, coll_name, seq_config):
if not isinstance(seq_config, list):

View File

@ -2,16 +2,14 @@
debug: true
use_js_obj_proxy: true
collections_root: _test_colls
collections:
pywb:
index: ./sample_archive/cdx/
resource: ./sample_archive/warcs/
with-js-proxy:
index: ./sample_archive/cdx/
use_js_obj_proxy: true
archive_paths: ./sample_archive/warcs/
# live collection
live: $live

View File

@ -25,7 +25,7 @@ class TestProxyCLIConfig(CollsDirMixin, BaseTestClass):
def test_proxy_cli_rec(self):
res = wayback(['--proxy', 'test', '--proxy-record'])
assert res.extra_config['proxy']['recording'] == True
assert res.extra_config['collections']['live'] == {'index': '$live', 'use_js_obj_proxy': True}
assert res.extra_config['collections']['live'] == {'index': '$live'}
def test_proxy_cli_err_coll(self):
with pytest.raises(Exception):
@ -37,6 +37,6 @@ class TestProxyCLIConfig(CollsDirMixin, BaseTestClass):
def test_live_all_cli(self):
res = wayback(['--all-coll', 'all', '--live'])
assert res.extra_config['collections'] == {'live': {'index': '$live', 'use_js_obj_proxy': True},
assert res.extra_config['collections'] == {'live': {'index': '$live'},
'all': '$all'}

View File

@ -253,29 +253,31 @@ class TestWbIntegration(BaseConfigTest):
assert resp.status_int == 200
assert resp.content_type == 'text/css'
def test_replay_js_mod(self):
# an empty js file
resp = self.testapp.get('/pywb/20140126201054js_/http://www.iana.org/_js/2013.1/iana.js')
def test_replay_js_mod_no_obj_proxy(self):
# an empty js file, (ie11 UA no js obj proxy)
resp = self.testapp.get('/pywb/20140126201054js_/http://www.iana.org/_js/2013.1/iana.js',
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'})
assert resp.status_int == 200
assert resp.content_length == 0
assert resp.content_type == 'application/x-javascript'
def test_replay_js_obj_proxy(self, fmod):
# test js proxy obj with jquery -- no user agent
resp = self.get('/with-js-proxy/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod)
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod)
assert resp.status_int == 200
assert resp.content_length != 0
assert resp.content_type == 'application/x-javascript'
# test with Chrome user agent
resp = self.get('/with-js-proxy/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'})
assert 'let window = _____WB$wombat$assign$function_____(' in resp.text
def test_replay_js_ie11_no_obj_proxy(self, fmod):
# IE11 user-agent, no proxy
resp = self.get('/with-js-proxy/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'})
assert 'let window = _____WB$wombat$assign$function_____(' not in resp.text
@ -468,7 +470,7 @@ class TestWbIntegration(BaseConfigTest):
resp = self.testapp.get('/collinfo.json')
assert resp.content_type == 'application/json'
value = resp.json
assert len(value['fixed']) == 5
assert len(value['fixed']) == 4
assert len(value['dynamic']) == 0
#def test_invalid_config(self):