mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
- allow custom 'rules.yaml' to be specified via 'rules_file' config entry, and used by FuzzyMatcher and DefaultRewriter - default rules file specified by DEFAULT_RULES_FILE in pywb package - 'archive_paths' is the key for archive paths instead of 'resource' - 'use_js_obj_proxy' not auto-added to metadata, just set per-deployment
This commit is contained in:
parent
61f825330c
commit
1dbabef410
@ -4,8 +4,8 @@
|
|||||||
|
|
||||||
collections:
|
collections:
|
||||||
pywb:
|
pywb:
|
||||||
index: ./sample_archive/cdx/
|
index_paths: ./sample_archive/cdx/
|
||||||
resource: ./sample_archive/warcs/
|
archive_paths: ./sample_archive/warcs/
|
||||||
|
|
||||||
# Settings for each collection
|
# Settings for each collection
|
||||||
use_js_obj_proxy: true
|
use_js_obj_proxy: true
|
||||||
|
@ -2,6 +2,8 @@ __version__ = '0.52.0'
|
|||||||
|
|
||||||
DEFAULT_CONFIG = 'pywb/default_config.yaml'
|
DEFAULT_CONFIG = 'pywb/default_config.yaml'
|
||||||
|
|
||||||
|
DEFAULT_RULES_FILE = 'pkg://pywb/rules.yaml'
|
||||||
|
|
||||||
|
|
||||||
def get_test_dir():
|
def get_test_dir():
|
||||||
import os
|
import os
|
||||||
|
@ -66,8 +66,7 @@ class BaseCli(object):
|
|||||||
def load(self):
|
def load(self):
|
||||||
if self.r.live:
|
if self.r.live:
|
||||||
self.extra_config['collections'] = {'live':
|
self.extra_config['collections'] = {'live':
|
||||||
{'index': '$live',
|
{'index': '$live'}}
|
||||||
'use_js_obj_proxy': True}}
|
|
||||||
|
|
||||||
if self.r.debug:
|
if self.r.debug:
|
||||||
self.extra_config['debug'] = True
|
self.extra_config['debug'] = True
|
||||||
|
@ -189,9 +189,6 @@ class FrontEndApp(object):
|
|||||||
metadata = {'coll': coll,
|
metadata = {'coll': coll,
|
||||||
'type': 'replay'}
|
'type': 'replay'}
|
||||||
|
|
||||||
if self.warcserver.config.get('use_js_obj_proxy'):
|
|
||||||
metadata['use_js_obj_proxy'] = True
|
|
||||||
|
|
||||||
if coll in self.warcserver.list_fixed_routes():
|
if coll in self.warcserver.list_fixed_routes():
|
||||||
metadata.update(self.warcserver.get_coll_config(coll))
|
metadata.update(self.warcserver.get_coll_config(coll))
|
||||||
else:
|
else:
|
||||||
|
@ -65,7 +65,9 @@ class RewriterApp(object):
|
|||||||
self.frame_mod = None
|
self.frame_mod = None
|
||||||
self.replay_mod = ''
|
self.replay_mod = ''
|
||||||
|
|
||||||
self.default_rw = DefaultRewriter(replay_mod=self.replay_mod)
|
self.default_rw = DefaultRewriter(replay_mod=self.replay_mod,
|
||||||
|
config=config)
|
||||||
|
|
||||||
self.js_proxy_rw = RewriterWithJSProxy(replay_mod=self.replay_mod)
|
self.js_proxy_rw = RewriterWithJSProxy(replay_mod=self.replay_mod)
|
||||||
|
|
||||||
if not jinja_env:
|
if not jinja_env:
|
||||||
@ -87,6 +89,8 @@ class RewriterApp(object):
|
|||||||
self.not_found_view = BaseInsertView(self.jinja_env, self._html_templ('not_found_html'))
|
self.not_found_view = BaseInsertView(self.jinja_env, self._html_templ('not_found_html'))
|
||||||
self.query_view = BaseInsertView(self.jinja_env, self._html_templ('query_html'))
|
self.query_view = BaseInsertView(self.jinja_env, self._html_templ('query_html'))
|
||||||
|
|
||||||
|
self.use_js_obj_proxy = config.get('use_js_obj_proxy', False)
|
||||||
|
|
||||||
self.cookie_tracker = None
|
self.cookie_tracker = None
|
||||||
|
|
||||||
self.enable_memento = self.config.get('enable_memento')
|
self.enable_memento = self.config.get('enable_memento')
|
||||||
@ -170,7 +174,7 @@ class RewriterApp(object):
|
|||||||
|
|
||||||
urlkey = canonicalize(wb_url.url)
|
urlkey = canonicalize(wb_url.url)
|
||||||
|
|
||||||
if kwargs.get('use_js_obj_proxy'):
|
if self.use_js_obj_proxy:
|
||||||
content_rw = self.js_proxy_rw
|
content_rw = self.js_proxy_rw
|
||||||
else:
|
else:
|
||||||
content_rw = self.default_rw
|
content_rw = self.default_rw
|
||||||
|
@ -15,6 +15,8 @@ from pywb.rewrite.rewrite_dash import RewriteDASH
|
|||||||
from pywb.rewrite.rewrite_hls import RewriteHLS
|
from pywb.rewrite.rewrite_hls import RewriteHLS
|
||||||
from pywb.rewrite.rewrite_amf import RewriteAMF
|
from pywb.rewrite.rewrite_amf import RewriteAMF
|
||||||
|
|
||||||
|
from pywb import DEFAULT_RULES_FILE
|
||||||
|
|
||||||
import copy
|
import copy
|
||||||
from werkzeug.useragents import UserAgent
|
from werkzeug.useragents import UserAgent
|
||||||
|
|
||||||
@ -90,8 +92,9 @@ class DefaultRewriter(BaseContentRewriter):
|
|||||||
'js': 'text/javascript'
|
'js': 'text/javascript'
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, rules_file=None, replay_mod=''):
|
def __init__(self, replay_mod='', config=None):
|
||||||
rules_file = rules_file or 'pkg://pywb/rules.yaml'
|
config = config or {}
|
||||||
|
rules_file = config.get('rules_file', DEFAULT_RULES_FILE)
|
||||||
super(DefaultRewriter, self).__init__(rules_file, replay_mod)
|
super(DefaultRewriter, self).__init__(rules_file, replay_mod)
|
||||||
self.all_rewriters = copy.copy(self.DEFAULT_REWRITERS)
|
self.all_rewriters = copy.copy(self.DEFAULT_REWRITERS)
|
||||||
|
|
||||||
|
@ -42,7 +42,7 @@ class IndexHandler(object):
|
|||||||
def __init__(self, index_source, opts=None, *args, **kwargs):
|
def __init__(self, index_source, opts=None, *args, **kwargs):
|
||||||
self.index_source = index_source
|
self.index_source = index_source
|
||||||
self.opts = opts or {}
|
self.opts = opts or {}
|
||||||
self.fuzzy = FuzzyMatcher('pkg://pywb/rules.yaml')
|
self.fuzzy = FuzzyMatcher(kwargs.get('rules_file'))
|
||||||
|
|
||||||
def get_supported_modes(self):
|
def get_supported_modes(self):
|
||||||
return dict(modes=['list_sources', 'index'])
|
return dict(modes=['list_sources', 'index'])
|
||||||
@ -96,8 +96,8 @@ class IndexHandler(object):
|
|||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class ResourceHandler(IndexHandler):
|
class ResourceHandler(IndexHandler):
|
||||||
def __init__(self, index_source, resource_loaders):
|
def __init__(self, index_source, resource_loaders, rules_file=None):
|
||||||
super(ResourceHandler, self).__init__(index_source)
|
super(ResourceHandler, self).__init__(index_source, rules_file=rules_file)
|
||||||
self.resource_loaders = resource_loaders
|
self.resource_loaders = resource_loaders
|
||||||
|
|
||||||
def get_supported_modes(self):
|
def get_supported_modes(self):
|
||||||
@ -133,12 +133,14 @@ class ResourceHandler(IndexHandler):
|
|||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
class DefaultResourceHandler(ResourceHandler):
|
class DefaultResourceHandler(ResourceHandler):
|
||||||
def __init__(self, index_source, warc_paths='', forward_proxy_prefix=''):
|
def __init__(self, index_source, warc_paths='', forward_proxy_prefix='',
|
||||||
|
rules_file=''):
|
||||||
loaders = [WARCPathLoader(warc_paths, index_source),
|
loaders = [WARCPathLoader(warc_paths, index_source),
|
||||||
LiveWebLoader(forward_proxy_prefix),
|
LiveWebLoader(forward_proxy_prefix),
|
||||||
VideoLoader()
|
VideoLoader()
|
||||||
]
|
]
|
||||||
super(DefaultResourceHandler, self).__init__(index_source, loaders)
|
super(DefaultResourceHandler, self).__init__(index_source, loaders,
|
||||||
|
rules_file=rules_file)
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
#=============================================================================
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from warcio.utils import to_native_str
|
from warcio.utils import to_native_str
|
||||||
from pywb.utils.loaders import load_yaml_config
|
from pywb.utils.loaders import load_yaml_config
|
||||||
|
from pywb import DEFAULT_RULES_FILE
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
@ -24,7 +25,8 @@ class FuzzyMatcher(object):
|
|||||||
FUZZY_SKIP_PARAMS = ('alt_url', 'reverse', 'closest', 'end_key',
|
FUZZY_SKIP_PARAMS = ('alt_url', 'reverse', 'closest', 'end_key',
|
||||||
'url', 'matchType', 'filter')
|
'url', 'matchType', 'filter')
|
||||||
|
|
||||||
def __init__(self, filename):
|
def __init__(self, filename=None):
|
||||||
|
filename = filename or DEFAULT_RULES_FILE
|
||||||
config = load_yaml_config(filename)
|
config = load_yaml_config(filename)
|
||||||
self.rules = []
|
self.rules = []
|
||||||
for rule in config.get('rules'):
|
for rule in config.get('rules'):
|
||||||
|
@ -26,7 +26,7 @@ class TestFuzzy(object):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def setup_class(cls):
|
def setup_class(cls):
|
||||||
cls.source = SimpleAggregator({'source': EchoParamsSource()})
|
cls.source = SimpleAggregator({'source': EchoParamsSource()})
|
||||||
cls.fuzzy = FuzzyMatcher('pkg://pywb/rules.yaml')
|
cls.fuzzy = FuzzyMatcher()
|
||||||
|
|
||||||
def get_params(self, url, actual_url, mime='text/html'):
|
def get_params(self, url, actual_url, mime='text/html'):
|
||||||
params = {'url': url,
|
params = {'url': url,
|
||||||
|
@ -46,18 +46,18 @@ collections:
|
|||||||
# Local Dir CDX
|
# Local Dir CDX
|
||||||
local:
|
local:
|
||||||
index: ./local/indexes
|
index: ./local/indexes
|
||||||
resource: ./local/data
|
archive_paths: ./local/data
|
||||||
|
|
||||||
local_file:
|
local_file:
|
||||||
index: ./local/indexes/file.cdxj
|
index: ./local/indexes/file.cdxj
|
||||||
resource: ./local/data
|
archive_paths: ./local/data
|
||||||
|
|
||||||
# Sequence
|
# Sequence
|
||||||
many_seq:
|
many_seq:
|
||||||
sequence:
|
sequence:
|
||||||
-
|
-
|
||||||
index: ./local/indexes
|
index: ./local/indexes
|
||||||
resource: ./local/data
|
archive_paths: ./local/data
|
||||||
name: local
|
name: local
|
||||||
|
|
||||||
-
|
-
|
||||||
|
@ -58,6 +58,8 @@ class WarcServer(BaseWarcServer):
|
|||||||
self.index_paths = self.init_paths('index_paths')
|
self.index_paths = self.init_paths('index_paths')
|
||||||
self.archive_paths = self.init_paths('archive_paths', self.root_dir)
|
self.archive_paths = self.init_paths('archive_paths', self.root_dir)
|
||||||
|
|
||||||
|
self.rules_file = self.config.get('rules_file', '')
|
||||||
|
|
||||||
self.auto_handler = None
|
self.auto_handler = None
|
||||||
|
|
||||||
if self.config.get('enable_auto_colls', True):
|
if self.config.get('enable_auto_colls', True):
|
||||||
@ -98,7 +100,8 @@ class WarcServer(BaseWarcServer):
|
|||||||
base_dir=self.index_paths,
|
base_dir=self.index_paths,
|
||||||
config=self.config)
|
config=self.config)
|
||||||
|
|
||||||
return DefaultResourceHandler(dir_source, self.archive_paths)
|
return DefaultResourceHandler(dir_source, self.archive_paths,
|
||||||
|
rules_file=self.rules_file)
|
||||||
|
|
||||||
def list_fixed_routes(self):
|
def list_fixed_routes(self):
|
||||||
return list(self.fixed_routes.keys())
|
return list(self.fixed_routes.keys())
|
||||||
@ -149,14 +152,12 @@ class WarcServer(BaseWarcServer):
|
|||||||
|
|
||||||
if isinstance(coll_config, str):
|
if isinstance(coll_config, str):
|
||||||
index = coll_config
|
index = coll_config
|
||||||
resource = None
|
archive_paths = None
|
||||||
elif isinstance(coll_config, dict):
|
elif isinstance(coll_config, dict):
|
||||||
index = coll_config.get('index')
|
index = coll_config.get('index')
|
||||||
if not index:
|
if not index:
|
||||||
index = coll_config.get('index_paths')
|
index = coll_config.get('index_paths')
|
||||||
resource = coll_config.get('resource')
|
archive_paths = coll_config.get('archive_paths')
|
||||||
if not resource:
|
|
||||||
resource = coll_config.get('archive_paths')
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise Exception('collection config must be string or dict')
|
raise Exception('collection config must be string or dict')
|
||||||
@ -179,10 +180,11 @@ class WarcServer(BaseWarcServer):
|
|||||||
timeout = int(coll_config.get('timeout', 0))
|
timeout = int(coll_config.get('timeout', 0))
|
||||||
agg = init_index_agg(index_group, True, timeout)
|
agg = init_index_agg(index_group, True, timeout)
|
||||||
|
|
||||||
if not resource:
|
if not archive_paths:
|
||||||
resource = self.config.get('archive_paths')
|
archive_paths = self.config.get('archive_paths')
|
||||||
|
|
||||||
return DefaultResourceHandler(agg, resource)
|
return DefaultResourceHandler(agg, archive_paths,
|
||||||
|
rules_file=self.rules_file)
|
||||||
|
|
||||||
def init_sequence(self, coll_name, seq_config):
|
def init_sequence(self, coll_name, seq_config):
|
||||||
if not isinstance(seq_config, list):
|
if not isinstance(seq_config, list):
|
||||||
|
@ -2,16 +2,14 @@
|
|||||||
|
|
||||||
debug: true
|
debug: true
|
||||||
|
|
||||||
|
use_js_obj_proxy: true
|
||||||
|
|
||||||
collections_root: _test_colls
|
collections_root: _test_colls
|
||||||
|
|
||||||
collections:
|
collections:
|
||||||
pywb:
|
pywb:
|
||||||
index: ./sample_archive/cdx/
|
index: ./sample_archive/cdx/
|
||||||
resource: ./sample_archive/warcs/
|
archive_paths: ./sample_archive/warcs/
|
||||||
|
|
||||||
with-js-proxy:
|
|
||||||
index: ./sample_archive/cdx/
|
|
||||||
use_js_obj_proxy: true
|
|
||||||
|
|
||||||
# live collection
|
# live collection
|
||||||
live: $live
|
live: $live
|
||||||
|
@ -25,7 +25,7 @@ class TestProxyCLIConfig(CollsDirMixin, BaseTestClass):
|
|||||||
def test_proxy_cli_rec(self):
|
def test_proxy_cli_rec(self):
|
||||||
res = wayback(['--proxy', 'test', '--proxy-record'])
|
res = wayback(['--proxy', 'test', '--proxy-record'])
|
||||||
assert res.extra_config['proxy']['recording'] == True
|
assert res.extra_config['proxy']['recording'] == True
|
||||||
assert res.extra_config['collections']['live'] == {'index': '$live', 'use_js_obj_proxy': True}
|
assert res.extra_config['collections']['live'] == {'index': '$live'}
|
||||||
|
|
||||||
def test_proxy_cli_err_coll(self):
|
def test_proxy_cli_err_coll(self):
|
||||||
with pytest.raises(Exception):
|
with pytest.raises(Exception):
|
||||||
@ -37,6 +37,6 @@ class TestProxyCLIConfig(CollsDirMixin, BaseTestClass):
|
|||||||
|
|
||||||
def test_live_all_cli(self):
|
def test_live_all_cli(self):
|
||||||
res = wayback(['--all-coll', 'all', '--live'])
|
res = wayback(['--all-coll', 'all', '--live'])
|
||||||
assert res.extra_config['collections'] == {'live': {'index': '$live', 'use_js_obj_proxy': True},
|
assert res.extra_config['collections'] == {'live': {'index': '$live'},
|
||||||
'all': '$all'}
|
'all': '$all'}
|
||||||
|
|
||||||
|
@ -253,29 +253,31 @@ class TestWbIntegration(BaseConfigTest):
|
|||||||
assert resp.status_int == 200
|
assert resp.status_int == 200
|
||||||
assert resp.content_type == 'text/css'
|
assert resp.content_type == 'text/css'
|
||||||
|
|
||||||
def test_replay_js_mod(self):
|
def test_replay_js_mod_no_obj_proxy(self):
|
||||||
# an empty js file
|
# an empty js file, (ie11 UA no js obj proxy)
|
||||||
resp = self.testapp.get('/pywb/20140126201054js_/http://www.iana.org/_js/2013.1/iana.js')
|
resp = self.testapp.get('/pywb/20140126201054js_/http://www.iana.org/_js/2013.1/iana.js',
|
||||||
|
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'})
|
||||||
|
|
||||||
assert resp.status_int == 200
|
assert resp.status_int == 200
|
||||||
assert resp.content_length == 0
|
assert resp.content_length == 0
|
||||||
assert resp.content_type == 'application/x-javascript'
|
assert resp.content_type == 'application/x-javascript'
|
||||||
|
|
||||||
def test_replay_js_obj_proxy(self, fmod):
|
def test_replay_js_obj_proxy(self, fmod):
|
||||||
# test js proxy obj with jquery -- no user agent
|
# test js proxy obj with jquery -- no user agent
|
||||||
resp = self.get('/with-js-proxy/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod)
|
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod)
|
||||||
|
|
||||||
assert resp.status_int == 200
|
assert resp.status_int == 200
|
||||||
assert resp.content_length != 0
|
assert resp.content_length != 0
|
||||||
assert resp.content_type == 'application/x-javascript'
|
assert resp.content_type == 'application/x-javascript'
|
||||||
|
|
||||||
# test with Chrome user agent
|
# test with Chrome user agent
|
||||||
resp = self.get('/with-js-proxy/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
|
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
|
||||||
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'})
|
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'})
|
||||||
assert 'let window = _____WB$wombat$assign$function_____(' in resp.text
|
assert 'let window = _____WB$wombat$assign$function_____(' in resp.text
|
||||||
|
|
||||||
def test_replay_js_ie11_no_obj_proxy(self, fmod):
|
def test_replay_js_ie11_no_obj_proxy(self, fmod):
|
||||||
# IE11 user-agent, no proxy
|
# IE11 user-agent, no proxy
|
||||||
resp = self.get('/with-js-proxy/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
|
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
|
||||||
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'})
|
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'})
|
||||||
|
|
||||||
assert 'let window = _____WB$wombat$assign$function_____(' not in resp.text
|
assert 'let window = _____WB$wombat$assign$function_____(' not in resp.text
|
||||||
@ -468,7 +470,7 @@ class TestWbIntegration(BaseConfigTest):
|
|||||||
resp = self.testapp.get('/collinfo.json')
|
resp = self.testapp.get('/collinfo.json')
|
||||||
assert resp.content_type == 'application/json'
|
assert resp.content_type == 'application/json'
|
||||||
value = resp.json
|
value = resp.json
|
||||||
assert len(value['fixed']) == 5
|
assert len(value['fixed']) == 4
|
||||||
assert len(value['dynamic']) == 0
|
assert len(value['dynamic']) == 0
|
||||||
|
|
||||||
#def test_invalid_config(self):
|
#def test_invalid_config(self):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user