mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
- allow custom 'rules.yaml' to be specified via 'rules_file' config entry, and used by FuzzyMatcher and DefaultRewriter - default rules file specified by DEFAULT_RULES_FILE in pywb package - 'archive_paths' is the key for archive paths instead of 'resource' - 'use_js_obj_proxy' not auto-added to metadata, just set per-deployment
This commit is contained in:
parent
61f825330c
commit
1dbabef410
@ -4,8 +4,8 @@
|
||||
|
||||
collections:
|
||||
pywb:
|
||||
index: ./sample_archive/cdx/
|
||||
resource: ./sample_archive/warcs/
|
||||
index_paths: ./sample_archive/cdx/
|
||||
archive_paths: ./sample_archive/warcs/
|
||||
|
||||
# Settings for each collection
|
||||
use_js_obj_proxy: true
|
||||
|
@ -2,6 +2,8 @@ __version__ = '0.52.0'
|
||||
|
||||
DEFAULT_CONFIG = 'pywb/default_config.yaml'
|
||||
|
||||
DEFAULT_RULES_FILE = 'pkg://pywb/rules.yaml'
|
||||
|
||||
|
||||
def get_test_dir():
|
||||
import os
|
||||
|
@ -66,8 +66,7 @@ class BaseCli(object):
|
||||
def load(self):
|
||||
if self.r.live:
|
||||
self.extra_config['collections'] = {'live':
|
||||
{'index': '$live',
|
||||
'use_js_obj_proxy': True}}
|
||||
{'index': '$live'}}
|
||||
|
||||
if self.r.debug:
|
||||
self.extra_config['debug'] = True
|
||||
|
@ -189,9 +189,6 @@ class FrontEndApp(object):
|
||||
metadata = {'coll': coll,
|
||||
'type': 'replay'}
|
||||
|
||||
if self.warcserver.config.get('use_js_obj_proxy'):
|
||||
metadata['use_js_obj_proxy'] = True
|
||||
|
||||
if coll in self.warcserver.list_fixed_routes():
|
||||
metadata.update(self.warcserver.get_coll_config(coll))
|
||||
else:
|
||||
|
@ -65,7 +65,9 @@ class RewriterApp(object):
|
||||
self.frame_mod = None
|
||||
self.replay_mod = ''
|
||||
|
||||
self.default_rw = DefaultRewriter(replay_mod=self.replay_mod)
|
||||
self.default_rw = DefaultRewriter(replay_mod=self.replay_mod,
|
||||
config=config)
|
||||
|
||||
self.js_proxy_rw = RewriterWithJSProxy(replay_mod=self.replay_mod)
|
||||
|
||||
if not jinja_env:
|
||||
@ -87,6 +89,8 @@ class RewriterApp(object):
|
||||
self.not_found_view = BaseInsertView(self.jinja_env, self._html_templ('not_found_html'))
|
||||
self.query_view = BaseInsertView(self.jinja_env, self._html_templ('query_html'))
|
||||
|
||||
self.use_js_obj_proxy = config.get('use_js_obj_proxy', False)
|
||||
|
||||
self.cookie_tracker = None
|
||||
|
||||
self.enable_memento = self.config.get('enable_memento')
|
||||
@ -170,7 +174,7 @@ class RewriterApp(object):
|
||||
|
||||
urlkey = canonicalize(wb_url.url)
|
||||
|
||||
if kwargs.get('use_js_obj_proxy'):
|
||||
if self.use_js_obj_proxy:
|
||||
content_rw = self.js_proxy_rw
|
||||
else:
|
||||
content_rw = self.default_rw
|
||||
|
@ -15,6 +15,8 @@ from pywb.rewrite.rewrite_dash import RewriteDASH
|
||||
from pywb.rewrite.rewrite_hls import RewriteHLS
|
||||
from pywb.rewrite.rewrite_amf import RewriteAMF
|
||||
|
||||
from pywb import DEFAULT_RULES_FILE
|
||||
|
||||
import copy
|
||||
from werkzeug.useragents import UserAgent
|
||||
|
||||
@ -90,8 +92,9 @@ class DefaultRewriter(BaseContentRewriter):
|
||||
'js': 'text/javascript'
|
||||
}
|
||||
|
||||
def __init__(self, rules_file=None, replay_mod=''):
|
||||
rules_file = rules_file or 'pkg://pywb/rules.yaml'
|
||||
def __init__(self, replay_mod='', config=None):
|
||||
config = config or {}
|
||||
rules_file = config.get('rules_file', DEFAULT_RULES_FILE)
|
||||
super(DefaultRewriter, self).__init__(rules_file, replay_mod)
|
||||
self.all_rewriters = copy.copy(self.DEFAULT_REWRITERS)
|
||||
|
||||
|
@ -42,7 +42,7 @@ class IndexHandler(object):
|
||||
def __init__(self, index_source, opts=None, *args, **kwargs):
|
||||
self.index_source = index_source
|
||||
self.opts = opts or {}
|
||||
self.fuzzy = FuzzyMatcher('pkg://pywb/rules.yaml')
|
||||
self.fuzzy = FuzzyMatcher(kwargs.get('rules_file'))
|
||||
|
||||
def get_supported_modes(self):
|
||||
return dict(modes=['list_sources', 'index'])
|
||||
@ -96,8 +96,8 @@ class IndexHandler(object):
|
||||
|
||||
#=============================================================================
|
||||
class ResourceHandler(IndexHandler):
|
||||
def __init__(self, index_source, resource_loaders):
|
||||
super(ResourceHandler, self).__init__(index_source)
|
||||
def __init__(self, index_source, resource_loaders, rules_file=None):
|
||||
super(ResourceHandler, self).__init__(index_source, rules_file=rules_file)
|
||||
self.resource_loaders = resource_loaders
|
||||
|
||||
def get_supported_modes(self):
|
||||
@ -133,12 +133,14 @@ class ResourceHandler(IndexHandler):
|
||||
|
||||
#=============================================================================
|
||||
class DefaultResourceHandler(ResourceHandler):
|
||||
def __init__(self, index_source, warc_paths='', forward_proxy_prefix=''):
|
||||
def __init__(self, index_source, warc_paths='', forward_proxy_prefix='',
|
||||
rules_file=''):
|
||||
loaders = [WARCPathLoader(warc_paths, index_source),
|
||||
LiveWebLoader(forward_proxy_prefix),
|
||||
VideoLoader()
|
||||
]
|
||||
super(DefaultResourceHandler, self).__init__(index_source, loaders)
|
||||
super(DefaultResourceHandler, self).__init__(index_source, loaders,
|
||||
rules_file=rules_file)
|
||||
|
||||
|
||||
#=============================================================================
|
||||
|
@ -1,5 +1,6 @@
|
||||
from warcio.utils import to_native_str
|
||||
from pywb.utils.loaders import load_yaml_config
|
||||
from pywb import DEFAULT_RULES_FILE
|
||||
|
||||
import re
|
||||
import os
|
||||
@ -24,7 +25,8 @@ class FuzzyMatcher(object):
|
||||
FUZZY_SKIP_PARAMS = ('alt_url', 'reverse', 'closest', 'end_key',
|
||||
'url', 'matchType', 'filter')
|
||||
|
||||
def __init__(self, filename):
|
||||
def __init__(self, filename=None):
|
||||
filename = filename or DEFAULT_RULES_FILE
|
||||
config = load_yaml_config(filename)
|
||||
self.rules = []
|
||||
for rule in config.get('rules'):
|
||||
|
@ -26,7 +26,7 @@ class TestFuzzy(object):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
cls.source = SimpleAggregator({'source': EchoParamsSource()})
|
||||
cls.fuzzy = FuzzyMatcher('pkg://pywb/rules.yaml')
|
||||
cls.fuzzy = FuzzyMatcher()
|
||||
|
||||
def get_params(self, url, actual_url, mime='text/html'):
|
||||
params = {'url': url,
|
||||
|
@ -46,18 +46,18 @@ collections:
|
||||
# Local Dir CDX
|
||||
local:
|
||||
index: ./local/indexes
|
||||
resource: ./local/data
|
||||
archive_paths: ./local/data
|
||||
|
||||
local_file:
|
||||
index: ./local/indexes/file.cdxj
|
||||
resource: ./local/data
|
||||
archive_paths: ./local/data
|
||||
|
||||
# Sequence
|
||||
many_seq:
|
||||
sequence:
|
||||
-
|
||||
index: ./local/indexes
|
||||
resource: ./local/data
|
||||
archive_paths: ./local/data
|
||||
name: local
|
||||
|
||||
-
|
||||
|
@ -58,6 +58,8 @@ class WarcServer(BaseWarcServer):
|
||||
self.index_paths = self.init_paths('index_paths')
|
||||
self.archive_paths = self.init_paths('archive_paths', self.root_dir)
|
||||
|
||||
self.rules_file = self.config.get('rules_file', '')
|
||||
|
||||
self.auto_handler = None
|
||||
|
||||
if self.config.get('enable_auto_colls', True):
|
||||
@ -98,7 +100,8 @@ class WarcServer(BaseWarcServer):
|
||||
base_dir=self.index_paths,
|
||||
config=self.config)
|
||||
|
||||
return DefaultResourceHandler(dir_source, self.archive_paths)
|
||||
return DefaultResourceHandler(dir_source, self.archive_paths,
|
||||
rules_file=self.rules_file)
|
||||
|
||||
def list_fixed_routes(self):
|
||||
return list(self.fixed_routes.keys())
|
||||
@ -149,14 +152,12 @@ class WarcServer(BaseWarcServer):
|
||||
|
||||
if isinstance(coll_config, str):
|
||||
index = coll_config
|
||||
resource = None
|
||||
archive_paths = None
|
||||
elif isinstance(coll_config, dict):
|
||||
index = coll_config.get('index')
|
||||
if not index:
|
||||
index = coll_config.get('index_paths')
|
||||
resource = coll_config.get('resource')
|
||||
if not resource:
|
||||
resource = coll_config.get('archive_paths')
|
||||
archive_paths = coll_config.get('archive_paths')
|
||||
|
||||
else:
|
||||
raise Exception('collection config must be string or dict')
|
||||
@ -179,10 +180,11 @@ class WarcServer(BaseWarcServer):
|
||||
timeout = int(coll_config.get('timeout', 0))
|
||||
agg = init_index_agg(index_group, True, timeout)
|
||||
|
||||
if not resource:
|
||||
resource = self.config.get('archive_paths')
|
||||
if not archive_paths:
|
||||
archive_paths = self.config.get('archive_paths')
|
||||
|
||||
return DefaultResourceHandler(agg, resource)
|
||||
return DefaultResourceHandler(agg, archive_paths,
|
||||
rules_file=self.rules_file)
|
||||
|
||||
def init_sequence(self, coll_name, seq_config):
|
||||
if not isinstance(seq_config, list):
|
||||
|
@ -2,16 +2,14 @@
|
||||
|
||||
debug: true
|
||||
|
||||
use_js_obj_proxy: true
|
||||
|
||||
collections_root: _test_colls
|
||||
|
||||
collections:
|
||||
pywb:
|
||||
index: ./sample_archive/cdx/
|
||||
resource: ./sample_archive/warcs/
|
||||
|
||||
with-js-proxy:
|
||||
index: ./sample_archive/cdx/
|
||||
use_js_obj_proxy: true
|
||||
archive_paths: ./sample_archive/warcs/
|
||||
|
||||
# live collection
|
||||
live: $live
|
||||
|
@ -25,7 +25,7 @@ class TestProxyCLIConfig(CollsDirMixin, BaseTestClass):
|
||||
def test_proxy_cli_rec(self):
|
||||
res = wayback(['--proxy', 'test', '--proxy-record'])
|
||||
assert res.extra_config['proxy']['recording'] == True
|
||||
assert res.extra_config['collections']['live'] == {'index': '$live', 'use_js_obj_proxy': True}
|
||||
assert res.extra_config['collections']['live'] == {'index': '$live'}
|
||||
|
||||
def test_proxy_cli_err_coll(self):
|
||||
with pytest.raises(Exception):
|
||||
@ -37,6 +37,6 @@ class TestProxyCLIConfig(CollsDirMixin, BaseTestClass):
|
||||
|
||||
def test_live_all_cli(self):
|
||||
res = wayback(['--all-coll', 'all', '--live'])
|
||||
assert res.extra_config['collections'] == {'live': {'index': '$live', 'use_js_obj_proxy': True},
|
||||
assert res.extra_config['collections'] == {'live': {'index': '$live'},
|
||||
'all': '$all'}
|
||||
|
||||
|
@ -253,29 +253,31 @@ class TestWbIntegration(BaseConfigTest):
|
||||
assert resp.status_int == 200
|
||||
assert resp.content_type == 'text/css'
|
||||
|
||||
def test_replay_js_mod(self):
|
||||
# an empty js file
|
||||
resp = self.testapp.get('/pywb/20140126201054js_/http://www.iana.org/_js/2013.1/iana.js')
|
||||
def test_replay_js_mod_no_obj_proxy(self):
|
||||
# an empty js file, (ie11 UA no js obj proxy)
|
||||
resp = self.testapp.get('/pywb/20140126201054js_/http://www.iana.org/_js/2013.1/iana.js',
|
||||
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'})
|
||||
|
||||
assert resp.status_int == 200
|
||||
assert resp.content_length == 0
|
||||
assert resp.content_type == 'application/x-javascript'
|
||||
|
||||
def test_replay_js_obj_proxy(self, fmod):
|
||||
# test js proxy obj with jquery -- no user agent
|
||||
resp = self.get('/with-js-proxy/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod)
|
||||
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod)
|
||||
|
||||
assert resp.status_int == 200
|
||||
assert resp.content_length != 0
|
||||
assert resp.content_type == 'application/x-javascript'
|
||||
|
||||
# test with Chrome user agent
|
||||
resp = self.get('/with-js-proxy/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
|
||||
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
|
||||
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'})
|
||||
assert 'let window = _____WB$wombat$assign$function_____(' in resp.text
|
||||
|
||||
def test_replay_js_ie11_no_obj_proxy(self, fmod):
|
||||
# IE11 user-agent, no proxy
|
||||
resp = self.get('/with-js-proxy/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
|
||||
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
|
||||
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'})
|
||||
|
||||
assert 'let window = _____WB$wombat$assign$function_____(' not in resp.text
|
||||
@ -468,7 +470,7 @@ class TestWbIntegration(BaseConfigTest):
|
||||
resp = self.testapp.get('/collinfo.json')
|
||||
assert resp.content_type == 'application/json'
|
||||
value = resp.json
|
||||
assert len(value['fixed']) == 5
|
||||
assert len(value['fixed']) == 4
|
||||
assert len(value['dynamic']) == 0
|
||||
|
||||
#def test_invalid_config(self):
|
||||
|
Loading…
x
Reference in New Issue
Block a user