mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-16 00:24:48 +01:00
add optional filters to default Route
add examples to config.yaml and test_config.yaml and integration test per route config is inherited globally if only name is set
This commit is contained in:
parent
d347b4952b
commit
00a7691f69
@ -88,12 +88,12 @@ After pywb and all its dependencies are installed, the uwsgi server will start u
|
||||
spawned uWSGI worker 1 (and the only) (pid: 123, cores: 1)
|
||||
```
|
||||
|
||||
At this point, you can open a web browser and navigate to `http://localhost:8080` for testing.
|
||||
At this point, you can open a web browser and navigate to the examples above for testing.
|
||||
|
||||
|
||||
### Automated Tests
|
||||
|
||||
Currently pywb consists of numerous doctests against the sample archive.
|
||||
Currently pywb includes numerous doctests which test rewriting and loading of data from the sample archive.
|
||||
|
||||
The `run-tests.py` file currently contains a few basic integration tests against the default config.
|
||||
|
||||
@ -203,7 +203,7 @@ The directions are for running in a shell:
|
||||
sort -m mypath/cdx/*.cdx | sort -c > mypath/merged_cdx/merge_1.cdx
|
||||
```
|
||||
|
||||
(The merged cdx will start with several ` CDX` headers due to the merge. These headers indicate cdx format and should be all the same!
|
||||
(The merged cdx will start with several ` CDX` headers due to the merge. These headers indicate the cdx format and should be all the same!
|
||||
They are always first and pywb ignores them)
|
||||
|
||||
|
||||
|
@ -12,6 +12,9 @@ collections:
|
||||
# - a string value indicating remote http cdx server
|
||||
pywb: ./sample_archive/cdx/
|
||||
|
||||
# ex with filtering: filter CDX lines by filename starting with 'dupe'
|
||||
#pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}
|
||||
|
||||
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
|
||||
# SURT keys are recommended for future indices, but non-SURT cdxs
|
||||
# are also supported
|
||||
|
@ -66,12 +66,13 @@ class Route:
|
||||
SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)'
|
||||
|
||||
|
||||
def __init__(self, regex, handler, coll_group = 0, lookahead = SLASH_QUERY_LOOKAHEAD):
|
||||
def __init__(self, regex, handler, coll_group = 0, filters = [], lookahead = SLASH_QUERY_LOOKAHEAD):
|
||||
self.path = regex
|
||||
self.regex = re.compile(regex + lookahead)
|
||||
self.handler = handler
|
||||
# collection id from regex group (default 0)
|
||||
self.coll_group = coll_group
|
||||
self.filters = filters
|
||||
|
||||
|
||||
def __call__(self, env, use_abs_prefix):
|
||||
@ -106,7 +107,9 @@ class Route:
|
||||
return self._handle_request(wbrequest)
|
||||
|
||||
def _add_filters(self, wbrequest, matcher):
|
||||
pass
|
||||
for filter in self.filters:
|
||||
last_grp = len(matcher.groups())
|
||||
wbrequest.query_filter.append(filter.format(matcher.group(last_grp)))
|
||||
|
||||
def _handle_request(self, wbrequest):
|
||||
return self.handler(wbrequest)
|
||||
|
@ -20,32 +20,34 @@ def pywb_config_manual(config = {}):
|
||||
collections = config.get('collections', {'pywb': './sample_archive/cdx/'})
|
||||
|
||||
for name, value in collections.iteritems():
|
||||
route_config = config
|
||||
|
||||
if isinstance(value, dict):
|
||||
# if a dict, extend with base properies
|
||||
index_paths = value['index_paths']
|
||||
value.extend(config)
|
||||
config = value
|
||||
value.update(route_config)
|
||||
route_config = value
|
||||
else:
|
||||
index_paths = str(value)
|
||||
|
||||
cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, **config)
|
||||
|
||||
# cdx query handler
|
||||
if config.get('enable_cdx_api', False):
|
||||
if route_config.get('enable_cdx_api', False):
|
||||
routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_source)))
|
||||
|
||||
wb_handler = config_utils.create_wb_handler(
|
||||
cdx_source = cdx_source,
|
||||
archive_paths = config.get('archive_paths', './sample_archive/warcs/'),
|
||||
head_html = config.get('head_insert_html'),
|
||||
query_html = config.get('query_html'),
|
||||
search_html = config.get('search_html'),
|
||||
static_path = config.get('static_path', hostpaths[0] + 'static/')
|
||||
archive_paths = route_config.get('archive_paths', './sample_archive/warcs/'),
|
||||
head_html = route_config.get('head_insert_html'),
|
||||
query_html = route_config.get('query_html'),
|
||||
search_html = route_config.get('search_html'),
|
||||
static_path = route_config.get('static_path', hostpaths[0] + 'static/')
|
||||
)
|
||||
|
||||
logging.info('Adding Collection: ' + name)
|
||||
|
||||
routes.append(archivalrouter.Route(name, wb_handler))
|
||||
routes.append(archivalrouter.Route(name, wb_handler, filters = route_config.get('filters', [])))
|
||||
|
||||
|
||||
if config.get('debug_echo_env', False):
|
||||
|
13
run-tests.py
13
run-tests.py
@ -37,6 +37,19 @@ class TestWb:
|
||||
# 3 Captures + header
|
||||
assert len(resp.html.find_all('tr')) == 4
|
||||
|
||||
def test_calendar_query_filtered(self):
|
||||
# unfiltered collection
|
||||
resp = self.testapp.get('/pywb/*/http://www.iana.org/_css/2013.1/screen.css')
|
||||
self._assert_basic_html(resp)
|
||||
# 17 Captures + header
|
||||
assert len(resp.html.find_all('tr')) == 18
|
||||
|
||||
# filtered collection
|
||||
resp = self.testapp.get('/pywb-filt/*/http://www.iana.org/_css/2013.1/screen.css')
|
||||
self._assert_basic_html(resp)
|
||||
# 1 Capture (filtered) + header
|
||||
assert len(resp.html.find_all('tr')) == 2
|
||||
|
||||
def test_cdx_query(self):
|
||||
resp = self.testapp.get('/pywb/cdx_/*/http://www.iana.org/')
|
||||
self._assert_basic_text(resp)
|
||||
|
@ -12,6 +12,10 @@ collections:
|
||||
# - a string value indicating remote http cdx server
|
||||
pywb: ./sample_archive/cdx/
|
||||
|
||||
# ex with filtering: filter CDX lines by filename starting with 'dupe'
|
||||
pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}
|
||||
|
||||
|
||||
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
|
||||
# SURT keys are recommended for future indices, but non-SURT cdxs
|
||||
# are also supported
|
||||
|
Loading…
x
Reference in New Issue
Block a user