mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
add optional filters to default Route
add examples to config.yaml and test_config.yaml and integration test per route config is inherited globally if only name is set
This commit is contained in:
parent
d347b4952b
commit
00a7691f69
@ -88,12 +88,12 @@ After pywb and all its dependencies are installed, the uwsgi server will start u
|
|||||||
spawned uWSGI worker 1 (and the only) (pid: 123, cores: 1)
|
spawned uWSGI worker 1 (and the only) (pid: 123, cores: 1)
|
||||||
```
|
```
|
||||||
|
|
||||||
At this point, you can open a web browser and navigate to `http://localhost:8080` for testing.
|
At this point, you can open a web browser and navigate to the examples above for testing.
|
||||||
|
|
||||||
|
|
||||||
### Automated Tests
|
### Automated Tests
|
||||||
|
|
||||||
Currently pywb consists of numerous doctests against the sample archive.
|
Currently pywb includes numerous doctests which test rewriting and loading of data from the sample archive.
|
||||||
|
|
||||||
The `run-tests.py` file currently contains a few basic integration tests against the default config.
|
The `run-tests.py` file currently contains a few basic integration tests against the default config.
|
||||||
|
|
||||||
@ -203,7 +203,7 @@ The directions are for running in a shell:
|
|||||||
sort -m mypath/cdx/*.cdx | sort -c > mypath/merged_cdx/merge_1.cdx
|
sort -m mypath/cdx/*.cdx | sort -c > mypath/merged_cdx/merge_1.cdx
|
||||||
```
|
```
|
||||||
|
|
||||||
(The merged cdx will start with several ` CDX` headers due to the merge. These headers indicate cdx format and should be all the same!
|
(The merged cdx will start with several ` CDX` headers due to the merge. These headers indicate the cdx format and should be all the same!
|
||||||
They are always first and pywb ignores them)
|
They are always first and pywb ignores them)
|
||||||
|
|
||||||
|
|
||||||
|
@ -12,6 +12,9 @@ collections:
|
|||||||
# - a string value indicating remote http cdx server
|
# - a string value indicating remote http cdx server
|
||||||
pywb: ./sample_archive/cdx/
|
pywb: ./sample_archive/cdx/
|
||||||
|
|
||||||
|
# ex with filtering: filter CDX lines by filename starting with 'dupe'
|
||||||
|
#pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}
|
||||||
|
|
||||||
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
|
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
|
||||||
# SURT keys are recommended for future indices, but non-SURT cdxs
|
# SURT keys are recommended for future indices, but non-SURT cdxs
|
||||||
# are also supported
|
# are also supported
|
||||||
|
@ -66,12 +66,13 @@ class Route:
|
|||||||
SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)'
|
SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)'
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, regex, handler, coll_group = 0, lookahead = SLASH_QUERY_LOOKAHEAD):
|
def __init__(self, regex, handler, coll_group = 0, filters = [], lookahead = SLASH_QUERY_LOOKAHEAD):
|
||||||
self.path = regex
|
self.path = regex
|
||||||
self.regex = re.compile(regex + lookahead)
|
self.regex = re.compile(regex + lookahead)
|
||||||
self.handler = handler
|
self.handler = handler
|
||||||
# collection id from regex group (default 0)
|
# collection id from regex group (default 0)
|
||||||
self.coll_group = coll_group
|
self.coll_group = coll_group
|
||||||
|
self.filters = filters
|
||||||
|
|
||||||
|
|
||||||
def __call__(self, env, use_abs_prefix):
|
def __call__(self, env, use_abs_prefix):
|
||||||
@ -106,7 +107,9 @@ class Route:
|
|||||||
return self._handle_request(wbrequest)
|
return self._handle_request(wbrequest)
|
||||||
|
|
||||||
def _add_filters(self, wbrequest, matcher):
|
def _add_filters(self, wbrequest, matcher):
|
||||||
pass
|
for filter in self.filters:
|
||||||
|
last_grp = len(matcher.groups())
|
||||||
|
wbrequest.query_filter.append(filter.format(matcher.group(last_grp)))
|
||||||
|
|
||||||
def _handle_request(self, wbrequest):
|
def _handle_request(self, wbrequest):
|
||||||
return self.handler(wbrequest)
|
return self.handler(wbrequest)
|
||||||
|
@ -20,32 +20,34 @@ def pywb_config_manual(config = {}):
|
|||||||
collections = config.get('collections', {'pywb': './sample_archive/cdx/'})
|
collections = config.get('collections', {'pywb': './sample_archive/cdx/'})
|
||||||
|
|
||||||
for name, value in collections.iteritems():
|
for name, value in collections.iteritems():
|
||||||
|
route_config = config
|
||||||
|
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
# if a dict, extend with base properies
|
# if a dict, extend with base properies
|
||||||
index_paths = value['index_paths']
|
index_paths = value['index_paths']
|
||||||
value.extend(config)
|
value.update(route_config)
|
||||||
config = value
|
route_config = value
|
||||||
else:
|
else:
|
||||||
index_paths = str(value)
|
index_paths = str(value)
|
||||||
|
|
||||||
cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, **config)
|
cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, **config)
|
||||||
|
|
||||||
# cdx query handler
|
# cdx query handler
|
||||||
if config.get('enable_cdx_api', False):
|
if route_config.get('enable_cdx_api', False):
|
||||||
routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_source)))
|
routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_source)))
|
||||||
|
|
||||||
wb_handler = config_utils.create_wb_handler(
|
wb_handler = config_utils.create_wb_handler(
|
||||||
cdx_source = cdx_source,
|
cdx_source = cdx_source,
|
||||||
archive_paths = config.get('archive_paths', './sample_archive/warcs/'),
|
archive_paths = route_config.get('archive_paths', './sample_archive/warcs/'),
|
||||||
head_html = config.get('head_insert_html'),
|
head_html = route_config.get('head_insert_html'),
|
||||||
query_html = config.get('query_html'),
|
query_html = route_config.get('query_html'),
|
||||||
search_html = config.get('search_html'),
|
search_html = route_config.get('search_html'),
|
||||||
static_path = config.get('static_path', hostpaths[0] + 'static/')
|
static_path = route_config.get('static_path', hostpaths[0] + 'static/')
|
||||||
)
|
)
|
||||||
|
|
||||||
logging.info('Adding Collection: ' + name)
|
logging.info('Adding Collection: ' + name)
|
||||||
|
|
||||||
routes.append(archivalrouter.Route(name, wb_handler))
|
routes.append(archivalrouter.Route(name, wb_handler, filters = route_config.get('filters', [])))
|
||||||
|
|
||||||
|
|
||||||
if config.get('debug_echo_env', False):
|
if config.get('debug_echo_env', False):
|
||||||
|
13
run-tests.py
13
run-tests.py
@ -37,6 +37,19 @@ class TestWb:
|
|||||||
# 3 Captures + header
|
# 3 Captures + header
|
||||||
assert len(resp.html.find_all('tr')) == 4
|
assert len(resp.html.find_all('tr')) == 4
|
||||||
|
|
||||||
|
def test_calendar_query_filtered(self):
|
||||||
|
# unfiltered collection
|
||||||
|
resp = self.testapp.get('/pywb/*/http://www.iana.org/_css/2013.1/screen.css')
|
||||||
|
self._assert_basic_html(resp)
|
||||||
|
# 17 Captures + header
|
||||||
|
assert len(resp.html.find_all('tr')) == 18
|
||||||
|
|
||||||
|
# filtered collection
|
||||||
|
resp = self.testapp.get('/pywb-filt/*/http://www.iana.org/_css/2013.1/screen.css')
|
||||||
|
self._assert_basic_html(resp)
|
||||||
|
# 1 Capture (filtered) + header
|
||||||
|
assert len(resp.html.find_all('tr')) == 2
|
||||||
|
|
||||||
def test_cdx_query(self):
|
def test_cdx_query(self):
|
||||||
resp = self.testapp.get('/pywb/cdx_/*/http://www.iana.org/')
|
resp = self.testapp.get('/pywb/cdx_/*/http://www.iana.org/')
|
||||||
self._assert_basic_text(resp)
|
self._assert_basic_text(resp)
|
||||||
|
@ -12,6 +12,10 @@ collections:
|
|||||||
# - a string value indicating remote http cdx server
|
# - a string value indicating remote http cdx server
|
||||||
pywb: ./sample_archive/cdx/
|
pywb: ./sample_archive/cdx/
|
||||||
|
|
||||||
|
# ex with filtering: filter CDX lines by filename starting with 'dupe'
|
||||||
|
pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}
|
||||||
|
|
||||||
|
|
||||||
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
|
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
|
||||||
# SURT keys are recommended for future indices, but non-SURT cdxs
|
# SURT keys are recommended for future indices, but non-SURT cdxs
|
||||||
# are also supported
|
# are also supported
|
||||||
|
Loading…
x
Reference in New Issue
Block a user