1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

add optional filters to default Route

add examples to config.yaml and test_config.yaml and integration test
per route config is inherited globally if only name is set
This commit is contained in:
Ilya Kreymer 2014-02-06 17:28:08 -08:00
parent d347b4952b
commit 00a7691f69
6 changed files with 39 additions and 14 deletions

View File

@ -88,12 +88,12 @@ After pywb and all its dependencies are installed, the uwsgi server will start u
spawned uWSGI worker 1 (and the only) (pid: 123, cores: 1)
```
At this point, you can open a web browser and navigate to `http://localhost:8080` for testing.
At this point, you can open a web browser and navigate to the examples above for testing.
### Automated Tests
Currently pywb consists of numerous doctests against the sample archive.
Currently pywb includes numerous doctests which test rewriting and loading of data from the sample archive.
The `run-tests.py` file currently contains a few basic integration tests against the default config.
@ -203,7 +203,7 @@ The directions are for running in a shell:
sort -m mypath/cdx/*.cdx | sort -c > mypath/merged_cdx/merge_1.cdx
```
(The merged cdx will start with several ` CDX` headers due to the merge. These headers indicate cdx format and should be all the same!
(The merged cdx will start with several ` CDX` headers due to the merge. These headers indicate the cdx format and should be all the same!
They are always first and pywb ignores them)

View File

@ -12,6 +12,9 @@ collections:
# - a string value indicating remote http cdx server
pywb: ./sample_archive/cdx/
# ex with filtering: filter CDX lines by filename starting with 'dupe'
#pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
# SURT keys are recommended for future indices, but non-SURT cdxs
# are also supported

View File

@ -66,12 +66,13 @@ class Route:
SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)'
def __init__(self, regex, handler, coll_group = 0, lookahead = SLASH_QUERY_LOOKAHEAD):
def __init__(self, regex, handler, coll_group = 0, filters = [], lookahead = SLASH_QUERY_LOOKAHEAD):
self.path = regex
self.regex = re.compile(regex + lookahead)
self.handler = handler
# collection id from regex group (default 0)
self.coll_group = coll_group
self.filters = filters
def __call__(self, env, use_abs_prefix):
@ -106,7 +107,9 @@ class Route:
return self._handle_request(wbrequest)
def _add_filters(self, wbrequest, matcher):
pass
for filter in self.filters:
last_grp = len(matcher.groups())
wbrequest.query_filter.append(filter.format(matcher.group(last_grp)))
def _handle_request(self, wbrequest):
return self.handler(wbrequest)

View File

@ -20,32 +20,34 @@ def pywb_config_manual(config = {}):
collections = config.get('collections', {'pywb': './sample_archive/cdx/'})
for name, value in collections.iteritems():
route_config = config
if isinstance(value, dict):
# if a dict, extend with base properies
index_paths = value['index_paths']
value.extend(config)
config = value
value.update(route_config)
route_config = value
else:
index_paths = str(value)
cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, **config)
# cdx query handler
if config.get('enable_cdx_api', False):
if route_config.get('enable_cdx_api', False):
routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_source)))
wb_handler = config_utils.create_wb_handler(
cdx_source = cdx_source,
archive_paths = config.get('archive_paths', './sample_archive/warcs/'),
head_html = config.get('head_insert_html'),
query_html = config.get('query_html'),
search_html = config.get('search_html'),
static_path = config.get('static_path', hostpaths[0] + 'static/')
archive_paths = route_config.get('archive_paths', './sample_archive/warcs/'),
head_html = route_config.get('head_insert_html'),
query_html = route_config.get('query_html'),
search_html = route_config.get('search_html'),
static_path = route_config.get('static_path', hostpaths[0] + 'static/')
)
logging.info('Adding Collection: ' + name)
routes.append(archivalrouter.Route(name, wb_handler))
routes.append(archivalrouter.Route(name, wb_handler, filters = route_config.get('filters', [])))
if config.get('debug_echo_env', False):

View File

@ -37,6 +37,19 @@ class TestWb:
# 3 Captures + header
assert len(resp.html.find_all('tr')) == 4
def test_calendar_query_filtered(self):
# unfiltered collection
resp = self.testapp.get('/pywb/*/http://www.iana.org/_css/2013.1/screen.css')
self._assert_basic_html(resp)
# 17 Captures + header
assert len(resp.html.find_all('tr')) == 18
# filtered collection
resp = self.testapp.get('/pywb-filt/*/http://www.iana.org/_css/2013.1/screen.css')
self._assert_basic_html(resp)
# 1 Capture (filtered) + header
assert len(resp.html.find_all('tr')) == 2
def test_cdx_query(self):
resp = self.testapp.get('/pywb/cdx_/*/http://www.iana.org/')
self._assert_basic_text(resp)

View File

@ -12,6 +12,10 @@ collections:
# - a string value indicating remote http cdx server
pywb: ./sample_archive/cdx/
# ex with filtering: filter CDX lines by filename starting with 'dupe'
pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/
# SURT keys are recommended for future indices, but non-SURT cdxs
# are also supported