1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

add optional filters to default Route

add examples to config.yaml and test_config.yaml and integration test
per route config is inherited globally if only name is set
This commit is contained in:
Ilya Kreymer 2014-02-06 17:28:08 -08:00
parent d347b4952b
commit 00a7691f69
6 changed files with 39 additions and 14 deletions

View File

@ -88,12 +88,12 @@ After pywb and all its dependencies are installed, the uwsgi server will start u
spawned uWSGI worker 1 (and the only) (pid: 123, cores: 1) spawned uWSGI worker 1 (and the only) (pid: 123, cores: 1)
``` ```
At this point, you can open a web browser and navigate to `http://localhost:8080` for testing. At this point, you can open a web browser and navigate to the examples above for testing.
### Automated Tests ### Automated Tests
Currently pywb consists of numerous doctests against the sample archive. Currently pywb includes numerous doctests which test rewriting and loading of data from the sample archive.
The `run-tests.py` file currently contains a few basic integration tests against the default config. The `run-tests.py` file currently contains a few basic integration tests against the default config.
@ -203,7 +203,7 @@ The directions are for running in a shell:
sort -m mypath/cdx/*.cdx | sort -c > mypath/merged_cdx/merge_1.cdx sort -m mypath/cdx/*.cdx | sort -c > mypath/merged_cdx/merge_1.cdx
``` ```
(The merged cdx will start with several ` CDX` headers due to the merge. These headers indicate cdx format and should be all the same! (The merged cdx will start with several ` CDX` headers due to the merge. These headers indicate the cdx format and should be all the same!
They are always first and pywb ignores them) They are always first and pywb ignores them)

View File

@ -12,6 +12,9 @@ collections:
# - a string value indicating remote http cdx server # - a string value indicating remote http cdx server
pywb: ./sample_archive/cdx/ pywb: ./sample_archive/cdx/
# ex with filtering: filter CDX lines by filename starting with 'dupe'
#pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/ # indicate if cdx files are sorted by SURT keys -- eg: com,example)/
# SURT keys are recommended for future indices, but non-SURT cdxs # SURT keys are recommended for future indices, but non-SURT cdxs
# are also supported # are also supported

View File

@ -66,12 +66,13 @@ class Route:
SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)' SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)'
def __init__(self, regex, handler, coll_group = 0, lookahead = SLASH_QUERY_LOOKAHEAD): def __init__(self, regex, handler, coll_group = 0, filters = [], lookahead = SLASH_QUERY_LOOKAHEAD):
self.path = regex self.path = regex
self.regex = re.compile(regex + lookahead) self.regex = re.compile(regex + lookahead)
self.handler = handler self.handler = handler
# collection id from regex group (default 0) # collection id from regex group (default 0)
self.coll_group = coll_group self.coll_group = coll_group
self.filters = filters
def __call__(self, env, use_abs_prefix): def __call__(self, env, use_abs_prefix):
@ -106,7 +107,9 @@ class Route:
return self._handle_request(wbrequest) return self._handle_request(wbrequest)
def _add_filters(self, wbrequest, matcher): def _add_filters(self, wbrequest, matcher):
pass for filter in self.filters:
last_grp = len(matcher.groups())
wbrequest.query_filter.append(filter.format(matcher.group(last_grp)))
def _handle_request(self, wbrequest): def _handle_request(self, wbrequest):
return self.handler(wbrequest) return self.handler(wbrequest)

View File

@ -20,32 +20,34 @@ def pywb_config_manual(config = {}):
collections = config.get('collections', {'pywb': './sample_archive/cdx/'}) collections = config.get('collections', {'pywb': './sample_archive/cdx/'})
for name, value in collections.iteritems(): for name, value in collections.iteritems():
route_config = config
if isinstance(value, dict): if isinstance(value, dict):
# if a dict, extend with base properies # if a dict, extend with base properies
index_paths = value['index_paths'] index_paths = value['index_paths']
value.extend(config) value.update(route_config)
config = value route_config = value
else: else:
index_paths = str(value) index_paths = str(value)
cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, **config) cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, **config)
# cdx query handler # cdx query handler
if config.get('enable_cdx_api', False): if route_config.get('enable_cdx_api', False):
routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_source))) routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_source)))
wb_handler = config_utils.create_wb_handler( wb_handler = config_utils.create_wb_handler(
cdx_source = cdx_source, cdx_source = cdx_source,
archive_paths = config.get('archive_paths', './sample_archive/warcs/'), archive_paths = route_config.get('archive_paths', './sample_archive/warcs/'),
head_html = config.get('head_insert_html'), head_html = route_config.get('head_insert_html'),
query_html = config.get('query_html'), query_html = route_config.get('query_html'),
search_html = config.get('search_html'), search_html = route_config.get('search_html'),
static_path = config.get('static_path', hostpaths[0] + 'static/') static_path = route_config.get('static_path', hostpaths[0] + 'static/')
) )
logging.info('Adding Collection: ' + name) logging.info('Adding Collection: ' + name)
routes.append(archivalrouter.Route(name, wb_handler)) routes.append(archivalrouter.Route(name, wb_handler, filters = route_config.get('filters', [])))
if config.get('debug_echo_env', False): if config.get('debug_echo_env', False):

View File

@ -37,6 +37,19 @@ class TestWb:
# 3 Captures + header # 3 Captures + header
assert len(resp.html.find_all('tr')) == 4 assert len(resp.html.find_all('tr')) == 4
def test_calendar_query_filtered(self):
# unfiltered collection
resp = self.testapp.get('/pywb/*/http://www.iana.org/_css/2013.1/screen.css')
self._assert_basic_html(resp)
# 17 Captures + header
assert len(resp.html.find_all('tr')) == 18
# filtered collection
resp = self.testapp.get('/pywb-filt/*/http://www.iana.org/_css/2013.1/screen.css')
self._assert_basic_html(resp)
# 1 Capture (filtered) + header
assert len(resp.html.find_all('tr')) == 2
def test_cdx_query(self): def test_cdx_query(self):
resp = self.testapp.get('/pywb/cdx_/*/http://www.iana.org/') resp = self.testapp.get('/pywb/cdx_/*/http://www.iana.org/')
self._assert_basic_text(resp) self._assert_basic_text(resp)

View File

@ -12,6 +12,10 @@ collections:
# - a string value indicating remote http cdx server # - a string value indicating remote http cdx server
pywb: ./sample_archive/cdx/ pywb: ./sample_archive/cdx/
# ex with filtering: filter CDX lines by filename starting with 'dupe'
pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']}
# indicate if cdx files are sorted by SURT keys -- eg: com,example)/ # indicate if cdx files are sorted by SURT keys -- eg: com,example)/
# SURT keys are recommended for future indices, but non-SURT cdxs # SURT keys are recommended for future indices, but non-SURT cdxs
# are also supported # are also supported