From 00a7691f69f1c7482ec3e08c9a8fc36b3e5bcd5b Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 6 Feb 2014 17:28:08 -0800 Subject: [PATCH] add optional filters to default Route add examples to config.yaml and test_config.yaml and integration test per route config is inherited globally if only name is set --- README.md | 6 +++--- config.yaml | 3 +++ pywb/archivalrouter.py | 7 +++++-- pywb/pywb_init.py | 20 +++++++++++--------- run-tests.py | 13 +++++++++++++ test_config.yaml | 4 ++++ 6 files changed, 39 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index e91b0a15..ca3de93c 100644 --- a/README.md +++ b/README.md @@ -88,12 +88,12 @@ After pywb and all its dependencies are installed, the uwsgi server will start u spawned uWSGI worker 1 (and the only) (pid: 123, cores: 1) ``` -At this point, you can open a web browser and navigate to `http://localhost:8080` for testing. +At this point, you can open a web browser and navigate to the examples above for testing. ### Automated Tests -Currently pywb consists of numerous doctests against the sample archive. +Currently pywb includes numerous doctests which test rewriting and loading of data from the sample archive. The `run-tests.py` file currently contains a few basic integration tests against the default config. @@ -203,7 +203,7 @@ The directions are for running in a shell: sort -m mypath/cdx/*.cdx | sort -c > mypath/merged_cdx/merge_1.cdx ``` - (The merged cdx will start with several ` CDX` headers due to the merge. These headers indicate cdx format and should be all the same! + (The merged cdx will start with several ` CDX` headers due to the merge. These headers indicate the cdx format and should be all the same! They are always first and pywb ignores them) diff --git a/config.yaml b/config.yaml index 934ded44..00793a2c 100644 --- a/config.yaml +++ b/config.yaml @@ -12,6 +12,9 @@ collections: # - a string value indicating remote http cdx server pywb: ./sample_archive/cdx/ + # ex with filtering: filter CDX lines by filename starting with 'dupe' + #pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']} + # indicate if cdx files are sorted by SURT keys -- eg: com,example)/ # SURT keys are recommended for future indices, but non-SURT cdxs # are also supported diff --git a/pywb/archivalrouter.py b/pywb/archivalrouter.py index 04143878..e805095a 100644 --- a/pywb/archivalrouter.py +++ b/pywb/archivalrouter.py @@ -66,12 +66,13 @@ class Route: SLASH_QUERY_LOOKAHEAD ='(?=/|$|\?)' - def __init__(self, regex, handler, coll_group = 0, lookahead = SLASH_QUERY_LOOKAHEAD): + def __init__(self, regex, handler, coll_group = 0, filters = [], lookahead = SLASH_QUERY_LOOKAHEAD): self.path = regex self.regex = re.compile(regex + lookahead) self.handler = handler # collection id from regex group (default 0) self.coll_group = coll_group + self.filters = filters def __call__(self, env, use_abs_prefix): @@ -106,7 +107,9 @@ class Route: return self._handle_request(wbrequest) def _add_filters(self, wbrequest, matcher): - pass + for filter in self.filters: + last_grp = len(matcher.groups()) + wbrequest.query_filter.append(filter.format(matcher.group(last_grp))) def _handle_request(self, wbrequest): return self.handler(wbrequest) diff --git a/pywb/pywb_init.py b/pywb/pywb_init.py index 6db4103b..616e7fac 100644 --- a/pywb/pywb_init.py +++ b/pywb/pywb_init.py @@ -20,32 +20,34 @@ def pywb_config_manual(config = {}): collections = config.get('collections', {'pywb': './sample_archive/cdx/'}) for name, value in collections.iteritems(): + route_config = config + if isinstance(value, dict): # if a dict, extend with base properies index_paths = value['index_paths'] - value.extend(config) - config = value + value.update(route_config) + route_config = value else: index_paths = str(value) cdx_source = indexreader.IndexReader.make_best_cdx_source(index_paths, **config) # cdx query handler - if config.get('enable_cdx_api', False): + if route_config.get('enable_cdx_api', False): routes.append(archivalrouter.Route(name + '-cdx', handlers.CDXHandler(cdx_source))) wb_handler = config_utils.create_wb_handler( cdx_source = cdx_source, - archive_paths = config.get('archive_paths', './sample_archive/warcs/'), - head_html = config.get('head_insert_html'), - query_html = config.get('query_html'), - search_html = config.get('search_html'), - static_path = config.get('static_path', hostpaths[0] + 'static/') + archive_paths = route_config.get('archive_paths', './sample_archive/warcs/'), + head_html = route_config.get('head_insert_html'), + query_html = route_config.get('query_html'), + search_html = route_config.get('search_html'), + static_path = route_config.get('static_path', hostpaths[0] + 'static/') ) logging.info('Adding Collection: ' + name) - routes.append(archivalrouter.Route(name, wb_handler)) + routes.append(archivalrouter.Route(name, wb_handler, filters = route_config.get('filters', []))) if config.get('debug_echo_env', False): diff --git a/run-tests.py b/run-tests.py index bfba91ed..550f871c 100644 --- a/run-tests.py +++ b/run-tests.py @@ -37,6 +37,19 @@ class TestWb: # 3 Captures + header assert len(resp.html.find_all('tr')) == 4 + def test_calendar_query_filtered(self): + # unfiltered collection + resp = self.testapp.get('/pywb/*/http://www.iana.org/_css/2013.1/screen.css') + self._assert_basic_html(resp) + # 17 Captures + header + assert len(resp.html.find_all('tr')) == 18 + + # filtered collection + resp = self.testapp.get('/pywb-filt/*/http://www.iana.org/_css/2013.1/screen.css') + self._assert_basic_html(resp) + # 1 Capture (filtered) + header + assert len(resp.html.find_all('tr')) == 2 + def test_cdx_query(self): resp = self.testapp.get('/pywb/cdx_/*/http://www.iana.org/') self._assert_basic_text(resp) diff --git a/test_config.yaml b/test_config.yaml index df16f0b3..8227c091 100644 --- a/test_config.yaml +++ b/test_config.yaml @@ -12,6 +12,10 @@ collections: # - a string value indicating remote http cdx server pywb: ./sample_archive/cdx/ + # ex with filtering: filter CDX lines by filename starting with 'dupe' + pywb-filt: {'index_paths': './sample_archive/cdx/', 'filters': ['filename:dupe*']} + + # indicate if cdx files are sorted by SURT keys -- eg: com,example)/ # SURT keys are recommended for future indices, but non-SURT cdxs # are also supported