From 2c74ea9f2399a12c7c956c9c5ab2b5c215d38183 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 27 Mar 2014 21:43:30 -0700 Subject: [PATCH] fuzzy match: make filter string optionally overridable setup.py: unset PYWB_CONFIG_ENV --- pywb/cdx/cdxdomainspecific.py | 7 ++++++- pywb/cdx/cdxserver.py | 27 +++++++++++++-------------- pywb/rules.yaml | 4 +++- setup.py | 2 ++ 4 files changed, 24 insertions(+), 16 deletions(-) diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index fb15c6c4..4804ff47 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -88,7 +88,8 @@ class FuzzyQuery: matched_rule = rule if len(m.groups()) == 1: - filter_.append('~urlkey:' + m.group(1)) + #filter_.append('~urlkey:' + m.group(1)) + filter_.append(rule.filter.format(m.group(1))) break @@ -113,15 +114,19 @@ class FuzzyQuery: #================================================================= class CDXDomainSpecificRule(BaseRule): + DEFAULT_FILTER = '~urlkey:{0}' + def __init__(self, name, config): super(CDXDomainSpecificRule, self).__init__(name, config) if isinstance(config, basestring): self.regex = re.compile(config) self.replace = None + self.filter = self.DEFAULT_FILTER else: self.regex = re.compile(config.get('match')) self.replace = config.get('replace') + self.filter = config.get('filter', self.DEFAULT_FILTER) def unsurt(self): """ diff --git a/pywb/cdx/cdxserver.py b/pywb/cdx/cdxserver.py index 90443c85..77fe5e6e 100644 --- a/pywb/cdx/cdxserver.py +++ b/pywb/cdx/cdxserver.py @@ -35,9 +35,6 @@ class BaseCDXServer(object): if not self.url_canon: self.url_canon = UrlCanonicalizer(surt_ordered) - # set perms checker, if any - #self.perms_checker = kwargs.get('perms_checker') - def _check_cdx_iter(self, cdx_iter, query): """ Check cdx iter semantics If `cdx_iter` is empty (no matches), check if fuzzy matching @@ -61,17 +58,18 @@ class BaseCDXServer(object): return self.load_cdx(**fuzzy_query_params) msg = 'No Captures found for: ' + query.url - print self.fuzzy_query - print query.params raise NotFoundException(msg) + def _calc_search_keys(self, query): + return calc_search_range(url=query.url, + match_type=query.match_type, + url_canon=self.url_canon) + def load_cdx(self, **params): query = CDXQuery(**params) - url = query.url - key, end_key = calc_search_range(url=url, - match_type=query.match_type, - url_canon=self.url_canon) + key, end_key = self._calc_search_keys(query) + query.set_key(key, end_key) cdx_iter = self._load_cdx_query(query) @@ -211,7 +209,7 @@ class RemoteCDXServer(BaseCDXServer): #================================================================= -def create_cdx_server(config, ds_rules_file=None): +def create_cdx_server(config, ds_rules_file=None, server_cls=None): if hasattr(config, 'get'): paths = config.get('index_paths') surt_ordered = config.get('surt_ordered', True) @@ -223,10 +221,11 @@ def create_cdx_server(config, ds_rules_file=None): logging.debug('CDX Surt-Ordered? ' + str(surt_ordered)) - if isinstance(paths, str) and is_http(paths): - server_cls = RemoteCDXServer - else: - server_cls = CDXServer + if not server_cls: + if isinstance(paths, str) and is_http(paths): + server_cls = RemoteCDXServer + else: + server_cls = CDXServer return server_cls(paths, config=pass_config, diff --git a/pywb/rules.yaml b/pywb/rules.yaml index e9892962..cd7325eb 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -58,4 +58,6 @@ rules: #================================================================= # Applies to all urls -- should be last - url_prefix: '' - fuzzy_lookup: '[&?](?:_|uncache)=[\d]+[&]?' + fuzzy_lookup: + match: '(.*)[&?](?:_|uncache)=[\d]+[&]?' + filter: '=urlkey:{0}' diff --git a/setup.py b/setup.py index f3faed33..bd6c1e65 100755 --- a/setup.py +++ b/setup.py @@ -25,6 +25,8 @@ class PyTest(TestCommand): def run_tests(self): import pytest import sys + import os + os.environ.pop('PYWB_CONFIG_FILE', None) cmdline = ' --cov-config .coveragerc --cov pywb' cmdline += ' -v --doctest-module ./pywb/ tests/' errcode = pytest.main(cmdline)