1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

fuzzy match: make filter string optionally overridable

setup.py: unset PYWB_CONFIG_ENV
This commit is contained in:
Ilya Kreymer 2014-03-27 21:43:30 -07:00
parent 41d51a6427
commit 2c74ea9f23
4 changed files with 24 additions and 16 deletions

View File

@ -88,7 +88,8 @@ class FuzzyQuery:
matched_rule = rule matched_rule = rule
if len(m.groups()) == 1: if len(m.groups()) == 1:
filter_.append('~urlkey:' + m.group(1)) #filter_.append('~urlkey:' + m.group(1))
filter_.append(rule.filter.format(m.group(1)))
break break
@ -113,15 +114,19 @@ class FuzzyQuery:
#================================================================= #=================================================================
class CDXDomainSpecificRule(BaseRule): class CDXDomainSpecificRule(BaseRule):
DEFAULT_FILTER = '~urlkey:{0}'
def __init__(self, name, config): def __init__(self, name, config):
super(CDXDomainSpecificRule, self).__init__(name, config) super(CDXDomainSpecificRule, self).__init__(name, config)
if isinstance(config, basestring): if isinstance(config, basestring):
self.regex = re.compile(config) self.regex = re.compile(config)
self.replace = None self.replace = None
self.filter = self.DEFAULT_FILTER
else: else:
self.regex = re.compile(config.get('match')) self.regex = re.compile(config.get('match'))
self.replace = config.get('replace') self.replace = config.get('replace')
self.filter = config.get('filter', self.DEFAULT_FILTER)
def unsurt(self): def unsurt(self):
""" """

View File

@ -35,9 +35,6 @@ class BaseCDXServer(object):
if not self.url_canon: if not self.url_canon:
self.url_canon = UrlCanonicalizer(surt_ordered) self.url_canon = UrlCanonicalizer(surt_ordered)
# set perms checker, if any
#self.perms_checker = kwargs.get('perms_checker')
def _check_cdx_iter(self, cdx_iter, query): def _check_cdx_iter(self, cdx_iter, query):
""" Check cdx iter semantics """ Check cdx iter semantics
If `cdx_iter` is empty (no matches), check if fuzzy matching If `cdx_iter` is empty (no matches), check if fuzzy matching
@ -61,17 +58,18 @@ class BaseCDXServer(object):
return self.load_cdx(**fuzzy_query_params) return self.load_cdx(**fuzzy_query_params)
msg = 'No Captures found for: ' + query.url msg = 'No Captures found for: ' + query.url
print self.fuzzy_query
print query.params
raise NotFoundException(msg) raise NotFoundException(msg)
def _calc_search_keys(self, query):
return calc_search_range(url=query.url,
match_type=query.match_type,
url_canon=self.url_canon)
def load_cdx(self, **params): def load_cdx(self, **params):
query = CDXQuery(**params) query = CDXQuery(**params)
url = query.url key, end_key = self._calc_search_keys(query)
key, end_key = calc_search_range(url=url,
match_type=query.match_type,
url_canon=self.url_canon)
query.set_key(key, end_key) query.set_key(key, end_key)
cdx_iter = self._load_cdx_query(query) cdx_iter = self._load_cdx_query(query)
@ -211,7 +209,7 @@ class RemoteCDXServer(BaseCDXServer):
#================================================================= #=================================================================
def create_cdx_server(config, ds_rules_file=None): def create_cdx_server(config, ds_rules_file=None, server_cls=None):
if hasattr(config, 'get'): if hasattr(config, 'get'):
paths = config.get('index_paths') paths = config.get('index_paths')
surt_ordered = config.get('surt_ordered', True) surt_ordered = config.get('surt_ordered', True)
@ -223,10 +221,11 @@ def create_cdx_server(config, ds_rules_file=None):
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered)) logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
if isinstance(paths, str) and is_http(paths): if not server_cls:
server_cls = RemoteCDXServer if isinstance(paths, str) and is_http(paths):
else: server_cls = RemoteCDXServer
server_cls = CDXServer else:
server_cls = CDXServer
return server_cls(paths, return server_cls(paths,
config=pass_config, config=pass_config,

View File

@ -58,4 +58,6 @@ rules:
#================================================================= #=================================================================
# Applies to all urls -- should be last # Applies to all urls -- should be last
- url_prefix: '' - url_prefix: ''
fuzzy_lookup: '[&?](?:_|uncache)=[\d]+[&]?' fuzzy_lookup:
match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
filter: '=urlkey:{0}'

View File

@ -25,6 +25,8 @@ class PyTest(TestCommand):
def run_tests(self): def run_tests(self):
import pytest import pytest
import sys import sys
import os
os.environ.pop('PYWB_CONFIG_FILE', None)
cmdline = ' --cov-config .coveragerc --cov pywb' cmdline = ' --cov-config .coveragerc --cov pywb'
cmdline += ' -v --doctest-module ./pywb/ tests/' cmdline += ' -v --doctest-module ./pywb/ tests/'
errcode = pytest.main(cmdline) errcode = pytest.main(cmdline)