mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
fuzzy match: make filter string optionally overridable
setup.py: unset PYWB_CONFIG_ENV
This commit is contained in:
parent
41d51a6427
commit
2c74ea9f23
@ -88,7 +88,8 @@ class FuzzyQuery:
|
|||||||
matched_rule = rule
|
matched_rule = rule
|
||||||
|
|
||||||
if len(m.groups()) == 1:
|
if len(m.groups()) == 1:
|
||||||
filter_.append('~urlkey:' + m.group(1))
|
#filter_.append('~urlkey:' + m.group(1))
|
||||||
|
filter_.append(rule.filter.format(m.group(1)))
|
||||||
|
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -113,15 +114,19 @@ class FuzzyQuery:
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CDXDomainSpecificRule(BaseRule):
|
class CDXDomainSpecificRule(BaseRule):
|
||||||
|
DEFAULT_FILTER = '~urlkey:{0}'
|
||||||
|
|
||||||
def __init__(self, name, config):
|
def __init__(self, name, config):
|
||||||
super(CDXDomainSpecificRule, self).__init__(name, config)
|
super(CDXDomainSpecificRule, self).__init__(name, config)
|
||||||
|
|
||||||
if isinstance(config, basestring):
|
if isinstance(config, basestring):
|
||||||
self.regex = re.compile(config)
|
self.regex = re.compile(config)
|
||||||
self.replace = None
|
self.replace = None
|
||||||
|
self.filter = self.DEFAULT_FILTER
|
||||||
else:
|
else:
|
||||||
self.regex = re.compile(config.get('match'))
|
self.regex = re.compile(config.get('match'))
|
||||||
self.replace = config.get('replace')
|
self.replace = config.get('replace')
|
||||||
|
self.filter = config.get('filter', self.DEFAULT_FILTER)
|
||||||
|
|
||||||
def unsurt(self):
|
def unsurt(self):
|
||||||
"""
|
"""
|
||||||
|
@ -35,9 +35,6 @@ class BaseCDXServer(object):
|
|||||||
if not self.url_canon:
|
if not self.url_canon:
|
||||||
self.url_canon = UrlCanonicalizer(surt_ordered)
|
self.url_canon = UrlCanonicalizer(surt_ordered)
|
||||||
|
|
||||||
# set perms checker, if any
|
|
||||||
#self.perms_checker = kwargs.get('perms_checker')
|
|
||||||
|
|
||||||
def _check_cdx_iter(self, cdx_iter, query):
|
def _check_cdx_iter(self, cdx_iter, query):
|
||||||
""" Check cdx iter semantics
|
""" Check cdx iter semantics
|
||||||
If `cdx_iter` is empty (no matches), check if fuzzy matching
|
If `cdx_iter` is empty (no matches), check if fuzzy matching
|
||||||
@ -61,17 +58,18 @@ class BaseCDXServer(object):
|
|||||||
return self.load_cdx(**fuzzy_query_params)
|
return self.load_cdx(**fuzzy_query_params)
|
||||||
|
|
||||||
msg = 'No Captures found for: ' + query.url
|
msg = 'No Captures found for: ' + query.url
|
||||||
print self.fuzzy_query
|
|
||||||
print query.params
|
|
||||||
raise NotFoundException(msg)
|
raise NotFoundException(msg)
|
||||||
|
|
||||||
|
def _calc_search_keys(self, query):
|
||||||
|
return calc_search_range(url=query.url,
|
||||||
|
match_type=query.match_type,
|
||||||
|
url_canon=self.url_canon)
|
||||||
|
|
||||||
def load_cdx(self, **params):
|
def load_cdx(self, **params):
|
||||||
query = CDXQuery(**params)
|
query = CDXQuery(**params)
|
||||||
|
|
||||||
url = query.url
|
key, end_key = self._calc_search_keys(query)
|
||||||
key, end_key = calc_search_range(url=url,
|
|
||||||
match_type=query.match_type,
|
|
||||||
url_canon=self.url_canon)
|
|
||||||
query.set_key(key, end_key)
|
query.set_key(key, end_key)
|
||||||
|
|
||||||
cdx_iter = self._load_cdx_query(query)
|
cdx_iter = self._load_cdx_query(query)
|
||||||
@ -211,7 +209,7 @@ class RemoteCDXServer(BaseCDXServer):
|
|||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def create_cdx_server(config, ds_rules_file=None):
|
def create_cdx_server(config, ds_rules_file=None, server_cls=None):
|
||||||
if hasattr(config, 'get'):
|
if hasattr(config, 'get'):
|
||||||
paths = config.get('index_paths')
|
paths = config.get('index_paths')
|
||||||
surt_ordered = config.get('surt_ordered', True)
|
surt_ordered = config.get('surt_ordered', True)
|
||||||
@ -223,10 +221,11 @@ def create_cdx_server(config, ds_rules_file=None):
|
|||||||
|
|
||||||
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
|
||||||
|
|
||||||
if isinstance(paths, str) and is_http(paths):
|
if not server_cls:
|
||||||
server_cls = RemoteCDXServer
|
if isinstance(paths, str) and is_http(paths):
|
||||||
else:
|
server_cls = RemoteCDXServer
|
||||||
server_cls = CDXServer
|
else:
|
||||||
|
server_cls = CDXServer
|
||||||
|
|
||||||
return server_cls(paths,
|
return server_cls(paths,
|
||||||
config=pass_config,
|
config=pass_config,
|
||||||
|
@ -58,4 +58,6 @@ rules:
|
|||||||
#=================================================================
|
#=================================================================
|
||||||
# Applies to all urls -- should be last
|
# Applies to all urls -- should be last
|
||||||
- url_prefix: ''
|
- url_prefix: ''
|
||||||
fuzzy_lookup: '[&?](?:_|uncache)=[\d]+[&]?'
|
fuzzy_lookup:
|
||||||
|
match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
|
||||||
|
filter: '=urlkey:{0}'
|
||||||
|
2
setup.py
2
setup.py
@ -25,6 +25,8 @@ class PyTest(TestCommand):
|
|||||||
def run_tests(self):
|
def run_tests(self):
|
||||||
import pytest
|
import pytest
|
||||||
import sys
|
import sys
|
||||||
|
import os
|
||||||
|
os.environ.pop('PYWB_CONFIG_FILE', None)
|
||||||
cmdline = ' --cov-config .coveragerc --cov pywb'
|
cmdline = ' --cov-config .coveragerc --cov pywb'
|
||||||
cmdline += ' -v --doctest-module ./pywb/ tests/'
|
cmdline += ' -v --doctest-module ./pywb/ tests/'
|
||||||
errcode = pytest.main(cmdline)
|
errcode = pytest.main(cmdline)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user