mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
fuzzy match: add support for specifying regex and args seperately for
fuzzy_lookup match
This commit is contained in:
parent
ffb702ce03
commit
8d6845a552
@ -13,11 +13,6 @@ from query import CDXQuery
|
|||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
|
def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
|
||||||
"""
|
|
||||||
>>> (canon, fuzzy) = load_domain_specific_cdx_rules(None, True)
|
|
||||||
>>> canon('http://test.example.example/path/index.html?a=b&id=value&c=d')
|
|
||||||
'example,example,test)/path/index.html?id=value'
|
|
||||||
"""
|
|
||||||
canon = None
|
canon = None
|
||||||
fuzzy = None
|
fuzzy = None
|
||||||
|
|
||||||
@ -162,24 +157,24 @@ class CDXDomainSpecificRule(BaseRule):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_regex(config):
|
def make_regex(config):
|
||||||
|
# just query args
|
||||||
if isinstance(config, list):
|
if isinstance(config, list):
|
||||||
string = CDXDomainSpecificRule.make_query_match_regex(config)
|
string = CDXDomainSpecificRule.make_query_match_regex(config)
|
||||||
# assumes string
|
|
||||||
|
# split out base and args
|
||||||
|
elif isinstance(config, dict):
|
||||||
|
string = config.get('regex', '')
|
||||||
|
string += CDXDomainSpecificRule.make_query_match_regex(
|
||||||
|
config.get('args', []))
|
||||||
|
|
||||||
|
# else assume string
|
||||||
else:
|
else:
|
||||||
string = config
|
string = str(config)
|
||||||
|
|
||||||
return re.compile(string)
|
return re.compile(string)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_query_match_regex(params_list):
|
def make_query_match_regex(params_list):
|
||||||
r"""
|
|
||||||
>>> CDXDomainSpecificRule.make_query_match_regex(['para', 'id', 'abc'])
|
|
||||||
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
|
|
||||||
|
|
||||||
>>> CDXDomainSpecificRule.make_query_match_regex(['id[0]', 'abc()'])
|
|
||||||
'[?&](abc\\(\\)=[^&]+).*[?&](id\\[0\\]=[^&]+)'
|
|
||||||
|
|
||||||
"""
|
|
||||||
params_list.sort()
|
params_list.sort()
|
||||||
|
|
||||||
def conv(value):
|
def conv(value):
|
||||||
@ -188,8 +183,3 @@ class CDXDomainSpecificRule(BaseRule):
|
|||||||
params_list = map(conv, params_list)
|
params_list = map(conv, params_list)
|
||||||
final_str = '.*'.join(params_list)
|
final_str = '.*'.join(params_list)
|
||||||
return final_str
|
return final_str
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import doctest
|
|
||||||
doctest.testmod()
|
|
||||||
|
40
pywb/cdx/test/test_cdxdomainspecific.py
Normal file
40
pywb/cdx/test/test_cdxdomainspecific.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
r"""
|
||||||
|
Load Rules
|
||||||
|
|
||||||
|
>>> (canon, fuzzy) = load_domain_specific_cdx_rules(None, True)
|
||||||
|
>>> canon('http://test.example.example/path/index.html?a=b&id=value&c=d')
|
||||||
|
'example,example,test)/path/index.html?id=value'
|
||||||
|
|
||||||
|
|
||||||
|
# Fuzzy Query Args Builder
|
||||||
|
>>> CDXDomainSpecificRule.make_query_match_regex(['para', 'id', 'abc'])
|
||||||
|
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
|
||||||
|
|
||||||
|
>>> CDXDomainSpecificRule.make_query_match_regex(['id[0]', 'abc()'])
|
||||||
|
'[?&](abc\\(\\)=[^&]+).*[?&](id\\[0\\]=[^&]+)'
|
||||||
|
|
||||||
|
|
||||||
|
# Fuzzy Match Query + Args
|
||||||
|
|
||||||
|
# list
|
||||||
|
>>> CDXDomainSpecificRule.make_regex(['para', 'id', 'abc']).pattern
|
||||||
|
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
|
||||||
|
|
||||||
|
# dict
|
||||||
|
>>> CDXDomainSpecificRule.make_regex(dict(regex='com,test,.*\)/', args=['para', 'id', 'abc'])).pattern
|
||||||
|
'com,test,.*\\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
|
||||||
|
|
||||||
|
# string
|
||||||
|
>>> CDXDomainSpecificRule.make_regex('com,test,.*\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)').pattern
|
||||||
|
'com,test,.*\\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
from pywb.cdx.cdxdomainspecific import CDXDomainSpecificRule
|
||||||
|
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import doctest
|
||||||
|
doctest.testmod()
|
@ -148,12 +148,22 @@ rules:
|
|||||||
|
|
||||||
- url_prefix: 'com,youtube,c'
|
- url_prefix: 'com,youtube,c'
|
||||||
|
|
||||||
fuzzy_lookup: 'com,youtube,c.*/videogoodput.*([?&]id=[^&]+)'
|
fuzzy_lookup:
|
||||||
|
match:
|
||||||
|
regex: 'com,youtube,c.*/videogoodput.*'
|
||||||
|
args:
|
||||||
|
- id
|
||||||
|
|
||||||
- url_prefix: 'com,googlevideo,'
|
- url_prefix: 'com,googlevideo,'
|
||||||
|
|
||||||
fuzzy_lookup:
|
fuzzy_lookup:
|
||||||
match: 'com,googlevideo.*/videoplayback.*([?&]id=[^&]+).*([?&]itag=[^&]+).*([?&]mime=[^&]+)'
|
match:
|
||||||
|
regex: 'com,googlevideo.*/videoplayback.*'
|
||||||
|
args:
|
||||||
|
- id
|
||||||
|
- itag
|
||||||
|
- mime
|
||||||
|
|
||||||
filter:
|
filter:
|
||||||
- '~urlkey:{0}'
|
- '~urlkey:{0}'
|
||||||
- '!mimetype:text/plain'
|
- '!mimetype:text/plain'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user