mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
fuzzy match: add support for specifying regex and args seperately for
fuzzy_lookup match
This commit is contained in:
parent
ffb702ce03
commit
8d6845a552
@ -13,11 +13,6 @@ from query import CDXQuery
|
||||
|
||||
#=================================================================
|
||||
def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
|
||||
"""
|
||||
>>> (canon, fuzzy) = load_domain_specific_cdx_rules(None, True)
|
||||
>>> canon('http://test.example.example/path/index.html?a=b&id=value&c=d')
|
||||
'example,example,test)/path/index.html?id=value'
|
||||
"""
|
||||
canon = None
|
||||
fuzzy = None
|
||||
|
||||
@ -162,24 +157,24 @@ class CDXDomainSpecificRule(BaseRule):
|
||||
|
||||
@staticmethod
|
||||
def make_regex(config):
|
||||
# just query args
|
||||
if isinstance(config, list):
|
||||
string = CDXDomainSpecificRule.make_query_match_regex(config)
|
||||
# assumes string
|
||||
|
||||
# split out base and args
|
||||
elif isinstance(config, dict):
|
||||
string = config.get('regex', '')
|
||||
string += CDXDomainSpecificRule.make_query_match_regex(
|
||||
config.get('args', []))
|
||||
|
||||
# else assume string
|
||||
else:
|
||||
string = config
|
||||
string = str(config)
|
||||
|
||||
return re.compile(string)
|
||||
|
||||
@staticmethod
|
||||
def make_query_match_regex(params_list):
|
||||
r"""
|
||||
>>> CDXDomainSpecificRule.make_query_match_regex(['para', 'id', 'abc'])
|
||||
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
|
||||
|
||||
>>> CDXDomainSpecificRule.make_query_match_regex(['id[0]', 'abc()'])
|
||||
'[?&](abc\\(\\)=[^&]+).*[?&](id\\[0\\]=[^&]+)'
|
||||
|
||||
"""
|
||||
params_list.sort()
|
||||
|
||||
def conv(value):
|
||||
@ -188,8 +183,3 @@ class CDXDomainSpecificRule(BaseRule):
|
||||
params_list = map(conv, params_list)
|
||||
final_str = '.*'.join(params_list)
|
||||
return final_str
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
|
40
pywb/cdx/test/test_cdxdomainspecific.py
Normal file
40
pywb/cdx/test/test_cdxdomainspecific.py
Normal file
@ -0,0 +1,40 @@
|
||||
r"""
|
||||
Load Rules
|
||||
|
||||
>>> (canon, fuzzy) = load_domain_specific_cdx_rules(None, True)
|
||||
>>> canon('http://test.example.example/path/index.html?a=b&id=value&c=d')
|
||||
'example,example,test)/path/index.html?id=value'
|
||||
|
||||
|
||||
# Fuzzy Query Args Builder
|
||||
>>> CDXDomainSpecificRule.make_query_match_regex(['para', 'id', 'abc'])
|
||||
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
|
||||
|
||||
>>> CDXDomainSpecificRule.make_query_match_regex(['id[0]', 'abc()'])
|
||||
'[?&](abc\\(\\)=[^&]+).*[?&](id\\[0\\]=[^&]+)'
|
||||
|
||||
|
||||
# Fuzzy Match Query + Args
|
||||
|
||||
# list
|
||||
>>> CDXDomainSpecificRule.make_regex(['para', 'id', 'abc']).pattern
|
||||
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
|
||||
|
||||
# dict
|
||||
>>> CDXDomainSpecificRule.make_regex(dict(regex='com,test,.*\)/', args=['para', 'id', 'abc'])).pattern
|
||||
'com,test,.*\\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
|
||||
|
||||
# string
|
||||
>>> CDXDomainSpecificRule.make_regex('com,test,.*\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)').pattern
|
||||
'com,test,.*\\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
|
||||
|
||||
"""
|
||||
|
||||
|
||||
from pywb.cdx.cdxdomainspecific import CDXDomainSpecificRule
|
||||
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
@ -148,12 +148,22 @@ rules:
|
||||
|
||||
- url_prefix: 'com,youtube,c'
|
||||
|
||||
fuzzy_lookup: 'com,youtube,c.*/videogoodput.*([?&]id=[^&]+)'
|
||||
fuzzy_lookup:
|
||||
match:
|
||||
regex: 'com,youtube,c.*/videogoodput.*'
|
||||
args:
|
||||
- id
|
||||
|
||||
- url_prefix: 'com,googlevideo,'
|
||||
|
||||
fuzzy_lookup:
|
||||
match: 'com,googlevideo.*/videoplayback.*([?&]id=[^&]+).*([?&]itag=[^&]+).*([?&]mime=[^&]+)'
|
||||
match:
|
||||
regex: 'com,googlevideo.*/videoplayback.*'
|
||||
args:
|
||||
- id
|
||||
- itag
|
||||
- mime
|
||||
|
||||
filter:
|
||||
- '~urlkey:{0}'
|
||||
- '!mimetype:text/plain'
|
||||
|
Loading…
x
Reference in New Issue
Block a user