1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

fuzzy match: add support for specifying regex and args seperately for

fuzzy_lookup match
This commit is contained in:
Ilya Kreymer 2014-12-26 14:29:51 -08:00
parent ffb702ce03
commit 8d6845a552
3 changed files with 62 additions and 22 deletions

View File

@ -13,11 +13,6 @@ from query import CDXQuery
#=================================================================
def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
"""
>>> (canon, fuzzy) = load_domain_specific_cdx_rules(None, True)
>>> canon('http://test.example.example/path/index.html?a=b&id=value&c=d')
'example,example,test)/path/index.html?id=value'
"""
canon = None
fuzzy = None
@ -162,24 +157,24 @@ class CDXDomainSpecificRule(BaseRule):
@staticmethod
def make_regex(config):
# just query args
if isinstance(config, list):
string = CDXDomainSpecificRule.make_query_match_regex(config)
# assumes string
# split out base and args
elif isinstance(config, dict):
string = config.get('regex', '')
string += CDXDomainSpecificRule.make_query_match_regex(
config.get('args', []))
# else assume string
else:
string = config
string = str(config)
return re.compile(string)
@staticmethod
def make_query_match_regex(params_list):
r"""
>>> CDXDomainSpecificRule.make_query_match_regex(['para', 'id', 'abc'])
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
>>> CDXDomainSpecificRule.make_query_match_regex(['id[0]', 'abc()'])
'[?&](abc\\(\\)=[^&]+).*[?&](id\\[0\\]=[^&]+)'
"""
params_list.sort()
def conv(value):
@ -188,8 +183,3 @@ class CDXDomainSpecificRule(BaseRule):
params_list = map(conv, params_list)
final_str = '.*'.join(params_list)
return final_str
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -0,0 +1,40 @@
r"""
Load Rules
>>> (canon, fuzzy) = load_domain_specific_cdx_rules(None, True)
>>> canon('http://test.example.example/path/index.html?a=b&id=value&c=d')
'example,example,test)/path/index.html?id=value'
# Fuzzy Query Args Builder
>>> CDXDomainSpecificRule.make_query_match_regex(['para', 'id', 'abc'])
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
>>> CDXDomainSpecificRule.make_query_match_regex(['id[0]', 'abc()'])
'[?&](abc\\(\\)=[^&]+).*[?&](id\\[0\\]=[^&]+)'
# Fuzzy Match Query + Args
# list
>>> CDXDomainSpecificRule.make_regex(['para', 'id', 'abc']).pattern
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
# dict
>>> CDXDomainSpecificRule.make_regex(dict(regex='com,test,.*\)/', args=['para', 'id', 'abc'])).pattern
'com,test,.*\\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
# string
>>> CDXDomainSpecificRule.make_regex('com,test,.*\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)').pattern
'com,test,.*\\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
"""
from pywb.cdx.cdxdomainspecific import CDXDomainSpecificRule
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -148,12 +148,22 @@ rules:
- url_prefix: 'com,youtube,c'
fuzzy_lookup: 'com,youtube,c.*/videogoodput.*([?&]id=[^&]+)'
fuzzy_lookup:
match:
regex: 'com,youtube,c.*/videogoodput.*'
args:
- id
- url_prefix: 'com,googlevideo,'
fuzzy_lookup:
match: 'com,googlevideo.*/videoplayback.*([?&]id=[^&]+).*([?&]itag=[^&]+).*([?&]mime=[^&]+)'
match:
regex: 'com,googlevideo.*/videoplayback.*'
args:
- id
- itag
- mime
filter:
- '~urlkey:{0}'
- '!mimetype:text/plain'