From 8d6845a55210b0462647711668ddcf5af98e721f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 26 Dec 2014 14:29:51 -0800 Subject: [PATCH] fuzzy match: add support for specifying regex and args seperately for fuzzy_lookup match --- pywb/cdx/cdxdomainspecific.py | 30 +++++++------------ pywb/cdx/test/test_cdxdomainspecific.py | 40 +++++++++++++++++++++++++ pywb/rules.yaml | 14 +++++++-- 3 files changed, 62 insertions(+), 22 deletions(-) create mode 100644 pywb/cdx/test/test_cdxdomainspecific.py diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index 3fb55862..8c27da01 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -13,11 +13,6 @@ from query import CDXQuery #================================================================= def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered): - """ - >>> (canon, fuzzy) = load_domain_specific_cdx_rules(None, True) - >>> canon('http://test.example.example/path/index.html?a=b&id=value&c=d') - 'example,example,test)/path/index.html?id=value' - """ canon = None fuzzy = None @@ -162,24 +157,24 @@ class CDXDomainSpecificRule(BaseRule): @staticmethod def make_regex(config): + # just query args if isinstance(config, list): string = CDXDomainSpecificRule.make_query_match_regex(config) - # assumes string + + # split out base and args + elif isinstance(config, dict): + string = config.get('regex', '') + string += CDXDomainSpecificRule.make_query_match_regex( + config.get('args', [])) + + # else assume string else: - string = config + string = str(config) return re.compile(string) @staticmethod def make_query_match_regex(params_list): - r""" - >>> CDXDomainSpecificRule.make_query_match_regex(['para', 'id', 'abc']) - '[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)' - - >>> CDXDomainSpecificRule.make_query_match_regex(['id[0]', 'abc()']) - '[?&](abc\\(\\)=[^&]+).*[?&](id\\[0\\]=[^&]+)' - - """ params_list.sort() def conv(value): @@ -188,8 +183,3 @@ class CDXDomainSpecificRule(BaseRule): params_list = map(conv, params_list) final_str = '.*'.join(params_list) return final_str - - -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/pywb/cdx/test/test_cdxdomainspecific.py b/pywb/cdx/test/test_cdxdomainspecific.py new file mode 100644 index 00000000..906a3103 --- /dev/null +++ b/pywb/cdx/test/test_cdxdomainspecific.py @@ -0,0 +1,40 @@ +r""" +Load Rules + +>>> (canon, fuzzy) = load_domain_specific_cdx_rules(None, True) +>>> canon('http://test.example.example/path/index.html?a=b&id=value&c=d') +'example,example,test)/path/index.html?id=value' + + +# Fuzzy Query Args Builder +>>> CDXDomainSpecificRule.make_query_match_regex(['para', 'id', 'abc']) +'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)' + +>>> CDXDomainSpecificRule.make_query_match_regex(['id[0]', 'abc()']) +'[?&](abc\\(\\)=[^&]+).*[?&](id\\[0\\]=[^&]+)' + + +# Fuzzy Match Query + Args + +# list +>>> CDXDomainSpecificRule.make_regex(['para', 'id', 'abc']).pattern +'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)' + +# dict +>>> CDXDomainSpecificRule.make_regex(dict(regex='com,test,.*\)/', args=['para', 'id', 'abc'])).pattern +'com,test,.*\\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)' + +# string +>>> CDXDomainSpecificRule.make_regex('com,test,.*\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)').pattern +'com,test,.*\\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)' + +""" + + +from pywb.cdx.cdxdomainspecific import CDXDomainSpecificRule +from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules + + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 643daced..b40ed818 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -148,12 +148,22 @@ rules: - url_prefix: 'com,youtube,c' - fuzzy_lookup: 'com,youtube,c.*/videogoodput.*([?&]id=[^&]+)' + fuzzy_lookup: + match: + regex: 'com,youtube,c.*/videogoodput.*' + args: + - id - url_prefix: 'com,googlevideo,' fuzzy_lookup: - match: 'com,googlevideo.*/videoplayback.*([?&]id=[^&]+).*([?&]itag=[^&]+).*([?&]mime=[^&]+)' + match: + regex: 'com,googlevideo.*/videoplayback.*' + args: + - id + - itag + - mime + filter: - '~urlkey:{0}' - '!mimetype:text/plain'