From c10df57e07719ddb6521babdb9aef56d7182a0f5 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 24 Nov 2014 11:10:49 -0800 Subject: [PATCH] rules: add support for customizing matchType prefix, adding multiple filters --- pywb/cdx/cdxdomainspecific.py | 21 ++++++++++++++++++--- pywb/rules.yaml | 11 ++++++++--- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/pywb/cdx/cdxdomainspecific.py b/pywb/cdx/cdxdomainspecific.py index ce15e024..c7c05545 100644 --- a/pywb/cdx/cdxdomainspecific.py +++ b/pywb/cdx/cdxdomainspecific.py @@ -2,6 +2,7 @@ import yaml import re import logging import pkg_resources +import urlparse from pywb.utils.dsrules import BaseRule, RuleSet @@ -89,7 +90,8 @@ class FuzzyQuery: groups = m.groups() for g in groups: - filter_.append(rule.filter.format(g)) + for f in matched_rule.filter: + filter_.append(f.format(g)) break @@ -104,9 +106,15 @@ class FuzzyQuery: if inx > 0: url = url[:inx + 1] + + if matched_rule.match_type == 'domain': + host = urlparse.urlsplit(url).netloc + # remove the subdomain + url = host.split('.', 1)[1] + params = query.params params.update({'url': url, - 'matchType': 'prefix', + 'matchType': matched_rule.match_type, 'filter': filter_}) if 'reverse' in params: @@ -115,12 +123,17 @@ class FuzzyQuery: if 'closest' in params: del params['closest'] + if 'end_key' in params: + del params['end_key'] + + print(params) return params #================================================================= class CDXDomainSpecificRule(BaseRule): - DEFAULT_FILTER = '~urlkey:{0}' + DEFAULT_FILTER = ['~urlkey:{0}'] + DEFAULT_MATCH_TYPE = 'prefix' def __init__(self, name, config): super(CDXDomainSpecificRule, self).__init__(name, config) @@ -129,10 +142,12 @@ class CDXDomainSpecificRule(BaseRule): self.regex = self.make_regex(config) self.replace = None self.filter = self.DEFAULT_FILTER + self.match_type = self.DEFAULT_MATCH_TYPE else: self.regex = self.make_regex(config.get('match')) self.replace = config.get('replace') self.filter = config.get('filter', self.DEFAULT_FILTER) + self.match_type = config.get('type', self.DEFAULT_MATCH_TYPE) def unsurt(self): """ diff --git a/pywb/rules.yaml b/pywb/rules.yaml index 8cdcb528..68b2759d 100644 --- a/pywb/rules.yaml +++ b/pywb/rules.yaml @@ -142,8 +142,13 @@ rules: - url_prefix: 'com,googlevideo,' - #fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]).*(mime=[^&]).*(signature=[^&])' - fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]+).*(itag=[^&]+).*(mime=[^&]+)' + fuzzy_lookup: + match: 'com,googlevideo.*/videoplayback.*(id=[^&]+).*(itag=[^&]+).*(mime=[^&]+)' + filter: + - '~urlkey:{0}' + - '!mimetype:text/plain' + + type: 'domain' # testing rules -- not for valid domain @@ -169,5 +174,5 @@ rules: - url_prefix: '' fuzzy_lookup: match: '(.*)[&?](?:_|uncache)=[\d]+[&]?' - filter: '=urlkey:{0}' + filter: ['=urlkey:{0}'] replace: '?'