mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
rules: add support for customizing matchType prefix, adding multiple
filters
This commit is contained in:
parent
d3ef47342c
commit
c10df57e07
@ -2,6 +2,7 @@ import yaml
|
|||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
|
import urlparse
|
||||||
|
|
||||||
from pywb.utils.dsrules import BaseRule, RuleSet
|
from pywb.utils.dsrules import BaseRule, RuleSet
|
||||||
|
|
||||||
@ -89,7 +90,8 @@ class FuzzyQuery:
|
|||||||
|
|
||||||
groups = m.groups()
|
groups = m.groups()
|
||||||
for g in groups:
|
for g in groups:
|
||||||
filter_.append(rule.filter.format(g))
|
for f in matched_rule.filter:
|
||||||
|
filter_.append(f.format(g))
|
||||||
|
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -104,9 +106,15 @@ class FuzzyQuery:
|
|||||||
if inx > 0:
|
if inx > 0:
|
||||||
url = url[:inx + 1]
|
url = url[:inx + 1]
|
||||||
|
|
||||||
|
|
||||||
|
if matched_rule.match_type == 'domain':
|
||||||
|
host = urlparse.urlsplit(url).netloc
|
||||||
|
# remove the subdomain
|
||||||
|
url = host.split('.', 1)[1]
|
||||||
|
|
||||||
params = query.params
|
params = query.params
|
||||||
params.update({'url': url,
|
params.update({'url': url,
|
||||||
'matchType': 'prefix',
|
'matchType': matched_rule.match_type,
|
||||||
'filter': filter_})
|
'filter': filter_})
|
||||||
|
|
||||||
if 'reverse' in params:
|
if 'reverse' in params:
|
||||||
@ -115,12 +123,17 @@ class FuzzyQuery:
|
|||||||
if 'closest' in params:
|
if 'closest' in params:
|
||||||
del params['closest']
|
del params['closest']
|
||||||
|
|
||||||
|
if 'end_key' in params:
|
||||||
|
del params['end_key']
|
||||||
|
|
||||||
|
print(params)
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class CDXDomainSpecificRule(BaseRule):
|
class CDXDomainSpecificRule(BaseRule):
|
||||||
DEFAULT_FILTER = '~urlkey:{0}'
|
DEFAULT_FILTER = ['~urlkey:{0}']
|
||||||
|
DEFAULT_MATCH_TYPE = 'prefix'
|
||||||
|
|
||||||
def __init__(self, name, config):
|
def __init__(self, name, config):
|
||||||
super(CDXDomainSpecificRule, self).__init__(name, config)
|
super(CDXDomainSpecificRule, self).__init__(name, config)
|
||||||
@ -129,10 +142,12 @@ class CDXDomainSpecificRule(BaseRule):
|
|||||||
self.regex = self.make_regex(config)
|
self.regex = self.make_regex(config)
|
||||||
self.replace = None
|
self.replace = None
|
||||||
self.filter = self.DEFAULT_FILTER
|
self.filter = self.DEFAULT_FILTER
|
||||||
|
self.match_type = self.DEFAULT_MATCH_TYPE
|
||||||
else:
|
else:
|
||||||
self.regex = self.make_regex(config.get('match'))
|
self.regex = self.make_regex(config.get('match'))
|
||||||
self.replace = config.get('replace')
|
self.replace = config.get('replace')
|
||||||
self.filter = config.get('filter', self.DEFAULT_FILTER)
|
self.filter = config.get('filter', self.DEFAULT_FILTER)
|
||||||
|
self.match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
|
||||||
|
|
||||||
def unsurt(self):
|
def unsurt(self):
|
||||||
"""
|
"""
|
||||||
|
@ -142,8 +142,13 @@ rules:
|
|||||||
|
|
||||||
- url_prefix: 'com,googlevideo,'
|
- url_prefix: 'com,googlevideo,'
|
||||||
|
|
||||||
#fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]).*(mime=[^&]).*(signature=[^&])'
|
fuzzy_lookup:
|
||||||
fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]+).*(itag=[^&]+).*(mime=[^&]+)'
|
match: 'com,googlevideo.*/videoplayback.*(id=[^&]+).*(itag=[^&]+).*(mime=[^&]+)'
|
||||||
|
filter:
|
||||||
|
- '~urlkey:{0}'
|
||||||
|
- '!mimetype:text/plain'
|
||||||
|
|
||||||
|
type: 'domain'
|
||||||
|
|
||||||
|
|
||||||
# testing rules -- not for valid domain
|
# testing rules -- not for valid domain
|
||||||
@ -169,5 +174,5 @@ rules:
|
|||||||
- url_prefix: ''
|
- url_prefix: ''
|
||||||
fuzzy_lookup:
|
fuzzy_lookup:
|
||||||
match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
|
match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
|
||||||
filter: '=urlkey:{0}'
|
filter: ['=urlkey:{0}']
|
||||||
replace: '?'
|
replace: '?'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user