mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
rules: add support for customizing matchType prefix, adding multiple
filters
This commit is contained in:
parent
d3ef47342c
commit
c10df57e07
@ -2,6 +2,7 @@ import yaml
|
||||
import re
|
||||
import logging
|
||||
import pkg_resources
|
||||
import urlparse
|
||||
|
||||
from pywb.utils.dsrules import BaseRule, RuleSet
|
||||
|
||||
@ -89,7 +90,8 @@ class FuzzyQuery:
|
||||
|
||||
groups = m.groups()
|
||||
for g in groups:
|
||||
filter_.append(rule.filter.format(g))
|
||||
for f in matched_rule.filter:
|
||||
filter_.append(f.format(g))
|
||||
|
||||
break
|
||||
|
||||
@ -104,9 +106,15 @@ class FuzzyQuery:
|
||||
if inx > 0:
|
||||
url = url[:inx + 1]
|
||||
|
||||
|
||||
if matched_rule.match_type == 'domain':
|
||||
host = urlparse.urlsplit(url).netloc
|
||||
# remove the subdomain
|
||||
url = host.split('.', 1)[1]
|
||||
|
||||
params = query.params
|
||||
params.update({'url': url,
|
||||
'matchType': 'prefix',
|
||||
'matchType': matched_rule.match_type,
|
||||
'filter': filter_})
|
||||
|
||||
if 'reverse' in params:
|
||||
@ -115,12 +123,17 @@ class FuzzyQuery:
|
||||
if 'closest' in params:
|
||||
del params['closest']
|
||||
|
||||
if 'end_key' in params:
|
||||
del params['end_key']
|
||||
|
||||
print(params)
|
||||
return params
|
||||
|
||||
|
||||
#=================================================================
|
||||
class CDXDomainSpecificRule(BaseRule):
|
||||
DEFAULT_FILTER = '~urlkey:{0}'
|
||||
DEFAULT_FILTER = ['~urlkey:{0}']
|
||||
DEFAULT_MATCH_TYPE = 'prefix'
|
||||
|
||||
def __init__(self, name, config):
|
||||
super(CDXDomainSpecificRule, self).__init__(name, config)
|
||||
@ -129,10 +142,12 @@ class CDXDomainSpecificRule(BaseRule):
|
||||
self.regex = self.make_regex(config)
|
||||
self.replace = None
|
||||
self.filter = self.DEFAULT_FILTER
|
||||
self.match_type = self.DEFAULT_MATCH_TYPE
|
||||
else:
|
||||
self.regex = self.make_regex(config.get('match'))
|
||||
self.replace = config.get('replace')
|
||||
self.filter = config.get('filter', self.DEFAULT_FILTER)
|
||||
self.match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
|
||||
|
||||
def unsurt(self):
|
||||
"""
|
||||
|
@ -142,8 +142,13 @@ rules:
|
||||
|
||||
- url_prefix: 'com,googlevideo,'
|
||||
|
||||
#fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]).*(mime=[^&]).*(signature=[^&])'
|
||||
fuzzy_lookup: 'com,googlevideo.*/videoplayback?.*(id=[^&]+).*(itag=[^&]+).*(mime=[^&]+)'
|
||||
fuzzy_lookup:
|
||||
match: 'com,googlevideo.*/videoplayback.*(id=[^&]+).*(itag=[^&]+).*(mime=[^&]+)'
|
||||
filter:
|
||||
- '~urlkey:{0}'
|
||||
- '!mimetype:text/plain'
|
||||
|
||||
type: 'domain'
|
||||
|
||||
|
||||
# testing rules -- not for valid domain
|
||||
@ -169,5 +174,5 @@ rules:
|
||||
- url_prefix: ''
|
||||
fuzzy_lookup:
|
||||
match: '(.*)[&?](?:_|uncache)=[\d]+[&]?'
|
||||
filter: '=urlkey:{0}'
|
||||
filter: ['=urlkey:{0}']
|
||||
replace: '?'
|
||||
|
Loading…
x
Reference in New Issue
Block a user