mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
indexsource: if filtering enabled, live index source can check status and mime (excluding fuzzy match)
cdxops: cleanup filtering, move class to CDXFilter, avoid ambiguous naming
This commit is contained in:
parent
dd961f893f
commit
324a36b5b7
@ -157,6 +157,63 @@ def cdx_reverse(cdx_iter, limit):
|
|||||||
yield cdx
|
yield cdx
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
class CDXFilter(object):
|
||||||
|
def __init__(self, string):
|
||||||
|
# invert filter
|
||||||
|
self.invert = string.startswith('!')
|
||||||
|
if self.invert:
|
||||||
|
string = string[1:]
|
||||||
|
|
||||||
|
# exact match
|
||||||
|
if string.startswith('='):
|
||||||
|
string = string[1:]
|
||||||
|
self.compare_func = self.exact
|
||||||
|
# contains match
|
||||||
|
elif string.startswith('~'):
|
||||||
|
string = string[1:]
|
||||||
|
self.compare_func = self.contains
|
||||||
|
else:
|
||||||
|
self.compare_func = self.rx_match
|
||||||
|
|
||||||
|
parts = string.split(':', 1)
|
||||||
|
# no field set, apply filter to entire cdx
|
||||||
|
if len(parts) == 1:
|
||||||
|
self.field = ''
|
||||||
|
# apply filter to cdx[field]
|
||||||
|
else:
|
||||||
|
self.field = parts[0]
|
||||||
|
self.field = CDXObject.CDX_ALT_FIELDS.get(self.field,
|
||||||
|
self.field)
|
||||||
|
string = parts[1]
|
||||||
|
|
||||||
|
# make regex if regex mode
|
||||||
|
if self.compare_func == self.rx_match:
|
||||||
|
self.regex = re.compile(string)
|
||||||
|
else:
|
||||||
|
self.filter_str = string
|
||||||
|
|
||||||
|
def __call__(self, cdx):
|
||||||
|
if not self.field:
|
||||||
|
val = str(cdx)
|
||||||
|
else:
|
||||||
|
val = str(cdx.get(self.field, ''))
|
||||||
|
|
||||||
|
matched = self.compare_func(val)
|
||||||
|
|
||||||
|
return matched ^ self.invert
|
||||||
|
|
||||||
|
def exact(self, val):
|
||||||
|
return (self.filter_str == val)
|
||||||
|
|
||||||
|
def contains(self, val):
|
||||||
|
return (self.filter_str in val)
|
||||||
|
|
||||||
|
def rx_match(self, val):
|
||||||
|
res = self.regex.match(val)
|
||||||
|
return res is not None
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def cdx_filter(cdx_iter, filter_strings):
|
def cdx_filter(cdx_iter, filter_strings):
|
||||||
"""
|
"""
|
||||||
@ -167,63 +224,7 @@ def cdx_filter(cdx_iter, filter_strings):
|
|||||||
if isinstance(filter_strings, str):
|
if isinstance(filter_strings, str):
|
||||||
filter_strings = [filter_strings]
|
filter_strings = [filter_strings]
|
||||||
|
|
||||||
filters = []
|
filters = [CDXFilter(filter_str) for filter_str in filter_strings]
|
||||||
|
|
||||||
class Filter:
|
|
||||||
def __init__(self, string):
|
|
||||||
# invert filter
|
|
||||||
self.invert = string.startswith('!')
|
|
||||||
if self.invert:
|
|
||||||
string = string[1:]
|
|
||||||
|
|
||||||
# exact match
|
|
||||||
if string.startswith('='):
|
|
||||||
string = string[1:]
|
|
||||||
self.compare_func = self.exact
|
|
||||||
# contains match
|
|
||||||
elif string.startswith('~'):
|
|
||||||
string = string[1:]
|
|
||||||
self.compare_func = self.contains
|
|
||||||
else:
|
|
||||||
self.compare_func = self.regex
|
|
||||||
|
|
||||||
parts = string.split(':', 1)
|
|
||||||
# no field set, apply filter to entire cdx
|
|
||||||
if len(parts) == 1:
|
|
||||||
self.field = ''
|
|
||||||
# apply filter to cdx[field]
|
|
||||||
else:
|
|
||||||
self.field = parts[0]
|
|
||||||
self.field = CDXObject.CDX_ALT_FIELDS.get(self.field,
|
|
||||||
self.field)
|
|
||||||
string = parts[1]
|
|
||||||
|
|
||||||
# make regex if regex mode
|
|
||||||
if self.compare_func == self.regex:
|
|
||||||
self.regex = re.compile(string)
|
|
||||||
else:
|
|
||||||
self.filter_str = string
|
|
||||||
|
|
||||||
def __call__(self, cdx):
|
|
||||||
if not self.field:
|
|
||||||
val = str(cdx)
|
|
||||||
else:
|
|
||||||
val = cdx.get(self.field, '')
|
|
||||||
|
|
||||||
matched = self.compare_func(val)
|
|
||||||
|
|
||||||
return matched ^ self.invert
|
|
||||||
|
|
||||||
def exact(self, val):
|
|
||||||
return (self.filter_str == val)
|
|
||||||
|
|
||||||
def contains(self, val):
|
|
||||||
return (self.filter_str in val)
|
|
||||||
|
|
||||||
def regex(self, val):
|
|
||||||
return self.regex.match(val) is not None
|
|
||||||
|
|
||||||
filters = list(map(Filter, filter_strings))
|
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
if all(x(cdx) for x in filters):
|
if all(x(cdx) for x in filters):
|
||||||
|
@ -193,19 +193,38 @@ class RemoteIndexSource(BaseIndexSource):
|
|||||||
class LiveIndexSource(BaseIndexSource):
|
class LiveIndexSource(BaseIndexSource):
|
||||||
def __init__(self, proxy_url='{url}'):
|
def __init__(self, proxy_url='{url}'):
|
||||||
self.proxy_url = proxy_url
|
self.proxy_url = proxy_url
|
||||||
|
self._init_sesh()
|
||||||
|
|
||||||
def load_index(self, params):
|
def load_index(self, params):
|
||||||
|
# no fuzzy match for live resources
|
||||||
|
if params.get('is_fuzzy'):
|
||||||
|
raise NotFoundException(params['url'] + '*')
|
||||||
|
|
||||||
cdx = CDXObject()
|
cdx = CDXObject()
|
||||||
cdx['urlkey'] = params.get('key').decode('utf-8')
|
cdx['urlkey'] = params.get('key').decode('utf-8')
|
||||||
cdx['timestamp'] = timestamp_now()
|
cdx['timestamp'] = timestamp_now()
|
||||||
cdx['url'] = params['url']
|
cdx['url'] = params['url']
|
||||||
cdx['load_url'] = res_template(self.proxy_url, params)
|
cdx['load_url'] = res_template(self.proxy_url, params)
|
||||||
cdx['is_live'] = 'true'
|
cdx['is_live'] = 'true'
|
||||||
cdx['mime'] = params.get('content_type', '')
|
|
||||||
def live():
|
|
||||||
yield cdx
|
|
||||||
|
|
||||||
return live()
|
mime = params.get('content_type', '')
|
||||||
|
|
||||||
|
if params.get('filter') and not mime:
|
||||||
|
try:
|
||||||
|
res = self.sesh.head(cdx['url'])
|
||||||
|
if res.status_code != 405:
|
||||||
|
cdx['status'] = str(res.status_code)
|
||||||
|
|
||||||
|
content_type = res.headers.get('Content-Type')
|
||||||
|
if content_type:
|
||||||
|
mime = content_type.split(';')[0]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
cdx['mime'] = mime
|
||||||
|
|
||||||
|
return iter([cdx])
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '{0}()'.format(self.__class__.__name__)
|
return '{0}()'.format(self.__class__.__name__)
|
||||||
@ -383,11 +402,16 @@ class MementoIndexSource(BaseIndexSource):
|
|||||||
def handle_timemap(self, params):
|
def handle_timemap(self, params):
|
||||||
url = res_template(self.timemap_url, params)
|
url = res_template(self.timemap_url, params)
|
||||||
headers = self._get_headers(params)
|
headers = self._get_headers(params)
|
||||||
res = self.sesh.get(url,
|
try:
|
||||||
headers=headers,
|
res = self.sesh.get(url,
|
||||||
timeout=params.get('_timeout'))
|
headers=headers,
|
||||||
|
timeout=params.get('_timeout'))
|
||||||
|
|
||||||
if res.status_code >= 400 or not res.text:
|
res.raise_for_status()
|
||||||
|
assert(res.text)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print('FAILED: ' + str(e))
|
||||||
raise NotFoundException(url)
|
raise NotFoundException(url)
|
||||||
|
|
||||||
links = res.text
|
links = res.text
|
||||||
|
Loading…
x
Reference in New Issue
Block a user