mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
warcserver/cdx query: filter improvements (#285)
- pywb.utils.format: add query_to_dict() to convert query string with support for list for certain params - support multiple values for 'filter' cdx server param (fixes #284) - pywb.utils.format: add to_bool() to convert string/int to bool (eg. for query args) - fuzzymatch: add 'allowFuzzy' (default to true) to allow disabling fuzzy matcher - tests: fuzzymather: test disabling fuzzy matcher with allowFuzzy=0 - tests: cdx-server api: add multiple filter tests, with and without fuzzy matching
This commit is contained in:
parent
cd304cc2d7
commit
273b3eec30
@ -1,8 +1,8 @@
|
|||||||
from six.moves.urllib.parse import quote
|
from six.moves.urllib.parse import quote, parse_qsl
|
||||||
import string
|
import string
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
# ============================================================================
|
||||||
class ParamFormatter(string.Formatter):
|
class ParamFormatter(string.Formatter):
|
||||||
def __init__(self, params, name='', prefix='param.'):
|
def __init__(self, params, name='', prefix='param.'):
|
||||||
self.params = params
|
self.params = params
|
||||||
@ -33,7 +33,7 @@ class ParamFormatter(string.Formatter):
|
|||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
#=============================================================================
|
# =============================================================================
|
||||||
def res_template(template, params, **extra_params):
|
def res_template(template, params, **extra_params):
|
||||||
formatter = params.get('_formatter')
|
formatter = params.get('_formatter')
|
||||||
if not formatter:
|
if not formatter:
|
||||||
@ -49,3 +49,37 @@ def res_template(template, params, **extra_params):
|
|||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
def to_bool(val):
|
||||||
|
if not val:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if isinstance(val, str):
|
||||||
|
return val.lower() not in ('0', 'false', 'f', 'off')
|
||||||
|
else:
|
||||||
|
return bool(val)
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
def query_to_dict(query_str, multi=None):
|
||||||
|
pairlist = parse_qsl(query_str)
|
||||||
|
if not multi:
|
||||||
|
return dict(pairlist)
|
||||||
|
|
||||||
|
obj = {}
|
||||||
|
for n, v in pairlist:
|
||||||
|
if n not in multi:
|
||||||
|
obj[n] = v
|
||||||
|
continue
|
||||||
|
|
||||||
|
# make_list
|
||||||
|
if n not in obj:
|
||||||
|
obj[n] = v
|
||||||
|
elif isinstance(obj[n], list):
|
||||||
|
obj[n].append(v)
|
||||||
|
else:
|
||||||
|
obj[n] = [obj[n], v]
|
||||||
|
|
||||||
|
return obj
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest
|
from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest
|
||||||
|
from pywb.utils.format import query_to_dict
|
||||||
|
|
||||||
from werkzeug.routing import Map, Rule
|
from werkzeug.routing import Map, Rule
|
||||||
from werkzeug.exceptions import HTTPException
|
from werkzeug.exceptions import HTTPException
|
||||||
@ -7,7 +8,6 @@ import requests
|
|||||||
import traceback
|
import traceback
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from six.moves.urllib.parse import parse_qsl
|
|
||||||
import six
|
import six
|
||||||
|
|
||||||
JSON_CT = 'application/json; charset=utf-8'
|
JSON_CT = 'application/json; charset=utf-8'
|
||||||
@ -60,7 +60,7 @@ class BaseWarcServer(object):
|
|||||||
def get_query_dict(self, environ):
|
def get_query_dict(self, environ):
|
||||||
query_str = environ.get('QUERY_STRING')
|
query_str = environ.get('QUERY_STRING')
|
||||||
if query_str:
|
if query_str:
|
||||||
return dict(parse_qsl(query_str))
|
return query_to_dict(query_str, multi=['filter'])
|
||||||
else:
|
else:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
from warcio.utils import to_native_str
|
from warcio.utils import to_native_str
|
||||||
|
|
||||||
from pywb.utils.loaders import load_yaml_config
|
from pywb.utils.loaders import load_yaml_config
|
||||||
|
from pywb.utils.format import to_bool
|
||||||
from pywb import DEFAULT_RULES_FILE
|
from pywb import DEFAULT_RULES_FILE
|
||||||
|
|
||||||
import re
|
import re
|
||||||
@ -65,7 +67,7 @@ class FuzzyMatcher(object):
|
|||||||
|
|
||||||
return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, find_all)
|
return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, find_all)
|
||||||
|
|
||||||
def get_fuzzy_match(self, urlkey, params):
|
def get_fuzzy_match(self, urlkey, url, params):
|
||||||
filters = set()
|
filters = set()
|
||||||
matched_rule = None
|
matched_rule = None
|
||||||
|
|
||||||
@ -93,8 +95,6 @@ class FuzzyMatcher(object):
|
|||||||
if not matched_rule:
|
if not matched_rule:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
url = params['url']
|
|
||||||
|
|
||||||
# support matching w/o query if no additional filters
|
# support matching w/o query if no additional filters
|
||||||
# don't include trailing '?' if no filters and replace_after '?'
|
# don't include trailing '?' if no filters and replace_after '?'
|
||||||
no_filters = (filters == {'urlkey:'}) and (matched_rule.replace_after == '?')
|
no_filters = (filters == {'urlkey:'}) and (matched_rule.replace_after == '?')
|
||||||
@ -161,10 +161,14 @@ class FuzzyMatcher(object):
|
|||||||
if found:
|
if found:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# if fuzzy matching disabled
|
||||||
|
if not to_bool(params.get('allowFuzzy', True)):
|
||||||
|
return
|
||||||
|
|
||||||
url = params['url']
|
url = params['url']
|
||||||
urlkey = to_native_str(params['key'], 'utf-8')
|
urlkey = to_native_str(params['key'], 'utf-8')
|
||||||
|
|
||||||
res = self.get_fuzzy_match(urlkey, params)
|
res = self.get_fuzzy_match(urlkey, url, params)
|
||||||
if not res:
|
if not res:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from six.moves.urllib.parse import urlencode
|
from six.moves.urllib.parse import urlencode
|
||||||
from pywb.warcserver.index.cdxobject import CDXException
|
from pywb.warcserver.index.cdxobject import CDXException
|
||||||
from pywb.utils.canonicalize import calc_search_range
|
from pywb.utils.canonicalize import calc_search_range
|
||||||
|
from pywb.utils.format import to_bool
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -128,17 +129,9 @@ class CDXQuery(object):
|
|||||||
def page_count(self):
|
def page_count(self):
|
||||||
return self._get_bool('showNumPages')
|
return self._get_bool('showNumPages')
|
||||||
|
|
||||||
def _get_bool(self, name, def_val=False):
|
def _get_bool(self, name):
|
||||||
v = self.params.get(name)
|
v = self.params.get(name)
|
||||||
if v:
|
return to_bool(v)
|
||||||
try:
|
|
||||||
v = int(v)
|
|
||||||
except ValueError as ex:
|
|
||||||
v = (v.lower() == 'true')
|
|
||||||
else:
|
|
||||||
v = def_val
|
|
||||||
|
|
||||||
return bool(v)
|
|
||||||
|
|
||||||
def urlencode(self):
|
def urlencode(self):
|
||||||
return urlencode(self.params, True)
|
return urlencode(self.params, True)
|
||||||
|
@ -133,6 +133,14 @@ class TestFuzzy(object):
|
|||||||
|
|
||||||
assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters)
|
assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters)
|
||||||
|
|
||||||
|
def test_no_fuzzy_disabled(self):
|
||||||
|
url = 'http://example.com/?_=123'
|
||||||
|
actual_url = 'http://example.com/'
|
||||||
|
params = self.get_params(url, actual_url)
|
||||||
|
params['allowFuzzy'] = 0
|
||||||
|
cdx_iter, errs = self.fuzzy(self.source, params)
|
||||||
|
assert list(cdx_iter) == []
|
||||||
|
|
||||||
def test_no_fuzzy_custom_rule_video_id_diff(self):
|
def test_no_fuzzy_custom_rule_video_id_diff(self):
|
||||||
url = 'http://youtube.com/get_video_info?a=b&html=true&___abc=123&video_id=ABCD&id=1234'
|
url = 'http://youtube.com/get_video_info?a=b&html=true&___abc=123&video_id=ABCD&id=1234'
|
||||||
actual_url = 'http://youtube.com/get_video_info?a=d&html=true&___abc=125&video_id=ABCE&id=1234'
|
actual_url = 'http://youtube.com/get_video_info?a=d&html=true&___abc=125&video_id=ABCE&id=1234'
|
||||||
|
@ -61,7 +61,7 @@ class TestCDXApp(BaseTestClass):
|
|||||||
suburls += 1
|
suburls += 1
|
||||||
assert suburls > 0
|
assert suburls > 0
|
||||||
|
|
||||||
def test_filters(self):
|
def test_filters_1(self):
|
||||||
"""
|
"""
|
||||||
filter cdxes by mimetype and filename field, exact match.
|
filter cdxes by mimetype and filename field, exact match.
|
||||||
"""
|
"""
|
||||||
@ -71,12 +71,50 @@ class TestCDXApp(BaseTestClass):
|
|||||||
assert resp.status_code == 200
|
assert resp.status_code == 200
|
||||||
assert resp.content_type == 'text/x-cdxj'
|
assert resp.content_type == 'text/x-cdxj'
|
||||||
|
|
||||||
for l in resp.text.splitlines():
|
lines = resp.text.splitlines()
|
||||||
|
assert len(lines) > 0
|
||||||
|
|
||||||
|
for l in lines:
|
||||||
cdx = CDXObject(l.encode('utf-8'))
|
cdx = CDXObject(l.encode('utf-8'))
|
||||||
assert cdx['urlkey'] == 'org,iana)/_css/2013.1/screen.css'
|
assert cdx['urlkey'] == 'org,iana)/_css/2013.1/screen.css'
|
||||||
|
assert cdx['timestamp'] == '20140127171239'
|
||||||
assert cdx['mime'] == 'warc/revisit'
|
assert cdx['mime'] == 'warc/revisit'
|
||||||
assert cdx['filename'] == 'dupes.warc.gz'
|
assert cdx['filename'] == 'dupes.warc.gz'
|
||||||
|
|
||||||
|
def test_filters_2_no_fuzzy_no_match(self):
|
||||||
|
"""
|
||||||
|
two filters, disable fuzzy matching
|
||||||
|
"""
|
||||||
|
resp = self.query('http://www.iana.org/_css/2013.1/screen.css',
|
||||||
|
filter=('!mime:warc/revisit', 'filename:dupes.warc.gz'),
|
||||||
|
allowFuzzy='false')
|
||||||
|
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.content_type == 'text/x-cdxj'
|
||||||
|
|
||||||
|
lines = resp.text.splitlines()
|
||||||
|
assert len(lines) == 0
|
||||||
|
|
||||||
|
def test_filters_3(self):
|
||||||
|
"""
|
||||||
|
filter cdxes by mimetype and filename field, exact match.
|
||||||
|
"""
|
||||||
|
resp = self.query('http://www.iana.org/_css/2013.1/screen.css',
|
||||||
|
filter=('!mime:warc/revisit', '!filename:dupes.warc.gz'))
|
||||||
|
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.content_type == 'text/x-cdxj'
|
||||||
|
|
||||||
|
lines = resp.text.splitlines()
|
||||||
|
assert len(lines) == 1
|
||||||
|
|
||||||
|
for l in lines:
|
||||||
|
cdx = CDXObject(l.encode('utf-8'))
|
||||||
|
assert cdx['urlkey'] == 'org,iana)/_css/2013.1/screen.css'
|
||||||
|
assert cdx['timestamp'] == '20140126200625'
|
||||||
|
assert cdx['mime'] == 'text/css'
|
||||||
|
assert cdx['filename'] == 'iana.warc.gz'
|
||||||
|
|
||||||
def test_limit(self):
|
def test_limit(self):
|
||||||
resp = self.query('http://www.iana.org/_css/2013.1/screen.css',
|
resp = self.query('http://www.iana.org/_css/2013.1/screen.css',
|
||||||
limit='1')
|
limit='1')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user