From 273b3eec303838b15220d142ec533f4a25e9a77d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 29 Jan 2018 15:08:50 -0800 Subject: [PATCH] warcserver/cdx query: filter improvements (#285) - pywb.utils.format: add query_to_dict() to convert query string with support for list for certain params - support multiple values for 'filter' cdx server param (fixes #284) - pywb.utils.format: add to_bool() to convert string/int to bool (eg. for query args) - fuzzymatch: add 'allowFuzzy' (default to true) to allow disabling fuzzy matcher - tests: fuzzymather: test disabling fuzzy matcher with allowFuzzy=0 - tests: cdx-server api: add multiple filter tests, with and without fuzzy matching --- pywb/utils/format.py | 40 ++++++++++++++++-- pywb/warcserver/basewarcserver.py | 4 +- pywb/warcserver/index/fuzzymatcher.py | 12 ++++-- pywb/warcserver/index/query.py | 13 ++---- .../index/test/test_fuzzymatcher.py | 8 ++++ tests/test_cdx_server_app.py | 42 ++++++++++++++++++- 6 files changed, 98 insertions(+), 21 deletions(-) diff --git a/pywb/utils/format.py b/pywb/utils/format.py index 527553c4..394dae81 100644 --- a/pywb/utils/format.py +++ b/pywb/utils/format.py @@ -1,8 +1,8 @@ -from six.moves.urllib.parse import quote +from six.moves.urllib.parse import quote, parse_qsl import string -#============================================================================= +# ============================================================================ class ParamFormatter(string.Formatter): def __init__(self, params, name='', prefix='param.'): self.params = params @@ -33,7 +33,7 @@ class ParamFormatter(string.Formatter): return value -#============================================================================= +# ============================================================================= def res_template(template, params, **extra_params): formatter = params.get('_formatter') if not formatter: @@ -49,3 +49,37 @@ def res_template(template, params, **extra_params): return res +# ============================================================================= +def to_bool(val): + if not val: + return False + + if isinstance(val, str): + return val.lower() not in ('0', 'false', 'f', 'off') + else: + return bool(val) + + +# ============================================================================= +def query_to_dict(query_str, multi=None): + pairlist = parse_qsl(query_str) + if not multi: + return dict(pairlist) + + obj = {} + for n, v in pairlist: + if n not in multi: + obj[n] = v + continue + + # make_list + if n not in obj: + obj[n] = v + elif isinstance(obj[n], list): + obj[n].append(v) + else: + obj[n] = [obj[n], v] + + return obj + + diff --git a/pywb/warcserver/basewarcserver.py b/pywb/warcserver/basewarcserver.py index e1d2fdfb..a5082e29 100644 --- a/pywb/warcserver/basewarcserver.py +++ b/pywb/warcserver/basewarcserver.py @@ -1,4 +1,5 @@ from pywb.warcserver.inputrequest import DirectWSGIInputRequest, POSTInputRequest +from pywb.utils.format import query_to_dict from werkzeug.routing import Map, Rule from werkzeug.exceptions import HTTPException @@ -7,7 +8,6 @@ import requests import traceback import json -from six.moves.urllib.parse import parse_qsl import six JSON_CT = 'application/json; charset=utf-8' @@ -60,7 +60,7 @@ class BaseWarcServer(object): def get_query_dict(self, environ): query_str = environ.get('QUERY_STRING') if query_str: - return dict(parse_qsl(query_str)) + return query_to_dict(query_str, multi=['filter']) else: return {} diff --git a/pywb/warcserver/index/fuzzymatcher.py b/pywb/warcserver/index/fuzzymatcher.py index 46ed03e5..23c16e8f 100644 --- a/pywb/warcserver/index/fuzzymatcher.py +++ b/pywb/warcserver/index/fuzzymatcher.py @@ -1,5 +1,7 @@ from warcio.utils import to_native_str + from pywb.utils.loaders import load_yaml_config +from pywb.utils.format import to_bool from pywb import DEFAULT_RULES_FILE import re @@ -65,7 +67,7 @@ class FuzzyMatcher(object): return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, find_all) - def get_fuzzy_match(self, urlkey, params): + def get_fuzzy_match(self, urlkey, url, params): filters = set() matched_rule = None @@ -93,8 +95,6 @@ class FuzzyMatcher(object): if not matched_rule: return None - url = params['url'] - # support matching w/o query if no additional filters # don't include trailing '?' if no filters and replace_after '?' no_filters = (filters == {'urlkey:'}) and (matched_rule.replace_after == '?') @@ -161,10 +161,14 @@ class FuzzyMatcher(object): if found: return + # if fuzzy matching disabled + if not to_bool(params.get('allowFuzzy', True)): + return + url = params['url'] urlkey = to_native_str(params['key'], 'utf-8') - res = self.get_fuzzy_match(urlkey, params) + res = self.get_fuzzy_match(urlkey, url, params) if not res: return diff --git a/pywb/warcserver/index/query.py b/pywb/warcserver/index/query.py index 64538a59..2d7fb670 100644 --- a/pywb/warcserver/index/query.py +++ b/pywb/warcserver/index/query.py @@ -1,6 +1,7 @@ from six.moves.urllib.parse import urlencode from pywb.warcserver.index.cdxobject import CDXException from pywb.utils.canonicalize import calc_search_range +from pywb.utils.format import to_bool #================================================================= @@ -128,17 +129,9 @@ class CDXQuery(object): def page_count(self): return self._get_bool('showNumPages') - def _get_bool(self, name, def_val=False): + def _get_bool(self, name): v = self.params.get(name) - if v: - try: - v = int(v) - except ValueError as ex: - v = (v.lower() == 'true') - else: - v = def_val - - return bool(v) + return to_bool(v) def urlencode(self): return urlencode(self.params, True) diff --git a/pywb/warcserver/index/test/test_fuzzymatcher.py b/pywb/warcserver/index/test/test_fuzzymatcher.py index 27a92088..b309ae6e 100644 --- a/pywb/warcserver/index/test/test_fuzzymatcher.py +++ b/pywb/warcserver/index/test/test_fuzzymatcher.py @@ -133,6 +133,14 @@ class TestFuzzy(object): assert list(cdx_iter) == self.get_expected(url=actual_url, filters=filters) + def test_no_fuzzy_disabled(self): + url = 'http://example.com/?_=123' + actual_url = 'http://example.com/' + params = self.get_params(url, actual_url) + params['allowFuzzy'] = 0 + cdx_iter, errs = self.fuzzy(self.source, params) + assert list(cdx_iter) == [] + def test_no_fuzzy_custom_rule_video_id_diff(self): url = 'http://youtube.com/get_video_info?a=b&html=true&___abc=123&video_id=ABCD&id=1234' actual_url = 'http://youtube.com/get_video_info?a=d&html=true&___abc=125&video_id=ABCE&id=1234' diff --git a/tests/test_cdx_server_app.py b/tests/test_cdx_server_app.py index c01dee91..a66bb975 100644 --- a/tests/test_cdx_server_app.py +++ b/tests/test_cdx_server_app.py @@ -61,7 +61,7 @@ class TestCDXApp(BaseTestClass): suburls += 1 assert suburls > 0 - def test_filters(self): + def test_filters_1(self): """ filter cdxes by mimetype and filename field, exact match. """ @@ -71,12 +71,50 @@ class TestCDXApp(BaseTestClass): assert resp.status_code == 200 assert resp.content_type == 'text/x-cdxj' - for l in resp.text.splitlines(): + lines = resp.text.splitlines() + assert len(lines) > 0 + + for l in lines: cdx = CDXObject(l.encode('utf-8')) assert cdx['urlkey'] == 'org,iana)/_css/2013.1/screen.css' + assert cdx['timestamp'] == '20140127171239' assert cdx['mime'] == 'warc/revisit' assert cdx['filename'] == 'dupes.warc.gz' + def test_filters_2_no_fuzzy_no_match(self): + """ + two filters, disable fuzzy matching + """ + resp = self.query('http://www.iana.org/_css/2013.1/screen.css', + filter=('!mime:warc/revisit', 'filename:dupes.warc.gz'), + allowFuzzy='false') + + assert resp.status_code == 200 + assert resp.content_type == 'text/x-cdxj' + + lines = resp.text.splitlines() + assert len(lines) == 0 + + def test_filters_3(self): + """ + filter cdxes by mimetype and filename field, exact match. + """ + resp = self.query('http://www.iana.org/_css/2013.1/screen.css', + filter=('!mime:warc/revisit', '!filename:dupes.warc.gz')) + + assert resp.status_code == 200 + assert resp.content_type == 'text/x-cdxj' + + lines = resp.text.splitlines() + assert len(lines) == 1 + + for l in lines: + cdx = CDXObject(l.encode('utf-8')) + assert cdx['urlkey'] == 'org,iana)/_css/2013.1/screen.css' + assert cdx['timestamp'] == '20140126200625' + assert cdx['mime'] == 'text/css' + assert cdx['filename'] == 'iana.warc.gz' + def test_limit(self): resp = self.query('http://www.iana.org/_css/2013.1/screen.css', limit='1')