1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

memento 404 fix: ensure timemap only includes memento headers on success 200 response

fuzzy match limit: add 'fuzzy_search_limit' option to default_filters in rules.yaml
default fuzzy matching search limit to 100 results to avoid timeouts for large result sets that don't have any matches
This commit is contained in:
Ilya Kreymer 2019-02-18 12:27:56 -08:00 committed by John Berlin
parent 0a9ad5c8dc
commit 54a4e38531
No known key found for this signature in database
GPG Key ID: 6EF5E4B442011B02
6 changed files with 34 additions and 1 deletions

View File

@ -506,7 +506,7 @@ class RewriterApp(object):
response = WbResponse.text_response(response, content_type=content_type)
if self.enable_memento:
if self.enable_memento and response.status_headers.statusline.startswith('200'):
self._add_memento_links(wb_url.url, full_prefix, None, memento_ts,
response.status_headers, is_timegate, is_proxy)
return response

View File

@ -1,5 +1,8 @@
# Default Filters
default_filters:
# limit to fuzzy match prefix results
fuzzy_search_limit: '100'
# exts that should *not* be treated as files (ignore all query args)
not_exts:
- asp

View File

@ -55,6 +55,10 @@
<script src='{{ static_prefix }}/vidrw.js'> </script>
{% endif %}
{% if config.enable_transclusions %}
<script src="{{ static_prefix }}/transclusions.js"> </script>
{% endif %}
{{ banner_html }}
<!-- End WB Insert -->

View File

@ -38,6 +38,8 @@ class FuzzyMatcher(object):
self.default_filters = config.get('default_filters')
self.fuzzy_search_limit = self.default_filters.get('fuzzy_search_limit')
self.url_normalize_rx = [(re.compile(rule['match']), rule['replace']) for rule in self.default_filters['url_normalize']]
def parse_fuzzy_rule(self, rule):
@ -121,6 +123,9 @@ class FuzzyMatcher(object):
'filter': filters,
'is_fuzzy': '1'}
if self.fuzzy_search_limit:
fuzzy_params['limit'] = self.fuzzy_search_limit
for key in iterkeys(params):
if key not in self.FUZZY_SKIP_PARAMS:
fuzzy_params[key] = params[key]

View File

@ -12,6 +12,9 @@ class EchoParamsSource(BaseIndexSource):
if params.get('matchType', 'exact') == 'exact':
return iter([])
assert params.get('is_fuzzy') == '1'
assert params.get('limit') == '100'
cdx = {'urlkey': canonicalize(params.get('cdx_url')),
'mime': params.get('mime'),
'filter': params.get('filter'),

View File

@ -273,7 +273,25 @@ class TestMementoRedirectClassic(MementoMixin, BaseConfigTest):
resp = self.testapp.get('/pywb/2/http://www.iana.org/', headers=headers)
assert resp.status_code == 200
assert VARY not in resp.headers
assert MEMENTO_DATETIME in resp.headers
def test_timegate_error_not_found(self):
resp = self.testapp.get('/pywb/http://example.com/x-not-found', status=404)
assert resp.status_code == 404
# No Memento Headers
assert VARY not in resp.headers
assert MEMENTO_DATETIME not in resp.headers
assert 'Link' not in resp.headers
def test_timemap_error_not_found(self):
resp = self.testapp.get('/pywb/timemap/link/http://example.com/x-not-found', status=404)
assert resp.status_code == 404
# No Memento Headers
assert VARY not in resp.headers
assert MEMENTO_DATETIME not in resp.headers
assert 'Link' not in resp.headers