From 66ac3ca114d765a2c0405c144bba074e681795a2 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 7 Nov 2019 10:25:49 -0800 Subject: [PATCH] config limit: add query_limit config options to specify optional limit for both exact and prefix queries, addresses ukwa/ukwa-pywb#49 (#518) --- pywb/apps/frontendapp.py | 5 +++++ pywb/apps/rewriterapp.py | 3 +++ tests/config_test_redirect_classic.yaml | 2 ++ tests/test_redirect_classic.py | 4 ++++ 4 files changed, 14 insertions(+) diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index 1bc4c5f4..e6211d16 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -92,6 +92,7 @@ class FrontEndApp(object): self.static_handler = StaticHandler(static_path) self.cdx_api_endpoint = config.get('cdx_api_endpoint', '/cdx') + self.query_limit = config.get('query_limit') upstream_paths = self.get_upstream_paths(self.warcserver_server.port) @@ -355,6 +356,10 @@ class FrontEndApp(object): cdx_url += '&' if '?' in cdx_url else '?' cdx_url += environ.get('QUERY_STRING') + if self.query_limit: + cdx_url += '&' if '?' in cdx_url else '?' + cdx_url += 'limit=' + str(self.query_limit) + try: res = requests.get(cdx_url, stream=True) diff --git a/pywb/apps/rewriterapp.py b/pywb/apps/rewriterapp.py index 600bade2..98a7f953 100644 --- a/pywb/apps/rewriterapp.py +++ b/pywb/apps/rewriterapp.py @@ -713,6 +713,9 @@ class RewriterApp(object): if 'memento_format' in kwargs: params['memento_format'] = kwargs['memento_format'] + if 'limit' in kwargs: + params['limit'] = kwargs['limit'] + upstream_url = self.get_upstream_url(wb_url, kwargs, params) upstream_url = upstream_url.replace('/resource/postreq', '/index') diff --git a/tests/config_test_redirect_classic.yaml b/tests/config_test_redirect_classic.yaml index c3a6000a..9eca05ab 100644 --- a/tests/config_test_redirect_classic.yaml +++ b/tests/config_test_redirect_classic.yaml @@ -17,3 +17,5 @@ enable_memento: true enable_prefer: true debug: true + +query_limit: 10 diff --git a/tests/test_redirect_classic.py b/tests/test_redirect_classic.py index 9d200e70..3eea7a0b 100644 --- a/tests/test_redirect_classic.py +++ b/tests/test_redirect_classic.py @@ -74,4 +74,8 @@ class TestRedirectClassic(BaseConfigTest): resp = self.get('/live/{0}http://example.com/?test=test', fmod_slash) assert resp.status_int == 200 + def test_replay_limit_cdx(self): + resp = self.testapp.get('/pywb/cdx?url=http://www.iana.org/*&output=json') + assert resp.content_type == 'text/x-ndjson' + assert len(resp.text.rstrip().split('\n')) == 10