From 9f258fa64cf49c8fd8b9cd92cba19b8e2e129b28 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 1 Feb 2014 14:47:07 -0800 Subject: [PATCH] fix up cdx server query interface supports /cdx?url=... and other params including filter= collapse_time=<0-14> resolve_revisits= reverse= closest= --- pywb/cdxserve.py | 16 +++++++++++++--- pywb/handlers.py | 9 +++++++-- run-tests.py | 32 ++++++++++++++++++++++---------- 3 files changed, 42 insertions(+), 15 deletions(-) diff --git a/pywb/cdxserve.py b/pywb/cdxserve.py index 752b26f9..9deadb49 100644 --- a/pywb/cdxserve.py +++ b/pywb/cdxserve.py @@ -27,7 +27,7 @@ def cdx_serve(key, params, sources, match_func = binsearch.iter_exact): if resolve_revisits: cdx_iter = cdx_resolve_revisits(cdx_iter) - filters = params.get('filters', None) + filters = params.get('filter', None) if filters: cdx_iter = cdx_filter(cdx_iter, filters) @@ -141,7 +141,7 @@ def cdx_reverse(cdx_iter, limit): # apply filter to cdx[field] def cdx_filter(cdx_iter, filter_strings): """ - >>> test_cdx(key = 'org,iana)/domains', match_func = binsearch.iter_prefix, filters = ['mimetype:text/html']) + >>> test_cdx(key = 'org,iana)/domains', match_func = binsearch.iter_prefix, filter = ['mimetype:text/html']) org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz @@ -151,8 +151,16 @@ def cdx_filter(cdx_iter, filter_strings): org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz + + + >>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', filter = 'statuscode:200') + org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz """ + # Support single strings as well + if isinstance(filter_strings, str): + filter_strings = [filter_strings] + filters = [] class Filter: @@ -197,12 +205,14 @@ def cdx_collapse_time_status(cdx_iter, timelen = 10): org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz # resolved revisits - >>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', collapse_time = 11, resolve_revisits = True) + >>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True) org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - - org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz """ + timelen = int(timelen) + last_token = None for cdx in cdx_iter: diff --git a/pywb/handlers.py b/pywb/handlers.py index 50d3e644..63d6fed8 100644 --- a/pywb/handlers.py +++ b/pywb/handlers.py @@ -73,12 +73,17 @@ class CDXHandler(BaseHandler): # use url= param to get actual url params = urlparse.parse_qs(wbrequest.env['QUERY_STRING']) + # parse_qs produces arrays for single values + # cdxreader expects singleton params for all except filters, so convert here + # use first value of the list + for name, val in params.iteritems(): + if name != 'filter': + params[name] = val[0] + url = params.get('url') if not url: raise WbException('Must specify a url= param to query cdx server') - url = url[0] - cdx_lines = self.cdx_reader.load_cdx(url, params, parsed_cdx = False) return self.view.render_response(wbrequest, cdx_lines) diff --git a/run-tests.py b/run-tests.py index d3f6972e..f8761266 100644 --- a/run-tests.py +++ b/run-tests.py @@ -1,6 +1,6 @@ import webtest import pywb.pywb_init - +from pywb.indexreader import CDXCaptureResult class TestWb: def setup(self): @@ -73,6 +73,27 @@ class TestWb: assert 'Mon, Jan 27 2014 17:12:51' in resp.body assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body + def test_cdx_server_filters(self): + resp = self.testapp.get('/cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mimetype:warc/revisit&filter=filename:dupes.warc.gz') + self._assert_basic_text(resp) + actual_len = len(resp.body.rstrip().split('\n')) + assert actual_len == 1, actual_len + + def test_cdx_server_advanced(self): + # combine collapsing, reversing and revisit resolving + resp = self.testapp.get('/cdx?url=http://www.iana.org/_css/2013.1/print.css&collapse_time=11&resolve_revisits=true&reverse=true') + + # convert back to CDXCaptureResult + cdxs = map(CDXCaptureResult, resp.body.rstrip().split('\n')) + assert len(cdxs) == 3, len(cdxs) + + # verify timestamps + timestamps = map(lambda cdx: cdx['timestamp'], cdxs) + assert timestamps == ['20140127171239', '20140126201054', '20140126200625'] + + # verify orig filenames (2 revisits, one non) + origfilenames = map(lambda cdx: cdx['orig.filename'], cdxs) + assert origfilenames == ['iana.warc.gz', 'iana.warc.gz', '-'] def test_error(self): @@ -81,12 +102,3 @@ class TestWb: assert 'Bad Request Url: http://?abc' in resp.body - - -def run(): - test = TestWb() - test.setup() - test.test_root() - - -#run()