1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

fix up cdx server query interface

supports /cdx?url=... and other params including
filter=<regex>
collapse_time=<0-14>
resolve_revisits=<true|false>
reverse=<true|false>
closest=<timestamp>
This commit is contained in:
Ilya Kreymer 2014-02-01 14:47:07 -08:00
parent b685772b96
commit 9f258fa64c
3 changed files with 42 additions and 15 deletions

View File

@ -27,7 +27,7 @@ def cdx_serve(key, params, sources, match_func = binsearch.iter_exact):
if resolve_revisits: if resolve_revisits:
cdx_iter = cdx_resolve_revisits(cdx_iter) cdx_iter = cdx_resolve_revisits(cdx_iter)
filters = params.get('filters', None) filters = params.get('filter', None)
if filters: if filters:
cdx_iter = cdx_filter(cdx_iter, filters) cdx_iter = cdx_filter(cdx_iter, filters)
@ -141,7 +141,7 @@ def cdx_reverse(cdx_iter, limit):
# apply filter to cdx[field] # apply filter to cdx[field]
def cdx_filter(cdx_iter, filter_strings): def cdx_filter(cdx_iter, filter_strings):
""" """
>>> test_cdx(key = 'org,iana)/domains', match_func = binsearch.iter_prefix, filters = ['mimetype:text/html']) >>> test_cdx(key = 'org,iana)/domains', match_func = binsearch.iter_prefix, filter = ['mimetype:text/html'])
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
@ -151,8 +151,16 @@ def cdx_filter(cdx_iter, filter_strings):
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
>>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', filter = 'statuscode:200')
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
""" """
# Support single strings as well
if isinstance(filter_strings, str):
filter_strings = [filter_strings]
filters = [] filters = []
class Filter: class Filter:
@ -197,12 +205,14 @@ def cdx_collapse_time_status(cdx_iter, timelen = 10):
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
# resolved revisits # resolved revisits
>>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', collapse_time = 11, resolve_revisits = True) >>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True)
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - - org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
""" """
timelen = int(timelen)
last_token = None last_token = None
for cdx in cdx_iter: for cdx in cdx_iter:

View File

@ -73,12 +73,17 @@ class CDXHandler(BaseHandler):
# use url= param to get actual url # use url= param to get actual url
params = urlparse.parse_qs(wbrequest.env['QUERY_STRING']) params = urlparse.parse_qs(wbrequest.env['QUERY_STRING'])
# parse_qs produces arrays for single values
# cdxreader expects singleton params for all except filters, so convert here
# use first value of the list
for name, val in params.iteritems():
if name != 'filter':
params[name] = val[0]
url = params.get('url') url = params.get('url')
if not url: if not url:
raise WbException('Must specify a url= param to query cdx server') raise WbException('Must specify a url= param to query cdx server')
url = url[0]
cdx_lines = self.cdx_reader.load_cdx(url, params, parsed_cdx = False) cdx_lines = self.cdx_reader.load_cdx(url, params, parsed_cdx = False)
return self.view.render_response(wbrequest, cdx_lines) return self.view.render_response(wbrequest, cdx_lines)

View File

@ -1,6 +1,6 @@
import webtest import webtest
import pywb.pywb_init import pywb.pywb_init
from pywb.indexreader import CDXCaptureResult
class TestWb: class TestWb:
def setup(self): def setup(self):
@ -73,6 +73,27 @@ class TestWb:
assert 'Mon, Jan 27 2014 17:12:51' in resp.body assert 'Mon, Jan 27 2014 17:12:51' in resp.body
assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body
def test_cdx_server_filters(self):
resp = self.testapp.get('/cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mimetype:warc/revisit&filter=filename:dupes.warc.gz')
self._assert_basic_text(resp)
actual_len = len(resp.body.rstrip().split('\n'))
assert actual_len == 1, actual_len
def test_cdx_server_advanced(self):
# combine collapsing, reversing and revisit resolving
resp = self.testapp.get('/cdx?url=http://www.iana.org/_css/2013.1/print.css&collapse_time=11&resolve_revisits=true&reverse=true')
# convert back to CDXCaptureResult
cdxs = map(CDXCaptureResult, resp.body.rstrip().split('\n'))
assert len(cdxs) == 3, len(cdxs)
# verify timestamps
timestamps = map(lambda cdx: cdx['timestamp'], cdxs)
assert timestamps == ['20140127171239', '20140126201054', '20140126200625']
# verify orig filenames (2 revisits, one non)
origfilenames = map(lambda cdx: cdx['orig.filename'], cdxs)
assert origfilenames == ['iana.warc.gz', 'iana.warc.gz', '-']
def test_error(self): def test_error(self):
@ -81,12 +102,3 @@ class TestWb:
assert 'Bad Request Url: http://?abc' in resp.body assert 'Bad Request Url: http://?abc' in resp.body
def run():
test = TestWb()
test.setup()
test.test_root()
#run()