mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
fix up cdx server query interface
supports /cdx?url=... and other params including filter=<regex> collapse_time=<0-14> resolve_revisits=<true|false> reverse=<true|false> closest=<timestamp>
This commit is contained in:
parent
b685772b96
commit
9f258fa64c
@ -27,7 +27,7 @@ def cdx_serve(key, params, sources, match_func = binsearch.iter_exact):
|
|||||||
if resolve_revisits:
|
if resolve_revisits:
|
||||||
cdx_iter = cdx_resolve_revisits(cdx_iter)
|
cdx_iter = cdx_resolve_revisits(cdx_iter)
|
||||||
|
|
||||||
filters = params.get('filters', None)
|
filters = params.get('filter', None)
|
||||||
if filters:
|
if filters:
|
||||||
cdx_iter = cdx_filter(cdx_iter, filters)
|
cdx_iter = cdx_filter(cdx_iter, filters)
|
||||||
|
|
||||||
@ -141,7 +141,7 @@ def cdx_reverse(cdx_iter, limit):
|
|||||||
# apply filter to cdx[field]
|
# apply filter to cdx[field]
|
||||||
def cdx_filter(cdx_iter, filter_strings):
|
def cdx_filter(cdx_iter, filter_strings):
|
||||||
"""
|
"""
|
||||||
>>> test_cdx(key = 'org,iana)/domains', match_func = binsearch.iter_prefix, filters = ['mimetype:text/html'])
|
>>> test_cdx(key = 'org,iana)/domains', match_func = binsearch.iter_prefix, filter = ['mimetype:text/html'])
|
||||||
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
|
org,iana)/domains 20140126200825 http://www.iana.org/domains text/html 200 7UPSCLNWNZP33LGW6OJGSF2Y4CDG4ES7 - - 2912 610534 iana.warc.gz
|
||||||
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
||||||
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
||||||
@ -151,8 +151,16 @@ def cdx_filter(cdx_iter, filter_strings):
|
|||||||
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ text/html 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 671278 iana.warc.gz
|
||||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||||
|
|
||||||
|
|
||||||
|
>>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', filter = 'statuscode:200')
|
||||||
|
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Support single strings as well
|
||||||
|
if isinstance(filter_strings, str):
|
||||||
|
filter_strings = [filter_strings]
|
||||||
|
|
||||||
filters = []
|
filters = []
|
||||||
|
|
||||||
class Filter:
|
class Filter:
|
||||||
@ -197,12 +205,14 @@ def cdx_collapse_time_status(cdx_iter, timelen = 10):
|
|||||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
|
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css warc/revisit - BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz
|
||||||
|
|
||||||
# resolved revisits
|
# resolved revisits
|
||||||
>>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', collapse_time = 11, resolve_revisits = True)
|
>>> test_cdx(key = 'org,iana)/_css/2013.1/screen.css', collapse_time = '11', resolve_revisits = True)
|
||||||
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
|
org,iana)/_css/2013.1/screen.css 20140126200625 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 8754 41238 iana.warc.gz - - -
|
||||||
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
|
org,iana)/_css/2013.1/screen.css 20140126201054 http://www.iana.org/_css/2013.1/screen.css text/css 200 BUAEPXZNN44AIX3NLXON4QDV6OY2H5QD - - 543 706476 iana.warc.gz 8754 41238 iana.warc.gz
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
timelen = int(timelen)
|
||||||
|
|
||||||
last_token = None
|
last_token = None
|
||||||
|
|
||||||
for cdx in cdx_iter:
|
for cdx in cdx_iter:
|
||||||
|
@ -73,12 +73,17 @@ class CDXHandler(BaseHandler):
|
|||||||
# use url= param to get actual url
|
# use url= param to get actual url
|
||||||
params = urlparse.parse_qs(wbrequest.env['QUERY_STRING'])
|
params = urlparse.parse_qs(wbrequest.env['QUERY_STRING'])
|
||||||
|
|
||||||
|
# parse_qs produces arrays for single values
|
||||||
|
# cdxreader expects singleton params for all except filters, so convert here
|
||||||
|
# use first value of the list
|
||||||
|
for name, val in params.iteritems():
|
||||||
|
if name != 'filter':
|
||||||
|
params[name] = val[0]
|
||||||
|
|
||||||
url = params.get('url')
|
url = params.get('url')
|
||||||
if not url:
|
if not url:
|
||||||
raise WbException('Must specify a url= param to query cdx server')
|
raise WbException('Must specify a url= param to query cdx server')
|
||||||
|
|
||||||
url = url[0]
|
|
||||||
|
|
||||||
cdx_lines = self.cdx_reader.load_cdx(url, params, parsed_cdx = False)
|
cdx_lines = self.cdx_reader.load_cdx(url, params, parsed_cdx = False)
|
||||||
|
|
||||||
return self.view.render_response(wbrequest, cdx_lines)
|
return self.view.render_response(wbrequest, cdx_lines)
|
||||||
|
32
run-tests.py
32
run-tests.py
@ -1,6 +1,6 @@
|
|||||||
import webtest
|
import webtest
|
||||||
import pywb.pywb_init
|
import pywb.pywb_init
|
||||||
|
from pywb.indexreader import CDXCaptureResult
|
||||||
|
|
||||||
class TestWb:
|
class TestWb:
|
||||||
def setup(self):
|
def setup(self):
|
||||||
@ -73,6 +73,27 @@ class TestWb:
|
|||||||
assert 'Mon, Jan 27 2014 17:12:51' in resp.body
|
assert 'Mon, Jan 27 2014 17:12:51' in resp.body
|
||||||
assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body
|
assert '/pywb/20140127171251/http://www.iana.org/domains/example' in resp.body
|
||||||
|
|
||||||
|
def test_cdx_server_filters(self):
|
||||||
|
resp = self.testapp.get('/cdx?url=http://www.iana.org/_css/2013.1/screen.css&filter=mimetype:warc/revisit&filter=filename:dupes.warc.gz')
|
||||||
|
self._assert_basic_text(resp)
|
||||||
|
actual_len = len(resp.body.rstrip().split('\n'))
|
||||||
|
assert actual_len == 1, actual_len
|
||||||
|
|
||||||
|
def test_cdx_server_advanced(self):
|
||||||
|
# combine collapsing, reversing and revisit resolving
|
||||||
|
resp = self.testapp.get('/cdx?url=http://www.iana.org/_css/2013.1/print.css&collapse_time=11&resolve_revisits=true&reverse=true')
|
||||||
|
|
||||||
|
# convert back to CDXCaptureResult
|
||||||
|
cdxs = map(CDXCaptureResult, resp.body.rstrip().split('\n'))
|
||||||
|
assert len(cdxs) == 3, len(cdxs)
|
||||||
|
|
||||||
|
# verify timestamps
|
||||||
|
timestamps = map(lambda cdx: cdx['timestamp'], cdxs)
|
||||||
|
assert timestamps == ['20140127171239', '20140126201054', '20140126200625']
|
||||||
|
|
||||||
|
# verify orig filenames (2 revisits, one non)
|
||||||
|
origfilenames = map(lambda cdx: cdx['orig.filename'], cdxs)
|
||||||
|
assert origfilenames == ['iana.warc.gz', 'iana.warc.gz', '-']
|
||||||
|
|
||||||
|
|
||||||
def test_error(self):
|
def test_error(self):
|
||||||
@ -81,12 +102,3 @@ class TestWb:
|
|||||||
assert 'Bad Request Url: http://?abc' in resp.body
|
assert 'Bad Request Url: http://?abc' in resp.body
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def run():
|
|
||||||
test = TestWb()
|
|
||||||
test.setup()
|
|
||||||
test.test_root()
|
|
||||||
|
|
||||||
|
|
||||||
#run()
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user