diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index bf5d7d68..ca4ad3fb 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -234,7 +234,7 @@ def cdx_collapse_time_status(cdx_iter, timelen=10): last_token = None for cdx in cdx_iter: - curr_token = (cdx[TIMESTAMP][:timelen], cdx[STATUSCODE]) + curr_token = (cdx[TIMESTAMP][:timelen], cdx.get(STATUSCODE, '')) # yield if last_dedup_time is diff, otherwise skip if curr_token != last_token: diff --git a/pywb/cdx/test/test_zipnum.py b/pywb/cdx/test/test_zipnum.py index 8fcdcf35..2d180ab9 100644 --- a/pywb/cdx/test/test_zipnum.py +++ b/pywb/cdx/test/test_zipnum.py @@ -22,16 +22,19 @@ org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db tex org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz # Pages -- default page size +>>> zip_ops_test(url='http://iana.org/domains/example', matchType='exact', showNumPages=True) +{"blocks": 1, "pages": 1, "pageSize": 10} + >>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showNumPages=True) -{"blocks": 37, "pages": 4, "pageSize": 10} +{"blocks": 38, "pages": 4, "pageSize": 10} # set page size >>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', pageSize=4, showNumPages=True) -{"blocks": 37, "pages": 10, "pageSize": 4} +{"blocks": 38, "pages": 10, "pageSize": 4} # set page size -- alt domain query >>> zip_ops_test(url='*.iana.org', pageSize=4, showNumPages=True) -{"blocks": 37, "pages": 10, "pageSize": 4} +{"blocks": 38, "pages": 10, "pageSize": 4} # first page >>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=0) @@ -145,7 +148,7 @@ def test_zip_prefix_load(): results = list(results) assert len(results) == 1, results - assert json.loads(results[0]) == {"blocks": 37, "pages": 4, "pageSize": 10} + assert json.loads(results[0]) == {"blocks": 38, "pages": 4, "pageSize": 10} # Test simple query diff --git a/pywb/cdx/zipnum.py b/pywb/cdx/zipnum.py index 703125fc..71813dfb 100644 --- a/pywb/cdx/zipnum.py +++ b/pywb/cdx/zipnum.py @@ -172,6 +172,7 @@ class ZipNumCluster(CDXSource): try: first_line = first_iter.next() except StopIteration: + reader.close() raise first = IDXObject(first_line) @@ -188,7 +189,7 @@ class ZipNumCluster(CDXSource): if query.page_count: info = dict(pages=total_pages, pageSize=pagesize, - blocks=diff) + blocks=diff + 1) yield json.dumps(info) reader.close() return