1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 08:04:49 +01:00

cdx-server query & zipnum: fixes for showNumPages query:

- if query contained in <1 secondary index block, must read first line of cdx to determine if any matches
- if no matches, don't throw 404 exception but always return json info with 0 pages
This commit is contained in:
Ilya Kreymer 2015-03-28 16:15:24 -07:00
parent 313a2efeac
commit f3a066f58b
2 changed files with 38 additions and 11 deletions

View File

@ -33,9 +33,17 @@ org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/s
{"blocks": 38, "pages": 10, "pageSize": 4}
# set page size -- alt domain query
>>> zip_ops_test(url='*.iana.org', pageSize=4, showNumPages=True)
>>> zip_ops_test(url='*.iana.org', pageSize='4', showNumPages=True)
{"blocks": 38, "pages": 10, "pageSize": 4}
# page size for non-existent, but secondary index match
>>> zip_ops_test(url='iana.org/domains/int/blah', pageSize=4, showNumPages=True)
{"blocks": 0, "pages": 0, "pageSize": 4}
# page size for non-existent, no secondary index match
>>> zip_ops_test(url='*.foo.bar', showNumPages=True)
{"blocks": 0, "pages": 0, "pageSize": 10}
# first page
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=0)
com,example)/ 20140127171200 zipnum 0 276 1

View File

@ -152,7 +152,18 @@ class ZipNumCluster(CDXSource):
return gen_cdx()
def _page_info(self, pages, pagesize, blocks):
info = dict(pages=pages,
pageSize=pagesize,
blocks=blocks)
return json.dumps(info)
def compute_page_range(self, reader, query):
pagesize = query.page_size
if not pagesize:
pagesize = self.max_blocks
else:
pagesize = int(pagesize)
# Get End
end_iter = search(reader, query.end_key, prev_size=1)
@ -163,7 +174,6 @@ class ZipNumCluster(CDXSource):
end_line = read_last_line(reader)
# Get Start
first_iter = iter_range(reader,
query.key,
query.end_key,
@ -173,24 +183,33 @@ class ZipNumCluster(CDXSource):
first_line = first_iter.next()
except StopIteration:
reader.close()
raise
if query.page_count:
yield self._page_info(0, pagesize, 0)
return
else:
raise
first = IDXObject(first_line)
end = IDXObject(end_line)
diff = end['lineno'] - first['lineno']
pagesize = query.page_size
if not pagesize:
pagesize = self.max_blocks
total_pages = diff / pagesize + 1
if query.page_count:
info = dict(pages=total_pages,
pageSize=pagesize,
blocks=diff + 1)
yield json.dumps(info)
blocks = diff + 1
# same line, so actually need to look at cdx
# to determine if it exists
if total_pages == 1:
try:
block_cdx_iter = self.idx_to_cdx([first_line], query)
block = block_cdx_iter.next()
cdx = block.next()
except StopIteration:
total_pages = 0
blocks = 0
yield self._page_info(total_pages, pagesize, blocks)
reader.close()
return