1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-16 00:24:48 +01:00

zipnum: support for pagination api! and . cdx server now bounded by pageSize (default 10 blocks),

showNumPages=true returns json indicating num pages, page=N can be set to page number 0-numPages - 1
loaders: add read_last_line() to read last line of a seekable file, used to read last line of index file when
at end
tests: additional test for binsearch boundary conditions
zipnum: secondary index output supports json also
This commit is contained in:
Ilya Kreymer 2015-03-24 18:56:13 -07:00
parent 872607c07d
commit 2af5a25009
10 changed files with 246 additions and 35 deletions

@ -229,5 +229,8 @@ class IDXObject(OrderedDict):
"""
return str(self) + '\n'
def to_json(self, fields=None):
return json_encode(self) + '\n'
def __str__(self):
return self.idxline

@ -23,6 +23,11 @@ def cdx_load(sources, query, process=True):
:param process: bool, perform processing sorting/filtering/grouping ops
"""
cdx_iter = create_merged_cdx_gen(sources, query)
# page count is a special case, no further processing
if query.page_count:
return cdx_iter
cdx_iter = make_obj_iter(cdx_iter, query)
if process and not query.secondary_index_only:

@ -86,6 +86,18 @@ class CDXQuery(object):
def secondary_index_only(self):
return self._get_bool('showPagedIndex')
@property
def page(self):
return int(self.params.get('page', 0))
@property
def page_size(self):
return self.params.get('pageSize')
@property
def page_count(self):
return self._get_bool('showNumPages')
def _get_bool(self, name, def_val=False):
v = self.params.get(name)
if v:

@ -170,7 +170,8 @@ test_cdx_dir = get_test_dir() + 'cdx/'
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
kwparams['url'] = url
kwparams['output'] = 'cdxobject'
if not 'output' in kwparams:
kwparams['output'] = 'cdxobject'
fields = kwparams.get('fields')
if fields:
fields = fields.split(',')
@ -179,7 +180,10 @@ def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
results = server.load_cdx(**kwparams)
for x in results:
l = x.to_text(fields).replace('\t', ' ')
if not isinstance(x, str):
l = x.to_text(fields).replace('\t', ' ')
else:
l = x
sys.stdout.write(l)

@ -1,16 +1,16 @@
"""
>>> zip_ops_test(url = 'http://iana.org')
>>> zip_ops_test(url='http://iana.org')
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
# test idx index (tabs replacad with 4 spaces)
>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix', showPagedIndex = True)
>>> zip_ops_test(url='http://iana.org/domains/', matchType='prefix', showPagedIndex=True)
org,iana)/dnssec 20140126201307 zipnum 8511 373 35
org,iana)/domains/int 20140126201239 zipnum 8884 353 36
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37
>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix')
>>> zip_ops_test(url='http://iana.org/domains/', matchType='prefix')
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
@ -21,6 +21,77 @@ org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ te
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
# Pages -- default page size
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showNumPages=True)
{"blocks": 37, "pages": 4, "pageSize": 10}
# set page size
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', pageSize=4, showNumPages=True)
{"blocks": 37, "pages": 10, "pageSize": 4}
# first page
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=0)
com,example)/ 20140127171200 zipnum 0 276 1
org,iana)/ 20140127171238 zipnum 276 328 2
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 604 312 3
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 916 235 4
# next page + json
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', output='json', showPagedIndex=True, pageSize=4, page=1)
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912", "part": "zipnum", "offset": 1151, "length": 235, "lineno": 5}
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240", "part": "zipnum", "offset": 1386, "length": 306, "lineno": 6}
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654", "part": "zipnum", "offset": 1692, "length": 235, "lineno": 7}
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816", "part": "zipnum", "offset": 1927, "length": 231, "lineno": 8}
# last page
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=9)
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37
org,iana)/time-zones 20140126200737 zipnum 9623 145 38
# last page cdx
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', pageSize=4, page=9)
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
org,iana)/numbers 20140126200651 http://www.iana.org/numbers text/html 200 HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK - - 3498 321385 iana.warc.gz
org,iana)/performance/ietf-draft-status 20140126200815 http://www.iana.org/performance/ietf-draft-status text/html 200 T5IQTX6DWV5KABGH454CYEDWKRI5Y23E - - 2940 597667 iana.warc.gz
org,iana)/performance/ietf-statistics 20140126200804 http://www.iana.org/performance/ietf-statistics text/html 200 XOFML5WNBQMTSULLIIPLSP6U5MX33HN6 - - 3712 582987 iana.warc.gz
org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
# last page reverse -- not yet supported
#>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', reverse=True, showPagedIndex=True, pageSize=4, page=9)
#org,iana)/time-zones 20140126200737 zipnum 9623 145 38
#org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37
# last page reverse CDX
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', reverse=True, pageSize=4, page=9)
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz
org,iana)/performance/ietf-statistics 20140126200804 http://www.iana.org/performance/ietf-statistics text/html 200 XOFML5WNBQMTSULLIIPLSP6U5MX33HN6 - - 3712 582987 iana.warc.gz
org,iana)/performance/ietf-draft-status 20140126200815 http://www.iana.org/performance/ietf-draft-status text/html 200 T5IQTX6DWV5KABGH454CYEDWKRI5Y23E - - 2940 597667 iana.warc.gz
org,iana)/numbers 20140126200651 http://www.iana.org/numbers text/html 200 HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK - - 3498 321385 iana.warc.gz
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
# invalid page
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=10)
Traceback (most recent call last):
Exception: Page 10 invalid: First Page is 0, Last Page is 9
>>> zip_ops_test(url='http://aaa.aaa/', matchType='exact', showPagedIndex=True)
Traceback (most recent call last):
NotFoundException: No Captures found for: http://aaa.aaa/
>>> zip_ops_test(url='http://aaa.aaa/', matchType='domain', showPagedIndex=True)
Traceback (most recent call last):
NotFoundException: No Captures found for: http://aaa.aaa/
>>> zip_ops_test(url='http://aaa.zz/', matchType='domain', showPagedIndex=True)
Traceback (most recent call last):
NotFoundException: No Captures found for: http://aaa.zz/
"""
from test_cdxops import cdx_ops_test

@ -4,13 +4,14 @@ import itertools
import logging
from io import BytesIO
import datetime
import json
from cdxsource import CDXSource
from cdxobject import IDXObject
from pywb.utils.loaders import BlockLoader
from pywb.utils.loaders import BlockLoader, read_last_line
from pywb.utils.bufferedreaders import gzip_decompressor
from pywb.utils.binsearch import iter_range, linearsearch
from pywb.utils.binsearch import iter_range, linearsearch, search
#=================================================================
@ -22,24 +23,10 @@ class ZipBlocks:
self.count = count
#=================================================================
def readline_to_iter(stream):
try:
count = 0
buff = stream.readline()
while buff:
count += 1
yield buff
buff = stream.readline()
finally:
stream.close()
#=================================================================
class ZipNumCluster(CDXSource):
DEFAULT_RELOAD_INTERVAL = 10 # in minutes
DEFAULT_MAX_BLOCKS = 50
DEFAULT_MAX_BLOCKS = 10
def __init__(self, summary, config=None):
@ -114,22 +101,91 @@ class ZipNumCluster(CDXSource):
reader = open(self.summary, 'rb')
idx_iter = iter_range(reader,
query.key,
query.end_key,
prev_size=1)
idx_iter = self.compute_page_range(reader, query)
if query.secondary_index_only:
if query.secondary_index_only or query.page_count:
return idx_iter
blocks = self.idx_to_cdx(idx_iter, query)
def gen_cdx():
for blk in blocks:
for cdx in blk:
yield cdx
return gen_cdx()
def compute_page_range(self, reader, query):
# Get End
end_iter = search(reader, query.end_key, prev_size=1)
try:
end_line = end_iter.next()
except StopIteration:
end_line = read_last_line(reader)
# Get Start
first_iter = iter_range(reader,
query.key,
query.end_key,
prev_size=1)
try:
first_line = first_iter.next()
except StopIteration:
raise
first = IDXObject(first_line)
end = IDXObject(end_line)
diff = end['lineno'] - first['lineno']
pagesize = query.page_size
if not pagesize:
pagesize = self.max_blocks
total_pages = diff / pagesize + 1
if query.page_count:
info = dict(pages=total_pages,
pageSize=pagesize,
blocks=diff)
yield json.dumps(info)
reader.close()
return
curr_page = query.page
if curr_page >= total_pages or curr_page < 0:
msg = 'Page {0} invalid: First Page is 0, Last Page is {1}'
reader.close()
raise Exception(msg.format(curr_page, total_pages - 1))
startline = curr_page * pagesize
endline = min(startline + pagesize - 1, diff)
if curr_page == 0:
yield first_line
else:
blocks = self.idx_to_cdx(idx_iter, query)
startline -= 1
def gen_cdx():
for blk in blocks:
for cdx in blk:
yield cdx
idxiter = itertools.islice(first_iter, startline, endline)
for idx in idxiter:
yield idx
return gen_cdx()
reader.close()
def search_by_line_num(self, reader, line): # pragma: no cover
def line_cmp(line1, line2):
line1_no = int(line1.rsplit('\t', 1)[-1])
line2_no = int(line2.rsplit('\t', 1)[-1])
return cmp(line1_no, line2_no)
line_iter = search(reader, line, compare_func=line_cmp)
yield line_iter.next()
def idx_to_cdx(self, idx_iter, query):
blocks = None
@ -178,6 +234,10 @@ class ZipNumCluster(CDXSource):
raise Exception('No Locations Found for: ' + block.part)
def load_blocks(self, location, blocks, ranges, query):
""" Load one or more blocks of compressed cdx lines, return
a line iterator which decompresses and returns one line at a time,
bounded by query.key and query.end_key
"""
if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
@ -188,7 +248,8 @@ class ZipNumCluster(CDXSource):
def decompress_block(range_):
decomp = gzip_decompressor()
buff = decomp.decompress(reader.read(range_))
return readline_to_iter(BytesIO(buff))
for line in BytesIO(buff):
yield line
iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))

@ -84,7 +84,7 @@ def linearsearch(iter_, key, prev_size=0, compare_func=cmp):
# no matches, so return empty iterator
if not matched:
return []
return iter([])
return itertools.chain(prev_deque, iter_)

@ -112,6 +112,27 @@ def extract_client_cookie(env, cookie_name):
return value
#=================================================================
def read_last_line(fh, offset=256):
""" Read last line from a seekable file. Start reading
from buff before end of file, and double backwards seek
until line break is found. If reached beginning of file
(no lines), just return whole file
"""
fh.seek(0, 2)
size = fh.tell()
while offset < size:
fh.seek(-offset, 2)
lines = fh.readlines()
if len(lines) > 1:
return lines[-1]
offset *= 2
fh.seek(0, 0)
return fh.readlines()[-1]
#=================================================================
class BlockLoader(object):
"""

@ -53,6 +53,21 @@ org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jq
org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
# Test at boundary
>>> print_binsearch_results('a)/', iter_exact)
>>> print_binsearch_results_range('a)/', 'a-', iter_range)
>>> print_binsearch_results_range('a)/', 'org,iana)/_css/2013.1/fonts/inconsolata.otf ', iter_range)
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
>>> print_binsearch_results('z)/', iter_exact)
>>> print_binsearch_results_range('z)/', 'z-', iter_range)
>>> print_binsearch_results_range('org,iana)/protocols', 'z-', iter_range)
org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
"""

@ -82,6 +82,24 @@ True
# length too long
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) + 4, BytesIO(post_data))
'foo=bar&dir=/baz'
# test read_last_line
>>> read_last_line(BytesIO('A\nB\nC'))
'C'
>>> read_last_line(BytesIO('Some Line\nLonger Line\nLongest Last Line LL'), offset=8)
'Longest Last Line LL'
>>> read_last_line(BytesIO('A\nBC'))
'BC'
>>> read_last_line(BytesIO('A\nBC\n'))
'BC\n'
>>> read_last_line(BytesIO('ABC'))
'ABC'
"""
@ -91,6 +109,7 @@ import os
from io import BytesIO
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query
from pywb.utils.loaders import read_last_line
from pywb import get_test_dir