mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-16 00:24:48 +01:00
zipnum: support for pagination api! #34 and #83. cdx server now bounded by pageSize (default 10 blocks),
showNumPages=true returns json indicating num pages, page=N can be set to page number 0-numPages - 1 loaders: add read_last_line() to read last line of a seekable file, used to read last line of index file when at end tests: additional test for binsearch boundary conditions zipnum: secondary index output supports json also
This commit is contained in:
parent
872607c07d
commit
2af5a25009
@ -229,5 +229,8 @@ class IDXObject(OrderedDict):
|
||||
"""
|
||||
return str(self) + '\n'
|
||||
|
||||
def to_json(self, fields=None):
|
||||
return json_encode(self) + '\n'
|
||||
|
||||
def __str__(self):
|
||||
return self.idxline
|
||||
|
@ -23,6 +23,11 @@ def cdx_load(sources, query, process=True):
|
||||
:param process: bool, perform processing sorting/filtering/grouping ops
|
||||
"""
|
||||
cdx_iter = create_merged_cdx_gen(sources, query)
|
||||
|
||||
# page count is a special case, no further processing
|
||||
if query.page_count:
|
||||
return cdx_iter
|
||||
|
||||
cdx_iter = make_obj_iter(cdx_iter, query)
|
||||
|
||||
if process and not query.secondary_index_only:
|
||||
|
@ -86,6 +86,18 @@ class CDXQuery(object):
|
||||
def secondary_index_only(self):
|
||||
return self._get_bool('showPagedIndex')
|
||||
|
||||
@property
|
||||
def page(self):
|
||||
return int(self.params.get('page', 0))
|
||||
|
||||
@property
|
||||
def page_size(self):
|
||||
return self.params.get('pageSize')
|
||||
|
||||
@property
|
||||
def page_count(self):
|
||||
return self._get_bool('showNumPages')
|
||||
|
||||
def _get_bool(self, name, def_val=False):
|
||||
v = self.params.get(name)
|
||||
if v:
|
||||
|
@ -170,7 +170,8 @@ test_cdx_dir = get_test_dir() + 'cdx/'
|
||||
|
||||
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
||||
kwparams['url'] = url
|
||||
kwparams['output'] = 'cdxobject'
|
||||
if not 'output' in kwparams:
|
||||
kwparams['output'] = 'cdxobject'
|
||||
fields = kwparams.get('fields')
|
||||
if fields:
|
||||
fields = fields.split(',')
|
||||
@ -179,7 +180,10 @@ def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
||||
results = server.load_cdx(**kwparams)
|
||||
|
||||
for x in results:
|
||||
l = x.to_text(fields).replace('\t', ' ')
|
||||
if not isinstance(x, str):
|
||||
l = x.to_text(fields).replace('\t', ' ')
|
||||
else:
|
||||
l = x
|
||||
sys.stdout.write(l)
|
||||
|
||||
|
||||
|
@ -1,16 +1,16 @@
|
||||
"""
|
||||
>>> zip_ops_test(url = 'http://iana.org')
|
||||
>>> zip_ops_test(url='http://iana.org')
|
||||
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
||||
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
|
||||
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
|
||||
|
||||
# test idx index (tabs replacad with 4 spaces)
|
||||
>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix', showPagedIndex = True)
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='prefix', showPagedIndex=True)
|
||||
org,iana)/dnssec 20140126201307 zipnum 8511 373 35
|
||||
org,iana)/domains/int 20140126201239 zipnum 8884 353 36
|
||||
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37
|
||||
|
||||
>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix')
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='prefix')
|
||||
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
||||
@ -21,6 +21,77 @@ org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ te
|
||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||
|
||||
# Pages -- default page size
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showNumPages=True)
|
||||
{"blocks": 37, "pages": 4, "pageSize": 10}
|
||||
|
||||
# set page size
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', pageSize=4, showNumPages=True)
|
||||
{"blocks": 37, "pages": 10, "pageSize": 4}
|
||||
|
||||
# first page
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=0)
|
||||
com,example)/ 20140127171200 zipnum 0 276 1
|
||||
org,iana)/ 20140127171238 zipnum 276 328 2
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 604 312 3
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 916 235 4
|
||||
|
||||
# next page + json
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', output='json', showPagedIndex=True, pageSize=4, page=1)
|
||||
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912", "part": "zipnum", "offset": 1151, "length": 235, "lineno": 5}
|
||||
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240", "part": "zipnum", "offset": 1386, "length": 306, "lineno": 6}
|
||||
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654", "part": "zipnum", "offset": 1692, "length": 235, "lineno": 7}
|
||||
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816", "part": "zipnum", "offset": 1927, "length": 231, "lineno": 8}
|
||||
|
||||
# last page
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=9)
|
||||
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37
|
||||
org,iana)/time-zones 20140126200737 zipnum 9623 145 38
|
||||
|
||||
# last page cdx
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', pageSize=4, page=9)
|
||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||
org,iana)/numbers 20140126200651 http://www.iana.org/numbers text/html 200 HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK - - 3498 321385 iana.warc.gz
|
||||
org,iana)/performance/ietf-draft-status 20140126200815 http://www.iana.org/performance/ietf-draft-status text/html 200 T5IQTX6DWV5KABGH454CYEDWKRI5Y23E - - 2940 597667 iana.warc.gz
|
||||
org,iana)/performance/ietf-statistics 20140126200804 http://www.iana.org/performance/ietf-statistics text/html 200 XOFML5WNBQMTSULLIIPLSP6U5MX33HN6 - - 3712 582987 iana.warc.gz
|
||||
org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz
|
||||
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
|
||||
|
||||
# last page reverse -- not yet supported
|
||||
#>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', reverse=True, showPagedIndex=True, pageSize=4, page=9)
|
||||
#org,iana)/time-zones 20140126200737 zipnum 9623 145 38
|
||||
#org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37
|
||||
|
||||
|
||||
# last page reverse CDX
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', reverse=True, pageSize=4, page=9)
|
||||
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz
|
||||
org,iana)/performance/ietf-statistics 20140126200804 http://www.iana.org/performance/ietf-statistics text/html 200 XOFML5WNBQMTSULLIIPLSP6U5MX33HN6 - - 3712 582987 iana.warc.gz
|
||||
org,iana)/performance/ietf-draft-status 20140126200815 http://www.iana.org/performance/ietf-draft-status text/html 200 T5IQTX6DWV5KABGH454CYEDWKRI5Y23E - - 2940 597667 iana.warc.gz
|
||||
org,iana)/numbers 20140126200651 http://www.iana.org/numbers text/html 200 HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK - - 3498 321385 iana.warc.gz
|
||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||
|
||||
|
||||
# invalid page
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=10)
|
||||
Traceback (most recent call last):
|
||||
Exception: Page 10 invalid: First Page is 0, Last Page is 9
|
||||
|
||||
|
||||
>>> zip_ops_test(url='http://aaa.aaa/', matchType='exact', showPagedIndex=True)
|
||||
Traceback (most recent call last):
|
||||
NotFoundException: No Captures found for: http://aaa.aaa/
|
||||
|
||||
>>> zip_ops_test(url='http://aaa.aaa/', matchType='domain', showPagedIndex=True)
|
||||
Traceback (most recent call last):
|
||||
NotFoundException: No Captures found for: http://aaa.aaa/
|
||||
|
||||
>>> zip_ops_test(url='http://aaa.zz/', matchType='domain', showPagedIndex=True)
|
||||
Traceback (most recent call last):
|
||||
NotFoundException: No Captures found for: http://aaa.zz/
|
||||
|
||||
"""
|
||||
|
||||
from test_cdxops import cdx_ops_test
|
||||
|
@ -4,13 +4,14 @@ import itertools
|
||||
import logging
|
||||
from io import BytesIO
|
||||
import datetime
|
||||
import json
|
||||
|
||||
from cdxsource import CDXSource
|
||||
from cdxobject import IDXObject
|
||||
|
||||
from pywb.utils.loaders import BlockLoader
|
||||
from pywb.utils.loaders import BlockLoader, read_last_line
|
||||
from pywb.utils.bufferedreaders import gzip_decompressor
|
||||
from pywb.utils.binsearch import iter_range, linearsearch
|
||||
from pywb.utils.binsearch import iter_range, linearsearch, search
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -22,24 +23,10 @@ class ZipBlocks:
|
||||
self.count = count
|
||||
|
||||
|
||||
#=================================================================
|
||||
def readline_to_iter(stream):
|
||||
try:
|
||||
count = 0
|
||||
buff = stream.readline()
|
||||
while buff:
|
||||
count += 1
|
||||
yield buff
|
||||
buff = stream.readline()
|
||||
|
||||
finally:
|
||||
stream.close()
|
||||
|
||||
|
||||
#=================================================================
|
||||
class ZipNumCluster(CDXSource):
|
||||
DEFAULT_RELOAD_INTERVAL = 10 # in minutes
|
||||
DEFAULT_MAX_BLOCKS = 50
|
||||
DEFAULT_MAX_BLOCKS = 10
|
||||
|
||||
def __init__(self, summary, config=None):
|
||||
|
||||
@ -114,22 +101,91 @@ class ZipNumCluster(CDXSource):
|
||||
|
||||
reader = open(self.summary, 'rb')
|
||||
|
||||
idx_iter = iter_range(reader,
|
||||
query.key,
|
||||
query.end_key,
|
||||
prev_size=1)
|
||||
idx_iter = self.compute_page_range(reader, query)
|
||||
|
||||
if query.secondary_index_only:
|
||||
if query.secondary_index_only or query.page_count:
|
||||
return idx_iter
|
||||
|
||||
blocks = self.idx_to_cdx(idx_iter, query)
|
||||
|
||||
def gen_cdx():
|
||||
for blk in blocks:
|
||||
for cdx in blk:
|
||||
yield cdx
|
||||
|
||||
return gen_cdx()
|
||||
|
||||
|
||||
def compute_page_range(self, reader, query):
|
||||
|
||||
# Get End
|
||||
end_iter = search(reader, query.end_key, prev_size=1)
|
||||
|
||||
try:
|
||||
end_line = end_iter.next()
|
||||
except StopIteration:
|
||||
end_line = read_last_line(reader)
|
||||
|
||||
# Get Start
|
||||
|
||||
first_iter = iter_range(reader,
|
||||
query.key,
|
||||
query.end_key,
|
||||
prev_size=1)
|
||||
|
||||
try:
|
||||
first_line = first_iter.next()
|
||||
except StopIteration:
|
||||
raise
|
||||
|
||||
first = IDXObject(first_line)
|
||||
|
||||
end = IDXObject(end_line)
|
||||
diff = end['lineno'] - first['lineno']
|
||||
|
||||
pagesize = query.page_size
|
||||
if not pagesize:
|
||||
pagesize = self.max_blocks
|
||||
|
||||
total_pages = diff / pagesize + 1
|
||||
|
||||
if query.page_count:
|
||||
info = dict(pages=total_pages,
|
||||
pageSize=pagesize,
|
||||
blocks=diff)
|
||||
yield json.dumps(info)
|
||||
reader.close()
|
||||
return
|
||||
|
||||
curr_page = query.page
|
||||
if curr_page >= total_pages or curr_page < 0:
|
||||
msg = 'Page {0} invalid: First Page is 0, Last Page is {1}'
|
||||
reader.close()
|
||||
raise Exception(msg.format(curr_page, total_pages - 1))
|
||||
|
||||
startline = curr_page * pagesize
|
||||
endline = min(startline + pagesize - 1, diff)
|
||||
|
||||
if curr_page == 0:
|
||||
yield first_line
|
||||
else:
|
||||
blocks = self.idx_to_cdx(idx_iter, query)
|
||||
startline -= 1
|
||||
|
||||
def gen_cdx():
|
||||
for blk in blocks:
|
||||
for cdx in blk:
|
||||
yield cdx
|
||||
idxiter = itertools.islice(first_iter, startline, endline)
|
||||
for idx in idxiter:
|
||||
yield idx
|
||||
|
||||
return gen_cdx()
|
||||
reader.close()
|
||||
|
||||
|
||||
def search_by_line_num(self, reader, line): # pragma: no cover
|
||||
def line_cmp(line1, line2):
|
||||
line1_no = int(line1.rsplit('\t', 1)[-1])
|
||||
line2_no = int(line2.rsplit('\t', 1)[-1])
|
||||
return cmp(line1_no, line2_no)
|
||||
|
||||
line_iter = search(reader, line, compare_func=line_cmp)
|
||||
yield line_iter.next()
|
||||
|
||||
def idx_to_cdx(self, idx_iter, query):
|
||||
blocks = None
|
||||
@ -178,6 +234,10 @@ class ZipNumCluster(CDXSource):
|
||||
raise Exception('No Locations Found for: ' + block.part)
|
||||
|
||||
def load_blocks(self, location, blocks, ranges, query):
|
||||
""" Load one or more blocks of compressed cdx lines, return
|
||||
a line iterator which decompresses and returns one line at a time,
|
||||
bounded by query.key and query.end_key
|
||||
"""
|
||||
|
||||
if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
|
||||
msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
|
||||
@ -188,7 +248,8 @@ class ZipNumCluster(CDXSource):
|
||||
def decompress_block(range_):
|
||||
decomp = gzip_decompressor()
|
||||
buff = decomp.decompress(reader.read(range_))
|
||||
return readline_to_iter(BytesIO(buff))
|
||||
for line in BytesIO(buff):
|
||||
yield line
|
||||
|
||||
iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))
|
||||
|
||||
|
@ -84,7 +84,7 @@ def linearsearch(iter_, key, prev_size=0, compare_func=cmp):
|
||||
|
||||
# no matches, so return empty iterator
|
||||
if not matched:
|
||||
return []
|
||||
return iter([])
|
||||
|
||||
return itertools.chain(prev_deque, iter_)
|
||||
|
||||
|
@ -112,6 +112,27 @@ def extract_client_cookie(env, cookie_name):
|
||||
return value
|
||||
|
||||
|
||||
#=================================================================
|
||||
def read_last_line(fh, offset=256):
|
||||
""" Read last line from a seekable file. Start reading
|
||||
from buff before end of file, and double backwards seek
|
||||
until line break is found. If reached beginning of file
|
||||
(no lines), just return whole file
|
||||
"""
|
||||
fh.seek(0, 2)
|
||||
size = fh.tell()
|
||||
|
||||
while offset < size:
|
||||
fh.seek(-offset, 2)
|
||||
lines = fh.readlines()
|
||||
if len(lines) > 1:
|
||||
return lines[-1]
|
||||
offset *= 2
|
||||
|
||||
fh.seek(0, 0)
|
||||
return fh.readlines()[-1]
|
||||
|
||||
|
||||
#=================================================================
|
||||
class BlockLoader(object):
|
||||
"""
|
||||
|
@ -53,6 +53,21 @@ org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jq
|
||||
org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
|
||||
|
||||
|
||||
# Test at boundary
|
||||
>>> print_binsearch_results('a)/', iter_exact)
|
||||
>>> print_binsearch_results_range('a)/', 'a-', iter_range)
|
||||
|
||||
>>> print_binsearch_results_range('a)/', 'org,iana)/_css/2013.1/fonts/inconsolata.otf ', iter_range)
|
||||
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
||||
|
||||
>>> print_binsearch_results('z)/', iter_exact)
|
||||
>>> print_binsearch_results_range('z)/', 'z-', iter_range)
|
||||
|
||||
>>> print_binsearch_results_range('org,iana)/protocols', 'z-', iter_range)
|
||||
org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz
|
||||
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
@ -82,6 +82,24 @@ True
|
||||
# length too long
|
||||
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) + 4, BytesIO(post_data))
|
||||
'foo=bar&dir=/baz'
|
||||
|
||||
|
||||
# test read_last_line
|
||||
>>> read_last_line(BytesIO('A\nB\nC'))
|
||||
'C'
|
||||
|
||||
>>> read_last_line(BytesIO('Some Line\nLonger Line\nLongest Last Line LL'), offset=8)
|
||||
'Longest Last Line LL'
|
||||
|
||||
>>> read_last_line(BytesIO('A\nBC'))
|
||||
'BC'
|
||||
|
||||
>>> read_last_line(BytesIO('A\nBC\n'))
|
||||
'BC\n'
|
||||
|
||||
>>> read_last_line(BytesIO('ABC'))
|
||||
'ABC'
|
||||
|
||||
"""
|
||||
|
||||
|
||||
@ -91,6 +109,7 @@ import os
|
||||
from io import BytesIO
|
||||
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
|
||||
from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query
|
||||
from pywb.utils.loaders import read_last_line
|
||||
|
||||
from pywb import get_test_dir
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user