diff --git a/pywb/cdx/cdxobject.py b/pywb/cdx/cdxobject.py index 1b79e751..30bce587 100644 --- a/pywb/cdx/cdxobject.py +++ b/pywb/cdx/cdxobject.py @@ -229,5 +229,8 @@ class IDXObject(OrderedDict): """ return str(self) + '\n' + def to_json(self, fields=None): + return json_encode(self) + '\n' + def __str__(self): return self.idxline diff --git a/pywb/cdx/cdxops.py b/pywb/cdx/cdxops.py index 035f9b4f..bf5d7d68 100644 --- a/pywb/cdx/cdxops.py +++ b/pywb/cdx/cdxops.py @@ -23,6 +23,11 @@ def cdx_load(sources, query, process=True): :param process: bool, perform processing sorting/filtering/grouping ops """ cdx_iter = create_merged_cdx_gen(sources, query) + + # page count is a special case, no further processing + if query.page_count: + return cdx_iter + cdx_iter = make_obj_iter(cdx_iter, query) if process and not query.secondary_index_only: diff --git a/pywb/cdx/query.py b/pywb/cdx/query.py index a1501563..8ebb9599 100644 --- a/pywb/cdx/query.py +++ b/pywb/cdx/query.py @@ -86,6 +86,18 @@ class CDXQuery(object): def secondary_index_only(self): return self._get_bool('showPagedIndex') + @property + def page(self): + return int(self.params.get('page', 0)) + + @property + def page_size(self): + return self.params.get('pageSize') + + @property + def page_count(self): + return self._get_bool('showNumPages') + def _get_bool(self, name, def_val=False): v = self.params.get(name) if v: diff --git a/pywb/cdx/test/test_cdxops.py b/pywb/cdx/test/test_cdxops.py index 0852815b..a3283e6a 100644 --- a/pywb/cdx/test/test_cdxops.py +++ b/pywb/cdx/test/test_cdxops.py @@ -170,7 +170,8 @@ test_cdx_dir = get_test_dir() + 'cdx/' def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): kwparams['url'] = url - kwparams['output'] = 'cdxobject' + if not 'output' in kwparams: + kwparams['output'] = 'cdxobject' fields = kwparams.get('fields') if fields: fields = fields.split(',') @@ -179,7 +180,10 @@ def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams): results = server.load_cdx(**kwparams) for x in results: - l = x.to_text(fields).replace('\t', ' ') + if not isinstance(x, str): + l = x.to_text(fields).replace('\t', ' ') + else: + l = x sys.stdout.write(l) diff --git a/pywb/cdx/test/test_zipnum.py b/pywb/cdx/test/test_zipnum.py index 95079d52..1d581a26 100644 --- a/pywb/cdx/test/test_zipnum.py +++ b/pywb/cdx/test/test_zipnum.py @@ -1,16 +1,16 @@ """ ->>> zip_ops_test(url = 'http://iana.org') +>>> zip_ops_test(url='http://iana.org') org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz # test idx index (tabs replacad with 4 spaces) ->>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix', showPagedIndex = True) +>>> zip_ops_test(url='http://iana.org/domains/', matchType='prefix', showPagedIndex=True) org,iana)/dnssec 20140126201307 zipnum 8511 373 35 org,iana)/domains/int 20140126201239 zipnum 8884 353 36 org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37 ->>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix') +>>> zip_ops_test(url='http://iana.org/domains/', matchType='prefix') org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz @@ -21,6 +21,77 @@ org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ te org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz +# Pages -- default page size +>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showNumPages=True) +{"blocks": 37, "pages": 4, "pageSize": 10} + +# set page size +>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', pageSize=4, showNumPages=True) +{"blocks": 37, "pages": 10, "pageSize": 4} + +# first page +>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=0) +com,example)/ 20140127171200 zipnum 0 276 1 +org,iana)/ 20140127171238 zipnum 276 328 2 +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 604 312 3 +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 916 235 4 + +# next page + json +>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', output='json', showPagedIndex=True, pageSize=4, page=1) +{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912", "part": "zipnum", "offset": 1151, "length": 235, "lineno": 5} +{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240", "part": "zipnum", "offset": 1386, "length": 306, "lineno": 6} +{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654", "part": "zipnum", "offset": 1692, "length": 235, "lineno": 7} +{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816", "part": "zipnum", "offset": 1927, "length": 231, "lineno": 8} + +# last page +>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=9) +org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37 +org,iana)/time-zones 20140126200737 zipnum 9623 145 38 + +# last page cdx +>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', pageSize=4, page=9) +org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz +org,iana)/numbers 20140126200651 http://www.iana.org/numbers text/html 200 HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK - - 3498 321385 iana.warc.gz +org,iana)/performance/ietf-draft-status 20140126200815 http://www.iana.org/performance/ietf-draft-status text/html 200 T5IQTX6DWV5KABGH454CYEDWKRI5Y23E - - 2940 597667 iana.warc.gz +org,iana)/performance/ietf-statistics 20140126200804 http://www.iana.org/performance/ietf-statistics text/html 200 XOFML5WNBQMTSULLIIPLSP6U5MX33HN6 - - 3712 582987 iana.warc.gz +org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz +org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz + + +# last page reverse -- not yet supported +#>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', reverse=True, showPagedIndex=True, pageSize=4, page=9) +#org,iana)/time-zones 20140126200737 zipnum 9623 145 38 +#org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37 + + +# last page reverse CDX +>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', reverse=True, pageSize=4, page=9) +org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz +org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz +org,iana)/performance/ietf-statistics 20140126200804 http://www.iana.org/performance/ietf-statistics text/html 200 XOFML5WNBQMTSULLIIPLSP6U5MX33HN6 - - 3712 582987 iana.warc.gz +org,iana)/performance/ietf-draft-status 20140126200815 http://www.iana.org/performance/ietf-draft-status text/html 200 T5IQTX6DWV5KABGH454CYEDWKRI5Y23E - - 2940 597667 iana.warc.gz +org,iana)/numbers 20140126200651 http://www.iana.org/numbers text/html 200 HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK - - 3498 321385 iana.warc.gz +org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz + + +# invalid page +>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=10) +Traceback (most recent call last): +Exception: Page 10 invalid: First Page is 0, Last Page is 9 + + +>>> zip_ops_test(url='http://aaa.aaa/', matchType='exact', showPagedIndex=True) +Traceback (most recent call last): +NotFoundException: No Captures found for: http://aaa.aaa/ + +>>> zip_ops_test(url='http://aaa.aaa/', matchType='domain', showPagedIndex=True) +Traceback (most recent call last): +NotFoundException: No Captures found for: http://aaa.aaa/ + +>>> zip_ops_test(url='http://aaa.zz/', matchType='domain', showPagedIndex=True) +Traceback (most recent call last): +NotFoundException: No Captures found for: http://aaa.zz/ + """ from test_cdxops import cdx_ops_test diff --git a/pywb/cdx/zipnum.py b/pywb/cdx/zipnum.py index 87ec1340..9a56fcf7 100644 --- a/pywb/cdx/zipnum.py +++ b/pywb/cdx/zipnum.py @@ -4,13 +4,14 @@ import itertools import logging from io import BytesIO import datetime +import json from cdxsource import CDXSource from cdxobject import IDXObject -from pywb.utils.loaders import BlockLoader +from pywb.utils.loaders import BlockLoader, read_last_line from pywb.utils.bufferedreaders import gzip_decompressor -from pywb.utils.binsearch import iter_range, linearsearch +from pywb.utils.binsearch import iter_range, linearsearch, search #================================================================= @@ -22,24 +23,10 @@ class ZipBlocks: self.count = count -#================================================================= -def readline_to_iter(stream): - try: - count = 0 - buff = stream.readline() - while buff: - count += 1 - yield buff - buff = stream.readline() - - finally: - stream.close() - - #================================================================= class ZipNumCluster(CDXSource): DEFAULT_RELOAD_INTERVAL = 10 # in minutes - DEFAULT_MAX_BLOCKS = 50 + DEFAULT_MAX_BLOCKS = 10 def __init__(self, summary, config=None): @@ -114,22 +101,91 @@ class ZipNumCluster(CDXSource): reader = open(self.summary, 'rb') - idx_iter = iter_range(reader, - query.key, - query.end_key, - prev_size=1) + idx_iter = self.compute_page_range(reader, query) - if query.secondary_index_only: + if query.secondary_index_only or query.page_count: return idx_iter + + blocks = self.idx_to_cdx(idx_iter, query) + + def gen_cdx(): + for blk in blocks: + for cdx in blk: + yield cdx + + return gen_cdx() + + + def compute_page_range(self, reader, query): + + # Get End + end_iter = search(reader, query.end_key, prev_size=1) + + try: + end_line = end_iter.next() + except StopIteration: + end_line = read_last_line(reader) + + # Get Start + + first_iter = iter_range(reader, + query.key, + query.end_key, + prev_size=1) + + try: + first_line = first_iter.next() + except StopIteration: + raise + + first = IDXObject(first_line) + + end = IDXObject(end_line) + diff = end['lineno'] - first['lineno'] + + pagesize = query.page_size + if not pagesize: + pagesize = self.max_blocks + + total_pages = diff / pagesize + 1 + + if query.page_count: + info = dict(pages=total_pages, + pageSize=pagesize, + blocks=diff) + yield json.dumps(info) + reader.close() + return + + curr_page = query.page + if curr_page >= total_pages or curr_page < 0: + msg = 'Page {0} invalid: First Page is 0, Last Page is {1}' + reader.close() + raise Exception(msg.format(curr_page, total_pages - 1)) + + startline = curr_page * pagesize + endline = min(startline + pagesize - 1, diff) + + if curr_page == 0: + yield first_line else: - blocks = self.idx_to_cdx(idx_iter, query) + startline -= 1 - def gen_cdx(): - for blk in blocks: - for cdx in blk: - yield cdx + idxiter = itertools.islice(first_iter, startline, endline) + for idx in idxiter: + yield idx - return gen_cdx() + reader.close() + + + def search_by_line_num(self, reader, line): # pragma: no cover + def line_cmp(line1, line2): + line1_no = int(line1.rsplit('\t', 1)[-1]) + line2_no = int(line2.rsplit('\t', 1)[-1]) + return cmp(line1_no, line2_no) + + line_iter = search(reader, line, compare_func=line_cmp) + yield line_iter.next() def idx_to_cdx(self, idx_iter, query): blocks = None @@ -178,6 +234,10 @@ class ZipNumCluster(CDXSource): raise Exception('No Locations Found for: ' + block.part) def load_blocks(self, location, blocks, ranges, query): + """ Load one or more blocks of compressed cdx lines, return + a line iterator which decompresses and returns one line at a time, + bounded by query.key and query.end_key + """ if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG): msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}' @@ -188,7 +248,8 @@ class ZipNumCluster(CDXSource): def decompress_block(range_): decomp = gzip_decompressor() buff = decomp.decompress(reader.read(range_)) - return readline_to_iter(BytesIO(buff)) + for line in BytesIO(buff): + yield line iter_ = itertools.chain(*itertools.imap(decompress_block, ranges)) diff --git a/pywb/utils/binsearch.py b/pywb/utils/binsearch.py index cf4ad986..f605709a 100644 --- a/pywb/utils/binsearch.py +++ b/pywb/utils/binsearch.py @@ -84,7 +84,7 @@ def linearsearch(iter_, key, prev_size=0, compare_func=cmp): # no matches, so return empty iterator if not matched: - return [] + return iter([]) return itertools.chain(prev_deque, iter_) diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index affae74f..cc1eb77f 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -112,6 +112,27 @@ def extract_client_cookie(env, cookie_name): return value +#================================================================= +def read_last_line(fh, offset=256): + """ Read last line from a seekable file. Start reading + from buff before end of file, and double backwards seek + until line break is found. If reached beginning of file + (no lines), just return whole file + """ + fh.seek(0, 2) + size = fh.tell() + + while offset < size: + fh.seek(-offset, 2) + lines = fh.readlines() + if len(lines) > 1: + return lines[-1] + offset *= 2 + + fh.seek(0, 0) + return fh.readlines()[-1] + + #================================================================= class BlockLoader(object): """ diff --git a/pywb/utils/test/test_binsearch.py b/pywb/utils/test/test_binsearch.py index 7fae02ff..7b5cbb3c 100644 --- a/pywb/utils/test/test_binsearch.py +++ b/pywb/utils/test/test_binsearch.py @@ -53,6 +53,21 @@ org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jq org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz +# Test at boundary +>>> print_binsearch_results('a)/', iter_exact) +>>> print_binsearch_results_range('a)/', 'a-', iter_range) + +>>> print_binsearch_results_range('a)/', 'org,iana)/_css/2013.1/fonts/inconsolata.otf ', iter_range) +org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz + +>>> print_binsearch_results('z)/', iter_exact) +>>> print_binsearch_results_range('z)/', 'z-', iter_range) + +>>> print_binsearch_results_range('org,iana)/protocols', 'z-', iter_range) +org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz +org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz + + """ diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index e43cdc41..cb0071b9 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -82,6 +82,24 @@ True # length too long >>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) + 4, BytesIO(post_data)) 'foo=bar&dir=/baz' + + +# test read_last_line +>>> read_last_line(BytesIO('A\nB\nC')) +'C' + +>>> read_last_line(BytesIO('Some Line\nLonger Line\nLongest Last Line LL'), offset=8) +'Longest Last Line LL' + +>>> read_last_line(BytesIO('A\nBC')) +'BC' + +>>> read_last_line(BytesIO('A\nBC\n')) +'BC\n' + +>>> read_last_line(BytesIO('ABC')) +'ABC' + """ @@ -91,6 +109,7 @@ import os from io import BytesIO from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query +from pywb.utils.loaders import read_last_line from pywb import get_test_dir