zipnum: support for pagination api! #34 and #83. cdx server now bounded by pageSize (default 10 blocks),

showNumPages=true returns json indicating num pages, page=N can be set to page number 0-numPages - 1 loaders: add read_last_line() to read last line of a seekable file, used to read last line of index file when at end tests: additional test for binsearch boundary conditions zipnum: secondary index output supports json also
2025-03-17 08:54:02 +01:00 · 2015-03-24 18:56:13 -07:00 · 2015-03-24 18:56:13 -07:00 · 2af5a25009
commit 2af5a25009
parent 872607c07d
10 changed files with 246 additions and 35 deletions
--- a/pywb/cdx/cdxobject.py
+++ b/pywb/cdx/cdxobject.py
@ -229,5 +229,8 @@ class IDXObject(OrderedDict):
        """
        return str(self) + '\n'

+    def to_json(self, fields=None):
+        return json_encode(self) + '\n'
+
    def __str__(self):
        return self.idxline
--- a/pywb/cdx/cdxops.py
+++ b/pywb/cdx/cdxops.py
@ -23,6 +23,11 @@ def cdx_load(sources, query, process=True):
    :param process: bool, perform processing sorting/filtering/grouping ops
    """
    cdx_iter = create_merged_cdx_gen(sources, query)
+
+    # page count is a special case, no further processing
+    if query.page_count:
+        return cdx_iter
+
    cdx_iter = make_obj_iter(cdx_iter, query)

    if process and not query.secondary_index_only:
--- a/pywb/cdx/query.py
+++ b/pywb/cdx/query.py
@ -86,6 +86,18 @@ class CDXQuery(object):
    def secondary_index_only(self):
        return self._get_bool('showPagedIndex')

+    @property
+    def page(self):
+        return int(self.params.get('page', 0))
+
+    @property
+    def page_size(self):
+        return self.params.get('pageSize')
+
+    @property
+    def page_count(self):
+        return self._get_bool('showNumPages')
+
    def _get_bool(self, name, def_val=False):
        v = self.params.get(name)
        if v:
--- a/pywb/cdx/test/test_cdxops.py
+++ b/pywb/cdx/test/test_cdxops.py
@ -170,7 +170,8 @@ test_cdx_dir = get_test_dir() + 'cdx/'

 def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
    kwparams['url'] = url
-    kwparams['output'] = 'cdxobject'
+    if not 'output' in kwparams:
+        kwparams['output'] = 'cdxobject'
    fields = kwparams.get('fields')
    if fields:
        fields = fields.split(',')
@ -179,7 +180,10 @@ def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
    results = server.load_cdx(**kwparams)

    for x in results:
-        l = x.to_text(fields).replace('\t', '    ')
+        if not isinstance(x, str):
+            l = x.to_text(fields).replace('\t', '    ')
+        else:
+            l = x
        sys.stdout.write(l)


--- a/pywb/cdx/test/test_zipnum.py
+++ b/pywb/cdx/test/test_zipnum.py
@ -1,16 +1,16 @@
 """
->>> zip_ops_test(url = 'http://iana.org')
+>>> zip_ops_test(url='http://iana.org')
 org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
 org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
 org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz

 # test idx index (tabs replacad with 4 spaces)
->>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix', showPagedIndex = True)
+>>> zip_ops_test(url='http://iana.org/domains/', matchType='prefix', showPagedIndex=True)
 org,iana)/dnssec 20140126201307    zipnum    8511    373    35
 org,iana)/domains/int 20140126201239    zipnum    8884    353    36
 org,iana)/domains/root/servers 20140126201227    zipnum    9237    386    37

->>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix')
+>>> zip_ops_test(url='http://iana.org/domains/', matchType='prefix')
 org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
 org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
 org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
@ -21,6 +21,77 @@ org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ te
 org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
 org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz

+# Pages -- default page size
+>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showNumPages=True)
+{"blocks": 37, "pages": 4, "pageSize": 10}
+
+# set page size
+>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', pageSize=4, showNumPages=True)
+{"blocks": 37, "pages": 10, "pageSize": 4}
+
+# first page
+>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=0)
+com,example)/ 20140127171200    zipnum    0    276    1
+org,iana)/ 20140127171238    zipnum    276    328    2
+org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055    zipnum    604    312    3
+org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718    zipnum    916    235    4
+
+# next page + json
+>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', output='json', showPagedIndex=True, pageSize=4, page=1)
+{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912", "part": "zipnum", "offset": 1151, "length": 235, "lineno": 5}
+{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240", "part": "zipnum", "offset": 1386, "length": 306, "lineno": 6}
+{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654", "part": "zipnum", "offset": 1692, "length": 235, "lineno": 7}
+{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816", "part": "zipnum", "offset": 1927, "length": 231, "lineno": 8}
+
+# last page
+>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=9)
+org,iana)/domains/root/servers 20140126201227    zipnum    9237    386    37
+org,iana)/time-zones 20140126200737    zipnum    9623    145    38
+
+# last page cdx
+>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', pageSize=4, page=9)
+org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
+org,iana)/numbers 20140126200651 http://www.iana.org/numbers text/html 200 HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK - - 3498 321385 iana.warc.gz
+org,iana)/performance/ietf-draft-status 20140126200815 http://www.iana.org/performance/ietf-draft-status text/html 200 T5IQTX6DWV5KABGH454CYEDWKRI5Y23E - - 2940 597667 iana.warc.gz
+org,iana)/performance/ietf-statistics 20140126200804 http://www.iana.org/performance/ietf-statistics text/html 200 XOFML5WNBQMTSULLIIPLSP6U5MX33HN6 - - 3712 582987 iana.warc.gz
+org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz
+org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
+
+
+# last page reverse -- not yet supported
+#>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', reverse=True, showPagedIndex=True, pageSize=4, page=9)
+#org,iana)/time-zones 20140126200737    zipnum    9623    145    38
+#org,iana)/domains/root/servers 20140126201227    zipnum    9237    386    37
+
+
+# last page reverse CDX
+>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', reverse=True, pageSize=4, page=9)
+org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
+org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz
+org,iana)/performance/ietf-statistics 20140126200804 http://www.iana.org/performance/ietf-statistics text/html 200 XOFML5WNBQMTSULLIIPLSP6U5MX33HN6 - - 3712 582987 iana.warc.gz
+org,iana)/performance/ietf-draft-status 20140126200815 http://www.iana.org/performance/ietf-draft-status text/html 200 T5IQTX6DWV5KABGH454CYEDWKRI5Y23E - - 2940 597667 iana.warc.gz
+org,iana)/numbers 20140126200651 http://www.iana.org/numbers text/html 200 HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK - - 3498 321385 iana.warc.gz
+org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
+
+
+# invalid page
+>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=10)
+Traceback (most recent call last):
+Exception: Page 10 invalid: First Page is 0, Last Page is 9
+
+
+>>> zip_ops_test(url='http://aaa.aaa/', matchType='exact', showPagedIndex=True)
+Traceback (most recent call last):
+NotFoundException: No Captures found for: http://aaa.aaa/
+
+>>> zip_ops_test(url='http://aaa.aaa/', matchType='domain', showPagedIndex=True)
+Traceback (most recent call last):
+NotFoundException: No Captures found for: http://aaa.aaa/
+
+>>> zip_ops_test(url='http://aaa.zz/', matchType='domain', showPagedIndex=True)
+Traceback (most recent call last):
+NotFoundException: No Captures found for: http://aaa.zz/
+
 """

 from test_cdxops import cdx_ops_test
--- a/pywb/cdx/zipnum.py
+++ b/pywb/cdx/zipnum.py
@ -4,13 +4,14 @@ import itertools
 import logging
 from io import BytesIO
 import datetime
+import json

 from cdxsource import CDXSource
 from cdxobject import IDXObject

-from pywb.utils.loaders import BlockLoader
+from pywb.utils.loaders import BlockLoader, read_last_line
 from pywb.utils.bufferedreaders import gzip_decompressor
-from pywb.utils.binsearch import iter_range, linearsearch
+from pywb.utils.binsearch import iter_range, linearsearch, search


 #=================================================================
@ -22,24 +23,10 @@ class ZipBlocks:
        self.count = count


-#=================================================================
-def readline_to_iter(stream):
-    try:
-        count = 0
-        buff = stream.readline()
-        while buff:
-            count += 1
-            yield buff
-            buff = stream.readline()
-
-    finally:
-        stream.close()
-
-
 #=================================================================
 class ZipNumCluster(CDXSource):
    DEFAULT_RELOAD_INTERVAL = 10  # in minutes
-    DEFAULT_MAX_BLOCKS = 50
+    DEFAULT_MAX_BLOCKS = 10

    def __init__(self, summary, config=None):

@ -114,22 +101,91 @@ class ZipNumCluster(CDXSource):

        reader = open(self.summary, 'rb')

-        idx_iter = iter_range(reader,
-                              query.key,
-                              query.end_key,
-                              prev_size=1)
+        idx_iter = self.compute_page_range(reader, query)

-        if query.secondary_index_only:
+        if query.secondary_index_only or query.page_count:
            return idx_iter
+
+        blocks = self.idx_to_cdx(idx_iter, query)
+
+        def gen_cdx():
+            for blk in blocks:
+                for cdx in blk:
+                    yield cdx
+
+        return gen_cdx()
+
+
+    def compute_page_range(self, reader, query):
+
+        # Get End
+        end_iter = search(reader, query.end_key, prev_size=1)
+
+        try:
+            end_line = end_iter.next()
+        except StopIteration:
+            end_line = read_last_line(reader)
+
+        # Get Start
+
+        first_iter = iter_range(reader,
+                                query.key,
+                                query.end_key,
+                                prev_size=1)
+
+        try:
+            first_line = first_iter.next()
+        except StopIteration:
+            raise
+
+        first = IDXObject(first_line)
+
+        end = IDXObject(end_line)
+        diff = end['lineno'] - first['lineno']
+
+        pagesize = query.page_size
+        if not pagesize:
+            pagesize = self.max_blocks
+
+        total_pages = diff / pagesize + 1
+
+        if query.page_count:
+            info = dict(pages=total_pages,
+                        pageSize=pagesize,
+                        blocks=diff)
+            yield json.dumps(info)
+            reader.close()
+            return
+
+        curr_page = query.page
+        if curr_page >= total_pages or curr_page < 0:
+            msg = 'Page {0} invalid: First Page is 0, Last Page is {1}'
+            reader.close()
+            raise Exception(msg.format(curr_page, total_pages - 1))
+
+        startline = curr_page * pagesize
+        endline = min(startline + pagesize - 1, diff)
+
+        if curr_page == 0:
+            yield first_line
        else:
-            blocks = self.idx_to_cdx(idx_iter, query)
+            startline -= 1

-            def gen_cdx():
-                for blk in blocks:
-                    for cdx in blk:
-                        yield cdx
+        idxiter = itertools.islice(first_iter, startline, endline)
+        for idx in idxiter:
+            yield idx

-            return gen_cdx()
+        reader.close()
+
+
+    def search_by_line_num(self, reader, line):  # pragma: no cover
+        def line_cmp(line1, line2):
+            line1_no = int(line1.rsplit('\t', 1)[-1])
+            line2_no = int(line2.rsplit('\t', 1)[-1])
+            return cmp(line1_no, line2_no)
+
+        line_iter = search(reader, line, compare_func=line_cmp)
+        yield line_iter.next()

    def idx_to_cdx(self, idx_iter, query):
        blocks = None
@ -178,6 +234,10 @@ class ZipNumCluster(CDXSource):
            raise Exception('No Locations Found for: ' + block.part)

    def load_blocks(self, location, blocks, ranges, query):
+        """ Load one or more blocks of compressed cdx lines, return
+        a line iterator which decompresses and returns one line at a time,
+        bounded by query.key and query.end_key
+        """

        if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
            msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
@ -188,7 +248,8 @@ class ZipNumCluster(CDXSource):
        def decompress_block(range_):
            decomp = gzip_decompressor()
            buff = decomp.decompress(reader.read(range_))
-            return readline_to_iter(BytesIO(buff))
+            for line in BytesIO(buff):
+                yield line

        iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))

--- a/pywb/utils/binsearch.py
+++ b/pywb/utils/binsearch.py
@ -84,7 +84,7 @@ def linearsearch(iter_, key, prev_size=0, compare_func=cmp):

    # no matches, so return empty iterator
    if not matched:
-        return []
+        return iter([])

    return itertools.chain(prev_deque, iter_)

--- a/pywb/utils/loaders.py
+++ b/pywb/utils/loaders.py
@ -112,6 +112,27 @@ def extract_client_cookie(env, cookie_name):
    return value


+#=================================================================
+def read_last_line(fh, offset=256):
+    """ Read last line from a seekable file. Start reading
+    from buff before end of file, and double backwards seek
+    until line break is found. If reached beginning of file
+    (no lines), just return whole file
+    """
+    fh.seek(0, 2)
+    size = fh.tell()
+
+    while offset < size:
+        fh.seek(-offset, 2)
+        lines = fh.readlines()
+        if len(lines) > 1:
+            return lines[-1]
+        offset *= 2
+
+    fh.seek(0, 0)
+    return fh.readlines()[-1]
+
+
 #=================================================================
 class BlockLoader(object):
    """
--- a/pywb/utils/test/test_binsearch.py
+++ b/pywb/utils/test/test_binsearch.py
@ -53,6 +53,21 @@ org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jq
 org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz


+# Test at boundary
+>>> print_binsearch_results('a)/', iter_exact)
+>>> print_binsearch_results_range('a)/', 'a-', iter_range)
+
+>>> print_binsearch_results_range('a)/', 'org,iana)/_css/2013.1/fonts/inconsolata.otf ', iter_range)
+org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
+
+>>> print_binsearch_results('z)/', iter_exact)
+>>> print_binsearch_results_range('z)/', 'z-', iter_range)
+
+>>> print_binsearch_results_range('org,iana)/protocols', 'z-', iter_range)
+org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz
+org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
+
+
 """


--- a/pywb/utils/test/test_loaders.py
+++ b/pywb/utils/test/test_loaders.py
@ -82,6 +82,24 @@ True
 # length too long
 >>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) + 4, BytesIO(post_data))
 'foo=bar&dir=/baz'
+
+
+# test read_last_line
+>>> read_last_line(BytesIO('A\nB\nC'))
+'C'
+
+>>> read_last_line(BytesIO('Some Line\nLonger Line\nLongest Last Line LL'), offset=8)
+'Longest Last Line LL'
+
+>>> read_last_line(BytesIO('A\nBC'))
+'BC'
+
+>>> read_last_line(BytesIO('A\nBC\n'))
+'BC\n'
+
+>>> read_last_line(BytesIO('ABC'))
+'ABC'
+
 """


@ -91,6 +109,7 @@ import os
 from io import BytesIO
 from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
 from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query
+from pywb.utils.loaders import read_last_line

 from pywb import get_test_dir