diff --git a/pywb/cdx/test/test_zipnum.py b/pywb/cdx/test/test_zipnum.py index b6a398e7..5c671c83 100644 --- a/pywb/cdx/test/test_zipnum.py +++ b/pywb/cdx/test/test_zipnum.py @@ -6,9 +6,10 @@ org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3 # test idx index (tabs replacad with 4 spaces) >>> zip_ops_test(url='http://iana.org/domains/', matchType='prefix', showPagedIndex=True) -org,iana)/dnssec 20140126201307 zipnum 8511 373 35 -org,iana)/domains/int 20140126201239 zipnum 8884 353 36 -org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37 +org,iana)/dnssec 20140126201307 zipnum 8517 373 35 +org,iana)/domains/int 20140126201239 zipnum 8890 355 36 +org,iana)/domains/root/servers 20140126201227 zipnum 9245 386 37 + >>> zip_ops_test(url='http://iana.org/domains/*') org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz @@ -46,29 +47,30 @@ org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/s # first page >>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=0) -com,example)/ 20140127171200 zipnum 0 276 1 -org,iana)/ 20140127171238 zipnum 276 328 2 -org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 604 312 3 -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 916 235 4 +com,example)/ 20140127171200 zipnum 0 275 1 +org,iana)/ 20140127171238 zipnum 275 328 2 +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 603 312 3 +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 915 235 4 + # first page -- simplified query >>> zip_ops_test(url='*.iana.org/path_part_ignored/', showPagedIndex=True, pageSize=4) -com,example)/ 20140127171200 zipnum 0 276 1 -org,iana)/ 20140127171238 zipnum 276 328 2 -org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 604 312 3 -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 916 235 4 +com,example)/ 20140127171200 zipnum 0 275 1 +org,iana)/ 20140127171238 zipnum 275 328 2 +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 603 312 3 +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 915 235 4 # next page + json >>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', output='json', showPagedIndex=True, pageSize=4, page=1) -{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912", "part": "zipnum", "offset": 1151, "length": 235, "lineno": 5} -{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240", "part": "zipnum", "offset": 1386, "length": 306, "lineno": 6} +{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912", "part": "zipnum", "offset": 1150, "length": 235, "lineno": 5} +{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240", "part": "zipnum", "offset": 1385, "length": 307, "lineno": 6} {"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654", "part": "zipnum", "offset": 1692, "length": 235, "lineno": 7} {"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816", "part": "zipnum", "offset": 1927, "length": 231, "lineno": 8} # last page >>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=9) -org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37 -org,iana)/time-zones 20140126200737 zipnum 9623 145 38 +org,iana)/domains/root/servers 20140126201227 zipnum 9245 386 37 +org,iana)/time-zones 20140126200737 zipnum 9631 166 38 # last page cdx >>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', pageSize=4, page=9) @@ -78,7 +80,8 @@ org,iana)/performance/ietf-draft-status 20140126200815 http://www.iana.org/perfo org,iana)/performance/ietf-statistics 20140126200804 http://www.iana.org/performance/ietf-statistics text/html 200 XOFML5WNBQMTSULLIIPLSP6U5MX33HN6 - - 3712 582987 iana.warc.gz org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz - +org,iana)/time-zones/x 20140126200737 http://www.iana.org/time-zones/X text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz +org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz # last page reverse -- not yet supported #>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', reverse=True, showPagedIndex=True, pageSize=4, page=9) @@ -88,6 +91,8 @@ org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 # last page reverse CDX >>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', reverse=True, pageSize=4, page=9) +org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz +org,iana)/time-zones/x 20140126200737 http://www.iana.org/time-zones/X text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz org,iana)/performance/ietf-statistics 20140126200804 http://www.iana.org/performance/ietf-statistics text/html 200 XOFML5WNBQMTSULLIIPLSP6U5MX33HN6 - - 3712 582987 iana.warc.gz @@ -95,6 +100,20 @@ org,iana)/performance/ietf-draft-status 20140126200815 http://www.iana.org/perfo org,iana)/numbers 20140126200651 http://www.iana.org/numbers text/html 200 HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK - - 3498 321385 iana.warc.gz org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz +# last url prefix +>>> zip_ops_test(url='http://iana.org/time-zones*') +org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz +org,iana)/time-zones/x 20140126200737 http://www.iana.org/time-zones/X text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz +org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz + +# last url prefix w/ slash +>>> zip_ops_test(url='http://iana.org/time-zones/*') +org,iana)/time-zones/x 20140126200737 http://www.iana.org/time-zones/X text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz +org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz + +# last url exact +>>> zip_ops_test(url='http://iana.org/time-zones/Y') +org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz # invalid page >>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=10) @@ -110,7 +129,16 @@ NotFoundException: No Captures found for: http://aaa.aaa/ Traceback (most recent call last): NotFoundException: No Captures found for: http://aaa.aaa/ (domain query) +# list last index line, as we don't know if there are any captures at end >>> zip_ops_test(url='http://aaa.zz/', matchType='domain', showPagedIndex=True) +org,iana)/time-zones 20140126200737 zipnum 9631 166 38 + +# read cdx to find 0 pages +>>> zip_ops_test(url='http://aaa.zz/', matchType='domain', showNumPages=True) +{"blocks": 0, "pages": 0, "pageSize": 10} + +# read cdx to find no captures +>>> zip_ops_test(url='http://aaa.zz/', matchType='domain') Traceback (most recent call last): NotFoundException: No Captures found for: http://aaa.zz/ (domain query) diff --git a/pywb/cdx/zipnum.py b/pywb/cdx/zipnum.py index a1bf4c18..a81f359f 100644 --- a/pywb/cdx/zipnum.py +++ b/pywb/cdx/zipnum.py @@ -22,6 +22,7 @@ class ZipBlocks: self.length = length self.count = count + #================================================================= #TODO: see if these could be combined with warc path resolvers @@ -134,8 +135,10 @@ class ZipNumCluster(CDXSource): def load_cdx(self, query): self.loc_resolver.load_loc() + return self._do_load_cdx(self.summary, query) - reader = open(self.summary, 'rb') + def _do_load_cdx(self, filename, query): + reader = open(filename, 'rb') idx_iter = self.compute_page_range(reader, query) @@ -165,13 +168,16 @@ class ZipNumCluster(CDXSource): else: pagesize = int(pagesize) + last_line = None + # Get End end_iter = search(reader, query.end_key, prev_size=1) try: end_line = end_iter.next() except StopIteration: - end_line = read_last_line(reader) + last_line = read_last_line(reader) + end_line = last_line # Get Start first_iter = iter_range(reader, @@ -182,34 +188,40 @@ class ZipNumCluster(CDXSource): try: first_line = first_iter.next() except StopIteration: - reader.close() - if query.page_count: - yield self._page_info(0, pagesize, 0) - return + if end_line == last_line and query.key >= last_line: + first_line = last_line else: - raise + reader.close() + if query.page_count: + yield self._page_info(0, pagesize, 0) + return + else: + raise first = IDXObject(first_line) end = IDXObject(end_line) - diff = end['lineno'] - first['lineno'] - total_pages = diff / pagesize + 1 + try: + blocks = end['lineno'] - first['lineno'] + total_pages = blocks / pagesize + 1 + except: + blocks = -1 + total_pages = 1 if query.page_count: - blocks = diff + 1 # same line, so actually need to look at cdx # to determine if it exists - if total_pages == 1: + if blocks == 0: try: block_cdx_iter = self.idx_to_cdx([first_line], query) block = block_cdx_iter.next() cdx = block.next() except StopIteration: total_pages = 0 - blocks = 0 + blocks = -1 - yield self._page_info(total_pages, pagesize, blocks) + yield self._page_info(total_pages, pagesize, blocks + 1) reader.close() return @@ -220,7 +232,9 @@ class ZipNumCluster(CDXSource): raise CDXException(msg.format(curr_page, total_pages - 1)) startline = curr_page * pagesize - endline = min(startline + pagesize - 1, diff) + endline = startline + pagesize - 1 + if blocks >= 0: + endline = min(endline, blocks) if curr_page == 0: yield first_line diff --git a/sample_archive/zipcdx/zipnum-sample.cdx.gz b/sample_archive/zipcdx/zipnum-sample.cdx.gz index 8687b97a..540f7a66 100644 Binary files a/sample_archive/zipcdx/zipnum-sample.cdx.gz and b/sample_archive/zipcdx/zipnum-sample.cdx.gz differ diff --git a/sample_archive/zipcdx/zipnum-sample.idx b/sample_archive/zipcdx/zipnum-sample.idx index 6697c131..e7ef2645 100644 --- a/sample_archive/zipcdx/zipnum-sample.idx +++ b/sample_archive/zipcdx/zipnum-sample.idx @@ -1,9 +1,9 @@ -com,example)/ 20140127171200 zipnum 0 276 1 -org,iana)/ 20140127171238 zipnum 276 328 2 -org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 604 312 3 -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 916 235 4 -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912 zipnum 1151 235 5 -org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 zipnum 1386 306 6 +com,example)/ 20140127171200 zipnum 0 275 1 +org,iana)/ 20140127171238 zipnum 275 328 2 +org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 603 312 3 +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 915 235 4 +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912 zipnum 1150 235 5 +org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 zipnum 1385 307 6 org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654 zipnum 1692 235 7 org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816 zipnum 1927 231 8 org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201128 zipnum 2158 236 9 @@ -12,27 +12,27 @@ org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200805 zipnum 2706 234 org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201055 zipnum 2940 235 12 org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201308 zipnum 3175 289 13 org,iana)/_css/2013.1/print.css 20140126200737 zipnum 3464 208 14 -org,iana)/_css/2013.1/print.css 20140126200929 zipnum 3672 207 15 -org,iana)/_css/2013.1/print.css 20140126201248 zipnum 3879 276 16 -org,iana)/_css/2013.1/screen.css 20140126200706 zipnum 4155 210 17 -org,iana)/_css/2013.1/screen.css 20140126200825 zipnum 4365 211 18 -org,iana)/_css/2013.1/screen.css 20140126201227 zipnum 4576 216 19 -org,iana)/_img/2013.1/iana-logo-header.svg 20140126200654 zipnum 4792 236 20 -org,iana)/_img/2013.1/iana-logo-header.svg 20140126200816 zipnum 5028 219 21 -org,iana)/_img/2013.1/iana-logo-header.svg 20140126201128 zipnum 5247 221 22 -org,iana)/_img/2013.1/iana-logo-homepage.png 20140126200625 zipnum 5468 299 23 -org,iana)/_img/2013.1/icann-logo.svg 20140126200719 zipnum 5767 210 24 -org,iana)/_img/2013.1/icann-logo.svg 20140126200912 zipnum 5977 212 25 -org,iana)/_img/2013.1/icann-logo.svg 20140126201240 zipnum 6189 281 26 -org,iana)/_img/bookmark_icon.ico 20140126200631 zipnum 6470 298 27 -org,iana)/_js/2013.1/iana.js 20140126200716 zipnum 6768 213 28 -org,iana)/_js/2013.1/iana.js 20140126200912 zipnum 6981 216 29 -org,iana)/_js/2013.1/iana.js 20140126201239 zipnum 7197 270 30 -org,iana)/_js/2013.1/jquery.js 20140126200653 zipnum 7467 215 31 -org,iana)/_js/2013.1/jquery.js 20140126200816 zipnum 7682 209 32 -org,iana)/_js/2013.1/jquery.js 20140126201127 zipnum 7891 210 33 -org,iana)/_js/2013.1/jquery.js 20140127171239 zipnum 8101 410 34 -org,iana)/dnssec 20140126201307 zipnum 8511 373 35 -org,iana)/domains/int 20140126201239 zipnum 8884 353 36 -org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37 -org,iana)/time-zones 20140126200737 zipnum 9623 145 38 +org,iana)/_css/2013.1/print.css 20140126200929 zipnum 3672 209 15 +org,iana)/_css/2013.1/print.css 20140126201248 zipnum 3881 276 16 +org,iana)/_css/2013.1/screen.css 20140126200706 zipnum 4157 210 17 +org,iana)/_css/2013.1/screen.css 20140126200825 zipnum 4367 211 18 +org,iana)/_css/2013.1/screen.css 20140126201227 zipnum 4578 216 19 +org,iana)/_img/2013.1/iana-logo-header.svg 20140126200654 zipnum 4794 236 20 +org,iana)/_img/2013.1/iana-logo-header.svg 20140126200816 zipnum 5030 219 21 +org,iana)/_img/2013.1/iana-logo-header.svg 20140126201128 zipnum 5249 221 22 +org,iana)/_img/2013.1/iana-logo-homepage.png 20140126200625 zipnum 5470 299 23 +org,iana)/_img/2013.1/icann-logo.svg 20140126200719 zipnum 5769 210 24 +org,iana)/_img/2013.1/icann-logo.svg 20140126200912 zipnum 5979 212 25 +org,iana)/_img/2013.1/icann-logo.svg 20140126201240 zipnum 6191 281 26 +org,iana)/_img/bookmark_icon.ico 20140126200631 zipnum 6472 300 27 +org,iana)/_js/2013.1/iana.js 20140126200716 zipnum 6772 213 28 +org,iana)/_js/2013.1/iana.js 20140126200912 zipnum 6985 216 29 +org,iana)/_js/2013.1/iana.js 20140126201239 zipnum 7201 270 30 +org,iana)/_js/2013.1/jquery.js 20140126200653 zipnum 7471 215 31 +org,iana)/_js/2013.1/jquery.js 20140126200816 zipnum 7686 210 32 +org,iana)/_js/2013.1/jquery.js 20140126201127 zipnum 7896 211 33 +org,iana)/_js/2013.1/jquery.js 20140127171239 zipnum 8107 410 34 +org,iana)/dnssec 20140126201307 zipnum 8517 373 35 +org,iana)/domains/int 20140126201239 zipnum 8890 355 36 +org,iana)/domains/root/servers 20140126201227 zipnum 9245 386 37 +org,iana)/time-zones 20140126200737 zipnum 9631 166 38