mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
zipnum: fix bug with urls in last block not being accessible. when iter_range() fails, if check to see if last_line == end_line,
and if so, check if start_line should also be end_line #112 support non-linenumbered idx files w/o pagination queries add new zipnum-sample to test cdx lines in last block (previous sample had only one line in last block except the first)
This commit is contained in:
parent
d104c03135
commit
a51b2936f3
@ -6,9 +6,10 @@ org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3
|
||||
|
||||
# test idx index (tabs replacad with 4 spaces)
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='prefix', showPagedIndex=True)
|
||||
org,iana)/dnssec 20140126201307 zipnum 8511 373 35
|
||||
org,iana)/domains/int 20140126201239 zipnum 8884 353 36
|
||||
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37
|
||||
org,iana)/dnssec 20140126201307 zipnum 8517 373 35
|
||||
org,iana)/domains/int 20140126201239 zipnum 8890 355 36
|
||||
org,iana)/domains/root/servers 20140126201227 zipnum 9245 386 37
|
||||
|
||||
|
||||
>>> zip_ops_test(url='http://iana.org/domains/*')
|
||||
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
||||
@ -46,29 +47,30 @@ org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/s
|
||||
|
||||
# first page
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=0)
|
||||
com,example)/ 20140127171200 zipnum 0 276 1
|
||||
org,iana)/ 20140127171238 zipnum 276 328 2
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 604 312 3
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 916 235 4
|
||||
com,example)/ 20140127171200 zipnum 0 275 1
|
||||
org,iana)/ 20140127171238 zipnum 275 328 2
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 603 312 3
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 915 235 4
|
||||
|
||||
|
||||
# first page -- simplified query
|
||||
>>> zip_ops_test(url='*.iana.org/path_part_ignored/', showPagedIndex=True, pageSize=4)
|
||||
com,example)/ 20140127171200 zipnum 0 276 1
|
||||
org,iana)/ 20140127171238 zipnum 276 328 2
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 604 312 3
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 916 235 4
|
||||
com,example)/ 20140127171200 zipnum 0 275 1
|
||||
org,iana)/ 20140127171238 zipnum 275 328 2
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 603 312 3
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 915 235 4
|
||||
|
||||
# next page + json
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', output='json', showPagedIndex=True, pageSize=4, page=1)
|
||||
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912", "part": "zipnum", "offset": 1151, "length": 235, "lineno": 5}
|
||||
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240", "part": "zipnum", "offset": 1386, "length": 306, "lineno": 6}
|
||||
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912", "part": "zipnum", "offset": 1150, "length": 235, "lineno": 5}
|
||||
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240", "part": "zipnum", "offset": 1385, "length": 307, "lineno": 6}
|
||||
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654", "part": "zipnum", "offset": 1692, "length": 235, "lineno": 7}
|
||||
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816", "part": "zipnum", "offset": 1927, "length": 231, "lineno": 8}
|
||||
|
||||
# last page
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=9)
|
||||
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37
|
||||
org,iana)/time-zones 20140126200737 zipnum 9623 145 38
|
||||
org,iana)/domains/root/servers 20140126201227 zipnum 9245 386 37
|
||||
org,iana)/time-zones 20140126200737 zipnum 9631 166 38
|
||||
|
||||
# last page cdx
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', pageSize=4, page=9)
|
||||
@ -78,7 +80,8 @@ org,iana)/performance/ietf-draft-status 20140126200815 http://www.iana.org/perfo
|
||||
org,iana)/performance/ietf-statistics 20140126200804 http://www.iana.org/performance/ietf-statistics text/html 200 XOFML5WNBQMTSULLIIPLSP6U5MX33HN6 - - 3712 582987 iana.warc.gz
|
||||
org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz
|
||||
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
|
||||
org,iana)/time-zones/x 20140126200737 http://www.iana.org/time-zones/X text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
|
||||
# last page reverse -- not yet supported
|
||||
#>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', reverse=True, showPagedIndex=True, pageSize=4, page=9)
|
||||
@ -88,6 +91,8 @@ org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200
|
||||
|
||||
# last page reverse CDX
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', reverse=True, pageSize=4, page=9)
|
||||
org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
org,iana)/time-zones/x 20140126200737 http://www.iana.org/time-zones/X text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz
|
||||
org,iana)/performance/ietf-statistics 20140126200804 http://www.iana.org/performance/ietf-statistics text/html 200 XOFML5WNBQMTSULLIIPLSP6U5MX33HN6 - - 3712 582987 iana.warc.gz
|
||||
@ -95,6 +100,20 @@ org,iana)/performance/ietf-draft-status 20140126200815 http://www.iana.org/perfo
|
||||
org,iana)/numbers 20140126200651 http://www.iana.org/numbers text/html 200 HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK - - 3498 321385 iana.warc.gz
|
||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||
|
||||
# last url prefix
|
||||
>>> zip_ops_test(url='http://iana.org/time-zones*')
|
||||
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
org,iana)/time-zones/x 20140126200737 http://www.iana.org/time-zones/X text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
|
||||
# last url prefix w/ slash
|
||||
>>> zip_ops_test(url='http://iana.org/time-zones/*')
|
||||
org,iana)/time-zones/x 20140126200737 http://www.iana.org/time-zones/X text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
|
||||
# last url exact
|
||||
>>> zip_ops_test(url='http://iana.org/time-zones/Y')
|
||||
org,iana)/time-zones/y 20140126200737 http://www.iana.org/time-zones/Y text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||
|
||||
# invalid page
|
||||
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=10)
|
||||
@ -110,7 +129,16 @@ NotFoundException: No Captures found for: http://aaa.aaa/
|
||||
Traceback (most recent call last):
|
||||
NotFoundException: No Captures found for: http://aaa.aaa/ (domain query)
|
||||
|
||||
# list last index line, as we don't know if there are any captures at end
|
||||
>>> zip_ops_test(url='http://aaa.zz/', matchType='domain', showPagedIndex=True)
|
||||
org,iana)/time-zones 20140126200737 zipnum 9631 166 38
|
||||
|
||||
# read cdx to find 0 pages
|
||||
>>> zip_ops_test(url='http://aaa.zz/', matchType='domain', showNumPages=True)
|
||||
{"blocks": 0, "pages": 0, "pageSize": 10}
|
||||
|
||||
# read cdx to find no captures
|
||||
>>> zip_ops_test(url='http://aaa.zz/', matchType='domain')
|
||||
Traceback (most recent call last):
|
||||
NotFoundException: No Captures found for: http://aaa.zz/ (domain query)
|
||||
|
||||
|
@ -22,6 +22,7 @@ class ZipBlocks:
|
||||
self.length = length
|
||||
self.count = count
|
||||
|
||||
|
||||
#=================================================================
|
||||
#TODO: see if these could be combined with warc path resolvers
|
||||
|
||||
@ -134,8 +135,10 @@ class ZipNumCluster(CDXSource):
|
||||
|
||||
def load_cdx(self, query):
|
||||
self.loc_resolver.load_loc()
|
||||
return self._do_load_cdx(self.summary, query)
|
||||
|
||||
reader = open(self.summary, 'rb')
|
||||
def _do_load_cdx(self, filename, query):
|
||||
reader = open(filename, 'rb')
|
||||
|
||||
idx_iter = self.compute_page_range(reader, query)
|
||||
|
||||
@ -165,13 +168,16 @@ class ZipNumCluster(CDXSource):
|
||||
else:
|
||||
pagesize = int(pagesize)
|
||||
|
||||
last_line = None
|
||||
|
||||
# Get End
|
||||
end_iter = search(reader, query.end_key, prev_size=1)
|
||||
|
||||
try:
|
||||
end_line = end_iter.next()
|
||||
except StopIteration:
|
||||
end_line = read_last_line(reader)
|
||||
last_line = read_last_line(reader)
|
||||
end_line = last_line
|
||||
|
||||
# Get Start
|
||||
first_iter = iter_range(reader,
|
||||
@ -182,34 +188,40 @@ class ZipNumCluster(CDXSource):
|
||||
try:
|
||||
first_line = first_iter.next()
|
||||
except StopIteration:
|
||||
reader.close()
|
||||
if query.page_count:
|
||||
yield self._page_info(0, pagesize, 0)
|
||||
return
|
||||
if end_line == last_line and query.key >= last_line:
|
||||
first_line = last_line
|
||||
else:
|
||||
raise
|
||||
reader.close()
|
||||
if query.page_count:
|
||||
yield self._page_info(0, pagesize, 0)
|
||||
return
|
||||
else:
|
||||
raise
|
||||
|
||||
first = IDXObject(first_line)
|
||||
|
||||
end = IDXObject(end_line)
|
||||
diff = end['lineno'] - first['lineno']
|
||||
|
||||
total_pages = diff / pagesize + 1
|
||||
try:
|
||||
blocks = end['lineno'] - first['lineno']
|
||||
total_pages = blocks / pagesize + 1
|
||||
except:
|
||||
blocks = -1
|
||||
total_pages = 1
|
||||
|
||||
if query.page_count:
|
||||
blocks = diff + 1
|
||||
# same line, so actually need to look at cdx
|
||||
# to determine if it exists
|
||||
if total_pages == 1:
|
||||
if blocks == 0:
|
||||
try:
|
||||
block_cdx_iter = self.idx_to_cdx([first_line], query)
|
||||
block = block_cdx_iter.next()
|
||||
cdx = block.next()
|
||||
except StopIteration:
|
||||
total_pages = 0
|
||||
blocks = 0
|
||||
blocks = -1
|
||||
|
||||
yield self._page_info(total_pages, pagesize, blocks)
|
||||
yield self._page_info(total_pages, pagesize, blocks + 1)
|
||||
reader.close()
|
||||
return
|
||||
|
||||
@ -220,7 +232,9 @@ class ZipNumCluster(CDXSource):
|
||||
raise CDXException(msg.format(curr_page, total_pages - 1))
|
||||
|
||||
startline = curr_page * pagesize
|
||||
endline = min(startline + pagesize - 1, diff)
|
||||
endline = startline + pagesize - 1
|
||||
if blocks >= 0:
|
||||
endline = min(endline, blocks)
|
||||
|
||||
if curr_page == 0:
|
||||
yield first_line
|
||||
|
Binary file not shown.
@ -1,9 +1,9 @@
|
||||
com,example)/ 20140127171200 zipnum 0 276 1
|
||||
org,iana)/ 20140127171238 zipnum 276 328 2
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 604 312 3
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 916 235 4
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912 zipnum 1151 235 5
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 zipnum 1386 306 6
|
||||
com,example)/ 20140127171200 zipnum 0 275 1
|
||||
org,iana)/ 20140127171238 zipnum 275 328 2
|
||||
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 603 312 3
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 915 235 4
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912 zipnum 1150 235 5
|
||||
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240 zipnum 1385 307 6
|
||||
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654 zipnum 1692 235 7
|
||||
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816 zipnum 1927 231 8
|
||||
org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126201128 zipnum 2158 236 9
|
||||
@ -12,27 +12,27 @@ org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126200805 zipnum 2706 234
|
||||
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201055 zipnum 2940 235 12
|
||||
org,iana)/_css/2013.1/fonts/opensans-semibold.ttf 20140126201308 zipnum 3175 289 13
|
||||
org,iana)/_css/2013.1/print.css 20140126200737 zipnum 3464 208 14
|
||||
org,iana)/_css/2013.1/print.css 20140126200929 zipnum 3672 207 15
|
||||
org,iana)/_css/2013.1/print.css 20140126201248 zipnum 3879 276 16
|
||||
org,iana)/_css/2013.1/screen.css 20140126200706 zipnum 4155 210 17
|
||||
org,iana)/_css/2013.1/screen.css 20140126200825 zipnum 4365 211 18
|
||||
org,iana)/_css/2013.1/screen.css 20140126201227 zipnum 4576 216 19
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200654 zipnum 4792 236 20
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200816 zipnum 5028 219 21
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126201128 zipnum 5247 221 22
|
||||
org,iana)/_img/2013.1/iana-logo-homepage.png 20140126200625 zipnum 5468 299 23
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126200719 zipnum 5767 210 24
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126200912 zipnum 5977 212 25
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126201240 zipnum 6189 281 26
|
||||
org,iana)/_img/bookmark_icon.ico 20140126200631 zipnum 6470 298 27
|
||||
org,iana)/_js/2013.1/iana.js 20140126200716 zipnum 6768 213 28
|
||||
org,iana)/_js/2013.1/iana.js 20140126200912 zipnum 6981 216 29
|
||||
org,iana)/_js/2013.1/iana.js 20140126201239 zipnum 7197 270 30
|
||||
org,iana)/_js/2013.1/jquery.js 20140126200653 zipnum 7467 215 31
|
||||
org,iana)/_js/2013.1/jquery.js 20140126200816 zipnum 7682 209 32
|
||||
org,iana)/_js/2013.1/jquery.js 20140126201127 zipnum 7891 210 33
|
||||
org,iana)/_js/2013.1/jquery.js 20140127171239 zipnum 8101 410 34
|
||||
org,iana)/dnssec 20140126201307 zipnum 8511 373 35
|
||||
org,iana)/domains/int 20140126201239 zipnum 8884 353 36
|
||||
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37
|
||||
org,iana)/time-zones 20140126200737 zipnum 9623 145 38
|
||||
org,iana)/_css/2013.1/print.css 20140126200929 zipnum 3672 209 15
|
||||
org,iana)/_css/2013.1/print.css 20140126201248 zipnum 3881 276 16
|
||||
org,iana)/_css/2013.1/screen.css 20140126200706 zipnum 4157 210 17
|
||||
org,iana)/_css/2013.1/screen.css 20140126200825 zipnum 4367 211 18
|
||||
org,iana)/_css/2013.1/screen.css 20140126201227 zipnum 4578 216 19
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200654 zipnum 4794 236 20
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126200816 zipnum 5030 219 21
|
||||
org,iana)/_img/2013.1/iana-logo-header.svg 20140126201128 zipnum 5249 221 22
|
||||
org,iana)/_img/2013.1/iana-logo-homepage.png 20140126200625 zipnum 5470 299 23
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126200719 zipnum 5769 210 24
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126200912 zipnum 5979 212 25
|
||||
org,iana)/_img/2013.1/icann-logo.svg 20140126201240 zipnum 6191 281 26
|
||||
org,iana)/_img/bookmark_icon.ico 20140126200631 zipnum 6472 300 27
|
||||
org,iana)/_js/2013.1/iana.js 20140126200716 zipnum 6772 213 28
|
||||
org,iana)/_js/2013.1/iana.js 20140126200912 zipnum 6985 216 29
|
||||
org,iana)/_js/2013.1/iana.js 20140126201239 zipnum 7201 270 30
|
||||
org,iana)/_js/2013.1/jquery.js 20140126200653 zipnum 7471 215 31
|
||||
org,iana)/_js/2013.1/jquery.js 20140126200816 zipnum 7686 210 32
|
||||
org,iana)/_js/2013.1/jquery.js 20140126201127 zipnum 7896 211 33
|
||||
org,iana)/_js/2013.1/jquery.js 20140127171239 zipnum 8107 410 34
|
||||
org,iana)/dnssec 20140126201307 zipnum 8517 373 35
|
||||
org,iana)/domains/int 20140126201239 zipnum 8890 355 36
|
||||
org,iana)/domains/root/servers 20140126201227 zipnum 9245 386 37
|
||||
org,iana)/time-zones 20140126200737 zipnum 9631 166 38
|
||||
|
Loading…
x
Reference in New Issue
Block a user