mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
zipnum: support for pagination api! #34 and #83. cdx server now bounded by pageSize (default 10 blocks),
showNumPages=true returns json indicating num pages, page=N can be set to page number 0-numPages - 1 loaders: add read_last_line() to read last line of a seekable file, used to read last line of index file when at end tests: additional test for binsearch boundary conditions zipnum: secondary index output supports json also
This commit is contained in:
parent
872607c07d
commit
2af5a25009
@ -229,5 +229,8 @@ class IDXObject(OrderedDict):
|
|||||||
"""
|
"""
|
||||||
return str(self) + '\n'
|
return str(self) + '\n'
|
||||||
|
|
||||||
|
def to_json(self, fields=None):
|
||||||
|
return json_encode(self) + '\n'
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.idxline
|
return self.idxline
|
||||||
|
@ -23,6 +23,11 @@ def cdx_load(sources, query, process=True):
|
|||||||
:param process: bool, perform processing sorting/filtering/grouping ops
|
:param process: bool, perform processing sorting/filtering/grouping ops
|
||||||
"""
|
"""
|
||||||
cdx_iter = create_merged_cdx_gen(sources, query)
|
cdx_iter = create_merged_cdx_gen(sources, query)
|
||||||
|
|
||||||
|
# page count is a special case, no further processing
|
||||||
|
if query.page_count:
|
||||||
|
return cdx_iter
|
||||||
|
|
||||||
cdx_iter = make_obj_iter(cdx_iter, query)
|
cdx_iter = make_obj_iter(cdx_iter, query)
|
||||||
|
|
||||||
if process and not query.secondary_index_only:
|
if process and not query.secondary_index_only:
|
||||||
|
@ -86,6 +86,18 @@ class CDXQuery(object):
|
|||||||
def secondary_index_only(self):
|
def secondary_index_only(self):
|
||||||
return self._get_bool('showPagedIndex')
|
return self._get_bool('showPagedIndex')
|
||||||
|
|
||||||
|
@property
|
||||||
|
def page(self):
|
||||||
|
return int(self.params.get('page', 0))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def page_size(self):
|
||||||
|
return self.params.get('pageSize')
|
||||||
|
|
||||||
|
@property
|
||||||
|
def page_count(self):
|
||||||
|
return self._get_bool('showNumPages')
|
||||||
|
|
||||||
def _get_bool(self, name, def_val=False):
|
def _get_bool(self, name, def_val=False):
|
||||||
v = self.params.get(name)
|
v = self.params.get(name)
|
||||||
if v:
|
if v:
|
||||||
|
@ -170,7 +170,8 @@ test_cdx_dir = get_test_dir() + 'cdx/'
|
|||||||
|
|
||||||
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
||||||
kwparams['url'] = url
|
kwparams['url'] = url
|
||||||
kwparams['output'] = 'cdxobject'
|
if not 'output' in kwparams:
|
||||||
|
kwparams['output'] = 'cdxobject'
|
||||||
fields = kwparams.get('fields')
|
fields = kwparams.get('fields')
|
||||||
if fields:
|
if fields:
|
||||||
fields = fields.split(',')
|
fields = fields.split(',')
|
||||||
@ -179,7 +180,10 @@ def cdx_ops_test(url, sources = [test_cdx_dir + 'iana.cdx'], **kwparams):
|
|||||||
results = server.load_cdx(**kwparams)
|
results = server.load_cdx(**kwparams)
|
||||||
|
|
||||||
for x in results:
|
for x in results:
|
||||||
l = x.to_text(fields).replace('\t', ' ')
|
if not isinstance(x, str):
|
||||||
|
l = x.to_text(fields).replace('\t', ' ')
|
||||||
|
else:
|
||||||
|
l = x
|
||||||
sys.stdout.write(l)
|
sys.stdout.write(l)
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,16 +1,16 @@
|
|||||||
"""
|
"""
|
||||||
>>> zip_ops_test(url = 'http://iana.org')
|
>>> zip_ops_test(url='http://iana.org')
|
||||||
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
||||||
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
|
org,iana)/ 20140127171238 http://iana.org unk 302 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 343 1858 dupes.warc.gz
|
||||||
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
|
org,iana)/ 20140127171238 http://www.iana.org/ warc/revisit - OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 536 2678 dupes.warc.gz
|
||||||
|
|
||||||
# test idx index (tabs replacad with 4 spaces)
|
# test idx index (tabs replacad with 4 spaces)
|
||||||
>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix', showPagedIndex = True)
|
>>> zip_ops_test(url='http://iana.org/domains/', matchType='prefix', showPagedIndex=True)
|
||||||
org,iana)/dnssec 20140126201307 zipnum 8511 373 35
|
org,iana)/dnssec 20140126201307 zipnum 8511 373 35
|
||||||
org,iana)/domains/int 20140126201239 zipnum 8884 353 36
|
org,iana)/domains/int 20140126201239 zipnum 8884 353 36
|
||||||
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37
|
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37
|
||||||
|
|
||||||
>>> zip_ops_test(url = 'http://iana.org/domains/', matchType = 'prefix')
|
>>> zip_ops_test(url='http://iana.org/domains/', matchType='prefix')
|
||||||
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
org,iana)/domains/arpa 20140126201248 http://www.iana.org/domains/arpa text/html 200 QOFZZRN6JIKAL2JRL6ZC2VVG42SPKGHT - - 2939 759039 iana.warc.gz
|
||||||
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
org,iana)/domains/example 20140128051539 http://www.iana.org/domains/example text/html 302 JZ622UA23G5ZU6Y3XAKH4LINONUEICEG - - 577 2907 example.warc.gz
|
||||||
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
org,iana)/domains/idn-tables 20140126201127 http://www.iana.org/domains/idn-tables text/html 200 HNCUFTJMOQOGAEY6T56KVC3T7TVLKGEW - - 8118 715878 iana.warc.gz
|
||||||
@ -21,6 +21,77 @@ org,iana)/domains/root/db 20140126200927 http://www.iana.org/domains/root/db/ te
|
|||||||
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
org,iana)/domains/root/db 20140126200928 http://www.iana.org/domains/root/db text/html 200 DHXA725IW5VJJFRTWBQT6BEZKRE7H57S - - 18365 672225 iana.warc.gz
|
||||||
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||||
|
|
||||||
|
# Pages -- default page size
|
||||||
|
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showNumPages=True)
|
||||||
|
{"blocks": 37, "pages": 4, "pageSize": 10}
|
||||||
|
|
||||||
|
# set page size
|
||||||
|
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', pageSize=4, showNumPages=True)
|
||||||
|
{"blocks": 37, "pages": 10, "pageSize": 4}
|
||||||
|
|
||||||
|
# first page
|
||||||
|
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=0)
|
||||||
|
com,example)/ 20140127171200 zipnum 0 276 1
|
||||||
|
org,iana)/ 20140127171238 zipnum 276 328 2
|
||||||
|
org,iana)/_css/2013.1/fonts/inconsolata.otf 20140126201055 zipnum 604 312 3
|
||||||
|
org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200718 zipnum 916 235 4
|
||||||
|
|
||||||
|
# next page + json
|
||||||
|
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', output='json', showPagedIndex=True, pageSize=4, page=1)
|
||||||
|
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126200912", "part": "zipnum", "offset": 1151, "length": 235, "lineno": 5}
|
||||||
|
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-bold.ttf 20140126201240", "part": "zipnum", "offset": 1386, "length": 306, "lineno": 6}
|
||||||
|
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200654", "part": "zipnum", "offset": 1692, "length": 235, "lineno": 7}
|
||||||
|
{"urlkey": "org,iana)/_css/2013.1/fonts/opensans-regular.ttf 20140126200816", "part": "zipnum", "offset": 1927, "length": 231, "lineno": 8}
|
||||||
|
|
||||||
|
# last page
|
||||||
|
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=9)
|
||||||
|
org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37
|
||||||
|
org,iana)/time-zones 20140126200737 zipnum 9623 145 38
|
||||||
|
|
||||||
|
# last page cdx
|
||||||
|
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', pageSize=4, page=9)
|
||||||
|
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||||
|
org,iana)/numbers 20140126200651 http://www.iana.org/numbers text/html 200 HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK - - 3498 321385 iana.warc.gz
|
||||||
|
org,iana)/performance/ietf-draft-status 20140126200815 http://www.iana.org/performance/ietf-draft-status text/html 200 T5IQTX6DWV5KABGH454CYEDWKRI5Y23E - - 2940 597667 iana.warc.gz
|
||||||
|
org,iana)/performance/ietf-statistics 20140126200804 http://www.iana.org/performance/ietf-statistics text/html 200 XOFML5WNBQMTSULLIIPLSP6U5MX33HN6 - - 3712 582987 iana.warc.gz
|
||||||
|
org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz
|
||||||
|
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||||
|
|
||||||
|
|
||||||
|
# last page reverse -- not yet supported
|
||||||
|
#>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', reverse=True, showPagedIndex=True, pageSize=4, page=9)
|
||||||
|
#org,iana)/time-zones 20140126200737 zipnum 9623 145 38
|
||||||
|
#org,iana)/domains/root/servers 20140126201227 zipnum 9237 386 37
|
||||||
|
|
||||||
|
|
||||||
|
# last page reverse CDX
|
||||||
|
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', reverse=True, pageSize=4, page=9)
|
||||||
|
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||||
|
org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz
|
||||||
|
org,iana)/performance/ietf-statistics 20140126200804 http://www.iana.org/performance/ietf-statistics text/html 200 XOFML5WNBQMTSULLIIPLSP6U5MX33HN6 - - 3712 582987 iana.warc.gz
|
||||||
|
org,iana)/performance/ietf-draft-status 20140126200815 http://www.iana.org/performance/ietf-draft-status text/html 200 T5IQTX6DWV5KABGH454CYEDWKRI5Y23E - - 2940 597667 iana.warc.gz
|
||||||
|
org,iana)/numbers 20140126200651 http://www.iana.org/numbers text/html 200 HWT5UZKURYLW5QNWVZCWFCANGEMU7XWK - - 3498 321385 iana.warc.gz
|
||||||
|
org,iana)/domains/root/servers 20140126201227 http://www.iana.org/domains/root/servers text/html 200 AFW34N3S4NK2RJ6QWMVPB5E2AIUETAHU - - 3137 733840 iana.warc.gz
|
||||||
|
|
||||||
|
|
||||||
|
# invalid page
|
||||||
|
>>> zip_ops_test(url='http://iana.org/domains/', matchType='domain', showPagedIndex=True, pageSize=4, page=10)
|
||||||
|
Traceback (most recent call last):
|
||||||
|
Exception: Page 10 invalid: First Page is 0, Last Page is 9
|
||||||
|
|
||||||
|
|
||||||
|
>>> zip_ops_test(url='http://aaa.aaa/', matchType='exact', showPagedIndex=True)
|
||||||
|
Traceback (most recent call last):
|
||||||
|
NotFoundException: No Captures found for: http://aaa.aaa/
|
||||||
|
|
||||||
|
>>> zip_ops_test(url='http://aaa.aaa/', matchType='domain', showPagedIndex=True)
|
||||||
|
Traceback (most recent call last):
|
||||||
|
NotFoundException: No Captures found for: http://aaa.aaa/
|
||||||
|
|
||||||
|
>>> zip_ops_test(url='http://aaa.zz/', matchType='domain', showPagedIndex=True)
|
||||||
|
Traceback (most recent call last):
|
||||||
|
NotFoundException: No Captures found for: http://aaa.zz/
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from test_cdxops import cdx_ops_test
|
from test_cdxops import cdx_ops_test
|
||||||
|
@ -4,13 +4,14 @@ import itertools
|
|||||||
import logging
|
import logging
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import datetime
|
import datetime
|
||||||
|
import json
|
||||||
|
|
||||||
from cdxsource import CDXSource
|
from cdxsource import CDXSource
|
||||||
from cdxobject import IDXObject
|
from cdxobject import IDXObject
|
||||||
|
|
||||||
from pywb.utils.loaders import BlockLoader
|
from pywb.utils.loaders import BlockLoader, read_last_line
|
||||||
from pywb.utils.bufferedreaders import gzip_decompressor
|
from pywb.utils.bufferedreaders import gzip_decompressor
|
||||||
from pywb.utils.binsearch import iter_range, linearsearch
|
from pywb.utils.binsearch import iter_range, linearsearch, search
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -22,24 +23,10 @@ class ZipBlocks:
|
|||||||
self.count = count
|
self.count = count
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
|
||||||
def readline_to_iter(stream):
|
|
||||||
try:
|
|
||||||
count = 0
|
|
||||||
buff = stream.readline()
|
|
||||||
while buff:
|
|
||||||
count += 1
|
|
||||||
yield buff
|
|
||||||
buff = stream.readline()
|
|
||||||
|
|
||||||
finally:
|
|
||||||
stream.close()
|
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ZipNumCluster(CDXSource):
|
class ZipNumCluster(CDXSource):
|
||||||
DEFAULT_RELOAD_INTERVAL = 10 # in minutes
|
DEFAULT_RELOAD_INTERVAL = 10 # in minutes
|
||||||
DEFAULT_MAX_BLOCKS = 50
|
DEFAULT_MAX_BLOCKS = 10
|
||||||
|
|
||||||
def __init__(self, summary, config=None):
|
def __init__(self, summary, config=None):
|
||||||
|
|
||||||
@ -114,22 +101,91 @@ class ZipNumCluster(CDXSource):
|
|||||||
|
|
||||||
reader = open(self.summary, 'rb')
|
reader = open(self.summary, 'rb')
|
||||||
|
|
||||||
idx_iter = iter_range(reader,
|
idx_iter = self.compute_page_range(reader, query)
|
||||||
query.key,
|
|
||||||
query.end_key,
|
|
||||||
prev_size=1)
|
|
||||||
|
|
||||||
if query.secondary_index_only:
|
if query.secondary_index_only or query.page_count:
|
||||||
return idx_iter
|
return idx_iter
|
||||||
|
|
||||||
|
blocks = self.idx_to_cdx(idx_iter, query)
|
||||||
|
|
||||||
|
def gen_cdx():
|
||||||
|
for blk in blocks:
|
||||||
|
for cdx in blk:
|
||||||
|
yield cdx
|
||||||
|
|
||||||
|
return gen_cdx()
|
||||||
|
|
||||||
|
|
||||||
|
def compute_page_range(self, reader, query):
|
||||||
|
|
||||||
|
# Get End
|
||||||
|
end_iter = search(reader, query.end_key, prev_size=1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
end_line = end_iter.next()
|
||||||
|
except StopIteration:
|
||||||
|
end_line = read_last_line(reader)
|
||||||
|
|
||||||
|
# Get Start
|
||||||
|
|
||||||
|
first_iter = iter_range(reader,
|
||||||
|
query.key,
|
||||||
|
query.end_key,
|
||||||
|
prev_size=1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
first_line = first_iter.next()
|
||||||
|
except StopIteration:
|
||||||
|
raise
|
||||||
|
|
||||||
|
first = IDXObject(first_line)
|
||||||
|
|
||||||
|
end = IDXObject(end_line)
|
||||||
|
diff = end['lineno'] - first['lineno']
|
||||||
|
|
||||||
|
pagesize = query.page_size
|
||||||
|
if not pagesize:
|
||||||
|
pagesize = self.max_blocks
|
||||||
|
|
||||||
|
total_pages = diff / pagesize + 1
|
||||||
|
|
||||||
|
if query.page_count:
|
||||||
|
info = dict(pages=total_pages,
|
||||||
|
pageSize=pagesize,
|
||||||
|
blocks=diff)
|
||||||
|
yield json.dumps(info)
|
||||||
|
reader.close()
|
||||||
|
return
|
||||||
|
|
||||||
|
curr_page = query.page
|
||||||
|
if curr_page >= total_pages or curr_page < 0:
|
||||||
|
msg = 'Page {0} invalid: First Page is 0, Last Page is {1}'
|
||||||
|
reader.close()
|
||||||
|
raise Exception(msg.format(curr_page, total_pages - 1))
|
||||||
|
|
||||||
|
startline = curr_page * pagesize
|
||||||
|
endline = min(startline + pagesize - 1, diff)
|
||||||
|
|
||||||
|
if curr_page == 0:
|
||||||
|
yield first_line
|
||||||
else:
|
else:
|
||||||
blocks = self.idx_to_cdx(idx_iter, query)
|
startline -= 1
|
||||||
|
|
||||||
def gen_cdx():
|
idxiter = itertools.islice(first_iter, startline, endline)
|
||||||
for blk in blocks:
|
for idx in idxiter:
|
||||||
for cdx in blk:
|
yield idx
|
||||||
yield cdx
|
|
||||||
|
|
||||||
return gen_cdx()
|
reader.close()
|
||||||
|
|
||||||
|
|
||||||
|
def search_by_line_num(self, reader, line): # pragma: no cover
|
||||||
|
def line_cmp(line1, line2):
|
||||||
|
line1_no = int(line1.rsplit('\t', 1)[-1])
|
||||||
|
line2_no = int(line2.rsplit('\t', 1)[-1])
|
||||||
|
return cmp(line1_no, line2_no)
|
||||||
|
|
||||||
|
line_iter = search(reader, line, compare_func=line_cmp)
|
||||||
|
yield line_iter.next()
|
||||||
|
|
||||||
def idx_to_cdx(self, idx_iter, query):
|
def idx_to_cdx(self, idx_iter, query):
|
||||||
blocks = None
|
blocks = None
|
||||||
@ -178,6 +234,10 @@ class ZipNumCluster(CDXSource):
|
|||||||
raise Exception('No Locations Found for: ' + block.part)
|
raise Exception('No Locations Found for: ' + block.part)
|
||||||
|
|
||||||
def load_blocks(self, location, blocks, ranges, query):
|
def load_blocks(self, location, blocks, ranges, query):
|
||||||
|
""" Load one or more blocks of compressed cdx lines, return
|
||||||
|
a line iterator which decompresses and returns one line at a time,
|
||||||
|
bounded by query.key and query.end_key
|
||||||
|
"""
|
||||||
|
|
||||||
if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
|
if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
|
||||||
msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
|
msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
|
||||||
@ -188,7 +248,8 @@ class ZipNumCluster(CDXSource):
|
|||||||
def decompress_block(range_):
|
def decompress_block(range_):
|
||||||
decomp = gzip_decompressor()
|
decomp = gzip_decompressor()
|
||||||
buff = decomp.decompress(reader.read(range_))
|
buff = decomp.decompress(reader.read(range_))
|
||||||
return readline_to_iter(BytesIO(buff))
|
for line in BytesIO(buff):
|
||||||
|
yield line
|
||||||
|
|
||||||
iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))
|
iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))
|
||||||
|
|
||||||
|
@ -84,7 +84,7 @@ def linearsearch(iter_, key, prev_size=0, compare_func=cmp):
|
|||||||
|
|
||||||
# no matches, so return empty iterator
|
# no matches, so return empty iterator
|
||||||
if not matched:
|
if not matched:
|
||||||
return []
|
return iter([])
|
||||||
|
|
||||||
return itertools.chain(prev_deque, iter_)
|
return itertools.chain(prev_deque, iter_)
|
||||||
|
|
||||||
|
@ -112,6 +112,27 @@ def extract_client_cookie(env, cookie_name):
|
|||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def read_last_line(fh, offset=256):
|
||||||
|
""" Read last line from a seekable file. Start reading
|
||||||
|
from buff before end of file, and double backwards seek
|
||||||
|
until line break is found. If reached beginning of file
|
||||||
|
(no lines), just return whole file
|
||||||
|
"""
|
||||||
|
fh.seek(0, 2)
|
||||||
|
size = fh.tell()
|
||||||
|
|
||||||
|
while offset < size:
|
||||||
|
fh.seek(-offset, 2)
|
||||||
|
lines = fh.readlines()
|
||||||
|
if len(lines) > 1:
|
||||||
|
return lines[-1]
|
||||||
|
offset *= 2
|
||||||
|
|
||||||
|
fh.seek(0, 0)
|
||||||
|
return fh.readlines()[-1]
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class BlockLoader(object):
|
class BlockLoader(object):
|
||||||
"""
|
"""
|
||||||
|
@ -53,6 +53,21 @@ org,iana)/_js/2013.1/jquery.js 20140126201307 https://www.iana.org/_js/2013.1/jq
|
|||||||
org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
|
org,iana)/about 20140126200706 http://www.iana.org/about text/html 200 6G77LZKFAVKH4PCWWKMW6TRJPSHWUBI3 - - 2962 483588 iana.warc.gz
|
||||||
|
|
||||||
|
|
||||||
|
# Test at boundary
|
||||||
|
>>> print_binsearch_results('a)/', iter_exact)
|
||||||
|
>>> print_binsearch_results_range('a)/', 'a-', iter_range)
|
||||||
|
|
||||||
|
>>> print_binsearch_results_range('a)/', 'org,iana)/_css/2013.1/fonts/inconsolata.otf ', iter_range)
|
||||||
|
org,iana)/ 20140126200624 http://www.iana.org/ text/html 200 OSSAPWJ23L56IYVRW3GFEAR4MCJMGPTB - - 2258 334 iana.warc.gz
|
||||||
|
|
||||||
|
>>> print_binsearch_results('z)/', iter_exact)
|
||||||
|
>>> print_binsearch_results_range('z)/', 'z-', iter_range)
|
||||||
|
|
||||||
|
>>> print_binsearch_results_range('org,iana)/protocols', 'z-', iter_range)
|
||||||
|
org,iana)/protocols 20140126200715 http://www.iana.org/protocols text/html 200 IRUJZEUAXOUUG224ZMI4VWTUPJX6XJTT - - 63663 496277 iana.warc.gz
|
||||||
|
org,iana)/time-zones 20140126200737 http://www.iana.org/time-zones text/html 200 4Z27MYWOSXY2XDRAJRW7WRMT56LXDD4R - - 2449 569675 iana.warc.gz
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@ -82,6 +82,24 @@ True
|
|||||||
# length too long
|
# length too long
|
||||||
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) + 4, BytesIO(post_data))
|
>>> extract_post_query('POST', 'application/x-www-form-urlencoded', len(post_data) + 4, BytesIO(post_data))
|
||||||
'foo=bar&dir=/baz'
|
'foo=bar&dir=/baz'
|
||||||
|
|
||||||
|
|
||||||
|
# test read_last_line
|
||||||
|
>>> read_last_line(BytesIO('A\nB\nC'))
|
||||||
|
'C'
|
||||||
|
|
||||||
|
>>> read_last_line(BytesIO('Some Line\nLonger Line\nLongest Last Line LL'), offset=8)
|
||||||
|
'Longest Last Line LL'
|
||||||
|
|
||||||
|
>>> read_last_line(BytesIO('A\nBC'))
|
||||||
|
'BC'
|
||||||
|
|
||||||
|
>>> read_last_line(BytesIO('A\nBC\n'))
|
||||||
|
'BC\n'
|
||||||
|
|
||||||
|
>>> read_last_line(BytesIO('ABC'))
|
||||||
|
'ABC'
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@ -91,6 +109,7 @@ import os
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
|
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
|
||||||
from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query
|
from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query
|
||||||
|
from pywb.utils.loaders import read_last_line
|
||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user