mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
optimize zipnum to support loading multiple continuous blocks,
decompressing each one individually. #17
This commit is contained in:
parent
8e840ccaaf
commit
434fd23a95
@ -39,9 +39,8 @@ def main(paths=None):
|
|||||||
try:
|
try:
|
||||||
params = extract_params_from_wsgi_env(env)
|
params = extract_params_from_wsgi_env(env)
|
||||||
response = cdxserver.load_cdx(**params)
|
response = cdxserver.load_cdx(**params)
|
||||||
start_response('200 OK', [('Content-Type', 'text/plain')])
|
|
||||||
|
|
||||||
response = list(response)
|
start_response('200 OK', [('Content-Type', 'text/plain')])
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
import traceback
|
import traceback
|
||||||
|
@ -2,18 +2,19 @@ import os
|
|||||||
import collections
|
import collections
|
||||||
import itertools
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
|
from cStringIO import StringIO
|
||||||
|
|
||||||
from cdxsource import CDXSource
|
from cdxsource import CDXSource
|
||||||
from cdxobject import IDXObject
|
from cdxobject import IDXObject
|
||||||
|
|
||||||
from pywb.utils.loaders import FileLoader
|
from pywb.utils.loaders import FileLoader, LimitReader
|
||||||
from pywb.utils.loaders import SeekableTextFileReader
|
from pywb.utils.loaders import SeekableTextFileReader
|
||||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
from pywb.utils.bufferedreaders import gzip_decompressor
|
||||||
from pywb.utils.binsearch import iter_range, linearsearch
|
from pywb.utils.binsearch import iter_range, linearsearch
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
class ZipBlock:
|
class ZipBlocks:
|
||||||
def __init__(self, part, offset, length, count):
|
def __init__(self, part, offset, length, count):
|
||||||
self.part = part
|
self.part = part
|
||||||
self.offset = offset
|
self.offset = offset
|
||||||
@ -81,35 +82,38 @@ class ZipNumCluster(CDXSource):
|
|||||||
return gen_cdx()
|
return gen_cdx()
|
||||||
|
|
||||||
def idx_to_cdx(self, idx_iter, params):
|
def idx_to_cdx(self, idx_iter, params):
|
||||||
block = None
|
blocks = None
|
||||||
max_blocks = 1
|
max_blocks = 10
|
||||||
|
ranges = []
|
||||||
|
|
||||||
for idx in idx_iter:
|
for idx in idx_iter:
|
||||||
idx = IDXObject(idx)
|
idx = IDXObject(idx)
|
||||||
|
|
||||||
if (block and block.part == idx['part'] and
|
if (blocks and blocks.part == idx['part'] and
|
||||||
block.offset + block.length == idx['offset'] and
|
blocks.offset + blocks.length == idx['offset'] and
|
||||||
block.count < max_blocks):
|
blocks.count < max_blocks):
|
||||||
|
|
||||||
block.length += idx['length']
|
blocks.length += idx['length']
|
||||||
block.count += 1
|
blocks.count += 1
|
||||||
|
ranges.append(idx['length'])
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if block:
|
if blocks:
|
||||||
yield self.block_to_cdx_iter(block, params)
|
yield self.block_to_cdx_iter(blocks, ranges, params)
|
||||||
|
|
||||||
block = ZipBlock(idx['part'], idx['offset'], idx['length'], 1)
|
blocks = ZipBlocks(idx['part'], idx['offset'], idx['length'], 1)
|
||||||
|
ranges = [blocks.length]
|
||||||
|
|
||||||
if block:
|
if blocks:
|
||||||
yield self.block_to_cdx_iter(block, params)
|
yield self.block_to_cdx_iter(blocks, ranges, params)
|
||||||
|
|
||||||
def block_to_cdx_iter(self, block, params):
|
def block_to_cdx_iter(self, blocks, ranges, params):
|
||||||
last_exc = None
|
last_exc = None
|
||||||
last_traceback = None
|
last_traceback = None
|
||||||
|
|
||||||
for location in self.lookup_loc(block.part):
|
for location in self.lookup_loc(blocks.part):
|
||||||
try:
|
try:
|
||||||
return self.load_block(location, block, params)
|
return self.load_blocks(location, blocks, ranges, params)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
last_exc = exc
|
last_exc = exc
|
||||||
import sys
|
import sys
|
||||||
@ -120,22 +124,20 @@ class ZipNumCluster(CDXSource):
|
|||||||
else:
|
else:
|
||||||
raise Exception('No Locations Found for: ' + block.part)
|
raise Exception('No Locations Found for: ' + block.part)
|
||||||
|
|
||||||
def load_block(self, location, block, params):
|
def load_blocks(self, location, blocks, ranges, params):
|
||||||
|
|
||||||
if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
|
if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
|
||||||
msg = 'Loading {block.count} blocks from {location}:\
|
msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
|
||||||
{block.offset}+{block.length}'.format(block=block, location=location)
|
logging.debug(msg.format(b=blocks, loc=location))
|
||||||
logging.debug(msg)
|
|
||||||
|
|
||||||
reader = FileLoader().load(location,
|
reader = FileLoader().load(location, blocks.offset, blocks.length)
|
||||||
block.offset,
|
|
||||||
block.length)
|
|
||||||
|
|
||||||
# read whole zip block into buffer
|
def decompress_block(range_):
|
||||||
reader = DecompressingBufferedReader(reader,
|
decomp = gzip_decompressor()
|
||||||
decomp_type='gzip',
|
buff = decomp.decompress(reader.read(range_))
|
||||||
block_size=block.length)
|
return readline_to_iter(StringIO(buff))
|
||||||
|
|
||||||
iter_ = readline_to_iter(reader)
|
iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))
|
||||||
|
|
||||||
# start bound
|
# start bound
|
||||||
iter_ = linearsearch(iter_, params['key'])
|
iter_ = linearsearch(iter_, params['key'])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user