1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

optimize zipnum to support loading multiple continuous blocks,

decompressing each one individually. #17
This commit is contained in:
Ilya Kreymer 2014-02-22 10:45:30 -08:00
parent 8e840ccaaf
commit 434fd23a95
2 changed files with 33 additions and 32 deletions

View File

@ -39,9 +39,8 @@ def main(paths=None):
try: try:
params = extract_params_from_wsgi_env(env) params = extract_params_from_wsgi_env(env)
response = cdxserver.load_cdx(**params) response = cdxserver.load_cdx(**params)
start_response('200 OK', [('Content-Type', 'text/plain')])
response = list(response) start_response('200 OK', [('Content-Type', 'text/plain')])
except Exception as exc: except Exception as exc:
import traceback import traceback

View File

@ -2,18 +2,19 @@ import os
import collections import collections
import itertools import itertools
import logging import logging
from cStringIO import StringIO
from cdxsource import CDXSource from cdxsource import CDXSource
from cdxobject import IDXObject from cdxobject import IDXObject
from pywb.utils.loaders import FileLoader from pywb.utils.loaders import FileLoader, LimitReader
from pywb.utils.loaders import SeekableTextFileReader from pywb.utils.loaders import SeekableTextFileReader
from pywb.utils.bufferedreaders import DecompressingBufferedReader from pywb.utils.bufferedreaders import gzip_decompressor
from pywb.utils.binsearch import iter_range, linearsearch from pywb.utils.binsearch import iter_range, linearsearch
#================================================================= #=================================================================
class ZipBlock: class ZipBlocks:
def __init__(self, part, offset, length, count): def __init__(self, part, offset, length, count):
self.part = part self.part = part
self.offset = offset self.offset = offset
@ -81,35 +82,38 @@ class ZipNumCluster(CDXSource):
return gen_cdx() return gen_cdx()
def idx_to_cdx(self, idx_iter, params): def idx_to_cdx(self, idx_iter, params):
block = None blocks = None
max_blocks = 1 max_blocks = 10
ranges = []
for idx in idx_iter: for idx in idx_iter:
idx = IDXObject(idx) idx = IDXObject(idx)
if (block and block.part == idx['part'] and if (blocks and blocks.part == idx['part'] and
block.offset + block.length == idx['offset'] and blocks.offset + blocks.length == idx['offset'] and
block.count < max_blocks): blocks.count < max_blocks):
block.length += idx['length'] blocks.length += idx['length']
block.count += 1 blocks.count += 1
ranges.append(idx['length'])
else: else:
if block: if blocks:
yield self.block_to_cdx_iter(block, params) yield self.block_to_cdx_iter(blocks, ranges, params)
block = ZipBlock(idx['part'], idx['offset'], idx['length'], 1) blocks = ZipBlocks(idx['part'], idx['offset'], idx['length'], 1)
ranges = [blocks.length]
if block: if blocks:
yield self.block_to_cdx_iter(block, params) yield self.block_to_cdx_iter(blocks, ranges, params)
def block_to_cdx_iter(self, block, params): def block_to_cdx_iter(self, blocks, ranges, params):
last_exc = None last_exc = None
last_traceback = None last_traceback = None
for location in self.lookup_loc(block.part): for location in self.lookup_loc(blocks.part):
try: try:
return self.load_block(location, block, params) return self.load_blocks(location, blocks, ranges, params)
except Exception as exc: except Exception as exc:
last_exc = exc last_exc = exc
import sys import sys
@ -120,22 +124,20 @@ class ZipNumCluster(CDXSource):
else: else:
raise Exception('No Locations Found for: ' + block.part) raise Exception('No Locations Found for: ' + block.part)
def load_block(self, location, block, params): def load_blocks(self, location, blocks, ranges, params):
if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG): if (logging.getLogger().getEffectiveLevel() <= logging.DEBUG):
msg = 'Loading {block.count} blocks from {location}:\ msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
{block.offset}+{block.length}'.format(block=block, location=location) logging.debug(msg.format(b=blocks, loc=location))
logging.debug(msg)
reader = FileLoader().load(location, reader = FileLoader().load(location, blocks.offset, blocks.length)
block.offset,
block.length)
# read whole zip block into buffer def decompress_block(range_):
reader = DecompressingBufferedReader(reader, decomp = gzip_decompressor()
decomp_type='gzip', buff = decomp.decompress(reader.read(range_))
block_size=block.length) return readline_to_iter(StringIO(buff))
iter_ = readline_to_iter(reader) iter_ = itertools.chain(*itertools.imap(decompress_block, ranges))
# start bound # start bound
iter_ = linearsearch(iter_, params['key']) iter_ = linearsearch(iter_, params['key'])