1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-24 06:59:52 +01:00

add zipnum location reloading support

default to 10 min interval #17
This commit is contained in:
Ilya Kreymer 2014-02-22 16:42:42 -08:00
parent 1754f15831
commit d8d7435d77
2 changed files with 68 additions and 24 deletions

View File

@ -6,6 +6,8 @@ from zipnum import ZipNumCluster
from cdxobject import CDXObject, CaptureNotFoundException, CDXException from cdxobject import CDXObject, CaptureNotFoundException, CDXException
from cdxdomainspecific import load_domain_specific_cdx_rules from cdxdomainspecific import load_domain_specific_cdx_rules
from pywb.utils.loaders import is_http
from itertools import chain from itertools import chain
import logging import logging
import os import os
@ -82,7 +84,7 @@ class CDXServer(BaseCDXServer):
def __init__(self, paths, **kwargs): def __init__(self, paths, **kwargs):
super(CDXServer, self).__init__(**kwargs) super(CDXServer, self).__init__(**kwargs)
self.sources = create_cdx_sources(paths) self.sources = create_cdx_sources(paths, kwargs.get('config'))
def load_cdx(self, **params): def load_cdx(self, **params):
# if key not set, assume 'url' is set and needs canonicalization # if key not set, assume 'url' is set and needs canonicalization
@ -154,8 +156,7 @@ def create_cdx_server(config, ds_rules_file=None):
logging.debug('CDX Surt-Ordered? ' + str(surt_ordered)) logging.debug('CDX Surt-Ordered? ' + str(surt_ordered))
if (isinstance(paths, str) and if isinstance(paths, str) and is_http(paths):
any(paths.startswith(x) for x in ['http://', 'https://'])):
server_cls = RemoteCDXServer server_cls = RemoteCDXServer
else: else:
server_cls = CDXServer server_cls = CDXServer
@ -167,7 +168,7 @@ def create_cdx_server(config, ds_rules_file=None):
#================================================================= #=================================================================
def create_cdx_sources(paths): def create_cdx_sources(paths, config=None):
sources = [] sources = []
if not isinstance(paths, list): if not isinstance(paths, list):
@ -175,13 +176,13 @@ def create_cdx_sources(paths):
for path in paths: for path in paths:
if isinstance(path, CDXSource): if isinstance(path, CDXSource):
add_cdx_source(sources, path) add_cdx_source(sources, path, config)
elif isinstance(path, str): elif isinstance(path, str):
if os.path.isdir(path): if os.path.isdir(path):
for file in os.listdir(path): for file in os.listdir(path):
add_cdx_source(sources, path + file) add_cdx_source(sources, path + file, config)
else: else:
add_cdx_source(sources, path) add_cdx_source(sources, path, config)
if len(sources) == 0: if len(sources) == 0:
logging.exception('No CDX Sources Found from: ' + str(sources)) logging.exception('No CDX Sources Found from: ' + str(sources))
@ -190,9 +191,9 @@ def create_cdx_sources(paths):
#================================================================= #=================================================================
def add_cdx_source(sources, source): def add_cdx_source(sources, source, config):
if not isinstance(source, CDXSource): if not isinstance(source, CDXSource):
source = create_cdx_source(source) source = create_cdx_source(source, config)
if not source: if not source:
return return
@ -201,15 +202,15 @@ def add_cdx_source(sources, source):
#================================================================= #=================================================================
def create_cdx_source(filename): def create_cdx_source(filename, config):
if filename.startswith('http://') or filename.startswith('https://'): if is_http(filename):
return RemoteCDXSource(filename) return RemoteCDXSource(filename)
if filename.endswith('.cdx'): if filename.endswith('.cdx'):
return CDXFile(filename) return CDXFile(filename)
if filename.endswith('.summary'): if filename.endswith('.summary'):
return ZipNumCluster(filename) return ZipNumCluster(filename, config)
return None return None
#TODO: support zipnum #TODO: support zipnum

View File

@ -3,6 +3,7 @@ import collections
import itertools import itertools
import logging import logging
from cStringIO import StringIO from cStringIO import StringIO
import datetime
from cdxsource import CDXSource from cdxsource import CDXSource
from cdxobject import IDXObject from cdxobject import IDXObject
@ -38,29 +39,72 @@ def readline_to_iter(stream):
#================================================================= #=================================================================
class ZipNumCluster(CDXSource): class ZipNumCluster(CDXSource):
def __init__(self, summary, loc=None): DEFAULT_RELOAD_INTERVAL = 10 # in minutes
DEFAULT_MAX_BLOCKS = 50
def __init__(self, summary, config=None):
loc = None
cookie_maker = None
self.max_blocks = self.DEFAULT_MAX_BLOCKS
reload_ival = self.DEFAULT_RELOAD_INTERVAL
if config:
loc = config.get('zipnum_loc')
cookie_maker = config.get('cookie_maker')
self.max_blocks = config.get('max_blocks',
self.max_blocks)
reload_ival = config.get('reload_interval', reload_ival)
if not loc: if not loc:
splits = os.path.splitext(summary) splits = os.path.splitext(summary)
loc = splits[0] + '.loc' loc = splits[0] + '.loc'
self.summary = summary self.summary = summary
self.loc = loc self.loc_filename = loc
self.loc_map = self.load_loc(loc)
@staticmethod # initial loc map
def load_loc(loc_file): self.loc_map = {}
loc_map = {} self.load_loc()
with open(loc_file) as fh:
# reload interval
self.loc_update_time = datetime.datetime.now()
self.reload_interval = datetime.timedelta(minutes=reload_ival)
self.blk_loader = BlockLoader(cookie_maker=cookie_maker)
def load_loc(self):
logging.debug('Loading loc from: ' + self.loc_filename)
with open(self.loc_filename) as fh:
for line in fh: for line in fh:
parts = line.rstrip().split('\t') parts = line.rstrip().split('\t')
loc_map[parts[0]] = parts[1:] self.loc_map[parts[0]] = parts[1:]
return loc_map @staticmethod
def reload_timed(timestamp, val, delta, func):
now = datetime.datetime.now()
if now - timestamp >= delta:
func()
return now
return None
def reload_loc(self):
newtime = self.reload_timed(self.loc_update_time,
self.loc_map,
self.reload_interval,
self.load_loc)
if newtime:
self.loc_update_time = newtime
def lookup_loc(self, part): def lookup_loc(self, part):
return self.loc_map[part] return self.loc_map[part]
def load_cdx(self, params): def load_cdx(self, params):
self.reload_loc()
reader = SeekableTextFileReader(self.summary) reader = SeekableTextFileReader(self.summary)
idx_iter = iter_range(reader, idx_iter = iter_range(reader,
@ -83,7 +127,6 @@ class ZipNumCluster(CDXSource):
def idx_to_cdx(self, idx_iter, params): def idx_to_cdx(self, idx_iter, params):
blocks = None blocks = None
max_blocks = 10
ranges = [] ranges = []
for idx in idx_iter: for idx in idx_iter:
@ -91,7 +134,7 @@ class ZipNumCluster(CDXSource):
if (blocks and blocks.part == idx['part'] and if (blocks and blocks.part == idx['part'] and
blocks.offset + blocks.length == idx['offset'] and blocks.offset + blocks.length == idx['offset'] and
blocks.count < max_blocks): blocks.count < self.max_blocks):
blocks.length += idx['length'] blocks.length += idx['length']
blocks.count += 1 blocks.count += 1
@ -134,7 +177,7 @@ class ZipNumCluster(CDXSource):
msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}' msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
logging.debug(msg.format(b=blocks, loc=location)) logging.debug(msg.format(b=blocks, loc=location))
reader = BlockLoader().load(location, blocks.offset, blocks.length) reader = self.blk_loader.load(location, blocks.offset, blocks.length)
def decompress_block(range_): def decompress_block(range_):
decomp = gzip_decompressor() decomp = gzip_decompressor()