From 8ad66249c7ea190cf72f62594763f151687dde60 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 18 May 2016 16:34:58 -0700 Subject: [PATCH] blockloader: support for loader profiles, specified via 'profile+scheme://...' urls. Profiles specify additional settings (eg. credentials) that are not included in the url. To enabl e custom profiles, BlockLoader.set_profile_loader(callable) to a callable that will return custom config, addresses #180 --- pywb/utils/loaders.py | 89 ++++++++++++++++++++++----------- pywb/utils/test/test_loaders.py | 4 +- pywb/warc/recordloader.py | 2 +- 3 files changed, 64 insertions(+), 31 deletions(-) diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 376a6224..6ce7bbac 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -236,23 +236,34 @@ def read_last_line(fh, offset=256): #================================================================= -class BlockLoader(object): +class BaseLoader(object): + def __init__(self, **kwargs): + pass + + def load(self, url, offset=0, length=-1): + raise NotImplemented() + + +#================================================================= +class BlockLoader(BaseLoader): """ a loader which can stream blocks of content given a uri, offset and optional length. Currently supports: http/https and file/local file system """ - def __init__(self, *args, **kwargs): + loaders = {} + profile_loader = None + + def __init__(self, **kwargs): self.cached = {} - self.args = args self.kwargs = kwargs def load(self, url, offset=0, length=-1): - loader = self._get_loader_for(url) + loader = self._get_loader_for_url(url) return loader.load(url, offset, length) - def _get_loader_for(self, url): + def _get_loader_for_url(self, url): """ Determine loading method based on uri """ @@ -266,14 +277,41 @@ class BlockLoader(object): if loader: return loader - loader_cls = LOADERS.get(type_) - if not loader_cls: - raise IOError('No Loader for type: ' + type_) + if '+' in type_: + profile_name, scheme = type_.split('+', 1) + else: + profile_name = '' + scheme = type_ + + loader_cls = self._get_loader_class_for_type(scheme) + + if not loader_cls: + raise IOError('No Loader for type: ' + scheme) + + profile = self.kwargs + + if self.profile_loader: + profile = self.profile_loader(profile_name, scheme) + + loader = loader_cls(**profile) - loader = loader_cls(*self.args, **self.kwargs) self.cached[type_] = loader return loader + def _get_loader_class_for_type(self, type_): + loader_cls = self.loaders.get(type_) + return loader_cls + + @staticmethod + def init_default_loaders(): + BlockLoader.loaders['http'] = HttpLoader + BlockLoader.loaders['https'] = HttpLoader + BlockLoader.loaders['s3'] = S3Loader + BlockLoader.loaders['file'] = LocalFileLoader + + @staticmethod + def set_profile_loader(src): + BlockLoader.profile_loader = src @staticmethod def _make_range_header(offset, length): @@ -286,10 +324,7 @@ class BlockLoader(object): #================================================================= -class LocalFileLoader(object): - def __init__(self, *args, **kwargs): - pass - +class LocalFileLoader(BaseLoader): def load(self, url, offset=0, length=-1): """ Load a file-like reader from the local file system @@ -329,9 +364,11 @@ class LocalFileLoader(object): #================================================================= -class HttpLoader(object): - def __init__(self, cookie_maker=None, *args, **kwargs): - self.cookie_maker = cookie_maker +class HttpLoader(BaseLoader): + def __init__(self, **kwargs): + self.cookie_maker = kwargs.get('cookie_maker') + if not self.cookie_maker: + self.cookie_maker = kwargs.get('cookie') self.session = None def load(self, url, offset, length): @@ -357,17 +394,19 @@ class HttpLoader(object): #================================================================= -class S3Loader(object): - def __init__(self, *args, **kwargs): +class S3Loader(BaseLoader): + def __init__(self, **kwargs): self.s3conn = None + self.aws_access_key_id = kwargs.get('aws_access_key_id') + self.aws_secret_access_key = kwargs.get('aws_secret_access_key') def load(self, url, offset, length): if not s3_avail: #pragma: no cover raise IOError('To load from s3 paths, ' + 'you must install boto: pip install boto') - aws_access_key_id = None - aws_secret_access_key = None + aws_access_key_id = self.aws_access_key_id + aws_secret_access_key = self.aws_secret_access_key parts = urlsplit(url) @@ -495,12 +534,6 @@ class LimitReader(object): return stream - -#================================================================= -LOADERS = {'http': HttpLoader, - 'https': HttpLoader, - 's3': S3Loader, - 'file': LocalFileLoader - } - +# ============================================================================ +BlockLoader.init_default_loaders() diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index 0a751712..8e8595bd 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -37,11 +37,11 @@ Traceback (most recent call last): IOError: [Errno 2] No such file or directory: '_x_no_such_file_' # HMAC Cookie Maker ->>> print_str(BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()) +>>> print_str(BlockLoader(cookie_maker=HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()) 'Example Domain' # fixed cookie, range request ->>> print_str(BlockLoader('some=value').load('http://example.com', 41, 14).read()) +>>> print_str(BlockLoader(cookie='some=value').load('http://example.com', 41, 14).read()) 'Example Domain' # range request diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 06a3c79e..402d1524 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -57,7 +57,7 @@ class ArcWarcRecordLoader(object): def __init__(self, loader=None, cookie_maker=None, block_size=8192, verify_http=True): if not loader: - loader = BlockLoader(cookie_maker) + loader = BlockLoader(cookie_maker=cookie_maker) self.loader = loader self.block_size = block_size