From 1754f1583120fecd5e30e4cbf21ec4ee89c48353 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 22 Feb 2014 11:08:46 -0800 Subject: [PATCH] Combine FileLoader/HttpLoader into a single BlockLoader which delegates based on scheme --- pywb/cdx/zipnum.py | 10 ++++-- pywb/utils/loaders.py | 63 +++++++++++++++++++-------------- pywb/utils/test/loaders_test.py | 8 ++--- pywb/warc/recordloader.py | 30 +++++----------- 4 files changed, 57 insertions(+), 54 deletions(-) diff --git a/pywb/cdx/zipnum.py b/pywb/cdx/zipnum.py index 5582177a..91db9e15 100644 --- a/pywb/cdx/zipnum.py +++ b/pywb/cdx/zipnum.py @@ -7,7 +7,7 @@ from cStringIO import StringIO from cdxsource import CDXSource from cdxobject import IDXObject -from pywb.utils.loaders import FileLoader, LimitReader +from pywb.utils.loaders import BlockLoader from pywb.utils.loaders import SeekableTextFileReader from pywb.utils.bufferedreaders import gzip_decompressor from pywb.utils.binsearch import iter_range, linearsearch @@ -101,7 +101,11 @@ class ZipNumCluster(CDXSource): if blocks: yield self.block_to_cdx_iter(blocks, ranges, params) - blocks = ZipBlocks(idx['part'], idx['offset'], idx['length'], 1) + blocks = ZipBlocks(idx['part'], + idx['offset'], + idx['length'], + 1) + ranges = [blocks.length] if blocks: @@ -130,7 +134,7 @@ class ZipNumCluster(CDXSource): msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}' logging.debug(msg.format(b=blocks, loc=location)) - reader = FileLoader().load(location, blocks.offset, blocks.length) + reader = BlockLoader().load(location, blocks.offset, blocks.length) def decompress_block(range_): decomp = gzip_decompressor() diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index 5572a3c2..a117f539 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -9,18 +9,50 @@ import urllib2 import time +def is_http(filename): + return any(filename.startswith(x) for x in ['http://', 'https://']) + + #================================================================= -# load a reader from http -#================================================================= -class HttpLoader(object): +class BlockLoader(object): """ - Load a file-like reader over http using range requests - and an optional cookie created via a cookie_maker + a loader which can stream blocks of content + given a uri, offset and optional length. + Currently supports: http/https and file/local file system """ def __init__(self, cookie_maker=None): self.cookie_maker = cookie_maker def load(self, url, offset, length): + """ + Determine loading method based on uri + """ + if is_http(url): + return self.load_http(url, offset, length) + else: + return self.load_file(url, offset, length) + + def load_file(self, url, offset, length): + """ + Load a file-like reader from the local file system + """ + + if url.startswith('file://'): + url = url[len('file://'):] + + afile = open(url, 'rb') + afile.seek(offset) + + if length > 0: + return LimitReader(afile, length) + else: + return afile + + def load_http(self, url, offset, length): + """ + Load a file-like reader over http using range requests + and an optional cookie created via a cookie_maker + """ if length > 0: range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1) else: @@ -71,27 +103,6 @@ class HMACCookieMaker(object): return cookie -#================================================================= -# load a reader from local filesystem -#================================================================= -class FileLoader(object): - """ - Load a file-like reader from the local file system - """ - - def load(self, url, offset, length): - if url.startswith('file://'): - url = url[len('file://'):] - - afile = open(url, 'rb') - afile.seek(offset) - - if length > 0: - return LimitReader(afile, length) - else: - return afile - - #================================================================= # Limit Reader #================================================================= diff --git a/pywb/utils/test/loaders_test.py b/pywb/utils/test/loaders_test.py index 1d26ff24..7dc42d83 100644 --- a/pywb/utils/test/loaders_test.py +++ b/pywb/utils/test/loaders_test.py @@ -10,9 +10,9 @@ >>> read_multiple(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20]) 'efghji' -# FileLoader Tests (includes LimitReader) +# BlockLoader Tests (includes LimitReader) # Ensure attempt to read more than 100 bytes, reads exactly 100 bytes ->>> len(FileLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400')) +>>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400')) 100 # SeekableTextFileReader Test @@ -34,7 +34,7 @@ >>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline() ' CDX N b a m s k r M S V g\\n' ->>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read() +>>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read() 'Example Domain' # test very small block size @@ -53,7 +53,7 @@ #================================================================= import os import StringIO -from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker +from pywb.utils.loaders import BlockLoader, HMACCookieMaker from pywb.utils.loaders import LimitReader, SeekableTextFileReader from pywb.utils.bufferedreaders import DecompressingBufferedReader diff --git a/pywb/warc/recordloader.py b/pywb/warc/recordloader.py index 9f595301..446e0da3 100644 --- a/pywb/warc/recordloader.py +++ b/pywb/warc/recordloader.py @@ -6,7 +6,7 @@ from pywb.utils.statusandheaders import StatusAndHeaders from pywb.utils.statusandheaders import StatusAndHeadersParser from pywb.utils.statusandheaders import StatusAndHeadersParserException -from pywb.utils.loaders import FileLoader, HttpLoader +from pywb.utils.loaders import BlockLoader from pywb.utils.bufferedreaders import DecompressingBufferedReader #================================================================= @@ -32,23 +32,11 @@ class ArcWarcRecordLoader: ARC_HEADERS = ["uri", "ip-address", "creation-date", "content-type", "length"] - @staticmethod - def create_default_loaders(cookie_maker=None): - http = HttpLoader(cookie_maker) - file = FileLoader() - return { - 'http': http, - 'https': http, - 'file': file, - '': file - } - - def __init__(self, loaders={}, cookie_maker=None, block_size=8192): - self.loaders = loaders - - if not self.loaders: - self.loaders = self.create_default_loaders(cookie_maker) + def __init__(self, loader=None, cookie_maker=None, block_size=8192): + if not loader: + loader = BlockLoader(cookie_maker) + self.loader = loader self.block_size = block_size self.arc_parser = ARCHeadersParser(self.ARC_HEADERS) @@ -60,16 +48,16 @@ class ArcWarcRecordLoader: def load(self, url, offset, length): url_parts = urlparse.urlsplit(url) - loader = self.loaders.get(url_parts.scheme) - if not loader: - raise ArchiveLoadFailed('Unknown Protocol', url) + #loader = self.loaders.get(url_parts.scheme) + #if not loader: + # raise ArchiveLoadFailed('Unknown Protocol', url) try: length = int(length) except: length = -1 - raw = loader.load(url, long(offset), length) + raw = self.loader.load(url, long(offset), length) decomp_type = 'gzip'