mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
Combine FileLoader/HttpLoader into a single BlockLoader which
delegates based on scheme
This commit is contained in:
parent
434fd23a95
commit
1754f15831
@ -7,7 +7,7 @@ from cStringIO import StringIO
|
||||
from cdxsource import CDXSource
|
||||
from cdxobject import IDXObject
|
||||
|
||||
from pywb.utils.loaders import FileLoader, LimitReader
|
||||
from pywb.utils.loaders import BlockLoader
|
||||
from pywb.utils.loaders import SeekableTextFileReader
|
||||
from pywb.utils.bufferedreaders import gzip_decompressor
|
||||
from pywb.utils.binsearch import iter_range, linearsearch
|
||||
@ -101,7 +101,11 @@ class ZipNumCluster(CDXSource):
|
||||
if blocks:
|
||||
yield self.block_to_cdx_iter(blocks, ranges, params)
|
||||
|
||||
blocks = ZipBlocks(idx['part'], idx['offset'], idx['length'], 1)
|
||||
blocks = ZipBlocks(idx['part'],
|
||||
idx['offset'],
|
||||
idx['length'],
|
||||
1)
|
||||
|
||||
ranges = [blocks.length]
|
||||
|
||||
if blocks:
|
||||
@ -130,7 +134,7 @@ class ZipNumCluster(CDXSource):
|
||||
msg = 'Loading {b.count} blocks from {loc}:{b.offset}+{b.length}'
|
||||
logging.debug(msg.format(b=blocks, loc=location))
|
||||
|
||||
reader = FileLoader().load(location, blocks.offset, blocks.length)
|
||||
reader = BlockLoader().load(location, blocks.offset, blocks.length)
|
||||
|
||||
def decompress_block(range_):
|
||||
decomp = gzip_decompressor()
|
||||
|
@ -9,18 +9,50 @@ import urllib2
|
||||
import time
|
||||
|
||||
|
||||
def is_http(filename):
|
||||
return any(filename.startswith(x) for x in ['http://', 'https://'])
|
||||
|
||||
|
||||
#=================================================================
|
||||
# load a reader from http
|
||||
#=================================================================
|
||||
class HttpLoader(object):
|
||||
class BlockLoader(object):
|
||||
"""
|
||||
Load a file-like reader over http using range requests
|
||||
and an optional cookie created via a cookie_maker
|
||||
a loader which can stream blocks of content
|
||||
given a uri, offset and optional length.
|
||||
Currently supports: http/https and file/local file system
|
||||
"""
|
||||
def __init__(self, cookie_maker=None):
|
||||
self.cookie_maker = cookie_maker
|
||||
|
||||
def load(self, url, offset, length):
|
||||
"""
|
||||
Determine loading method based on uri
|
||||
"""
|
||||
if is_http(url):
|
||||
return self.load_http(url, offset, length)
|
||||
else:
|
||||
return self.load_file(url, offset, length)
|
||||
|
||||
def load_file(self, url, offset, length):
|
||||
"""
|
||||
Load a file-like reader from the local file system
|
||||
"""
|
||||
|
||||
if url.startswith('file://'):
|
||||
url = url[len('file://'):]
|
||||
|
||||
afile = open(url, 'rb')
|
||||
afile.seek(offset)
|
||||
|
||||
if length > 0:
|
||||
return LimitReader(afile, length)
|
||||
else:
|
||||
return afile
|
||||
|
||||
def load_http(self, url, offset, length):
|
||||
"""
|
||||
Load a file-like reader over http using range requests
|
||||
and an optional cookie created via a cookie_maker
|
||||
"""
|
||||
if length > 0:
|
||||
range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
|
||||
else:
|
||||
@ -71,27 +103,6 @@ class HMACCookieMaker(object):
|
||||
return cookie
|
||||
|
||||
|
||||
#=================================================================
|
||||
# load a reader from local filesystem
|
||||
#=================================================================
|
||||
class FileLoader(object):
|
||||
"""
|
||||
Load a file-like reader from the local file system
|
||||
"""
|
||||
|
||||
def load(self, url, offset, length):
|
||||
if url.startswith('file://'):
|
||||
url = url[len('file://'):]
|
||||
|
||||
afile = open(url, 'rb')
|
||||
afile.seek(offset)
|
||||
|
||||
if length > 0:
|
||||
return LimitReader(afile, length)
|
||||
else:
|
||||
return afile
|
||||
|
||||
|
||||
#=================================================================
|
||||
# Limit Reader
|
||||
#=================================================================
|
||||
|
@ -10,9 +10,9 @@
|
||||
>>> read_multiple(LimitReader(StringIO.StringIO('abcdefghjiklmnopqrstuvwxyz'), 10), [2, 2, 20])
|
||||
'efghji'
|
||||
|
||||
# FileLoader Tests (includes LimitReader)
|
||||
# BlockLoader Tests (includes LimitReader)
|
||||
# Ensure attempt to read more than 100 bytes, reads exactly 100 bytes
|
||||
>>> len(FileLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400'))
|
||||
>>> len(BlockLoader().load(test_cdx_dir + 'iana.cdx', 0, 100).read('400'))
|
||||
100
|
||||
|
||||
# SeekableTextFileReader Test
|
||||
@ -34,7 +34,7 @@
|
||||
>>> DecompressingBufferedReader(open(test_cdx_dir + 'iana.cdx.gz', 'rb'), decomp_type = 'gzip').readline()
|
||||
' CDX N b a m s k r M S V g\\n'
|
||||
|
||||
>>> HttpLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
|
||||
>>> BlockLoader(HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read()
|
||||
'Example Domain'
|
||||
|
||||
# test very small block size
|
||||
@ -53,7 +53,7 @@
|
||||
#=================================================================
|
||||
import os
|
||||
import StringIO
|
||||
from pywb.utils.loaders import FileLoader, HttpLoader, HMACCookieMaker
|
||||
from pywb.utils.loaders import BlockLoader, HMACCookieMaker
|
||||
from pywb.utils.loaders import LimitReader, SeekableTextFileReader
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
|
||||
|
@ -6,7 +6,7 @@ from pywb.utils.statusandheaders import StatusAndHeaders
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParser
|
||||
from pywb.utils.statusandheaders import StatusAndHeadersParserException
|
||||
|
||||
from pywb.utils.loaders import FileLoader, HttpLoader
|
||||
from pywb.utils.loaders import BlockLoader
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
|
||||
#=================================================================
|
||||
@ -32,23 +32,11 @@ class ArcWarcRecordLoader:
|
||||
ARC_HEADERS = ["uri", "ip-address", "creation-date",
|
||||
"content-type", "length"]
|
||||
|
||||
@staticmethod
|
||||
def create_default_loaders(cookie_maker=None):
|
||||
http = HttpLoader(cookie_maker)
|
||||
file = FileLoader()
|
||||
return {
|
||||
'http': http,
|
||||
'https': http,
|
||||
'file': file,
|
||||
'': file
|
||||
}
|
||||
|
||||
def __init__(self, loaders={}, cookie_maker=None, block_size=8192):
|
||||
self.loaders = loaders
|
||||
|
||||
if not self.loaders:
|
||||
self.loaders = self.create_default_loaders(cookie_maker)
|
||||
def __init__(self, loader=None, cookie_maker=None, block_size=8192):
|
||||
if not loader:
|
||||
loader = BlockLoader(cookie_maker)
|
||||
|
||||
self.loader = loader
|
||||
self.block_size = block_size
|
||||
|
||||
self.arc_parser = ARCHeadersParser(self.ARC_HEADERS)
|
||||
@ -60,16 +48,16 @@ class ArcWarcRecordLoader:
|
||||
def load(self, url, offset, length):
|
||||
url_parts = urlparse.urlsplit(url)
|
||||
|
||||
loader = self.loaders.get(url_parts.scheme)
|
||||
if not loader:
|
||||
raise ArchiveLoadFailed('Unknown Protocol', url)
|
||||
#loader = self.loaders.get(url_parts.scheme)
|
||||
#if not loader:
|
||||
# raise ArchiveLoadFailed('Unknown Protocol', url)
|
||||
|
||||
try:
|
||||
length = int(length)
|
||||
except:
|
||||
length = -1
|
||||
|
||||
raw = loader.load(url, long(offset), length)
|
||||
raw = self.loader.load(url, long(offset), length)
|
||||
|
||||
decomp_type = 'gzip'
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user