mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-25 23:47:47 +01:00
407 lines
10 KiB
Python
407 lines
10 KiB
Python
"""
|
|
This module provides loaders for local file system and over http
|
|
local and remote access
|
|
"""
|
|
|
|
import os
|
|
import hmac
|
|
import requests
|
|
|
|
import six
|
|
import six.moves.urllib.request as urllib_req
|
|
import six.moves.urllib.parse as urlparse
|
|
|
|
import time
|
|
import pkg_resources
|
|
|
|
from io import open, BytesIO
|
|
|
|
try:
|
|
from boto import connect_s3
|
|
s3_avail = True
|
|
except ImportError: #pragma: no cover
|
|
s3_avail = False
|
|
|
|
|
|
#=================================================================
|
|
def is_http(filename):
|
|
return filename.startswith(('http://', 'https://'))
|
|
|
|
|
|
#=================================================================
|
|
def to_file_url(filename):
|
|
""" Convert a filename to a file:// url
|
|
"""
|
|
url = os.path.abspath(filename)
|
|
url = urlparse.urljoin('file:', urllib_req.pathname2url(url))
|
|
return url
|
|
|
|
|
|
#=================================================================
|
|
def load_yaml_config(config_file):
|
|
import yaml
|
|
configdata = BlockLoader().load(config_file)
|
|
config = yaml.load(configdata)
|
|
return config
|
|
|
|
|
|
#=================================================================
|
|
def extract_post_query(method, mime, length, stream, buffered_stream=None):
|
|
"""
|
|
Extract a url-encoded form POST from stream
|
|
If not a application/x-www-form-urlencoded, or no missing
|
|
content length, return None
|
|
"""
|
|
if method.upper() != 'POST':
|
|
return None
|
|
|
|
if ((not mime or
|
|
not mime.lower().startswith('application/x-www-form-urlencoded'))):
|
|
return None
|
|
|
|
try:
|
|
length = int(length)
|
|
except (ValueError, TypeError):
|
|
return None
|
|
|
|
if length <= 0:
|
|
return None
|
|
|
|
#todo: encoding issues?
|
|
post_query = ''
|
|
|
|
while length > 0:
|
|
buff = stream.read(length)
|
|
length -= len(buff)
|
|
|
|
if not buff:
|
|
break
|
|
|
|
post_query += buff
|
|
|
|
if buffered_stream:
|
|
buffered_stream.write(post_query)
|
|
buffered_stream.seek(0)
|
|
|
|
post_query = urlparse.unquote_plus(post_query)
|
|
return post_query
|
|
|
|
|
|
#=================================================================
|
|
def append_post_query(url, post_query):
|
|
if not post_query:
|
|
return url
|
|
|
|
if '?' not in url:
|
|
url += '?'
|
|
else:
|
|
url += '&'
|
|
|
|
url += post_query
|
|
return url
|
|
|
|
|
|
#=================================================================
|
|
def extract_client_cookie(env, cookie_name):
|
|
cookie_header = env.get('HTTP_COOKIE')
|
|
if not cookie_header:
|
|
return None
|
|
|
|
# attempt to extract cookie_name only
|
|
inx = cookie_header.find(cookie_name)
|
|
if inx < 0:
|
|
return None
|
|
|
|
end_inx = cookie_header.find(';', inx)
|
|
if end_inx > 0:
|
|
value = cookie_header[inx:end_inx]
|
|
else:
|
|
value = cookie_header[inx:]
|
|
|
|
value = value.split('=')
|
|
if len(value) < 2:
|
|
return None
|
|
|
|
value = value[1].strip()
|
|
return value
|
|
|
|
|
|
#=================================================================
|
|
def read_last_line(fh, offset=256):
|
|
""" Read last line from a seekable file. Start reading
|
|
from buff before end of file, and double backwards seek
|
|
until line break is found. If reached beginning of file
|
|
(no lines), just return whole file
|
|
"""
|
|
fh.seek(0, 2)
|
|
size = fh.tell()
|
|
|
|
while offset < size:
|
|
fh.seek(-offset, 2)
|
|
lines = fh.readlines()
|
|
if len(lines) > 1:
|
|
return lines[-1]
|
|
offset *= 2
|
|
|
|
fh.seek(0, 0)
|
|
return fh.readlines()[-1]
|
|
|
|
|
|
#=================================================================
|
|
class BlockLoader(object):
|
|
"""
|
|
a loader which can stream blocks of content
|
|
given a uri, offset and optional length.
|
|
Currently supports: http/https and file/local file system
|
|
"""
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
self.cached = {}
|
|
self.args = args
|
|
self.kwargs = kwargs
|
|
|
|
def load(self, url, offset=0, length=-1):
|
|
loader = self._get_loader_for(url)
|
|
return loader.load(url, offset, length)
|
|
|
|
def _get_loader_for(self, url):
|
|
"""
|
|
Determine loading method based on uri
|
|
"""
|
|
parts = url.split('://', 1)
|
|
if len(parts) < 2:
|
|
type_ = 'file'
|
|
else:
|
|
type_ = parts[0]
|
|
|
|
loader = self.cached.get(type_)
|
|
if loader:
|
|
return loader
|
|
|
|
loader_cls = LOADERS.get(type_)
|
|
if not loader_cls:
|
|
raise IOError('No Loader for type: ' + type_)
|
|
|
|
loader = loader_cls(*self.args, **self.kwargs)
|
|
self.cached[type_] = loader
|
|
return loader
|
|
|
|
|
|
@staticmethod
|
|
def _make_range_header(offset, length):
|
|
if length > 0:
|
|
range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
|
|
else:
|
|
range_header = 'bytes={0}-'.format(offset)
|
|
|
|
return range_header
|
|
|
|
|
|
#=================================================================
|
|
class LocalFileLoader(object):
|
|
def __init__(self, *args, **kwargs):
|
|
pass
|
|
|
|
def load(self, url, offset=0, length=-1):
|
|
"""
|
|
Load a file-like reader from the local file system
|
|
"""
|
|
|
|
# if starting with . or /, can only be a file path..
|
|
file_only = url.startswith(('/', '.'))
|
|
|
|
# convert to filename
|
|
if url.startswith('file://'):
|
|
file_only = True
|
|
url = urllib_req.url2pathname(url[len('file://'):])
|
|
|
|
try:
|
|
# first, try as file
|
|
afile = open(url, 'rb')
|
|
|
|
except IOError:
|
|
if file_only:
|
|
raise
|
|
|
|
# then, try as package.path/file
|
|
pkg_split = url.split('/', 1)
|
|
if len(pkg_split) == 1:
|
|
raise
|
|
|
|
afile = pkg_resources.resource_stream(pkg_split[0],
|
|
pkg_split[1])
|
|
|
|
if offset > 0:
|
|
afile.seek(offset)
|
|
|
|
if length >= 0:
|
|
return LimitReader(afile, length)
|
|
else:
|
|
return afile
|
|
|
|
|
|
#=================================================================
|
|
class HttpLoader(object):
|
|
def __init__(self, cookie_maker=None, *args, **kwargs):
|
|
self.cookie_maker = cookie_maker
|
|
self.session = None
|
|
|
|
def load(self, url, offset, length):
|
|
"""
|
|
Load a file-like reader over http using range requests
|
|
and an optional cookie created via a cookie_maker
|
|
"""
|
|
headers = {}
|
|
if offset != 0 or length != -1:
|
|
headers['Range'] = BlockLoader._make_range_header(offset, length)
|
|
|
|
if self.cookie_maker:
|
|
if isinstance(self.cookie_maker, six.string_types):
|
|
headers['Cookie'] = self.cookie_maker
|
|
else:
|
|
headers['Cookie'] = self.cookie_maker.make()
|
|
|
|
if not self.session:
|
|
self.session = requests.Session()
|
|
|
|
r = self.session.get(url, headers=headers, stream=True)
|
|
return r.raw
|
|
|
|
|
|
#=================================================================
|
|
class S3Loader(object):
|
|
def __init__(self, *args, **kwargs):
|
|
self.s3conn = None
|
|
|
|
def load(self, url, offset, length):
|
|
if not s3_avail: #pragma: no cover
|
|
raise IOError('To load from s3 paths, ' +
|
|
'you must install boto: pip install boto')
|
|
|
|
if not self.s3conn:
|
|
try:
|
|
self.s3conn = connect_s3()
|
|
except Exception: #pragma: no cover
|
|
self.s3conn = connect_s3(anon=True)
|
|
|
|
parts = urlparse.urlsplit(url)
|
|
|
|
bucket = self.s3conn.get_bucket(parts.netloc)
|
|
|
|
headers = {'Range': BlockLoader._make_range_header(offset, length)}
|
|
|
|
key = bucket.get_key(parts.path)
|
|
|
|
result = key.get_contents_as_string(headers=headers)
|
|
key.close()
|
|
|
|
return BytesIO(result)
|
|
|
|
|
|
#=================================================================
|
|
# Signed Cookie-Maker
|
|
#=================================================================
|
|
|
|
class HMACCookieMaker(object):
|
|
"""
|
|
Utility class to produce signed HMAC digest cookies
|
|
to be used with each http request
|
|
"""
|
|
def __init__(self, key, name, duration=10):
|
|
self.key = key
|
|
self.name = name
|
|
# duration in seconds
|
|
self.duration = duration
|
|
|
|
def make(self, extra_id=''):
|
|
expire = str(int(time.time() + self.duration))
|
|
|
|
if extra_id:
|
|
msg = extra_id + '-' + expire
|
|
else:
|
|
msg = expire
|
|
|
|
hmacdigest = hmac.new(self.key.encode('utf-8'), msg.encode('utf-8'))
|
|
hexdigest = hmacdigest.hexdigest()
|
|
|
|
if extra_id:
|
|
cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id,
|
|
expire, hexdigest)
|
|
else:
|
|
cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)
|
|
|
|
return cookie
|
|
|
|
|
|
#=================================================================
|
|
# Limit Reader
|
|
#=================================================================
|
|
class LimitReader(object):
|
|
"""
|
|
A reader which will not read more than specified limit
|
|
"""
|
|
|
|
def __init__(self, stream, limit):
|
|
self.stream = stream
|
|
self.limit = limit
|
|
|
|
def read(self, length=None):
|
|
if length is not None:
|
|
length = min(length, self.limit)
|
|
else:
|
|
length = self.limit
|
|
|
|
if length == 0:
|
|
return b''
|
|
|
|
buff = self.stream.read(length)
|
|
self.limit -= len(buff)
|
|
return buff
|
|
|
|
def readline(self, length=None):
|
|
if length is not None:
|
|
length = min(length, self.limit)
|
|
else:
|
|
length = self.limit
|
|
|
|
if length == 0:
|
|
return b''
|
|
|
|
buff = self.stream.readline(length)
|
|
self.limit -= len(buff)
|
|
return buff
|
|
|
|
def close(self):
|
|
self.stream.close()
|
|
|
|
@staticmethod
|
|
def wrap_stream(stream, content_length):
|
|
"""
|
|
If given content_length is an int > 0, wrap the stream
|
|
in a LimitReader. Ottherwise, return the stream unaltered
|
|
"""
|
|
try:
|
|
content_length = int(content_length)
|
|
if content_length >= 0:
|
|
# optimize: if already a LimitStream, set limit to
|
|
# the smaller of the two limits
|
|
if isinstance(stream, LimitReader):
|
|
stream.limit = min(stream.limit, content_length)
|
|
else:
|
|
stream = LimitReader(stream, content_length)
|
|
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
return stream
|
|
|
|
|
|
#=================================================================
|
|
LOADERS = {'http': HttpLoader,
|
|
'https': HttpLoader,
|
|
's3': S3Loader,
|
|
'file': LocalFileLoader
|
|
}
|
|
|
|
|