mirror of https://github.com/webrecorder/pywb.git synced 2025-03-25 23:47:47 +01:00
Ilya Kreymer 2af5a25009 zipnum: support for pagination api! #34 and #83. cdx server now bounded by pageSize (default 10 blocks),
showNumPages=true returns json indicating num pages, page=N can be set to page number 0-numPages - 1
loaders: add read_last_line() to read last line of a seekable file, used to read last line of index file when
at end
tests: additional test for binsearch boundary conditions
zipnum: secondary index output supports json also
2015-03-24 18:56:13 -07:00

311 lines
8.1 KiB

This module provides loaders for local file system and over http
local and remote access
import os
import hmac
import urllib
import urllib2
import urlparse
import time
import pkg_resources
from io import open
def is_http(filename):
return filename.startswith(('http://', 'https://'))
def to_file_url(filename):
""" Convert a filename to a file:// url
url = os.path.abspath(filename)
url = urlparse.urljoin('file:', urllib.pathname2url(url))
return url
def load_yaml_config(config_file):
import yaml
configdata = BlockLoader().load(config_file)
config = yaml.load(configdata)
return config
def extract_post_query(method, mime, length, stream):
Extract a url-encoded form POST from stream
If not a application/x-www-form-urlencoded, or no missing
content length, return None
if method.upper() != 'POST':
return None
if ((not mime or
not mime.lower().startswith('application/x-www-form-urlencoded'))):
return None
length = int(length)
except (ValueError, TypeError):
return None
if length <= 0:
return None
#todo: encoding issues?
post_query = ''
while length > 0:
buff = stream.read(length)
length -= len(buff)
if not buff:
post_query += buff
post_query = urllib.unquote_plus(post_query)
return post_query
def append_post_query(url, post_query):
if not post_query:
return url
if '?' not in url:
url += '?'
url += '&'
url += post_query
return url
def extract_client_cookie(env, cookie_name):
cookie_header = env.get('HTTP_COOKIE')
if not cookie_header:
return None
# attempt to extract cookie_name only
inx = cookie_header.find(cookie_name)
if inx < 0:
return None
end_inx = cookie_header.find(';', inx)
if end_inx > 0:
value = cookie_header[inx:end_inx]
value = cookie_header[inx:]
value = value.split('=')
if len(value) < 2:
return None
value = value[1].strip()
return value
def read_last_line(fh, offset=256):
""" Read last line from a seekable file. Start reading
from buff before end of file, and double backwards seek
until line break is found. If reached beginning of file
(no lines), just return whole file
fh.seek(0, 2)
size = fh.tell()
while offset < size:
fh.seek(-offset, 2)
lines = fh.readlines()
if len(lines) > 1:
return lines[-1]
offset *= 2
fh.seek(0, 0)
return fh.readlines()[-1]
class BlockLoader(object):
a loader which can stream blocks of content
given a uri, offset and optional length.
Currently supports: http/https and file/local file system
def __init__(self, cookie_maker=None):
self.cookie_maker = cookie_maker
def load(self, url, offset=0, length=-1):
Determine loading method based on uri
if is_http(url):
return self.load_http(url, offset, length)
return self.load_file_or_resource(url, offset, length)
def load_file_or_resource(self, url, offset=0, length=-1):
Load a file-like reader from the local file system
# if starting with . or /, can only be a file path..
file_only = url.startswith(('/', '.'))
# convert to filename
if url.startswith('file://'):
file_only = True
url = urllib.url2pathname(url[len('file://'):])
# first, try as file
afile = open(url, 'rb')
except IOError:
if file_only:
# then, try as package.path/file
pkg_split = url.split('/', 1)
if len(pkg_split) == 1:
afile = pkg_resources.resource_stream(pkg_split[0],
if offset > 0:
if length >= 0:
return LimitReader(afile, length)
return afile
def load_http(self, url, offset, length):
Load a file-like reader over http using range requests
and an optional cookie created via a cookie_maker
if length > 0:
range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
range_header = 'bytes={0}-'.format(offset)
headers = {}
headers['Range'] = range_header
if self.cookie_maker:
if isinstance(self.cookie_maker, basestring):
headers['Cookie'] = self.cookie_maker
headers['Cookie'] = self.cookie_maker.make()
request = urllib2.Request(url, headers=headers)
return urllib2.urlopen(request)
# Signed Cookie-Maker
class HMACCookieMaker(object):
Utility class to produce signed HMAC digest cookies
to be used with each http request
def __init__(self, key, name, duration=10):
self.key = key
self.name = name
# duration in seconds
self.duration = duration
def make(self, extra_id=''):
expire = str(long(time.time() + self.duration))
if extra_id:
msg = extra_id + '-' + expire
msg = expire
hmacdigest = hmac.new(self.key, msg)
hexdigest = hmacdigest.hexdigest()
if extra_id:
cookie = '{0}-{1}={2}-{3}'.format(self.name, extra_id,
expire, hexdigest)
cookie = '{0}={1}-{2}'.format(self.name, expire, hexdigest)
return cookie
# Limit Reader
class LimitReader(object):
A reader which will not read more than specified limit
def __init__(self, stream, limit):
self.stream = stream
self.limit = limit
def read(self, length=None):
if length is not None:
length = min(length, self.limit)
length = self.limit
if length == 0:
return ''
buff = self.stream.read(length)
self.limit -= len(buff)
return buff
def readline(self, length=None):
if length is not None:
length = min(length, self.limit)
length = self.limit
if length == 0:
return ''
buff = self.stream.readline(length)
self.limit -= len(buff)
return buff
def close(self):
def wrap_stream(stream, content_length):
If given content_length is an int > 0, wrap the stream
in a LimitReader. Ottherwise, return the stream unaltered
content_length = int(content_length)
if content_length >= 0:
# optimize: if already a LimitStream, set limit to
# the smaller of the two limits
if isinstance(stream, LimitReader):
stream.limit = min(stream.limit, content_length)
stream = LimitReader(stream, content_length)
except (ValueError, TypeError):
return stream