1
0
mirror of https://github.com/webrecorder/pywb.git synced 2025-03-15 00:03:28 +01:00

loaders: add support for loading from s3:// using boto

if auth connection fails, attempt anon connection, #97
This commit is contained in:
Ilya Kreymer 2015-04-16 16:33:50 -07:00
parent 4db661a09e
commit c8a9a3ddd4
2 changed files with 63 additions and 10 deletions

View File

@ -11,7 +11,7 @@ import requests
import urlparse
import time
import pkg_resources
from io import open
from io import open, BytesIO
#=================================================================
@ -19,6 +19,11 @@ def is_http(filename):
return filename.startswith(('http://', 'https://'))
#=================================================================
def is_s3(filename):
return filename.startswith('s3://')
#=================================================================
def to_file_url(filename):
""" Convert a filename to a file:// url
@ -144,6 +149,7 @@ class BlockLoader(object):
def __init__(self, cookie_maker=None):
self.cookie_maker = cookie_maker
self.session = None
self.s3conn = None
def load(self, url, offset=0, length=-1):
"""
@ -151,6 +157,8 @@ class BlockLoader(object):
"""
if is_http(url):
return self.load_http(url, offset, length)
elif is_s3(url):
return self.load_s3(url, offset, length)
else:
return self.load_file_or_resource(url, offset, length)
@ -191,18 +199,21 @@ class BlockLoader(object):
else:
return afile
def load_http(self, url, offset, length):
"""
Load a file-like reader over http using range requests
and an optional cookie created via a cookie_maker
"""
@staticmethod
def _make_range_header(offset, length):
if length > 0:
range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
else:
range_header = 'bytes={0}-'.format(offset)
headers = {}
headers['Range'] = range_header
return range_header
def load_http(self, url, offset, length):
"""
Load a file-like reader over http using range requests
and an optional cookie created via a cookie_maker
"""
headers = {'Range': self._make_range_header(offset, length)}
if self.cookie_maker:
if isinstance(self.cookie_maker, basestring):
@ -215,8 +226,32 @@ class BlockLoader(object):
r = self.session.get(url, headers=headers, stream=True)
return r.raw
#request = urllib2.Request(url, headers=headers)
#return urllib2.urlopen(request)
def load_s3(self, url, offset, length):
try:
import boto
except ImportError:
raise IOError('To load from s3 paths, ' +
'you must install boto: pip install boto')
if not self.s3conn:
try:
self.s3conn = boto.connect_s3()
except Exception:
self.s3conn = boto.connect_s3(anon=True)
parts = urlparse.urlsplit(url)
bucket = self.s3conn.get_bucket(parts.netloc)
headers = {'Range': self._make_range_header(offset, length)}
key = bucket.get_key(parts.path)
result = key.get_contents_as_string(headers=headers)
key.close()
return BytesIO(result)
#=================================================================

View File

@ -106,15 +106,20 @@ True
#=================================================================
import re
import os
import pytest
from io import BytesIO
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query
from pywb.utils.loaders import read_last_line
from pywb.utils.bufferedreaders import DecompressingBufferedReader
from pywb import get_test_dir
test_cdx_dir = get_test_dir() + 'cdx/'
def read_multiple(reader, inc_reads):
result = None
for x in inc_reads:
@ -127,6 +132,19 @@ def seek_read_full(seekable_reader, offset):
seekable_reader.readline() #skip
return seekable_reader.readline()
def test_s3_read_1():
pytest.importorskip('boto')
res = BlockLoader().load('s3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz',
offset=53235662,
length=2526)
buff = res.read()
assert len(buff) == 2526
reader = DecompressingBufferedReader(BytesIO(buff))
assert reader.readline() == 'WARC/1.0\r\n'
assert reader.readline() == 'WARC-Type: response\r\n'
if __name__ == "__main__":
import doctest