mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-15 00:03:28 +01:00
loaders: add support for loading from s3:// using boto
if auth connection fails, attempt anon connection, #97
This commit is contained in:
parent
4db661a09e
commit
c8a9a3ddd4
@ -11,7 +11,7 @@ import requests
|
||||
import urlparse
|
||||
import time
|
||||
import pkg_resources
|
||||
from io import open
|
||||
from io import open, BytesIO
|
||||
|
||||
|
||||
#=================================================================
|
||||
@ -19,6 +19,11 @@ def is_http(filename):
|
||||
return filename.startswith(('http://', 'https://'))
|
||||
|
||||
|
||||
#=================================================================
|
||||
def is_s3(filename):
|
||||
return filename.startswith('s3://')
|
||||
|
||||
|
||||
#=================================================================
|
||||
def to_file_url(filename):
|
||||
""" Convert a filename to a file:// url
|
||||
@ -144,6 +149,7 @@ class BlockLoader(object):
|
||||
def __init__(self, cookie_maker=None):
|
||||
self.cookie_maker = cookie_maker
|
||||
self.session = None
|
||||
self.s3conn = None
|
||||
|
||||
def load(self, url, offset=0, length=-1):
|
||||
"""
|
||||
@ -151,6 +157,8 @@ class BlockLoader(object):
|
||||
"""
|
||||
if is_http(url):
|
||||
return self.load_http(url, offset, length)
|
||||
elif is_s3(url):
|
||||
return self.load_s3(url, offset, length)
|
||||
else:
|
||||
return self.load_file_or_resource(url, offset, length)
|
||||
|
||||
@ -191,18 +199,21 @@ class BlockLoader(object):
|
||||
else:
|
||||
return afile
|
||||
|
||||
def load_http(self, url, offset, length):
|
||||
"""
|
||||
Load a file-like reader over http using range requests
|
||||
and an optional cookie created via a cookie_maker
|
||||
"""
|
||||
@staticmethod
|
||||
def _make_range_header(offset, length):
|
||||
if length > 0:
|
||||
range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
|
||||
else:
|
||||
range_header = 'bytes={0}-'.format(offset)
|
||||
|
||||
headers = {}
|
||||
headers['Range'] = range_header
|
||||
return range_header
|
||||
|
||||
def load_http(self, url, offset, length):
|
||||
"""
|
||||
Load a file-like reader over http using range requests
|
||||
and an optional cookie created via a cookie_maker
|
||||
"""
|
||||
headers = {'Range': self._make_range_header(offset, length)}
|
||||
|
||||
if self.cookie_maker:
|
||||
if isinstance(self.cookie_maker, basestring):
|
||||
@ -215,8 +226,32 @@ class BlockLoader(object):
|
||||
|
||||
r = self.session.get(url, headers=headers, stream=True)
|
||||
return r.raw
|
||||
#request = urllib2.Request(url, headers=headers)
|
||||
#return urllib2.urlopen(request)
|
||||
|
||||
def load_s3(self, url, offset, length):
|
||||
try:
|
||||
import boto
|
||||
except ImportError:
|
||||
raise IOError('To load from s3 paths, ' +
|
||||
'you must install boto: pip install boto')
|
||||
|
||||
if not self.s3conn:
|
||||
try:
|
||||
self.s3conn = boto.connect_s3()
|
||||
except Exception:
|
||||
self.s3conn = boto.connect_s3(anon=True)
|
||||
|
||||
parts = urlparse.urlsplit(url)
|
||||
|
||||
bucket = self.s3conn.get_bucket(parts.netloc)
|
||||
|
||||
headers = {'Range': self._make_range_header(offset, length)}
|
||||
|
||||
key = bucket.get_key(parts.path)
|
||||
|
||||
result = key.get_contents_as_string(headers=headers)
|
||||
key.close()
|
||||
|
||||
return BytesIO(result)
|
||||
|
||||
|
||||
#=================================================================
|
||||
|
@ -106,15 +106,20 @@ True
|
||||
#=================================================================
|
||||
import re
|
||||
import os
|
||||
import pytest
|
||||
|
||||
from io import BytesIO
|
||||
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
|
||||
from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query
|
||||
from pywb.utils.loaders import read_last_line
|
||||
|
||||
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||
|
||||
from pywb import get_test_dir
|
||||
|
||||
test_cdx_dir = get_test_dir() + 'cdx/'
|
||||
|
||||
|
||||
def read_multiple(reader, inc_reads):
|
||||
result = None
|
||||
for x in inc_reads:
|
||||
@ -127,6 +132,19 @@ def seek_read_full(seekable_reader, offset):
|
||||
seekable_reader.readline() #skip
|
||||
return seekable_reader.readline()
|
||||
|
||||
def test_s3_read_1():
|
||||
pytest.importorskip('boto')
|
||||
|
||||
res = BlockLoader().load('s3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz',
|
||||
offset=53235662,
|
||||
length=2526)
|
||||
|
||||
buff = res.read()
|
||||
assert len(buff) == 2526
|
||||
|
||||
reader = DecompressingBufferedReader(BytesIO(buff))
|
||||
assert reader.readline() == 'WARC/1.0\r\n'
|
||||
assert reader.readline() == 'WARC-Type: response\r\n'
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
Loading…
x
Reference in New Issue
Block a user