mirror of
https://github.com/webrecorder/pywb.git
synced 2025-03-24 06:59:52 +01:00
loaders: add support for loading from s3:// using boto
if auth connection fails, attempt anon connection, #97
This commit is contained in:
parent
4db661a09e
commit
c8a9a3ddd4
@ -11,7 +11,7 @@ import requests
|
|||||||
import urlparse
|
import urlparse
|
||||||
import time
|
import time
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
from io import open
|
from io import open, BytesIO
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
@ -19,6 +19,11 @@ def is_http(filename):
|
|||||||
return filename.startswith(('http://', 'https://'))
|
return filename.startswith(('http://', 'https://'))
|
||||||
|
|
||||||
|
|
||||||
|
#=================================================================
|
||||||
|
def is_s3(filename):
|
||||||
|
return filename.startswith('s3://')
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
def to_file_url(filename):
|
def to_file_url(filename):
|
||||||
""" Convert a filename to a file:// url
|
""" Convert a filename to a file:// url
|
||||||
@ -144,6 +149,7 @@ class BlockLoader(object):
|
|||||||
def __init__(self, cookie_maker=None):
|
def __init__(self, cookie_maker=None):
|
||||||
self.cookie_maker = cookie_maker
|
self.cookie_maker = cookie_maker
|
||||||
self.session = None
|
self.session = None
|
||||||
|
self.s3conn = None
|
||||||
|
|
||||||
def load(self, url, offset=0, length=-1):
|
def load(self, url, offset=0, length=-1):
|
||||||
"""
|
"""
|
||||||
@ -151,6 +157,8 @@ class BlockLoader(object):
|
|||||||
"""
|
"""
|
||||||
if is_http(url):
|
if is_http(url):
|
||||||
return self.load_http(url, offset, length)
|
return self.load_http(url, offset, length)
|
||||||
|
elif is_s3(url):
|
||||||
|
return self.load_s3(url, offset, length)
|
||||||
else:
|
else:
|
||||||
return self.load_file_or_resource(url, offset, length)
|
return self.load_file_or_resource(url, offset, length)
|
||||||
|
|
||||||
@ -191,18 +199,21 @@ class BlockLoader(object):
|
|||||||
else:
|
else:
|
||||||
return afile
|
return afile
|
||||||
|
|
||||||
def load_http(self, url, offset, length):
|
@staticmethod
|
||||||
"""
|
def _make_range_header(offset, length):
|
||||||
Load a file-like reader over http using range requests
|
|
||||||
and an optional cookie created via a cookie_maker
|
|
||||||
"""
|
|
||||||
if length > 0:
|
if length > 0:
|
||||||
range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
|
range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1)
|
||||||
else:
|
else:
|
||||||
range_header = 'bytes={0}-'.format(offset)
|
range_header = 'bytes={0}-'.format(offset)
|
||||||
|
|
||||||
headers = {}
|
return range_header
|
||||||
headers['Range'] = range_header
|
|
||||||
|
def load_http(self, url, offset, length):
|
||||||
|
"""
|
||||||
|
Load a file-like reader over http using range requests
|
||||||
|
and an optional cookie created via a cookie_maker
|
||||||
|
"""
|
||||||
|
headers = {'Range': self._make_range_header(offset, length)}
|
||||||
|
|
||||||
if self.cookie_maker:
|
if self.cookie_maker:
|
||||||
if isinstance(self.cookie_maker, basestring):
|
if isinstance(self.cookie_maker, basestring):
|
||||||
@ -215,8 +226,32 @@ class BlockLoader(object):
|
|||||||
|
|
||||||
r = self.session.get(url, headers=headers, stream=True)
|
r = self.session.get(url, headers=headers, stream=True)
|
||||||
return r.raw
|
return r.raw
|
||||||
#request = urllib2.Request(url, headers=headers)
|
|
||||||
#return urllib2.urlopen(request)
|
def load_s3(self, url, offset, length):
|
||||||
|
try:
|
||||||
|
import boto
|
||||||
|
except ImportError:
|
||||||
|
raise IOError('To load from s3 paths, ' +
|
||||||
|
'you must install boto: pip install boto')
|
||||||
|
|
||||||
|
if not self.s3conn:
|
||||||
|
try:
|
||||||
|
self.s3conn = boto.connect_s3()
|
||||||
|
except Exception:
|
||||||
|
self.s3conn = boto.connect_s3(anon=True)
|
||||||
|
|
||||||
|
parts = urlparse.urlsplit(url)
|
||||||
|
|
||||||
|
bucket = self.s3conn.get_bucket(parts.netloc)
|
||||||
|
|
||||||
|
headers = {'Range': self._make_range_header(offset, length)}
|
||||||
|
|
||||||
|
key = bucket.get_key(parts.path)
|
||||||
|
|
||||||
|
result = key.get_contents_as_string(headers=headers)
|
||||||
|
key.close()
|
||||||
|
|
||||||
|
return BytesIO(result)
|
||||||
|
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
|
@ -106,15 +106,20 @@ True
|
|||||||
#=================================================================
|
#=================================================================
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
import pytest
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
|
from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url
|
||||||
from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query
|
from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query
|
||||||
from pywb.utils.loaders import read_last_line
|
from pywb.utils.loaders import read_last_line
|
||||||
|
|
||||||
|
from pywb.utils.bufferedreaders import DecompressingBufferedReader
|
||||||
|
|
||||||
from pywb import get_test_dir
|
from pywb import get_test_dir
|
||||||
|
|
||||||
test_cdx_dir = get_test_dir() + 'cdx/'
|
test_cdx_dir = get_test_dir() + 'cdx/'
|
||||||
|
|
||||||
|
|
||||||
def read_multiple(reader, inc_reads):
|
def read_multiple(reader, inc_reads):
|
||||||
result = None
|
result = None
|
||||||
for x in inc_reads:
|
for x in inc_reads:
|
||||||
@ -127,6 +132,19 @@ def seek_read_full(seekable_reader, offset):
|
|||||||
seekable_reader.readline() #skip
|
seekable_reader.readline() #skip
|
||||||
return seekable_reader.readline()
|
return seekable_reader.readline()
|
||||||
|
|
||||||
|
def test_s3_read_1():
|
||||||
|
pytest.importorskip('boto')
|
||||||
|
|
||||||
|
res = BlockLoader().load('s3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz',
|
||||||
|
offset=53235662,
|
||||||
|
length=2526)
|
||||||
|
|
||||||
|
buff = res.read()
|
||||||
|
assert len(buff) == 2526
|
||||||
|
|
||||||
|
reader = DecompressingBufferedReader(BytesIO(buff))
|
||||||
|
assert reader.readline() == 'WARC/1.0\r\n'
|
||||||
|
assert reader.readline() == 'WARC-Type: response\r\n'
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import doctest
|
import doctest
|
||||||
|
Loading…
x
Reference in New Issue
Block a user