diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index b5b39568..3e4bdd52 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -11,7 +11,7 @@ import requests import urlparse import time import pkg_resources -from io import open +from io import open, BytesIO #================================================================= @@ -19,6 +19,11 @@ def is_http(filename): return filename.startswith(('http://', 'https://')) +#================================================================= +def is_s3(filename): + return filename.startswith('s3://') + + #================================================================= def to_file_url(filename): """ Convert a filename to a file:// url @@ -144,6 +149,7 @@ class BlockLoader(object): def __init__(self, cookie_maker=None): self.cookie_maker = cookie_maker self.session = None + self.s3conn = None def load(self, url, offset=0, length=-1): """ @@ -151,6 +157,8 @@ class BlockLoader(object): """ if is_http(url): return self.load_http(url, offset, length) + elif is_s3(url): + return self.load_s3(url, offset, length) else: return self.load_file_or_resource(url, offset, length) @@ -191,18 +199,21 @@ class BlockLoader(object): else: return afile - def load_http(self, url, offset, length): - """ - Load a file-like reader over http using range requests - and an optional cookie created via a cookie_maker - """ + @staticmethod + def _make_range_header(offset, length): if length > 0: range_header = 'bytes={0}-{1}'.format(offset, offset + length - 1) else: range_header = 'bytes={0}-'.format(offset) - headers = {} - headers['Range'] = range_header + return range_header + + def load_http(self, url, offset, length): + """ + Load a file-like reader over http using range requests + and an optional cookie created via a cookie_maker + """ + headers = {'Range': self._make_range_header(offset, length)} if self.cookie_maker: if isinstance(self.cookie_maker, basestring): @@ -215,8 +226,32 @@ class BlockLoader(object): r = self.session.get(url, headers=headers, stream=True) return r.raw - #request = urllib2.Request(url, headers=headers) - #return urllib2.urlopen(request) + + def load_s3(self, url, offset, length): + try: + import boto + except ImportError: + raise IOError('To load from s3 paths, ' + + 'you must install boto: pip install boto') + + if not self.s3conn: + try: + self.s3conn = boto.connect_s3() + except Exception: + self.s3conn = boto.connect_s3(anon=True) + + parts = urlparse.urlsplit(url) + + bucket = self.s3conn.get_bucket(parts.netloc) + + headers = {'Range': self._make_range_header(offset, length)} + + key = bucket.get_key(parts.path) + + result = key.get_contents_as_string(headers=headers) + key.close() + + return BytesIO(result) #================================================================= diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index cb0071b9..cca16cf8 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -106,15 +106,20 @@ True #================================================================= import re import os +import pytest + from io import BytesIO from pywb.utils.loaders import BlockLoader, HMACCookieMaker, to_file_url from pywb.utils.loaders import LimitReader, extract_client_cookie, extract_post_query from pywb.utils.loaders import read_last_line +from pywb.utils.bufferedreaders import DecompressingBufferedReader + from pywb import get_test_dir test_cdx_dir = get_test_dir() + 'cdx/' + def read_multiple(reader, inc_reads): result = None for x in inc_reads: @@ -127,6 +132,19 @@ def seek_read_full(seekable_reader, offset): seekable_reader.readline() #skip return seekable_reader.readline() +def test_s3_read_1(): + pytest.importorskip('boto') + + res = BlockLoader().load('s3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz', + offset=53235662, + length=2526) + + buff = res.read() + assert len(buff) == 2526 + + reader = DecompressingBufferedReader(BytesIO(buff)) + assert reader.readline() == 'WARC/1.0\r\n' + assert reader.readline() == 'WARC-Type: response\r\n' if __name__ == "__main__": import doctest