From 510c9dc9f11b38c9135eae88debd30f6a7ed54ea Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Tue, 9 Aug 2022 02:25:16 +0200 Subject: [PATCH] S3 loader to use boto3 built-in credential configuration (#723) * S3Loader: allow authenticated S3 access using boto3 built-in configuration methods without explicitly passing credentials, cf. https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#configuring-credentials * S3Loader tests: re-enable tests reading from s3://commoncrawl/ in order to test authenticated reads. Tests are skipped if no AWS credentials are configured. --- pywb/utils/loaders.py | 16 +++++++++------- pywb/utils/test/test_loaders.py | 18 ++++++++++++++---- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/pywb/utils/loaders.py b/pywb/utils/loaders.py index c8623fa6..b0e78cfb 100644 --- a/pywb/utils/loaders.py +++ b/pywb/utils/loaders.py @@ -185,7 +185,8 @@ class BlockLoader(BaseLoader): """ a loader which can stream blocks of content given a uri, offset and optional length. - Currently supports: http/https and file/local file system + Currently supports: http/https, file/local file system, + pkg, WebHDFS, S3 """ loaders = {} @@ -393,14 +394,15 @@ class S3Loader(BaseLoader): def s3_load(anon=False): if not self.client: + s3_client_args = {} if anon: - config = Config(signature_version=UNSIGNED) - else: - config = None + s3_client_args['config'] = Config(signature_version=UNSIGNED) + if aws_access_key_id: + s3_client_args['aws_access_key_id'] = aws_access_key_id + s3_client_args['aws_secret_access_key'] = aws_secret_access_key + + client = boto3.client('s3', **s3_client_args) - client = boto3.client('s3', aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - config=config) else: client = self.client diff --git a/pywb/utils/test/test_loaders.py b/pywb/utils/test/test_loaders.py index 0366a08d..a0f621f3 100644 --- a/pywb/utils/test/test_loaders.py +++ b/pywb/utils/test/test_loaders.py @@ -97,10 +97,19 @@ from pywb import get_test_dir test_cdx_dir = get_test_dir() + 'cdx/' -@pytest.mark.skip("skip for now, made need different s3 source") -def test_s3_read_1(): +def s3_authenticated_access_verification(bucket): + import boto3, botocore + s3_client = boto3.client('s3') + try: + s3_client.head_bucket(Bucket=bucket) + except botocore.exceptions.NoCredentialsError: + pytest.skip("Skipping S3Loader test for authenticated reads: no credentials configured") + +def test_s3_read_authenticated_1(): pytest.importorskip('boto3') + s3_authenticated_access_verification('commoncrawl') + res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/segments/1424936462700.28/warc/CC-MAIN-20150226074102-00159-ip-10-28-5-156.ec2.internal.warc.gz', offset=53235662, length=2526) @@ -112,10 +121,11 @@ def test_s3_read_1(): assert reader.readline() == b'WARC/1.0\r\n' assert reader.readline() == b'WARC-Type: response\r\n' -@pytest.mark.skip("skip for now, made need different s3 source") -def test_s3_read_2(): +def test_s3_read_authenticated_2(): pytest.importorskip('boto3') + s3_authenticated_access_verification('commoncrawl') + res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/index.html') buff = res.read()