From 5547322ae79053dc7a19e4acec8e8c926adf0ec4 Mon Sep 17 00:00:00 2001 From: stats-dev Date: Sun, 25 Jan 2026 20:12:07 +0900 Subject: [PATCH] feat: Add AWS profile support for GlueCatalog and internal S3FileIO --- mkdocs/docs/configuration.md | 4 +- pyiceberg/catalog/glue.py | 4 +- pyiceberg/io/__init__.py | 2 + pyiceberg/io/fsspec.py | 13 +++- tests/catalog/test_glue_profile.py | 67 ++++++++++++++++++ tests/io/test_fsspec_profile.py | 106 +++++++++++++++++++++++++++++ 6 files changed, 192 insertions(+), 4 deletions(-) create mode 100644 tests/catalog/test_glue_profile.py create mode 100644 tests/io/test_fsspec_profile.py diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index e42ea1da80..efe6ddee7a 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -115,6 +115,7 @@ For the FileIO there are several configuration options available: | s3.access-key-id | admin | Configure the static access key id used to access the FileIO. | | s3.secret-access-key | password | Configure the static secret access key used to access the FileIO. | | s3.session-token | AQoDYXdzEJr... | Configure the static session token used to access the FileIO. | +| s3.profile-name | default | Configure the AWS profile used to access the S3 FileIO. | | s3.role-session-name | session | An optional identifier for the assumed role session. | | s3.role-arn | arn:aws:... | AWS Role ARN. If provided instead of access_key and secret_key, temporary credentials will be fetched by assuming this role. | | s3.signer | bearer | Configure the signature version of the FileIO. | @@ -720,7 +721,7 @@ catalog: | glue.id | 111111111111 | Configure the 12-digit ID of the Glue Catalog | | glue.skip-archive | true | Configure whether to skip the archival of older table versions. Default to true | | glue.endpoint | | Configure an alternative endpoint of the Glue service for GlueCatalog to access | -| glue.profile-name | default | Configure the static profile used to access the Glue Catalog | +| glue.profile-name | default | Configure the AWS profile used to access the Glue Catalog | | glue.region | us-east-1 | Set the region of the Glue Catalog | | glue.access-key-id | admin | Configure the static access key id used to access the Glue Catalog | | glue.secret-access-key | password | Configure the static secret access key used to access the Glue Catalog | @@ -826,6 +827,7 @@ configures the AWS credentials for both Glue Catalog and S3 FileIO. | client.access-key-id | admin | Configure the static access key id used to access both the Glue/DynamoDB Catalog and the S3 FileIO | | client.secret-access-key | password | Configure the static secret access key used to access both the Glue/DynamoDB Catalog and the S3 FileIO | | client.session-token | AQoDYXdzEJr... | Configure the static session token used to access both the Glue/DynamoDB Catalog and the S3 FileIO | +| client.profile-name | default | Configure the AWS profile used to access both the Glue/DynamoDB Catalog and the S3 FileIO | | client.role-session-name | session | An optional identifier for the assumed role session. | | client.role-arn | arn:aws:... | AWS Role ARN. If provided instead of access_key and secret_key, temporary credentials will be fetched by assuming this role. | diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index 7260b29447..3a70fb93b5 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -48,7 +48,7 @@ NoSuchTableError, TableAlreadyExistsError, ) -from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN +from pyiceberg.io import AWS_ACCESS_KEY_ID, AWS_PROFILE_NAME, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema, SchemaVisitor, visit from pyiceberg.serializers import FromInputFile @@ -329,7 +329,7 @@ def __init__(self, name: str, client: Optional["GlueClient"] = None, **propertie retry_mode_prop_value = get_first_property_value(properties, GLUE_RETRY_MODE) session = boto3.Session( - profile_name=properties.get(GLUE_PROFILE_NAME), + profile_name=get_first_property_value(properties, GLUE_PROFILE_NAME, AWS_PROFILE_NAME), region_name=get_first_property_value(properties, GLUE_REGION, AWS_REGION), botocore_session=properties.get(BOTOCORE_SESSION), aws_access_key_id=get_first_property_value(properties, GLUE_ACCESS_KEY_ID, AWS_ACCESS_KEY_ID), diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py index 87d155a0fd..8486faabdf 100644 --- a/pyiceberg/io/__init__.py +++ b/pyiceberg/io/__init__.py @@ -41,12 +41,14 @@ logger = logging.getLogger(__name__) +AWS_PROFILE_NAME = "client.profile-name" AWS_REGION = "client.region" AWS_ACCESS_KEY_ID = "client.access-key-id" AWS_SECRET_ACCESS_KEY = "client.secret-access-key" AWS_SESSION_TOKEN = "client.session-token" AWS_ROLE_ARN = "client.role-arn" AWS_ROLE_SESSION_NAME = "client.role-session-name" +S3_PROFILE_NAME = "s3.profile-name" S3_ANONYMOUS = "s3.anonymous" S3_ENDPOINT = "s3.endpoint" S3_ACCESS_KEY_ID = "s3.access-key-id" diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py index eb5342c99d..6f44501eb6 100644 --- a/pyiceberg/io/fsspec.py +++ b/pyiceberg/io/fsspec.py @@ -51,6 +51,7 @@ ADLS_TENANT_ID, ADLS_TOKEN, AWS_ACCESS_KEY_ID, + AWS_PROFILE_NAME, AWS_REGION, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, @@ -71,6 +72,7 @@ S3_CONNECT_TIMEOUT, S3_ENDPOINT, S3_FORCE_VIRTUAL_ADDRESSING, + S3_PROFILE_NAME, S3_PROXY_URI, S3_REGION, S3_REQUEST_TIMEOUT, @@ -205,7 +207,16 @@ def _s3(properties: Properties) -> AbstractFileSystem: else: anon = False - fs = S3FileSystem(anon=anon, client_kwargs=client_kwargs, config_kwargs=config_kwargs) + s3_fs_kwargs = { + "anon": anon, + "client_kwargs": client_kwargs, + "config_kwargs": config_kwargs, + } + + if profile_name := get_first_property_value(properties, S3_PROFILE_NAME, AWS_PROFILE_NAME): + s3_fs_kwargs["profile"] = profile_name + + fs = S3FileSystem(**s3_fs_kwargs) for event_name, event_function in register_events.items(): fs.s3.meta.events.unregister(event_name, unique_id=1925) diff --git a/tests/catalog/test_glue_profile.py b/tests/catalog/test_glue_profile.py new file mode 100644 index 0000000000..3d9ee92a66 --- /dev/null +++ b/tests/catalog/test_glue_profile.py @@ -0,0 +1,67 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from unittest import mock + +from moto import mock_aws + +from pyiceberg.catalog.glue import GlueCatalog +from pyiceberg.typedef import Properties +from tests.conftest import UNIFIED_AWS_SESSION_PROPERTIES + + +@mock_aws +def test_passing_client_profile_name_properties_to_glue() -> None: + session_properties: Properties = { + "client.profile-name": "profile_name", + **UNIFIED_AWS_SESSION_PROPERTIES, + } + + with mock.patch("boto3.Session") as mock_session: + test_catalog = GlueCatalog("glue", **session_properties) + + mock_session.assert_called_with( + aws_access_key_id="client.access-key-id", + aws_secret_access_key="client.secret-access-key", + aws_session_token="client.session-token", + region_name="client.region", + profile_name="profile_name", + botocore_session=None, + ) + assert test_catalog.glue is mock_session().client() + + +@mock_aws +def test_glue_profile_precedence() -> None: + session_properties: Properties = { + "glue.profile-name": "glue-profile", + "client.profile-name": "client-profile", + **UNIFIED_AWS_SESSION_PROPERTIES, + } + + with mock.patch("boto3.Session") as mock_session: + test_catalog = GlueCatalog("glue", **session_properties) + + mock_session.assert_called_with( + aws_access_key_id="client.access-key-id", + aws_secret_access_key="client.secret-access-key", + aws_session_token="client.session-token", + region_name="client.region", + profile_name="glue-profile", + botocore_session=None, + ) + assert test_catalog.glue is mock_session().client() diff --git a/tests/io/test_fsspec_profile.py b/tests/io/test_fsspec_profile.py new file mode 100644 index 0000000000..5f4a63f6ff --- /dev/null +++ b/tests/io/test_fsspec_profile.py @@ -0,0 +1,106 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +import uuid +from unittest import mock + +from pyiceberg.io.fsspec import FsspecFileIO +from pyiceberg.typedef import Properties +from tests.conftest import UNIFIED_AWS_SESSION_PROPERTIES + + +def test_fsspec_s3_session_properties_with_profile() -> None: + session_properties: Properties = { + "s3.profile-name": "test-profile", + "s3.endpoint": "http://localhost:9000", + **UNIFIED_AWS_SESSION_PROPERTIES, + } + + with mock.patch("s3fs.S3FileSystem") as mock_s3fs: + s3_fileio = FsspecFileIO(properties=session_properties) + filename = str(uuid.uuid4()) + + s3_fileio.new_input(location=f"s3://warehouse/{filename}") + + mock_s3fs.assert_called_with( + anon=False, + client_kwargs={ + "endpoint_url": "http://localhost:9000", + "aws_access_key_id": "client.access-key-id", + "aws_secret_access_key": "client.secret-access-key", + "region_name": "client.region", + "aws_session_token": "client.session-token", + }, + config_kwargs={}, + profile="test-profile", + ) + + +def test_fsspec_s3_session_properties_with_client_profile() -> None: + session_properties: Properties = { + "client.profile-name": "test-profile", + "s3.endpoint": "http://localhost:9000", + **UNIFIED_AWS_SESSION_PROPERTIES, + } + + with mock.patch("s3fs.S3FileSystem") as mock_s3fs: + s3_fileio = FsspecFileIO(properties=session_properties) + filename = str(uuid.uuid4()) + + s3_fileio.new_input(location=f"s3://warehouse/{filename}") + + mock_s3fs.assert_called_with( + anon=False, + client_kwargs={ + "endpoint_url": "http://localhost:9000", + "aws_access_key_id": "client.access-key-id", + "aws_secret_access_key": "client.secret-access-key", + "region_name": "client.region", + "aws_session_token": "client.session-token", + }, + config_kwargs={}, + profile="test-profile", + ) + + +def test_fsspec_s3_session_properties_with_s3_and_client_profile() -> None: + session_properties: Properties = { + "s3.profile-name": "s3-profile", + "client.profile-name": "client-profile", + "s3.endpoint": "http://localhost:9000", + **UNIFIED_AWS_SESSION_PROPERTIES, + } + + with mock.patch("s3fs.S3FileSystem") as mock_s3fs: + s3_fileio = FsspecFileIO(properties=session_properties) + filename = str(uuid.uuid4()) + + s3_fileio.new_input(location=f"s3://warehouse/{filename}") + + mock_s3fs.assert_called_with( + anon=False, + client_kwargs={ + "endpoint_url": "http://localhost:9000", + "aws_access_key_id": "client.access-key-id", + "aws_secret_access_key": "client.secret-access-key", + "region_name": "client.region", + "aws_session_token": "client.session-token", + }, + config_kwargs={}, + profile="s3-profile", + )