From c0f76f6bd8b8ca240695a12137a2ffbd5860da9a Mon Sep 17 00:00:00 2001 From: kirtimanmishrazipstack Date: Thu, 12 Mar 2026 10:14:54 +0530 Subject: [PATCH 1/6] code changes to access s3 using iam --- .../filesystems/minio/exceptions.py | 41 ++- .../connectors/filesystems/minio/minio.py | 44 ++- .../filesystems/minio/static/json_schema.json | 10 +- .../tests/filesystems/test_miniofs.py | 291 ++++++++++++++++++ .../src/unstract/sdk1/file_storage/helper.py | 29 +- unstract/sdk1/tests/file_storage/__init__.py | 0 .../sdk1/tests/file_storage/test_helper.py | 97 ++++++ 7 files changed, 478 insertions(+), 34 deletions(-) create mode 100644 unstract/sdk1/tests/file_storage/__init__.py create mode 100644 unstract/sdk1/tests/file_storage/test_helper.py diff --git a/unstract/connectors/src/unstract/connectors/filesystems/minio/exceptions.py b/unstract/connectors/src/unstract/connectors/filesystems/minio/exceptions.py index 0cc1a89772..684260d4bb 100644 --- a/unstract/connectors/src/unstract/connectors/filesystems/minio/exceptions.py +++ b/unstract/connectors/src/unstract/connectors/filesystems/minio/exceptions.py @@ -1,12 +1,27 @@ from unstract.connectors.exceptions import ConnectorError -S3FS_EXC_TO_UNSTRACT_EXC = { +S3FS_EXC_TO_UNSTRACT_EXC_STATIC = { "The AWS Access Key Id you provided does not exist in our records": ( "Invalid Key (Access Key ID) provided, please provide a valid one." ), "The request signature we calculated does not match the signature you provided": ( "Invalid Secret (Secret Access Key) provided, please provide a valid one." ), +} + +_AMBIENT_AUTH_MSG = ( + "AWS authentication failed — verify IAM role permissions " + "or IRSA service account annotation." +) + +S3FS_EXC_TO_UNSTRACT_EXC_AMBIENT = { + "The AWS Access Key Id you provided does not exist in our records": _AMBIENT_AUTH_MSG, + "The request signature we calculated does not match the signature you provided": ( + _AMBIENT_AUTH_MSG + ), +} + +S3FS_EXC_TO_UNSTRACT_EXC_COMMON = { "[Errno 22] S3 API Requests must be made to API port": ( # Minio only "Request made to invalid port, please check the port of the endpoint URL." ), @@ -17,7 +32,9 @@ } -def handle_s3fs_exception(e: Exception) -> ConnectorError: +def handle_s3fs_exception( + e: Exception, using_static_creds: bool = True +) -> ConnectorError: """Parses the exception from S3/MinIO. Helps parse the S3/MinIO error and wraps it with our @@ -25,6 +42,8 @@ def handle_s3fs_exception(e: Exception) -> ConnectorError: Args: e (Exception): Error from S3/MinIO + using_static_creds (bool): Whether static credentials were configured. + Controls the auth error message style (key/secret vs IAM/IRSA). Returns: ConnectorError: Unstract's ConnectorError object @@ -35,9 +54,21 @@ def handle_s3fs_exception(e: Exception) -> ConnectorError: original_exc = str(e) user_msg = "Error from S3 / MinIO while testing connection: " exc_to_append = "" - for s3fs_exc, user_friendly_msg in S3FS_EXC_TO_UNSTRACT_EXC.items(): - if s3fs_exc in original_exc: - exc_to_append = user_friendly_msg + + # Choose auth-error mapping based on credential mode + auth_map = ( + S3FS_EXC_TO_UNSTRACT_EXC_STATIC + if using_static_creds + else S3FS_EXC_TO_UNSTRACT_EXC_AMBIENT + ) + + # Check auth errors first, then common errors + for exc_map in (auth_map, S3FS_EXC_TO_UNSTRACT_EXC_COMMON): + for s3fs_exc, user_friendly_msg in exc_map.items(): + if s3fs_exc in original_exc: + exc_to_append = user_friendly_msg + break + if exc_to_append: break # Generic error handling diff --git a/unstract/connectors/src/unstract/connectors/filesystems/minio/minio.py b/unstract/connectors/src/unstract/connectors/filesystems/minio/minio.py index bf99e5f747..99d3b45272 100644 --- a/unstract/connectors/src/unstract/connectors/filesystems/minio/minio.py +++ b/unstract/connectors/src/unstract/connectors/filesystems/minio/minio.py @@ -16,22 +16,30 @@ class MinioFS(UnstractFileSystem): def __init__(self, settings: dict[str, Any]): super().__init__("MinioFS/S3") - key = settings.get("key", "") - secret = settings.get("secret", "") - endpoint_url = settings.get("endpoint_url", "") + key = (settings.get("key") or "").strip() + secret = (settings.get("secret") or "").strip() + endpoint_url = (settings.get("endpoint_url") or "").strip() client_kwargs = {} if "region_name" in settings and settings["region_name"] != "": client_kwargs = {"region_name": settings["region_name"]} + + creds: dict[str, str] = {} + if key and secret: + creds["key"] = key + creds["secret"] = secret + if endpoint_url: + creds["endpoint_url"] = endpoint_url + + self._using_static_creds = bool(key and secret) + self.s3 = S3FileSystem( anon=False, - key=key, - secret=secret, use_listings_cache=False, default_fill_cache=False, default_cache_type="none", skip_instance_cache=True, - endpoint_url=endpoint_url, client_kwargs=client_kwargs, + **creds, ) @staticmethod @@ -92,12 +100,13 @@ def extract_metadata_file_hash(self, metadata: dict[str, Any]) -> str | None: file_hash = file_hash.strip('"') if "-" in file_hash: logger.warning( - f"[S3/MinIO] Multipart upload detected. ETag may not be an " - f"MD5 hash. Full metadata: {metadata}" + "[S3/MinIO] Multipart upload detected. ETag may not be an " + "MD5 hash. Full metadata: %s", + metadata, ) return None return file_hash - logger.error(f"[MinIO] File hash not found for the metadata: {metadata}") + logger.error("[MinIO] File hash not found for the metadata: %s", metadata) return None def is_dir_by_metadata(self, metadata: dict[str, Any]) -> bool: @@ -119,7 +128,8 @@ def _find_modified_date_value(self, metadata: dict[str, Any]) -> Any | None: return last_modified logger.debug( - f"[S3/MinIO] No modified date found in metadata keys: {list(metadata.keys())}" + "[S3/MinIO] No modified date found in metadata keys: %s", + list(metadata.keys()), ) return None @@ -146,7 +156,9 @@ def _parse_string_datetime( return dt.astimezone(UTC) except (ValueError, TypeError): logger.warning( - f"[S3/MinIO] Failed to parse datetime '{date_str}' from metadata keys: {metadata_keys}" + "[S3/MinIO] Failed to parse datetime '%s' from metadata keys: %s", + date_str, + metadata_keys, ) return None @@ -155,7 +167,7 @@ def _parse_numeric_timestamp(self, timestamp: float) -> datetime | None: try: return datetime.fromtimestamp(timestamp, tz=UTC) except (ValueError, OSError): - logger.warning(f"[S3/MinIO] Invalid epoch timestamp: {timestamp}") + logger.warning("[S3/MinIO] Invalid epoch timestamp: %s", timestamp) return None def extract_modified_date(self, metadata: dict[str, Any]) -> datetime | None: @@ -183,7 +195,9 @@ def extract_modified_date(self, metadata: dict[str, Any]) -> datetime | None: return self._parse_numeric_timestamp(last_modified) logger.debug( - f"[S3/MinIO] Unsupported datetime type '{type(last_modified)}' in metadata keys: {list(metadata.keys())}" + "[S3/MinIO] Unsupported datetime type '%s' in metadata keys: %s", + type(last_modified), + list(metadata.keys()), ) return None @@ -195,5 +209,7 @@ def test_credentials(self) -> bool: try: self.get_fsspec_fs().ls("") except Exception as e: - raise handle_s3fs_exception(e) from e + raise handle_s3fs_exception( + e, using_static_creds=self._using_static_creds + ) from e return True diff --git a/unstract/connectors/src/unstract/connectors/filesystems/minio/static/json_schema.json b/unstract/connectors/src/unstract/connectors/filesystems/minio/static/json_schema.json index e357f48058..d0ba5ed916 100644 --- a/unstract/connectors/src/unstract/connectors/filesystems/minio/static/json_schema.json +++ b/unstract/connectors/src/unstract/connectors/filesystems/minio/static/json_schema.json @@ -4,8 +4,6 @@ "type": "object", "required": [ "connectorName", - "key", - "secret", "endpoint_url", "region_name" ], @@ -19,25 +17,25 @@ "type": "string", "title": "Key", "default": "", - "description": "Access Key ID" + "description": "Access Key ID (leave blank to use IAM role / instance profile)" }, "secret": { "type": "string", "title": "Secret", "format": "password", - "description": "Secret Access Key" + "description": "Secret Access Key (leave blank to use IAM role / instance profile)" }, "endpoint_url": { "type": "string", "title": "Endpoint URL", "default": "https://s3.amazonaws.com", - "description": "Endpoint URL to connect to. (example `https://s3.amazonaws.com`)" + "description": "Endpoint URL (leave blank for default AWS S3)" }, "region_name": { "type": "string", "title": "Region Name", "default": "ap-south", - "description": "Region of the AWS S3 account. For Minio, leave it blank" + "description": "Region of the AWS S3 account (leave blank for Minio)" } } } diff --git a/unstract/connectors/tests/filesystems/test_miniofs.py b/unstract/connectors/tests/filesystems/test_miniofs.py index 28dd00b134..d1632de0a1 100644 --- a/unstract/connectors/tests/filesystems/test_miniofs.py +++ b/unstract/connectors/tests/filesystems/test_miniofs.py @@ -1,6 +1,9 @@ import os import unittest +from unittest.mock import MagicMock, patch +from unstract.connectors.exceptions import ConnectorError +from unstract.connectors.filesystems.minio.exceptions import handle_s3fs_exception from unstract.connectors.filesystems.minio.minio import MinioFS @@ -39,5 +42,293 @@ def test_minio(self) -> None: print(s3.get_fsspec_fs().ls("/minio-test")) # type:ignore +class TestMinioFSCredentials(unittest.TestCase): + """Tests for IRSA / IAM role support — credential omission logic.""" + + @patch( + "unstract.connectors.filesystems.minio.minio.S3FileSystem", + return_value=MagicMock(), + ) + def test_empty_key_secret_not_passed(self, mock_s3fs: MagicMock) -> None: + """Empty key/secret should NOT be forwarded to S3FileSystem.""" + MinioFS({"key": "", "secret": "", "endpoint_url": ""}) + _, kwargs = mock_s3fs.call_args + self.assertNotIn("key", kwargs) + self.assertNotIn("secret", kwargs) + self.assertNotIn("endpoint_url", kwargs) + + @patch( + "unstract.connectors.filesystems.minio.minio.S3FileSystem", + return_value=MagicMock(), + ) + def test_missing_key_secret_not_passed(self, mock_s3fs: MagicMock) -> None: + """Missing key/secret (no keys in settings) should NOT be forwarded.""" + MinioFS({}) + _, kwargs = mock_s3fs.call_args + self.assertNotIn("key", kwargs) + self.assertNotIn("secret", kwargs) + self.assertNotIn("endpoint_url", kwargs) + + @patch( + "unstract.connectors.filesystems.minio.minio.S3FileSystem", + return_value=MagicMock(), + ) + def test_real_credentials_passed_through(self, mock_s3fs: MagicMock) -> None: + """Real credentials should be forwarded to S3FileSystem.""" + MinioFS( + { + "key": "fake-access-key-id", + "secret": "fake-secret-access-key", + "endpoint_url": "https://s3.amazonaws.com", + } + ) + _, kwargs = mock_s3fs.call_args + self.assertEqual(kwargs["key"], "fake-access-key-id") + self.assertEqual(kwargs["secret"], "fake-secret-access-key") + self.assertEqual(kwargs["endpoint_url"], "https://s3.amazonaws.com") + + @patch( + "unstract.connectors.filesystems.minio.minio.S3FileSystem", + return_value=MagicMock(), + ) + def test_endpoint_url_omitted_when_empty(self, mock_s3fs: MagicMock) -> None: + """Empty endpoint_url should NOT be forwarded.""" + MinioFS( + { + "key": "fake-access-key-id", + "secret": "fake-secret-access-key", + "endpoint_url": " ", + } + ) + _, kwargs = mock_s3fs.call_args + self.assertIn("key", kwargs) + self.assertNotIn("endpoint_url", kwargs) + + @patch( + "unstract.connectors.filesystems.minio.minio.S3FileSystem", + return_value=MagicMock(), + ) + def test_endpoint_url_passed_when_present(self, mock_s3fs: MagicMock) -> None: + """Non-empty endpoint_url should be forwarded.""" + MinioFS( + { + "key": "fake-access-key-id", + "secret": "fake-secret-access-key", + "endpoint_url": "http://localhost:9000", + } + ) + _, kwargs = mock_s3fs.call_args + self.assertEqual(kwargs["endpoint_url"], "http://localhost:9000") + + # --- Partial credentials tests --- + + @patch( + "unstract.connectors.filesystems.minio.minio.S3FileSystem", + return_value=MagicMock(), + ) + def test_key_only_without_secret_uses_ambient( + self, mock_s3fs: MagicMock + ) -> None: + """Key present but secret absent should fall back to ambient path.""" + MinioFS({"key": "fake-access-key-id", "secret": ""}) + _, kwargs = mock_s3fs.call_args + self.assertNotIn("key", kwargs) + self.assertNotIn("secret", kwargs) + + @patch( + "unstract.connectors.filesystems.minio.minio.S3FileSystem", + return_value=MagicMock(), + ) + def test_secret_only_without_key_uses_ambient( + self, mock_s3fs: MagicMock + ) -> None: + """Secret present but key absent should fall back to ambient path.""" + MinioFS({"key": "", "secret": "fake-secret-access-key"}) + _, kwargs = mock_s3fs.call_args + self.assertNotIn("key", kwargs) + self.assertNotIn("secret", kwargs) + + # --- Whitespace-only credentials tests --- + + @patch( + "unstract.connectors.filesystems.minio.minio.S3FileSystem", + return_value=MagicMock(), + ) + def test_whitespace_only_key_secret_uses_ambient( + self, mock_s3fs: MagicMock + ) -> None: + """Whitespace-only key/secret should fall back to ambient path.""" + MinioFS({"key": " ", "secret": " \t "}) + _, kwargs = mock_s3fs.call_args + self.assertNotIn("key", kwargs) + self.assertNotIn("secret", kwargs) + + @patch( + "unstract.connectors.filesystems.minio.minio.S3FileSystem", + return_value=MagicMock(), + ) + def test_whitespace_key_with_valid_secret_uses_ambient( + self, mock_s3fs: MagicMock + ) -> None: + """Whitespace-only key with valid secret should fall back to ambient.""" + MinioFS( + {"key": " ", "secret": "fake-secret-access-key"} + ) + _, kwargs = mock_s3fs.call_args + self.assertNotIn("key", kwargs) + self.assertNotIn("secret", kwargs) + + # --- _using_static_creds flag tests --- + + @patch( + "unstract.connectors.filesystems.minio.minio.S3FileSystem", + return_value=MagicMock(), + ) + def test_using_static_creds_true_when_creds_present( + self, mock_s3fs: MagicMock + ) -> None: + """_using_static_creds should be True when both key and secret are provided.""" + fs = MinioFS( + { + "key": "fake-access-key-id", + "secret": "fake-secret-access-key", + } + ) + self.assertTrue(fs._using_static_creds) + + @patch( + "unstract.connectors.filesystems.minio.minio.S3FileSystem", + return_value=MagicMock(), + ) + def test_using_static_creds_false_when_creds_absent( + self, mock_s3fs: MagicMock + ) -> None: + """_using_static_creds should be False when key/secret are empty.""" + fs = MinioFS({"key": "", "secret": ""}) + self.assertFalse(fs._using_static_creds) + + @patch( + "unstract.connectors.filesystems.minio.minio.S3FileSystem", + return_value=MagicMock(), + ) + def test_using_static_creds_false_for_partial_creds( + self, mock_s3fs: MagicMock + ) -> None: + """_using_static_creds should be False when only one of key/secret is set.""" + fs = MinioFS({"key": "fake-access-key-id", "secret": ""}) + self.assertFalse(fs._using_static_creds) + + +class TestHandleS3fsException(unittest.TestCase): + """Tests for context-aware auth error messages in exceptions.py.""" + + def test_invalid_key_static_creds_message(self) -> None: + """Static creds mode: invalid key error references key/secret.""" + exc = Exception( + "The AWS Access Key Id you provided does not exist in our records" + ) + result = handle_s3fs_exception(exc, using_static_creds=True) + self.assertIsInstance(result, ConnectorError) + self.assertIn("Invalid Key", str(result)) + + def test_invalid_secret_static_creds_message(self) -> None: + """Static creds mode: signature mismatch references secret.""" + exc = Exception( + "The request signature we calculated does not match " + "the signature you provided" + ) + result = handle_s3fs_exception(exc, using_static_creds=True) + self.assertIsInstance(result, ConnectorError) + self.assertIn("Invalid Secret", str(result)) + + def test_invalid_key_ambient_creds_message(self) -> None: + """Ambient creds mode: invalid key error references IAM/IRSA.""" + exc = Exception( + "The AWS Access Key Id you provided does not exist in our records" + ) + result = handle_s3fs_exception(exc, using_static_creds=False) + self.assertIsInstance(result, ConnectorError) + msg = str(result) + self.assertIn("IAM role", msg) + self.assertIn("IRSA", msg) + self.assertNotIn("Invalid Key", msg) + + def test_invalid_secret_ambient_creds_message(self) -> None: + """Ambient creds mode: signature mismatch references IAM/IRSA.""" + exc = Exception( + "The request signature we calculated does not match " + "the signature you provided" + ) + result = handle_s3fs_exception(exc, using_static_creds=False) + self.assertIsInstance(result, ConnectorError) + msg = str(result) + self.assertIn("IAM role", msg) + self.assertIn("IRSA", msg) + self.assertNotIn("Invalid Secret", msg) + + def test_common_error_unaffected_by_creds_mode(self) -> None: + """Common errors (port, endpoint) should be the same regardless.""" + exc = Exception("[Errno 22] S3 API Requests must be made to API port") + result_static = handle_s3fs_exception(exc, using_static_creds=True) + result_ambient = handle_s3fs_exception(exc, using_static_creds=False) + self.assertIn("invalid port", str(result_static)) + self.assertIn("invalid port", str(result_ambient)) + + def test_unknown_error_passes_through(self) -> None: + """Unknown errors should pass through the original message.""" + exc = Exception("Something completely unexpected happened") + result = handle_s3fs_exception(exc, using_static_creds=True) + self.assertIn("Something completely unexpected happened", str(result)) + + def test_connector_error_passes_through(self) -> None: + """ConnectorError should be returned as-is.""" + exc = ConnectorError(message="Already wrapped") + result = handle_s3fs_exception(exc, using_static_creds=True) + self.assertIs(result, exc) + + def test_default_using_static_creds_is_true(self) -> None: + """Default value for using_static_creds should be True.""" + exc = Exception( + "The AWS Access Key Id you provided does not exist in our records" + ) + result = handle_s3fs_exception(exc) + self.assertIn("Invalid Key", str(result)) + + +# --- JSON schema validation tests --- + + +class TestMinioJsonSchema(unittest.TestCase): + """Tests for the JSON schema configuration.""" + + def test_schema_required_fields(self) -> None: + """Schema should require connectorName, endpoint_url, region_name.""" + import json + + schema_str = MinioFS.get_json_schema() + schema = json.loads(schema_str) + required = schema["required"] + self.assertIn("connectorName", required) + self.assertIn("endpoint_url", required) + self.assertIn("region_name", required) + # key and secret should NOT be required (IRSA support) + self.assertNotIn("key", required) + self.assertNotIn("secret", required) + + def test_schema_has_default_for_endpoint_url(self) -> None: + """endpoint_url should have a default value.""" + import json + + schema = json.loads(MinioFS.get_json_schema()) + self.assertIn("default", schema["properties"]["endpoint_url"]) + + def test_schema_has_default_for_region_name(self) -> None: + """region_name should have a default value.""" + import json + + schema = json.loads(MinioFS.get_json_schema()) + self.assertIn("default", schema["properties"]["region_name"]) + + if __name__ == "__main__": unittest.main() diff --git a/unstract/sdk1/src/unstract/sdk1/file_storage/helper.py b/unstract/sdk1/src/unstract/sdk1/file_storage/helper.py index 17e614b00a..4f32c515a4 100644 --- a/unstract/sdk1/src/unstract/sdk1/file_storage/helper.py +++ b/unstract/sdk1/src/unstract/sdk1/file_storage/helper.py @@ -1,5 +1,4 @@ import logging -from typing import Any import fsspec from fsspec import AbstractFileSystem @@ -12,7 +11,7 @@ class FileStorageHelper: @staticmethod def file_storage_init( - provider: FileStorageProvider, **storage_config: dict[str, Any] + provider: FileStorageProvider, **storage_config: object ) -> AbstractFileSystem: """Initialises file storage based on provider. @@ -28,24 +27,34 @@ def file_storage_init( protocol = provider.value if provider == FileStorageProvider.LOCAL: # Hard set auto_mkdir to True as default - storage_config.update({"auto_mkdir": True}) # type: ignore + storage_config.update({"auto_mkdir": True}) elif provider in [FileStorageProvider.MINIO]: # Initialise using s3 for Minio protocol = FileStorageProvider.S3.value + if provider in (FileStorageProvider.S3, FileStorageProvider.MINIO): + # Strip empty string values so boto3's credential chain + # can work (e.g., IRSA on EKS) + storage_config = { + k: v + for k, v in storage_config.items() + if not (isinstance(v, str) and v.strip() == "") + } + fs = fsspec.filesystem( protocol=protocol, **storage_config, ) - logger.debug(f"Connected to {provider.value} file system") + logger.debug("Connected to %s file system", provider.value) except KeyError as e: logger.error( - f"Error in initialising {provider.value} " - f"file system because of missing config {e}" + "Error in initialising %s file system because of missing config %s", + provider.value, + e, ) raise FileStorageError(str(e)) from e except Exception as e: - logger.error(f"Error in initialising {provider.value} file system {e}") + logger.error("Error in initialising %s file system %s", provider.value, e) raise FileStorageError(str(e)) from e return fs @@ -58,11 +67,13 @@ def local_file_system_init() -> AbstractFileSystem: """ try: fs = fsspec.filesystem(protocol=FileStorageProvider.LOCAL.value) - logger.debug(f"Connected to {FileStorageProvider.LOCAL.value} file system") + logger.debug("Connected to %s file system", FileStorageProvider.LOCAL.value) return fs except Exception as e: logger.error( - f"Error in initialising {FileStorageProvider.GCS.value} file system {e}" + "Error in initialising %s file system %s", + FileStorageProvider.GCS.value, + e, ) raise FileStorageError(str(e)) from e diff --git a/unstract/sdk1/tests/file_storage/__init__.py b/unstract/sdk1/tests/file_storage/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/unstract/sdk1/tests/file_storage/test_helper.py b/unstract/sdk1/tests/file_storage/test_helper.py new file mode 100644 index 0000000000..4375bd8223 --- /dev/null +++ b/unstract/sdk1/tests/file_storage/test_helper.py @@ -0,0 +1,97 @@ +from collections.abc import Generator +from unittest.mock import MagicMock, patch + +import pytest +from unstract.sdk1.file_storage.helper import FileStorageHelper +from unstract.sdk1.file_storage.provider import FileStorageProvider + + +@pytest.fixture +def mock_fsspec() -> Generator[MagicMock, None, None]: + with patch("unstract.sdk1.file_storage.helper.fsspec") as mock: + mock.filesystem.return_value = MagicMock() + yield mock + + +class TestFileStorageHelperStripping: + """Tests for empty string stripping in file_storage_init.""" + + def test_s3_empty_strings_stripped(self, mock_fsspec: MagicMock) -> None: + """Empty string credentials should be stripped for S3 provider.""" + FileStorageHelper.file_storage_init( + provider=FileStorageProvider.S3, + key="", + secret=" ", + endpoint_url="https://s3.us-east-1.amazonaws.com/", + ) + _, kwargs = mock_fsspec.filesystem.call_args + assert "key" not in kwargs + assert "secret" not in kwargs + assert kwargs["endpoint_url"] == "https://s3.us-east-1.amazonaws.com/" + + def test_minio_empty_strings_stripped(self, mock_fsspec: MagicMock) -> None: + """Empty string credentials should be stripped for MINIO provider.""" + FileStorageHelper.file_storage_init( + provider=FileStorageProvider.MINIO, + key="", + secret="", + endpoint_url="http://minio:9000", + ) + _, kwargs = mock_fsspec.filesystem.call_args + assert "key" not in kwargs + assert "secret" not in kwargs + assert kwargs["endpoint_url"] == "http://minio:9000" + + def test_s3_nonempty_credentials_preserved(self, mock_fsspec: MagicMock) -> None: + """Non-empty credentials should be passed through unchanged.""" + FileStorageHelper.file_storage_init( + provider=FileStorageProvider.S3, + key="fake-access-key-id", + secret="fake-secret-access-key", + ) + _, kwargs = mock_fsspec.filesystem.call_args + assert kwargs["key"] == "fake-access-key-id" + assert kwargs["secret"] == "fake-secret-access-key" + + def test_local_provider_unaffected(self, mock_fsspec: MagicMock) -> None: + """LOCAL provider should not have stripping applied.""" + FileStorageHelper.file_storage_init( + provider=FileStorageProvider.LOCAL, + ) + _, kwargs = mock_fsspec.filesystem.call_args + # LOCAL adds auto_mkdir, verify it's there + assert kwargs["auto_mkdir"] is True + + def test_s3_protocol_used(self, mock_fsspec: MagicMock) -> None: + """S3 provider should use 's3' protocol.""" + FileStorageHelper.file_storage_init( + provider=FileStorageProvider.S3, + key="test-key", + secret="test-secret", + ) + _, kwargs = mock_fsspec.filesystem.call_args + assert kwargs["protocol"] == "s3" + + def test_minio_uses_s3_protocol(self, mock_fsspec: MagicMock) -> None: + """MINIO provider should use 's3' protocol.""" + FileStorageHelper.file_storage_init( + provider=FileStorageProvider.MINIO, + key="minio", + secret="minio123", + endpoint_url="http://minio:9000", + ) + _, kwargs = mock_fsspec.filesystem.call_args + assert kwargs["protocol"] == "s3" + + def test_s3_non_string_values_preserved(self, mock_fsspec: MagicMock) -> None: + """Non-string config values (booleans, ints) should not be stripped.""" + FileStorageHelper.file_storage_init( + provider=FileStorageProvider.S3, + key="test-key", + secret="test-secret", + anon=False, + max_retries=3, + ) + _, kwargs = mock_fsspec.filesystem.call_args + assert kwargs["anon"] is False + assert kwargs["max_retries"] == 3 From 49d1a37731ea5c5913403f827abfd48783745773 Mon Sep 17 00:00:00 2001 From: kirtimanmishrazipstack Date: Thu, 12 Mar 2026 15:55:11 +0530 Subject: [PATCH 2/6] iam roles support in helm changes --- .../filesystems/minio/exceptions.py | 21 ++++++++++ .../tests/filesystems/test_miniofs.py | 39 +++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/unstract/connectors/src/unstract/connectors/filesystems/minio/exceptions.py b/unstract/connectors/src/unstract/connectors/filesystems/minio/exceptions.py index 684260d4bb..f83715174e 100644 --- a/unstract/connectors/src/unstract/connectors/filesystems/minio/exceptions.py +++ b/unstract/connectors/src/unstract/connectors/filesystems/minio/exceptions.py @@ -14,11 +14,32 @@ "or IRSA service account annotation." ) +_IRSA_NO_CREDS_MSG = ( + "No AWS credentials found. Ensure the pod's ServiceAccount is annotated " + "with an IAM role ARN for IRSA, or provide static access key/secret." +) + +_IRSA_ASSUME_ROLE_MSG = ( + "Failed to assume IAM role via IRSA. Verify the IAM role exists, has the " + "correct trust policy for the EKS OIDC provider, and the ServiceAccount " + "is annotated correctly." +) + +_IRSA_ACCESS_DENIED_MSG = ( + "Access denied — the IAM role does not have sufficient S3 permissions. " + "Ensure the role policy grants s3:GetObject, s3:PutObject, s3:ListBucket, " + "and s3:DeleteObject on the target bucket." +) + S3FS_EXC_TO_UNSTRACT_EXC_AMBIENT = { "The AWS Access Key Id you provided does not exist in our records": _AMBIENT_AUTH_MSG, "The request signature we calculated does not match the signature you provided": ( _AMBIENT_AUTH_MSG ), + "Unable to locate credentials": _IRSA_NO_CREDS_MSG, + "AssumeRoleWithWebIdentity": _IRSA_ASSUME_ROLE_MSG, + "InvalidIdentityToken": _IRSA_ASSUME_ROLE_MSG, + "Access Denied": _IRSA_ACCESS_DENIED_MSG, } S3FS_EXC_TO_UNSTRACT_EXC_COMMON = { diff --git a/unstract/connectors/tests/filesystems/test_miniofs.py b/unstract/connectors/tests/filesystems/test_miniofs.py index d1632de0a1..2e9c4a220b 100644 --- a/unstract/connectors/tests/filesystems/test_miniofs.py +++ b/unstract/connectors/tests/filesystems/test_miniofs.py @@ -286,6 +286,45 @@ def test_connector_error_passes_through(self) -> None: result = handle_s3fs_exception(exc, using_static_creds=True) self.assertIs(result, exc) + def test_ambient_no_credentials(self) -> None: + """Ambient mode: missing credentials gives IRSA guidance.""" + exc = Exception("Unable to locate credentials") + result = handle_s3fs_exception(exc, using_static_creds=False) + self.assertIsInstance(result, ConnectorError) + msg = str(result) + self.assertIn("No AWS credentials found", msg) + self.assertIn("ServiceAccount", msg) + + def test_ambient_assume_role_failure(self) -> None: + """Ambient mode: AssumeRoleWithWebIdentity error gives trust policy guidance.""" + exc = Exception( + "An error occurred (AccessDenied) when calling the " + "AssumeRoleWithWebIdentity operation" + ) + result = handle_s3fs_exception(exc, using_static_creds=False) + self.assertIsInstance(result, ConnectorError) + msg = str(result) + self.assertIn("Failed to assume IAM role via IRSA", msg) + self.assertIn("trust policy", msg) + + def test_ambient_invalid_identity_token(self) -> None: + """Ambient mode: InvalidIdentityToken gives same trust policy guidance.""" + exc = Exception("InvalidIdentityToken: Token is expired or invalid") + result = handle_s3fs_exception(exc, using_static_creds=False) + self.assertIsInstance(result, ConnectorError) + msg = str(result) + self.assertIn("Failed to assume IAM role via IRSA", msg) + self.assertIn("trust policy", msg) + + def test_ambient_access_denied(self) -> None: + """Ambient mode: Access Denied gives S3 permissions guidance.""" + exc = Exception("Access Denied") + result = handle_s3fs_exception(exc, using_static_creds=False) + self.assertIsInstance(result, ConnectorError) + msg = str(result) + self.assertIn("Access denied", msg) + self.assertIn("s3:GetObject", msg) + def test_default_using_static_creds_is_true(self) -> None: """Default value for using_static_creds should be True.""" exc = Exception( From 8c3acae1056b5f69fe8871da15e2c6293f9bad8e Mon Sep 17 00:00:00 2001 From: kirtimanmishrazipstack Date: Thu, 12 Mar 2026 16:03:25 +0530 Subject: [PATCH 3/6] removed test file --- .../tests/filesystems/test_miniofs.py | 330 ------------------ .../sdk1/tests/file_storage/test_helper.py | 97 ----- 2 files changed, 427 deletions(-) delete mode 100644 unstract/sdk1/tests/file_storage/test_helper.py diff --git a/unstract/connectors/tests/filesystems/test_miniofs.py b/unstract/connectors/tests/filesystems/test_miniofs.py index 2e9c4a220b..28dd00b134 100644 --- a/unstract/connectors/tests/filesystems/test_miniofs.py +++ b/unstract/connectors/tests/filesystems/test_miniofs.py @@ -1,9 +1,6 @@ import os import unittest -from unittest.mock import MagicMock, patch -from unstract.connectors.exceptions import ConnectorError -from unstract.connectors.filesystems.minio.exceptions import handle_s3fs_exception from unstract.connectors.filesystems.minio.minio import MinioFS @@ -42,332 +39,5 @@ def test_minio(self) -> None: print(s3.get_fsspec_fs().ls("/minio-test")) # type:ignore -class TestMinioFSCredentials(unittest.TestCase): - """Tests for IRSA / IAM role support — credential omission logic.""" - - @patch( - "unstract.connectors.filesystems.minio.minio.S3FileSystem", - return_value=MagicMock(), - ) - def test_empty_key_secret_not_passed(self, mock_s3fs: MagicMock) -> None: - """Empty key/secret should NOT be forwarded to S3FileSystem.""" - MinioFS({"key": "", "secret": "", "endpoint_url": ""}) - _, kwargs = mock_s3fs.call_args - self.assertNotIn("key", kwargs) - self.assertNotIn("secret", kwargs) - self.assertNotIn("endpoint_url", kwargs) - - @patch( - "unstract.connectors.filesystems.minio.minio.S3FileSystem", - return_value=MagicMock(), - ) - def test_missing_key_secret_not_passed(self, mock_s3fs: MagicMock) -> None: - """Missing key/secret (no keys in settings) should NOT be forwarded.""" - MinioFS({}) - _, kwargs = mock_s3fs.call_args - self.assertNotIn("key", kwargs) - self.assertNotIn("secret", kwargs) - self.assertNotIn("endpoint_url", kwargs) - - @patch( - "unstract.connectors.filesystems.minio.minio.S3FileSystem", - return_value=MagicMock(), - ) - def test_real_credentials_passed_through(self, mock_s3fs: MagicMock) -> None: - """Real credentials should be forwarded to S3FileSystem.""" - MinioFS( - { - "key": "fake-access-key-id", - "secret": "fake-secret-access-key", - "endpoint_url": "https://s3.amazonaws.com", - } - ) - _, kwargs = mock_s3fs.call_args - self.assertEqual(kwargs["key"], "fake-access-key-id") - self.assertEqual(kwargs["secret"], "fake-secret-access-key") - self.assertEqual(kwargs["endpoint_url"], "https://s3.amazonaws.com") - - @patch( - "unstract.connectors.filesystems.minio.minio.S3FileSystem", - return_value=MagicMock(), - ) - def test_endpoint_url_omitted_when_empty(self, mock_s3fs: MagicMock) -> None: - """Empty endpoint_url should NOT be forwarded.""" - MinioFS( - { - "key": "fake-access-key-id", - "secret": "fake-secret-access-key", - "endpoint_url": " ", - } - ) - _, kwargs = mock_s3fs.call_args - self.assertIn("key", kwargs) - self.assertNotIn("endpoint_url", kwargs) - - @patch( - "unstract.connectors.filesystems.minio.minio.S3FileSystem", - return_value=MagicMock(), - ) - def test_endpoint_url_passed_when_present(self, mock_s3fs: MagicMock) -> None: - """Non-empty endpoint_url should be forwarded.""" - MinioFS( - { - "key": "fake-access-key-id", - "secret": "fake-secret-access-key", - "endpoint_url": "http://localhost:9000", - } - ) - _, kwargs = mock_s3fs.call_args - self.assertEqual(kwargs["endpoint_url"], "http://localhost:9000") - - # --- Partial credentials tests --- - - @patch( - "unstract.connectors.filesystems.minio.minio.S3FileSystem", - return_value=MagicMock(), - ) - def test_key_only_without_secret_uses_ambient( - self, mock_s3fs: MagicMock - ) -> None: - """Key present but secret absent should fall back to ambient path.""" - MinioFS({"key": "fake-access-key-id", "secret": ""}) - _, kwargs = mock_s3fs.call_args - self.assertNotIn("key", kwargs) - self.assertNotIn("secret", kwargs) - - @patch( - "unstract.connectors.filesystems.minio.minio.S3FileSystem", - return_value=MagicMock(), - ) - def test_secret_only_without_key_uses_ambient( - self, mock_s3fs: MagicMock - ) -> None: - """Secret present but key absent should fall back to ambient path.""" - MinioFS({"key": "", "secret": "fake-secret-access-key"}) - _, kwargs = mock_s3fs.call_args - self.assertNotIn("key", kwargs) - self.assertNotIn("secret", kwargs) - - # --- Whitespace-only credentials tests --- - - @patch( - "unstract.connectors.filesystems.minio.minio.S3FileSystem", - return_value=MagicMock(), - ) - def test_whitespace_only_key_secret_uses_ambient( - self, mock_s3fs: MagicMock - ) -> None: - """Whitespace-only key/secret should fall back to ambient path.""" - MinioFS({"key": " ", "secret": " \t "}) - _, kwargs = mock_s3fs.call_args - self.assertNotIn("key", kwargs) - self.assertNotIn("secret", kwargs) - - @patch( - "unstract.connectors.filesystems.minio.minio.S3FileSystem", - return_value=MagicMock(), - ) - def test_whitespace_key_with_valid_secret_uses_ambient( - self, mock_s3fs: MagicMock - ) -> None: - """Whitespace-only key with valid secret should fall back to ambient.""" - MinioFS( - {"key": " ", "secret": "fake-secret-access-key"} - ) - _, kwargs = mock_s3fs.call_args - self.assertNotIn("key", kwargs) - self.assertNotIn("secret", kwargs) - - # --- _using_static_creds flag tests --- - - @patch( - "unstract.connectors.filesystems.minio.minio.S3FileSystem", - return_value=MagicMock(), - ) - def test_using_static_creds_true_when_creds_present( - self, mock_s3fs: MagicMock - ) -> None: - """_using_static_creds should be True when both key and secret are provided.""" - fs = MinioFS( - { - "key": "fake-access-key-id", - "secret": "fake-secret-access-key", - } - ) - self.assertTrue(fs._using_static_creds) - - @patch( - "unstract.connectors.filesystems.minio.minio.S3FileSystem", - return_value=MagicMock(), - ) - def test_using_static_creds_false_when_creds_absent( - self, mock_s3fs: MagicMock - ) -> None: - """_using_static_creds should be False when key/secret are empty.""" - fs = MinioFS({"key": "", "secret": ""}) - self.assertFalse(fs._using_static_creds) - - @patch( - "unstract.connectors.filesystems.minio.minio.S3FileSystem", - return_value=MagicMock(), - ) - def test_using_static_creds_false_for_partial_creds( - self, mock_s3fs: MagicMock - ) -> None: - """_using_static_creds should be False when only one of key/secret is set.""" - fs = MinioFS({"key": "fake-access-key-id", "secret": ""}) - self.assertFalse(fs._using_static_creds) - - -class TestHandleS3fsException(unittest.TestCase): - """Tests for context-aware auth error messages in exceptions.py.""" - - def test_invalid_key_static_creds_message(self) -> None: - """Static creds mode: invalid key error references key/secret.""" - exc = Exception( - "The AWS Access Key Id you provided does not exist in our records" - ) - result = handle_s3fs_exception(exc, using_static_creds=True) - self.assertIsInstance(result, ConnectorError) - self.assertIn("Invalid Key", str(result)) - - def test_invalid_secret_static_creds_message(self) -> None: - """Static creds mode: signature mismatch references secret.""" - exc = Exception( - "The request signature we calculated does not match " - "the signature you provided" - ) - result = handle_s3fs_exception(exc, using_static_creds=True) - self.assertIsInstance(result, ConnectorError) - self.assertIn("Invalid Secret", str(result)) - - def test_invalid_key_ambient_creds_message(self) -> None: - """Ambient creds mode: invalid key error references IAM/IRSA.""" - exc = Exception( - "The AWS Access Key Id you provided does not exist in our records" - ) - result = handle_s3fs_exception(exc, using_static_creds=False) - self.assertIsInstance(result, ConnectorError) - msg = str(result) - self.assertIn("IAM role", msg) - self.assertIn("IRSA", msg) - self.assertNotIn("Invalid Key", msg) - - def test_invalid_secret_ambient_creds_message(self) -> None: - """Ambient creds mode: signature mismatch references IAM/IRSA.""" - exc = Exception( - "The request signature we calculated does not match " - "the signature you provided" - ) - result = handle_s3fs_exception(exc, using_static_creds=False) - self.assertIsInstance(result, ConnectorError) - msg = str(result) - self.assertIn("IAM role", msg) - self.assertIn("IRSA", msg) - self.assertNotIn("Invalid Secret", msg) - - def test_common_error_unaffected_by_creds_mode(self) -> None: - """Common errors (port, endpoint) should be the same regardless.""" - exc = Exception("[Errno 22] S3 API Requests must be made to API port") - result_static = handle_s3fs_exception(exc, using_static_creds=True) - result_ambient = handle_s3fs_exception(exc, using_static_creds=False) - self.assertIn("invalid port", str(result_static)) - self.assertIn("invalid port", str(result_ambient)) - - def test_unknown_error_passes_through(self) -> None: - """Unknown errors should pass through the original message.""" - exc = Exception("Something completely unexpected happened") - result = handle_s3fs_exception(exc, using_static_creds=True) - self.assertIn("Something completely unexpected happened", str(result)) - - def test_connector_error_passes_through(self) -> None: - """ConnectorError should be returned as-is.""" - exc = ConnectorError(message="Already wrapped") - result = handle_s3fs_exception(exc, using_static_creds=True) - self.assertIs(result, exc) - - def test_ambient_no_credentials(self) -> None: - """Ambient mode: missing credentials gives IRSA guidance.""" - exc = Exception("Unable to locate credentials") - result = handle_s3fs_exception(exc, using_static_creds=False) - self.assertIsInstance(result, ConnectorError) - msg = str(result) - self.assertIn("No AWS credentials found", msg) - self.assertIn("ServiceAccount", msg) - - def test_ambient_assume_role_failure(self) -> None: - """Ambient mode: AssumeRoleWithWebIdentity error gives trust policy guidance.""" - exc = Exception( - "An error occurred (AccessDenied) when calling the " - "AssumeRoleWithWebIdentity operation" - ) - result = handle_s3fs_exception(exc, using_static_creds=False) - self.assertIsInstance(result, ConnectorError) - msg = str(result) - self.assertIn("Failed to assume IAM role via IRSA", msg) - self.assertIn("trust policy", msg) - - def test_ambient_invalid_identity_token(self) -> None: - """Ambient mode: InvalidIdentityToken gives same trust policy guidance.""" - exc = Exception("InvalidIdentityToken: Token is expired or invalid") - result = handle_s3fs_exception(exc, using_static_creds=False) - self.assertIsInstance(result, ConnectorError) - msg = str(result) - self.assertIn("Failed to assume IAM role via IRSA", msg) - self.assertIn("trust policy", msg) - - def test_ambient_access_denied(self) -> None: - """Ambient mode: Access Denied gives S3 permissions guidance.""" - exc = Exception("Access Denied") - result = handle_s3fs_exception(exc, using_static_creds=False) - self.assertIsInstance(result, ConnectorError) - msg = str(result) - self.assertIn("Access denied", msg) - self.assertIn("s3:GetObject", msg) - - def test_default_using_static_creds_is_true(self) -> None: - """Default value for using_static_creds should be True.""" - exc = Exception( - "The AWS Access Key Id you provided does not exist in our records" - ) - result = handle_s3fs_exception(exc) - self.assertIn("Invalid Key", str(result)) - - -# --- JSON schema validation tests --- - - -class TestMinioJsonSchema(unittest.TestCase): - """Tests for the JSON schema configuration.""" - - def test_schema_required_fields(self) -> None: - """Schema should require connectorName, endpoint_url, region_name.""" - import json - - schema_str = MinioFS.get_json_schema() - schema = json.loads(schema_str) - required = schema["required"] - self.assertIn("connectorName", required) - self.assertIn("endpoint_url", required) - self.assertIn("region_name", required) - # key and secret should NOT be required (IRSA support) - self.assertNotIn("key", required) - self.assertNotIn("secret", required) - - def test_schema_has_default_for_endpoint_url(self) -> None: - """endpoint_url should have a default value.""" - import json - - schema = json.loads(MinioFS.get_json_schema()) - self.assertIn("default", schema["properties"]["endpoint_url"]) - - def test_schema_has_default_for_region_name(self) -> None: - """region_name should have a default value.""" - import json - - schema = json.loads(MinioFS.get_json_schema()) - self.assertIn("default", schema["properties"]["region_name"]) - - if __name__ == "__main__": unittest.main() diff --git a/unstract/sdk1/tests/file_storage/test_helper.py b/unstract/sdk1/tests/file_storage/test_helper.py deleted file mode 100644 index 4375bd8223..0000000000 --- a/unstract/sdk1/tests/file_storage/test_helper.py +++ /dev/null @@ -1,97 +0,0 @@ -from collections.abc import Generator -from unittest.mock import MagicMock, patch - -import pytest -from unstract.sdk1.file_storage.helper import FileStorageHelper -from unstract.sdk1.file_storage.provider import FileStorageProvider - - -@pytest.fixture -def mock_fsspec() -> Generator[MagicMock, None, None]: - with patch("unstract.sdk1.file_storage.helper.fsspec") as mock: - mock.filesystem.return_value = MagicMock() - yield mock - - -class TestFileStorageHelperStripping: - """Tests for empty string stripping in file_storage_init.""" - - def test_s3_empty_strings_stripped(self, mock_fsspec: MagicMock) -> None: - """Empty string credentials should be stripped for S3 provider.""" - FileStorageHelper.file_storage_init( - provider=FileStorageProvider.S3, - key="", - secret=" ", - endpoint_url="https://s3.us-east-1.amazonaws.com/", - ) - _, kwargs = mock_fsspec.filesystem.call_args - assert "key" not in kwargs - assert "secret" not in kwargs - assert kwargs["endpoint_url"] == "https://s3.us-east-1.amazonaws.com/" - - def test_minio_empty_strings_stripped(self, mock_fsspec: MagicMock) -> None: - """Empty string credentials should be stripped for MINIO provider.""" - FileStorageHelper.file_storage_init( - provider=FileStorageProvider.MINIO, - key="", - secret="", - endpoint_url="http://minio:9000", - ) - _, kwargs = mock_fsspec.filesystem.call_args - assert "key" not in kwargs - assert "secret" not in kwargs - assert kwargs["endpoint_url"] == "http://minio:9000" - - def test_s3_nonempty_credentials_preserved(self, mock_fsspec: MagicMock) -> None: - """Non-empty credentials should be passed through unchanged.""" - FileStorageHelper.file_storage_init( - provider=FileStorageProvider.S3, - key="fake-access-key-id", - secret="fake-secret-access-key", - ) - _, kwargs = mock_fsspec.filesystem.call_args - assert kwargs["key"] == "fake-access-key-id" - assert kwargs["secret"] == "fake-secret-access-key" - - def test_local_provider_unaffected(self, mock_fsspec: MagicMock) -> None: - """LOCAL provider should not have stripping applied.""" - FileStorageHelper.file_storage_init( - provider=FileStorageProvider.LOCAL, - ) - _, kwargs = mock_fsspec.filesystem.call_args - # LOCAL adds auto_mkdir, verify it's there - assert kwargs["auto_mkdir"] is True - - def test_s3_protocol_used(self, mock_fsspec: MagicMock) -> None: - """S3 provider should use 's3' protocol.""" - FileStorageHelper.file_storage_init( - provider=FileStorageProvider.S3, - key="test-key", - secret="test-secret", - ) - _, kwargs = mock_fsspec.filesystem.call_args - assert kwargs["protocol"] == "s3" - - def test_minio_uses_s3_protocol(self, mock_fsspec: MagicMock) -> None: - """MINIO provider should use 's3' protocol.""" - FileStorageHelper.file_storage_init( - provider=FileStorageProvider.MINIO, - key="minio", - secret="minio123", - endpoint_url="http://minio:9000", - ) - _, kwargs = mock_fsspec.filesystem.call_args - assert kwargs["protocol"] == "s3" - - def test_s3_non_string_values_preserved(self, mock_fsspec: MagicMock) -> None: - """Non-string config values (booleans, ints) should not be stripped.""" - FileStorageHelper.file_storage_init( - provider=FileStorageProvider.S3, - key="test-key", - secret="test-secret", - anon=False, - max_retries=3, - ) - _, kwargs = mock_fsspec.filesystem.call_args - assert kwargs["anon"] is False - assert kwargs["max_retries"] == 3 From cc21ab10ddad9f7290f66c6108145ca4ab6417ea Mon Sep 17 00:00:00 2001 From: kirtimanmishrazipstack Date: Wed, 18 Mar 2026 18:01:27 +0530 Subject: [PATCH 4/6] passing region_name as top level argument in sdk --- unstract/sdk1/src/unstract/sdk1/file_storage/helper.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/unstract/sdk1/src/unstract/sdk1/file_storage/helper.py b/unstract/sdk1/src/unstract/sdk1/file_storage/helper.py index 4f32c515a4..912bf282a5 100644 --- a/unstract/sdk1/src/unstract/sdk1/file_storage/helper.py +++ b/unstract/sdk1/src/unstract/sdk1/file_storage/helper.py @@ -40,6 +40,15 @@ def file_storage_init( for k, v in storage_config.items() if not (isinstance(v, str) and v.strip() == "") } + # s3fs expects region_name inside client_kwargs, + # not as a top-level arg + region_name = storage_config.pop("region_name", None) + if region_name: + client_kwargs: dict[str, object] = storage_config.get( # type: ignore[assignment] + "client_kwargs", {} + ) + client_kwargs["region_name"] = region_name + storage_config["client_kwargs"] = client_kwargs fs = fsspec.filesystem( protocol=protocol, From b8da3981b79daa85f1a11b0c13bf072842c7f98d Mon Sep 17 00:00:00 2001 From: kirtimanmishrazipstack Date: Wed, 18 Mar 2026 18:46:38 +0530 Subject: [PATCH 5/6] removing connector changes --- .../filesystems/minio/exceptions.py | 62 ++----------------- .../connectors/filesystems/minio/minio.py | 44 +++++-------- .../filesystems/minio/static/json_schema.json | 10 +-- 3 files changed, 25 insertions(+), 91 deletions(-) diff --git a/unstract/connectors/src/unstract/connectors/filesystems/minio/exceptions.py b/unstract/connectors/src/unstract/connectors/filesystems/minio/exceptions.py index f83715174e..0cc1a89772 100644 --- a/unstract/connectors/src/unstract/connectors/filesystems/minio/exceptions.py +++ b/unstract/connectors/src/unstract/connectors/filesystems/minio/exceptions.py @@ -1,48 +1,12 @@ from unstract.connectors.exceptions import ConnectorError -S3FS_EXC_TO_UNSTRACT_EXC_STATIC = { +S3FS_EXC_TO_UNSTRACT_EXC = { "The AWS Access Key Id you provided does not exist in our records": ( "Invalid Key (Access Key ID) provided, please provide a valid one." ), "The request signature we calculated does not match the signature you provided": ( "Invalid Secret (Secret Access Key) provided, please provide a valid one." ), -} - -_AMBIENT_AUTH_MSG = ( - "AWS authentication failed — verify IAM role permissions " - "or IRSA service account annotation." -) - -_IRSA_NO_CREDS_MSG = ( - "No AWS credentials found. Ensure the pod's ServiceAccount is annotated " - "with an IAM role ARN for IRSA, or provide static access key/secret." -) - -_IRSA_ASSUME_ROLE_MSG = ( - "Failed to assume IAM role via IRSA. Verify the IAM role exists, has the " - "correct trust policy for the EKS OIDC provider, and the ServiceAccount " - "is annotated correctly." -) - -_IRSA_ACCESS_DENIED_MSG = ( - "Access denied — the IAM role does not have sufficient S3 permissions. " - "Ensure the role policy grants s3:GetObject, s3:PutObject, s3:ListBucket, " - "and s3:DeleteObject on the target bucket." -) - -S3FS_EXC_TO_UNSTRACT_EXC_AMBIENT = { - "The AWS Access Key Id you provided does not exist in our records": _AMBIENT_AUTH_MSG, - "The request signature we calculated does not match the signature you provided": ( - _AMBIENT_AUTH_MSG - ), - "Unable to locate credentials": _IRSA_NO_CREDS_MSG, - "AssumeRoleWithWebIdentity": _IRSA_ASSUME_ROLE_MSG, - "InvalidIdentityToken": _IRSA_ASSUME_ROLE_MSG, - "Access Denied": _IRSA_ACCESS_DENIED_MSG, -} - -S3FS_EXC_TO_UNSTRACT_EXC_COMMON = { "[Errno 22] S3 API Requests must be made to API port": ( # Minio only "Request made to invalid port, please check the port of the endpoint URL." ), @@ -53,9 +17,7 @@ } -def handle_s3fs_exception( - e: Exception, using_static_creds: bool = True -) -> ConnectorError: +def handle_s3fs_exception(e: Exception) -> ConnectorError: """Parses the exception from S3/MinIO. Helps parse the S3/MinIO error and wraps it with our @@ -63,8 +25,6 @@ def handle_s3fs_exception( Args: e (Exception): Error from S3/MinIO - using_static_creds (bool): Whether static credentials were configured. - Controls the auth error message style (key/secret vs IAM/IRSA). Returns: ConnectorError: Unstract's ConnectorError object @@ -75,21 +35,9 @@ def handle_s3fs_exception( original_exc = str(e) user_msg = "Error from S3 / MinIO while testing connection: " exc_to_append = "" - - # Choose auth-error mapping based on credential mode - auth_map = ( - S3FS_EXC_TO_UNSTRACT_EXC_STATIC - if using_static_creds - else S3FS_EXC_TO_UNSTRACT_EXC_AMBIENT - ) - - # Check auth errors first, then common errors - for exc_map in (auth_map, S3FS_EXC_TO_UNSTRACT_EXC_COMMON): - for s3fs_exc, user_friendly_msg in exc_map.items(): - if s3fs_exc in original_exc: - exc_to_append = user_friendly_msg - break - if exc_to_append: + for s3fs_exc, user_friendly_msg in S3FS_EXC_TO_UNSTRACT_EXC.items(): + if s3fs_exc in original_exc: + exc_to_append = user_friendly_msg break # Generic error handling diff --git a/unstract/connectors/src/unstract/connectors/filesystems/minio/minio.py b/unstract/connectors/src/unstract/connectors/filesystems/minio/minio.py index 99d3b45272..bf99e5f747 100644 --- a/unstract/connectors/src/unstract/connectors/filesystems/minio/minio.py +++ b/unstract/connectors/src/unstract/connectors/filesystems/minio/minio.py @@ -16,30 +16,22 @@ class MinioFS(UnstractFileSystem): def __init__(self, settings: dict[str, Any]): super().__init__("MinioFS/S3") - key = (settings.get("key") or "").strip() - secret = (settings.get("secret") or "").strip() - endpoint_url = (settings.get("endpoint_url") or "").strip() + key = settings.get("key", "") + secret = settings.get("secret", "") + endpoint_url = settings.get("endpoint_url", "") client_kwargs = {} if "region_name" in settings and settings["region_name"] != "": client_kwargs = {"region_name": settings["region_name"]} - - creds: dict[str, str] = {} - if key and secret: - creds["key"] = key - creds["secret"] = secret - if endpoint_url: - creds["endpoint_url"] = endpoint_url - - self._using_static_creds = bool(key and secret) - self.s3 = S3FileSystem( anon=False, + key=key, + secret=secret, use_listings_cache=False, default_fill_cache=False, default_cache_type="none", skip_instance_cache=True, + endpoint_url=endpoint_url, client_kwargs=client_kwargs, - **creds, ) @staticmethod @@ -100,13 +92,12 @@ def extract_metadata_file_hash(self, metadata: dict[str, Any]) -> str | None: file_hash = file_hash.strip('"') if "-" in file_hash: logger.warning( - "[S3/MinIO] Multipart upload detected. ETag may not be an " - "MD5 hash. Full metadata: %s", - metadata, + f"[S3/MinIO] Multipart upload detected. ETag may not be an " + f"MD5 hash. Full metadata: {metadata}" ) return None return file_hash - logger.error("[MinIO] File hash not found for the metadata: %s", metadata) + logger.error(f"[MinIO] File hash not found for the metadata: {metadata}") return None def is_dir_by_metadata(self, metadata: dict[str, Any]) -> bool: @@ -128,8 +119,7 @@ def _find_modified_date_value(self, metadata: dict[str, Any]) -> Any | None: return last_modified logger.debug( - "[S3/MinIO] No modified date found in metadata keys: %s", - list(metadata.keys()), + f"[S3/MinIO] No modified date found in metadata keys: {list(metadata.keys())}" ) return None @@ -156,9 +146,7 @@ def _parse_string_datetime( return dt.astimezone(UTC) except (ValueError, TypeError): logger.warning( - "[S3/MinIO] Failed to parse datetime '%s' from metadata keys: %s", - date_str, - metadata_keys, + f"[S3/MinIO] Failed to parse datetime '{date_str}' from metadata keys: {metadata_keys}" ) return None @@ -167,7 +155,7 @@ def _parse_numeric_timestamp(self, timestamp: float) -> datetime | None: try: return datetime.fromtimestamp(timestamp, tz=UTC) except (ValueError, OSError): - logger.warning("[S3/MinIO] Invalid epoch timestamp: %s", timestamp) + logger.warning(f"[S3/MinIO] Invalid epoch timestamp: {timestamp}") return None def extract_modified_date(self, metadata: dict[str, Any]) -> datetime | None: @@ -195,9 +183,7 @@ def extract_modified_date(self, metadata: dict[str, Any]) -> datetime | None: return self._parse_numeric_timestamp(last_modified) logger.debug( - "[S3/MinIO] Unsupported datetime type '%s' in metadata keys: %s", - type(last_modified), - list(metadata.keys()), + f"[S3/MinIO] Unsupported datetime type '{type(last_modified)}' in metadata keys: {list(metadata.keys())}" ) return None @@ -209,7 +195,5 @@ def test_credentials(self) -> bool: try: self.get_fsspec_fs().ls("") except Exception as e: - raise handle_s3fs_exception( - e, using_static_creds=self._using_static_creds - ) from e + raise handle_s3fs_exception(e) from e return True diff --git a/unstract/connectors/src/unstract/connectors/filesystems/minio/static/json_schema.json b/unstract/connectors/src/unstract/connectors/filesystems/minio/static/json_schema.json index d0ba5ed916..e357f48058 100644 --- a/unstract/connectors/src/unstract/connectors/filesystems/minio/static/json_schema.json +++ b/unstract/connectors/src/unstract/connectors/filesystems/minio/static/json_schema.json @@ -4,6 +4,8 @@ "type": "object", "required": [ "connectorName", + "key", + "secret", "endpoint_url", "region_name" ], @@ -17,25 +19,25 @@ "type": "string", "title": "Key", "default": "", - "description": "Access Key ID (leave blank to use IAM role / instance profile)" + "description": "Access Key ID" }, "secret": { "type": "string", "title": "Secret", "format": "password", - "description": "Secret Access Key (leave blank to use IAM role / instance profile)" + "description": "Secret Access Key" }, "endpoint_url": { "type": "string", "title": "Endpoint URL", "default": "https://s3.amazonaws.com", - "description": "Endpoint URL (leave blank for default AWS S3)" + "description": "Endpoint URL to connect to. (example `https://s3.amazonaws.com`)" }, "region_name": { "type": "string", "title": "Region Name", "default": "ap-south", - "description": "Region of the AWS S3 account (leave blank for Minio)" + "description": "Region of the AWS S3 account. For Minio, leave it blank" } } } From ea91d6fd01cbd8dd19b66dacf49a92a234199143 Mon Sep 17 00:00:00 2001 From: kirtimanmishrazipstack Date: Thu, 19 Mar 2026 10:29:35 +0530 Subject: [PATCH 6/6] greptile reviews --- unstract/sdk1/src/unstract/sdk1/file_storage/helper.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/unstract/sdk1/src/unstract/sdk1/file_storage/helper.py b/unstract/sdk1/src/unstract/sdk1/file_storage/helper.py index 912bf282a5..8fdf7e1069 100644 --- a/unstract/sdk1/src/unstract/sdk1/file_storage/helper.py +++ b/unstract/sdk1/src/unstract/sdk1/file_storage/helper.py @@ -44,9 +44,10 @@ def file_storage_init( # not as a top-level arg region_name = storage_config.pop("region_name", None) if region_name: - client_kwargs: dict[str, object] = storage_config.get( # type: ignore[assignment] - "client_kwargs", {} - ) + existing_kwargs = storage_config.get("client_kwargs", {}) + if not isinstance(existing_kwargs, dict): + existing_kwargs = {} + client_kwargs: dict[str, object] = existing_kwargs client_kwargs["region_name"] = region_name storage_config["client_kwargs"] = client_kwargs @@ -81,7 +82,7 @@ def local_file_system_init() -> AbstractFileSystem: except Exception as e: logger.error( "Error in initialising %s file system %s", - FileStorageProvider.GCS.value, + FileStorageProvider.LOCAL.value, e, ) raise FileStorageError(str(e)) from e