Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,19 +1,63 @@
from unstract.connectors.exceptions import ConnectorError

S3FS_EXC_TO_UNSTRACT_EXC = {
S3FS_EXC_TO_UNSTRACT_EXC: dict[str, str] = {
# Auth errors
"The AWS Access Key Id you provided does not exist in our records": (
"Invalid Key (Access Key ID) provided, please provide a valid one."
),
"The request signature we calculated does not match the signature you provided": (
"Invalid Secret (Secret Access Key) provided, please provide a valid one."
),
"Unable to locate credentials": (
"No AWS credentials found. Provide a valid access key/secret or ensure "
"the instance/pod has an IAM role attached."
),
"AssumeRoleWithWebIdentity": (
"Failed to assume IAM role via web identity. Verify the IAM role exists "
"and has the correct trust policy."
),
"InvalidIdentityToken": (
"The identity token provided is invalid. Verify the OIDC provider "
"and ServiceAccount configuration."
),
"ExpiredToken": (
"AWS security token has expired. Refresh your credentials or ensure "
"IAM role session duration is sufficient."
),
# Permission errors
"AccessDenied": (
"Access denied. The IAM user or role does not have sufficient S3 "
"permissions. Ensure the policy grants the required S3 actions "
"(s3:ListAllMyBuckets, s3:ListBucket, s3:GetObject, s3:PutObject) "
"on the target bucket."
),
# Bucket errors
"NoSuchBucket": (
"The specified bucket does not exist. Please check the bucket name."
),
# Endpoint / connectivity errors
"[Errno 22] S3 API Requests must be made to API port": ( # Minio only
"Request made to invalid port, please check the port of the endpoint URL."
),
"Invalid endpoint": (
"Could not connect to the endpoint URL. Please check if the URL is correct "
"and accessible."
),
"timed out": (
"Connection timed out. Check network connectivity and the endpoint URL."
),
"SSL: CERTIFICATE_VERIFY_FAILED": (
"SSL certificate verification failed. If using a self-signed certificate "
"(e.g. MinIO), check your endpoint configuration."
),
"Name or service not known": (
"Could not resolve the endpoint hostname. Please check the endpoint URL."
),
# Clock / request errors
"RequestTimeTooSkewed": (
"The system clock is out of sync with AWS. Ensure the host's clock "
"is accurate (max allowed skew is 15 minutes)."
),
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,28 @@
class MinioFS(UnstractFileSystem):
def __init__(self, settings: dict[str, Any]):
super().__init__("MinioFS/S3")
key = settings.get("key", "")
secret = settings.get("secret", "")
endpoint_url = settings.get("endpoint_url", "")
key = (settings.get("key") or "").strip()
Comment thread
kirtimanmishrazipstack marked this conversation as resolved.
secret = (settings.get("secret") or "").strip()
endpoint_url = (settings.get("endpoint_url") or "").strip()
client_kwargs = {}
if "region_name" in settings and settings["region_name"] != "":
client_kwargs = {"region_name": settings["region_name"]}

creds: dict[str, str] = {}
if key and secret:
creds["key"] = key
creds["secret"] = secret
if endpoint_url:
creds["endpoint_url"] = endpoint_url

self.s3 = S3FileSystem(
anon=False,
key=key,
secret=secret,
use_listings_cache=False,
default_fill_cache=False,
default_cache_type="none",
skip_instance_cache=True,
endpoint_url=endpoint_url,
client_kwargs=client_kwargs,
**creds,
)

@staticmethod
Expand Down Expand Up @@ -87,17 +93,18 @@ def extract_metadata_file_hash(self, metadata: dict[str, Any]) -> str | None:
Optional[str]: The file hash in hexadecimal format or None if not found.
"""
# Extracts ETag for MinIO
file_hash = metadata.get("ETag")
file_hash: str | None = metadata.get("ETag")
if file_hash:
file_hash = file_hash.strip('"')
if "-" in file_hash:
logger.warning(
f"[S3/MinIO] Multipart upload detected. ETag may not be an "
f"MD5 hash. Full metadata: {metadata}"
"[S3/MinIO] Multipart upload detected. ETag may not be an "
"MD5 hash. Full metadata: %s",
metadata,
)
return None
return file_hash
logger.error(f"[MinIO] File hash not found for the metadata: {metadata}")
logger.error("[MinIO] File hash not found for the metadata: %s", metadata)
return None

def is_dir_by_metadata(self, metadata: dict[str, Any]) -> bool:
Expand All @@ -119,7 +126,8 @@ def _find_modified_date_value(self, metadata: dict[str, Any]) -> Any | None:
return last_modified

logger.debug(
f"[S3/MinIO] No modified date found in metadata keys: {list(metadata.keys())}"
"[S3/MinIO] No modified date found in metadata keys: %s",
list(metadata.keys()),
)
return None

Expand All @@ -146,7 +154,9 @@ def _parse_string_datetime(
return dt.astimezone(UTC)
except (ValueError, TypeError):
logger.warning(
f"[S3/MinIO] Failed to parse datetime '{date_str}' from metadata keys: {metadata_keys}"
"[S3/MinIO] Failed to parse datetime '%s' from metadata keys: %s",
date_str,
metadata_keys,
)
return None

Expand All @@ -155,7 +165,7 @@ def _parse_numeric_timestamp(self, timestamp: float) -> datetime | None:
try:
return datetime.fromtimestamp(timestamp, tz=UTC)
except (ValueError, OSError):
logger.warning(f"[S3/MinIO] Invalid epoch timestamp: {timestamp}")
logger.warning("[S3/MinIO] Invalid epoch timestamp: %s", timestamp)
return None

def extract_modified_date(self, metadata: dict[str, Any]) -> datetime | None:
Expand Down Expand Up @@ -183,7 +193,9 @@ def extract_modified_date(self, metadata: dict[str, Any]) -> datetime | None:
return self._parse_numeric_timestamp(last_modified)

logger.debug(
f"[S3/MinIO] Unsupported datetime type '{type(last_modified)}' in metadata keys: {list(metadata.keys())}"
"[S3/MinIO] Unsupported datetime type '%s' in metadata keys: %s",
type(last_modified),
list(metadata.keys()),
)
return None

Expand Down
Comment thread
kirtimanmishrazipstack marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
"type": "object",
"required": [
"connectorName",
"key",
"secret",
"endpoint_url",
"region_name"
],
Comment thread
kirtimanmishrazipstack marked this conversation as resolved.
Expand All @@ -19,25 +17,25 @@
"type": "string",
"title": "Key",
"default": "",
"description": "Access Key ID"
"description": "Access Key ID (leave blank to use IAM role / instance profile)"
},
"secret": {
"type": "string",
"title": "Secret",
"format": "password",
"description": "Secret Access Key"
"description": "Secret Access Key (leave blank to use IAM role / instance profile)"
},
Comment thread
kirtimanmishrazipstack marked this conversation as resolved.
"endpoint_url": {
"type": "string",
"title": "Endpoint URL",
"default": "https://s3.amazonaws.com",
"description": "Endpoint URL to connect to. (example `https://s3.amazonaws.com`)"
"description": "Endpoint URL (leave blank for default AWS S3)"
},
"region_name": {
"type": "string",
"title": "Region Name",
"default": "ap-south",
"description": "Region of the AWS S3 account. For Minio, leave it blank"
"description": "Region of the AWS S3 account (leave blank for Minio)"
}
Comment thread
kirtimanmishrazipstack marked this conversation as resolved.
}
}
Loading