From 9ae2438ae8c4d3ec41e9afc388f0cac038312381 Mon Sep 17 00:00:00 2001 From: Manohar Reddy Date: Tue, 15 Jul 2025 00:50:28 +0530 Subject: [PATCH 1/5] update AWS Terraform code as per the latest changes --- .github/workflows/e2e.yaml | 24 ---- aws/bootstrap-cluster.sh | 12 +- aws/eks.tf | 107 ++++++++--------- aws/main.tf | 98 +-------------- aws/provider.tf | 2 +- aws/tfengine.tf | 240 ------------------------------------- 6 files changed, 63 insertions(+), 420 deletions(-) delete mode 100644 aws/tfengine.tf diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index b3aff5e..8cc558d 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -132,25 +132,6 @@ jobs: env: SBCLI_CMD: ${{ github.event.inputs.sbcli_cmd || 'sbcli-dev' }} - - name: Bootstrap K3s - run: | - CMD="$GITHUB_WORKSPACE/aws/bootstrap-k3s.sh" - if ${{ github.event.inputs.k8s-snode }}; then - CMD="$CMD --k8s-snode" - fi - echo "Running: $CMD" - eval $CMD - id: bootstrap_k3s - - - name: copying Kubeconfig file - run: | - mkdir -p ${HOME}/.kube - scp -o StrictHostKeyChecking=no -i ${{ steps.bootstrap_k3s.outputs.KEY }} ec2-user@${{ steps.bootstrap_k3s.outputs.extra_node_ip }}:/etc/rancher/k3s/k3s.yaml ${HOME}/.kube/config - - - name: update .kube/config address - run: | - sed -i "s/127.0.0.1/${{ steps.bootstrap_k3s.outputs.extra_node_ip }}/g" ${HOME}/.kube/config - - name: Checkout code uses: actions/checkout@v4 with: @@ -186,11 +167,6 @@ jobs: chmod +x config-apply.sh ./config-apply.sh - - name: Reboot Workers - if: ${{ github.event.inputs.k8s-snode == 'true' }} - run: | - $GITHUB_WORKSPACE/aws/reboot-worker.sh - - name: Install SIMPLYBLK-CONTROLLER using Helm if: ${{ github.event.inputs.k8s-snode == 'true' }} run: | diff --git a/aws/bootstrap-cluster.sh b/aws/bootstrap-cluster.sh index 7d07ee5..cbb15ff 100755 --- a/aws/bootstrap-cluster.sh +++ b/aws/bootstrap-cluster.sh @@ -32,6 +32,8 @@ print_help() { echo " --disable-ha-jm Disable HA JM for distrib creation (optional)" echo " --data-nics Set Storage network interface name(s). Can be more than one. (optional)" echo " --id-device-by-nqn Use device nqn to identify it instead of serial number. (optional)" + echo " --max-lvol Set Maximum lvols (optional)" + echo " --number-of-devices Set number of devices (optional)" echo " --help Print this help message" exit 0 } @@ -70,6 +72,14 @@ HA_JM_COUNT="" while [[ $# -gt 0 ]]; do arg="$1" case $arg in + --max-lvol) + MAX_LVOL="$2" + shift + ;; + --number-of-devices) + NO_DEVICE="$2" + shift + ;; --max-snap) MAX_SNAPSHOT="$2" shift @@ -241,7 +251,7 @@ echo "" echo "Deploying management node..." echo "" -command="sudo docker swarm leave --force ; ${SBCLI_CMD} --dev -d cluster create" +command="${SBCLI_CMD} --dev -d cluster create" if [[ -n "$LOG_DEL_INTERVAL" ]]; then command+=" --log-del-interval $LOG_DEL_INTERVAL" fi diff --git a/aws/eks.tf b/aws/eks.tf index 3c62554..6565624 100644 --- a/aws/eks.tf +++ b/aws/eks.tf @@ -121,10 +121,10 @@ resource "aws_security_group" "eks_nodes_sg" { module "eks" { count = var.enable_eks source = "terraform-aws-modules/eks/aws" - version = "19.16.0" + version = "~> 20.31" cluster_name = "${terraform.workspace}-${var.cluster_name}" - cluster_version = "1.31" + cluster_version = "1.33" cluster_endpoint_private_access = true # default is true cluster_endpoint_public_access = true @@ -144,8 +144,6 @@ module "eks" { } } - enable_irsa = true - eks_managed_node_group_defaults = { disk_size = 30 iam_role_attach_cni_policy = true @@ -157,9 +155,6 @@ module "eks" { eks_managed_node_groups = { - # FIXME: Caching-node not working properly with bottlerocket ami_type - # https://simplyblock.atlassian.net/browse/SFAM-865 - bottlerocket = { instance_types = ["m6id.large"] ami_type = "BOTTLEROCKET_x86_64" @@ -167,7 +162,7 @@ module "eks" { use_custom_launch_template = false vpc_security_group_ids = [aws_security_group.eks_nodes_sg[0].id] min_size = 0 - max_size = 2 + max_size = 3 desired_size = 0 key_name = local.selected_key_name enable_bootstrap_user_data = true @@ -208,75 +203,43 @@ module "eks" { } storage-nodes = { - desired_size = 0 - min_size = 0 + desired_size = 3 + min_size = 3 max_size = 3 labels = { type = "simplyblock-storage-plane" } - taints = { - dedicated = { - key = "dedicated" - value = "simplyblock-storage-plane" - effect = "NO_SCHEDULE" - } - } - - ami_type = "AL2023_x86_64_STANDARD" instance_types = ["i3en.2xlarge"] capacity_type = "ON_DEMAND" key_name = local.selected_key_name vpc_security_group_ids = [aws_security_group.eks_nodes_sg[0].id] - pre_bootstrap_user_data = <<-EOT - echo "installing nvme-cli.." - sudo yum install -y nvme-cli - sudo modprobe nvme-tcp - sudo dnf install tuned - EOT - } + bootstrap_extra_args = <<-EOT + # The admin host container provides SSH access and runs with "superpowers". + # It is disabled by default, but can be disabled explicitly. + [settings.host-containers.admin] + enabled = true - eks-nodes = { - desired_size = 2 - min_size = 2 - max_size = 2 + # The control host container provides out-of-band access via SSM. + # It is enabled by default, and can be disabled if you do not expect to use SSM. + # This could leave you with no way to access the API and change settings on an existing node! + [settings.host-containers.control] + enabled = true - labels = { - role = "general" - } + # extra args added + [settings.kernel] + lockdown = "integrity" - ami_type = "AL2023_x86_64_STANDARD" - instance_types = ["t3.xlarge"] - capacity_type = "ON_DEMAND" - key_name = local.selected_key_name - vpc_security_group_ids = [aws_security_group.eks_nodes_sg[0].id] - pre_bootstrap_user_data = <<-EOT - echo "installing nvme-cli.." - sudo yum install -y nvme-cli - sudo modprobe nvme-tcp - sudo dnf install tuned + [settings.kubernetes.node-labels] + type = "simplyblock-storage-plane" EOT - } - - cache-nodes = { - desired_size = 0 - min_size = 0 - max_size = 1 - labels = { - role = "cache" - } - - ami_type = "AL2023_x86_64_STANDARD" - instance_types = ["m6id.large"] - capacity_type = "ON_DEMAND" - key_name = local.selected_key_name - vpc_security_group_ids = [aws_security_group.eks_nodes_sg[0].id] pre_bootstrap_user_data = <<-EOT echo "installing nvme-cli.." sudo yum install -y nvme-cli sudo modprobe nvme-tcp + sudo dnf install tuned EOT } } @@ -286,3 +249,31 @@ module "eks" { Environment = "${terraform.workspace}-dev" } } + +# resource "aws_eks_access_entry" "example" { +# cluster_name = var.cluster_name +# principal_arn = aws_iam_role.example.arn +# kubernetes_groups = ["group-1", "group-2"] +# type = "STANDARD" +# } + +# resource "aws_eks_access_policy_association" "eksadmin" { +# cluster_name = var.cluster_name +# policy_arn = "arn:aws:eks::aws:cluster-access-policy/AmazonEKSAdminPolicy" +# principal_arn = aws_iam_user.example.arn + +# access_scope { +# type = "cluster" +# } +# } + +# resource "aws_eks_access_policy_association" "eksclusteradmin" { +# cluster_name = var.cluster_name +# policy_arn = "arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy" +# principal_arn = aws_iam_user.example.arn + +# access_scope { +# type = "cluster" +# } +# } + diff --git a/aws/main.tf b/aws/main.tf index 5ddfab7..a84efec 100644 --- a/aws/main.tf +++ b/aws/main.tf @@ -1,6 +1,7 @@ module "vpc" { source = "terraform-aws-modules/vpc/aws" + version = "5.21.0" name = "${terraform.workspace}-storage-vpc-sb" cidr = "10.0.0.0/16" @@ -545,97 +546,7 @@ data "aws_iam_policy_document" "assume_role_policy" { } } -# create a policy -resource "aws_iam_policy" "mgmt_policy" { - name = "${terraform.workspace}-mgmt_node_policy" - description = "Policy for allowing EC2 to communicate with other resources" - - policy = jsonencode({ - Version = "2012-10-17" - Statement = [ - { - "Effect" : "Allow", - "Action" : [ - "ec2:DescribeAvailabilityZones", - "ec2:DescribeSubnets", - "ec2:DescribeNetworkInterfaces", - "ec2:DescribeInstances", - "ec2:DescribeInstanceAttribute", - "ec2:DescribeSecurityGroups", - "ec2:DescribeTags", - "ec2:DescribeVolumes", - "ec2:RunInstances", - "ec2:CreateVolume", - "ec2:AttachVolume", - "ec2:DetachVolume", - "ec2:CreateTags" - ], - "Resource" : "*" - }, - { - "Effect" : "Allow", - "Action" : "sts:GetServiceBearerToken", - "Resource" : "*" - }, - { - "Effect" : "Allow", - "Action" : "iam:PassRole", - "Resource" : "*" - }, - { - Action = [ - "codeartifact:GetAuthorizationToken", - "codeartifact:GetRepositoryEndpoint", - "codeartifact:ReadFromRepository", - ], - Effect = "Allow", - Resource = [ - "arn:aws:codeartifact:eu-west-1:${local.account_id}:repository/simplyblock/sbcli", - "arn:aws:codeartifact:eu-west-1:${local.account_id}:domain/simplyblock" - ] - }, - { - Action = [ - "ssm:SendCommand", - ], - Effect = "Allow", - Resource = [ - "arn:aws:ec2:${var.region}:${local.account_id}:instance/*", - "arn:aws:ssm:${var.region}::document/AWS-RunShellScript", - "arn:aws:ssm:${var.region}:${local.account_id}:*" - ] - }, - { - Action = [ - "ssm:GetCommandInvocation" - ], - Effect = "Allow", - Resource = [ - "arn:aws:ssm:${var.region}:${local.account_id}:*" - ] - }, - { - "Effect" : "Allow", - "Action" : [ - "s3:GetObject" - ], - "Resource" : [ - "${aws_s3_bucket.tfengine_logs.arn}/*", - "arn:aws:s3:::${var.tf_state_bucket_name}/*" - ] - }, - { - "Effect" : "Allow", - "Action" : [ - "s3:ListBucket" - ], - "Resource" : [ - "arn:aws:s3:::${var.tf_state_bucket_name}" - ] - } - ] - }) -} + # create a role with an assumed policy resource "aws_iam_role" "role" { @@ -643,11 +554,6 @@ resource "aws_iam_role" "role" { assume_role_policy = data.aws_iam_policy_document.assume_role_policy.json } -# attach policy to the role -resource "aws_iam_role_policy_attachment" "s3_get_object_attachment" { - role = aws_iam_role.role.name - policy_arn = aws_iam_policy.mgmt_policy.arn -} # create instance profile resource "aws_iam_instance_profile" "inst_profile" { diff --git a/aws/provider.tf b/aws/provider.tf index 05e296a..1cbf639 100644 --- a/aws/provider.tf +++ b/aws/provider.tf @@ -8,7 +8,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "5.80.0" + version = "5.95.0" } } } diff --git a/aws/tfengine.tf b/aws/tfengine.tf deleted file mode 100644 index 470d4ca..0000000 --- a/aws/tfengine.tf +++ /dev/null @@ -1,240 +0,0 @@ - - -data "aws_ami" "this" { - most_recent = true - owners = ["amazon"] - - filter { - name = "name" - values = ["al2023-ami-2023*"] - } - - filter { - name = "architecture" - values = ["x86_64"] - } -} - -resource "aws_autoscaling_group" "tfengine_asg" { - min_size = 0 - max_size = 1 - desired_capacity = 0 - vpc_zone_identifier = [module.vpc.private_subnets[0]] - tag { - key = "Name" - value = "${terraform.workspace}-tfengine" - propagate_at_launch = true - } - tag { - key = "long-term-test" - value = "true" - propagate_at_launch = true - } - lifecycle { - create_before_destroy = true - } - launch_template { - id = aws_launch_template.tfengine_lc.id - version = "$Latest" - } -} - -resource "aws_launch_template" "tfengine_lc" { - name_prefix = "tfengine" - image_id = data.aws_ami.this.id - instance_type = "t3.medium" - - lifecycle { - create_before_destroy = true - } - - network_interfaces { - associate_public_ip_address = false - security_groups = [aws_security_group.tfengine_sg.id] - } - - iam_instance_profile { - name = aws_iam_instance_profile.tfengine.name - } - - user_data = base64encode(< ~/.docker/config.json -EOF - ) - - tag_specifications { - resource_type = "instance" - - tags = { - Name = "${terraform.workspace}-tfengine" - } - } -} - -resource "aws_security_group" "tfengine_sg" { - description = "tfEngine security group" - vpc_id = module.vpc.vpc_id - - egress { - from_port = 0 - to_port = 0 - protocol = "-1" - cidr_blocks = ["0.0.0.0/0"] - } -} - -resource "aws_iam_instance_profile" "tfengine" { - role = aws_iam_role.tfengine.name -} - -resource "aws_iam_role" "tfengine" { - - assume_role_policy = < Date: Tue, 15 Jul 2025 11:13:42 +0530 Subject: [PATCH 2/5] cleanup un used sections --- .github/workflows/e2e.yaml | 5 +- README.md | 8 + aws/README.md | 29 --- aws/bootstrap-cluster.sh | 40 +--- aws/data.tf | 4 - aws/dev.tfvars | 4 - aws/efs.tf | 23 -- aws/eks.tf | 16 +- aws/locals.tf | 22 +- aws/main.tf | 119 +--------- aws/outputs.tf | 29 --- aws/prod.tfvars | 6 +- aws/variables.tf | 69 +----- .../.github/workflows/docker-image.yaml | 35 --- bare-metal/.github/workflows/e2e.yaml | 215 ------------------ 15 files changed, 31 insertions(+), 593 deletions(-) create mode 100644 README.md delete mode 100644 aws/efs.tf delete mode 100644 bare-metal/.github/workflows/docker-image.yaml delete mode 100644 bare-metal/.github/workflows/e2e.yaml diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 8cc558d..c365f13 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -102,14 +102,11 @@ jobs: -var \"storage_nodes_instance_type=m6i.2xlarge\" \ -var \"sec_storage_nodes_instance_type=m6i.2xlarge\" \ -var \"sec_storage_nodes=0\" \ - -var \"extra_nodes=1\" \ - -var \"extra_nodes_instance_type=m6id.xlarge\" \ - -var \"extra_nodes_arch=amd64\" \ -var \"region=us-east-2\" \ -var \"sbcli_cmd=${{ github.event.inputs.sbcli_cmd || 'sbcli-dev' }}\"" if ${{ github.event.inputs.k8s-snode }}; then - TF_CMD="$TF_CMD -var \"snode_deploy_on_k8s=true\"" + TF_CMD="$TF_CMD -var \"enable_eks=1\"" fi TF_CMD="$TF_CMD -out=tfplan" diff --git a/README.md b/README.md new file mode 100644 index 0000000..71c2439 --- /dev/null +++ b/README.md @@ -0,0 +1,8 @@ +### Introduction + +Simplyblock storage solution can be deployed to multiple environments. This repo hosts code to automatically deploy +the setup in AWS and bare metal + +### AWS Deployment + + diff --git a/aws/README.md b/aws/README.md index f9f44c1..0e7d841 100644 --- a/aws/README.md +++ b/aws/README.md @@ -19,10 +19,6 @@ Follow the installation instructions for AWS CLI based on your operating system: After installing AWS CLI, configure it with your AWS credentials by running the `aws configure` command and providing your Access Key ID, Secret Access Key, region, and output format. -# Intro - -Terraform template to set up a simple cluster. - # Deploy Infrastructure ## Change Node Counts @@ -62,12 +58,6 @@ terraform plan ## Apply Configurations -### Basic Deployment - -```bash -terraform apply -var mgmt_nodes=1 -var storage_nodes=3 --auto-approve -``` - ### Deploying with EKS ```bash @@ -80,13 +70,6 @@ terraform apply -var mgmt_nodes=1 -var storage_nodes=3 -var enable_eks=1 --auto- terraform apply -var mgmt_nodes=1 -var storage_nodes=3 -var az=us-east-2b --auto-approve ``` -### Specifying the Arch type to Deploy - -```bash -terraform apply -var mgmt_nodes=1 -var storage_nodes=3 -var extra_nodes_arch=arm64 \ - -var extra_nodes_instance_type="m6gd.xlarge" --auto-approve -``` - ### Specifying Instance Types ```bash @@ -94,12 +77,6 @@ terraform apply -var mgmt_nodes=1 -var storage_nodes=3 \ -var mgmt_nodes_instance_type="m5.large" -var storage_nodes_instance_type="m5.large" --auto-approve ``` -### Specifying the Number of EBS Volumes - -```bash -terraform apply -var mgmt_nodes=1 -var storage_nodes=3 -var volumes_per_storage_nodes=2 --auto-approve -``` - ### Specifying the Size of EBS Volumes ```bash @@ -172,12 +149,6 @@ chmod +x ./bootstrap-cluster.sh ### Deploy Storage-node to K8s -#### Set Terraform variable snode_deploy_on_k8s to true - -```bash -terraform apply -var mgmt_nodes=1 -var storage_nodes=3 -var snode_deploy_on_k8s="true" --auto-approve -``` - #### Set cluster argument --k8s-snode ```bash diff --git a/aws/bootstrap-cluster.sh b/aws/bootstrap-cluster.sh index cbb15ff..609354f 100755 --- a/aws/bootstrap-cluster.sh +++ b/aws/bootstrap-cluster.sh @@ -1,7 +1,7 @@ #!/bin/bash set -euo pipefail -KEY="$HOME/.ssh/simplyblock-ohio.pem" +KEY="${KEY:-$HOME/.ssh/id_ed25519}" print_help() { echo "Usage: $0 [options]" @@ -194,37 +194,14 @@ while [[ $# -gt 0 ]]; do shift done -SECRET_VALUE=$(terraform output -raw secret_value) -KEY_NAME=$(terraform output -raw key_name) +echo "reading terraform outputs..." BASTION_IP=$(terraform output -raw bastion_public_ip) GRAFANA_ENDPOINT=$(terraform output -raw grafana_invoke_url) - -ssh_dir="$HOME/.ssh" - -if [ ! -d "$ssh_dir" ]; then - mkdir -p "$ssh_dir" - echo "Directory $ssh_dir created." -else - echo "Directory $ssh_dir already exists." -fi - -if [[ -n "$SECRET_VALUE" ]]; then - KEY="$HOME/.ssh/$KEY_NAME" - if [ -f "$HOME/.ssh/$KEY_NAME" ]; then - echo "the ssh key: ${KEY} already exits on local" - else - echo "$SECRET_VALUE" >"$KEY" - chmod 400 "$KEY" - fi -else - echo "Failed to retrieve secret value. Falling back to default key." -fi - mnodes=$(terraform output -raw mgmt_private_ips) +storage_private_ips=$(terraform output -raw storage_private_ips) + echo "mgmt_private_ips: ${mnodes}" IFS=' ' read -ra mnodes <<<"$mnodes" -storage_private_ips=$(terraform output -raw storage_private_ips) -sec_storage_private_ips=$(terraform output -raw sec_storage_private_ips) echo "bootstrapping cluster..." @@ -410,15 +387,6 @@ else echo "add node command: \${add_node_command}" \$add_node_command sleep 3 - done - - for node in ${sec_storage_private_ips}; do - echo "" - echo "joining secondary node \${node}" - add_node_command=\"${command} --is-secondary-node ${CLUSTER_ID} \${node}:5000 eth0\" - echo "add node command: \${add_node_command}" - \$add_node_command - sleep 3 done" echo "" diff --git a/aws/data.tf b/aws/data.tf index bfe7d18..ced6e5e 100644 --- a/aws/data.tf +++ b/aws/data.tf @@ -2,10 +2,6 @@ data "aws_availability_zones" "available" { state = "available" } -data "aws_secretsmanager_secret_version" "simply" { - secret_id = local.selected_key_name -} - data "aws_ami" "rhel9" { most_recent = true owners = ["309956199498"] # Red Hat diff --git a/aws/dev.tfvars b/aws/dev.tfvars index 50375d7..d5a080d 100644 --- a/aws/dev.tfvars +++ b/aws/dev.tfvars @@ -2,10 +2,6 @@ region = "us-east-1" sbcli_cmd = "sbcli-dev" mgmt_nodes = 1 storage_nodes = 3 -extra_nodes = 0 mgmt_nodes_instance_type = "m5.large" storage_nodes_instance_type = "m5.large" -extra_nodes_instance_type = "m6id.large" -volumes_per_storage_nodes = 1 -enable_apigateway = 0 env = "dev" diff --git a/aws/efs.tf b/aws/efs.tf deleted file mode 100644 index a128fdb..0000000 --- a/aws/efs.tf +++ /dev/null @@ -1,23 +0,0 @@ -# resource "aws_efs_file_system" "efs" { -# for_each = local.efs_file_systems - -# performance_mode = "generalPurpose" - -# tags = { -# Name = "${aws_instance.mgmt_nodes[0].id}/monitoring_${each.value}" -# } -# } - -# resource "aws_efs_mount_target" "efs_mt" { -# for_each = local.efs_file_systems - -# file_system_id = aws_efs_file_system.efs[each.key].id -# subnet_id = module.vpc.private_subnets[local.az_index] -# security_groups = [aws_security_group.mgmt_node_sg.id] - -# lifecycle { -# ignore_changes = [ -# subnet_id, -# ] -# } -# } diff --git a/aws/eks.tf b/aws/eks.tf index 6565624..944b5ee 100644 --- a/aws/eks.tf +++ b/aws/eks.tf @@ -147,14 +147,11 @@ module "eks" { eks_managed_node_group_defaults = { disk_size = 30 iam_role_attach_cni_policy = true - # remote_access = { - # ec2_ssh_key = local.selected_key_name - # source_security_group_ids = [aws_security_group.eks_nodes_sg[0].id] - # } } eks_managed_node_groups = { + # bottlerock storage nodes bottlerocket = { instance_types = ["m6id.large"] ami_type = "BOTTLEROCKET_x86_64" @@ -202,17 +199,20 @@ module "eks" { EOT } - storage-nodes = { - desired_size = 3 + # AL2023 storage nodes + # TODO: add support for ARM images + al2023 = { + desired_size = var.storage_nodes min_size = 3 - max_size = 3 + max_size = 10 labels = { type = "simplyblock-storage-plane" } ami_type = "AL2023_x86_64_STANDARD" - instance_types = ["i3en.2xlarge"] + instance_types = [var.storage_nodes_instance_type] + architecture = var.storage_nodes_arch capacity_type = "ON_DEMAND" key_name = local.selected_key_name vpc_security_group_ids = [aws_security_group.eks_nodes_sg[0].id] diff --git a/aws/locals.tf b/aws/locals.tf index 0377f25..1053158 100644 --- a/aws/locals.tf +++ b/aws/locals.tf @@ -1,17 +1,7 @@ data "aws_caller_identity" "current" {} locals { - volume_device_names = ["/dev/sdi", "/dev/sdj", "/dev/sdk", "/dev/sdl", "/dev/sdm", "/dev/sdn", "/dev/sdo"] - - snodes = toset([for n in range(var.storage_nodes) : tostring(n)]) - - sec_snodes = toset([for n in range(var.sec_storage_nodes) : tostring(n)]) - - node_disks = { for pair in setproduct(local.snodes, slice(local.volume_device_names, 0, var.volumes_per_storage_nodes)) : "${pair[0]}:${pair[1]}" => { - node_name = pair[0] - disk_dev_path = pair[1] - } } - + snodes = toset([for n in range(var.enable_eks == 1 ? 0 : var.storage_nodes) : tostring(n)]) key_name = { "us-east-1" = "simplyblock-us-east-1.pem" "us-east-2" = "simplyblock-us-east-2.pem" @@ -58,15 +48,5 @@ locals { az_suffix = substr(var.az, -1, 1) az_index = lookup(local.az_suffix_to_number, local.az_suffix, -1) - efs_file_systems = { - mongodb_data = "mongodb_data" - os_data = "os_data" - graylog_data = "graylog_data" - graylog_journal = "graylog_journal" - graylog_config = "graylog_config" - grafana_data = "grafana_data" - prometheus_data = "prometheus_data" - } - account_id = data.aws_caller_identity.current.account_id } diff --git a/aws/main.tf b/aws/main.tf index a84efec..ad6075c 100644 --- a/aws/main.tf +++ b/aws/main.tf @@ -31,7 +31,7 @@ module "vpc" { } module "apigatewayendpoint" { - count = var.enable_apigateway == 1 && var.mgmt_nodes > 0 ? 1 : 0 + count = var.mgmt_nodes > 0 ? 1 : 0 source = "./modules/apigateway" region = var.region mgmt_node_instance_ids = aws_instance.mgmt_nodes[*].id @@ -547,7 +547,6 @@ data "aws_iam_policy_document" "assume_role_policy" { } - # create a role with an assumed policy resource "aws_iam_role" "role" { path = "/" @@ -599,6 +598,7 @@ resource "aws_instance" "mgmt_nodes" { user_data = <> /home/ec2-user/.ssh/authorized_keys echo "installing sbcli.." sudo yum install -y pip jq pip install ${local.sbcli_pkg} @@ -618,7 +618,6 @@ resource "aws_instance" "storage_nodes" { ami = local.ami_map[var.storage_nodes_arch][var.region] # RHEL 9 // use this outside simplyblock aws acccount data.aws_ami.rhel9.id instance_type = var.storage_nodes_instance_type - key_name = local.selected_key_name vpc_security_group_ids = [aws_security_group.storage_nodes_sg.id] subnet_id = module.vpc.private_subnets[local.az_index] iam_instance_profile = aws_iam_instance_profile.inst_profile.name @@ -637,6 +636,7 @@ resource "aws_instance" "storage_nodes" { user_data = <> /home/ec2-user/.ssh/authorized_keys sudo sysctl -w vm.nr_hugepages=${var.nr_hugepages} cat /proc/meminfo | grep -i hug echo "installing sbcli.." @@ -645,123 +645,10 @@ pip install ${local.sbcli_pkg} curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" unzip awscliv2.zip sudo ./aws/install -if [ "${var.snode_deploy_on_k8s}" = "false" ]; then - ${var.sbcli_cmd} storage-node configure --max-lvol ${var.max_lvol} --max-size ${var.max_size} \ - --nodes-per-socket ${var.nodes_per_socket} --sockets-to-use ${var.socket_to_use} \ - --pci-allowed "${join(",", var.pci_allowed)}" --pci-blocked "${join(",", var.pci_blocked)}" - - ${var.sbcli_cmd} storage-node deploy -fi -EOF -} - -resource "aws_instance" "sec_storage_nodes" { - for_each = local.sec_snodes - ami = local.ami_map[var.storage_nodes_arch][var.region] # RHEL 9 - instance_type = var.sec_storage_nodes_instance_type - key_name = local.selected_key_name - vpc_security_group_ids = [aws_security_group.storage_nodes_sg.id] - subnet_id = module.vpc.private_subnets[local.az_index] - iam_instance_profile = aws_iam_instance_profile.inst_profile.name - root_block_device { - volume_size = 45 - } - tags = { - Name = "${terraform.workspace}-sec-storage-${each.value + 1}" - } - - lifecycle { - ignore_changes = [ - subnet_id, - ] - } - - user_data = < 0 && var.storage_nodes > 0 ? var.storage_nodes : 0 - availability_zone = data.aws_availability_zones.available.names[local.az_index] - size = var.storage_nodes_ebs_size1 - - tags = { - Name = "simplyblock-jm" - } - - lifecycle { - ignore_changes = [ - availability_zone, - ] - } -} - -resource "aws_ebs_volume" "storage_nodes_ebs2" { - for_each = var.storage_nodes > 0 ? local.node_disks : {} - - availability_zone = data.aws_availability_zones.available.names[local.az_index] - size = var.storage_nodes_ebs_size2 - - tags = { - Name = "simplyblock-storage" - } - - lifecycle { - ignore_changes = [ - availability_zone, - ] - } -} - -resource "aws_volume_attachment" "attach_sn2" { - for_each = var.storage_nodes > 0 ? local.node_disks : {} - - device_name = each.value.disk_dev_path - volume_id = aws_ebs_volume.storage_nodes_ebs2[each.key].id - instance_id = aws_instance.storage_nodes[each.value.node_name].id -} - -resource "aws_volume_attachment" "attach_sn" { - count = var.volumes_per_storage_nodes > 0 && var.storage_nodes > 0 ? var.storage_nodes : 0 - device_name = "/dev/sdh" - volume_id = aws_ebs_volume.storage_nodes_ebs[count.index].id - instance_id = aws_instance.storage_nodes[count.index].id -} - -# can be used for testing caching nodes -resource "aws_instance" "extra_nodes" { - count = var.extra_nodes - ami = local.ami_map[var.extra_nodes_arch][var.region] # RHEL 9 - instance_type = var.extra_nodes_instance_type - key_name = local.selected_key_name - vpc_security_group_ids = [aws_security_group.extra_nodes_sg.id] - subnet_id = module.vpc.public_subnets[1] - iam_instance_profile = aws_iam_instance_profile.inst_profile.name - root_block_device { - volume_size = 45 - } - tags = { - Name = "${terraform.workspace}-k8scluster-${count.index + 1}" - } - user_data = < { @@ -58,18 +41,6 @@ output "storage_public_ips" { value = join(" ", [for inst in aws_instance.storage_nodes : inst.public_ip]) } -output "sec_storage_node_details" { - value = { for i, instance in aws_instance.sec_storage_nodes : - instance.tags["Name"] => { - type = instance.instance_type - public_ip = instance.public_ip - private_ip = instance.private_ip - availability_zone = instance.availability_zone - } - } - description = "Details of the secondary storage node nodes." -} - output "bastion_public_ip" { value = aws_instance.bastion.public_ip } diff --git a/aws/prod.tfvars b/aws/prod.tfvars index 43aeb84..86d8480 100644 --- a/aws/prod.tfvars +++ b/aws/prod.tfvars @@ -2,10 +2,6 @@ region = " eu-central-1" sbcli_cmd = "sbcli-release" mgmt_nodes = 1 storage_nodes = 3 -extra_nodes = 0 mgmt_nodes_instance_type = "m5.large" -storage_nodes_instance_type = "i3en.large" -extra_nodes_instance_type = "m6id.large" -volumes_per_storage_nodes = 1 -enable_apigateway = 0 +storage_nodes_instance_type = "i3en.2xlarge" env = "prod" diff --git a/aws/variables.tf b/aws/variables.tf index 6c2c6fb..5b2b05c 100644 --- a/aws/variables.tf +++ b/aws/variables.tf @@ -52,11 +52,6 @@ variable "mgmt_nodes" { } variable "storage_nodes" { - default = 3 - type = number -} - -variable "sec_storage_nodes" { default = 0 type = number } @@ -72,65 +67,16 @@ variable "mgmt_nodes_instance_type" { } variable "storage_nodes_instance_type" { - default = "m5.large" - type = string -} - -variable "sec_storage_nodes_instance_type" { - default = "m5.large" - type = string -} - -variable "extra_nodes_instance_type" { - default = "m5.large" + default = "i3en.2xlarge" # Simplyblock requires atleast 6 VPCs per storage node type = string } -variable "storage_nodes_ebs_size1" { - default = 2 - type = number -} - -variable "storage_nodes_ebs_size2" { - default = 50 - type = number -} - -variable "volumes_per_storage_nodes" { - default = 1 - type = number - validation { - condition = var.volumes_per_storage_nodes <= 6 - error_message = "The number of volumes per storage node must not exceed 6." - } -} - variable "nr_hugepages" { default = 2048 - description = "number of huge pages" + description = "number of huge pages. To be used when eks is not enabled" type = number } -variable "enable_apigateway" { - default = 1 - type = number -} - -variable "tf_state_bucket_name" { - default = "simplyblock-terraform-state-bucket" - type = string -} - -variable "extra_nodes_arch" { - type = string - default = "amd64" - - validation { - condition = contains(["arm64", "amd64"], var.extra_nodes_arch) - error_message = "The architecture type must be either 'arm64' or 'amd64'." - } -} - variable "storage_nodes_arch" { type = string default = "amd64" @@ -141,17 +87,12 @@ variable "storage_nodes_arch" { } } -variable "snode_deploy_on_k8s" { +variable "ssh_key_path" { + description = "Path to the public SSH key" type = string - default = "false" - - validation { - condition = contains(["false", "true"], var.snode_deploy_on_k8s) - error_message = "The value must be either 'true' or 'false'." - } + default = "~/.ssh/id_ed25519.pub" } - variable "max_lvol" { type = number default = 20 diff --git a/bare-metal/.github/workflows/docker-image.yaml b/bare-metal/.github/workflows/docker-image.yaml deleted file mode 100644 index ef31e87..0000000 --- a/bare-metal/.github/workflows/docker-image.yaml +++ /dev/null @@ -1,35 +0,0 @@ -name: Docker Image Build - -on: - push: - branches: - - 'main' - -jobs: - - build: - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@master - - - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v2 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: us-east-1 - - - name: Login to Amazon ECR - id: login-ecr - uses: aws-actions/amazon-ecr-login@v2 - - - name: Push docker image - env: - REGISTRY: ${{ steps.login-ecr.outputs.registry }} - REPOSITORY: simplyblockdeploy - IMAGE_TAG: latest - run: | - docker build -t $REGISTRY/$REPOSITORY:$IMAGE_TAG . - docker push $REGISTRY/$REPOSITORY:$IMAGE_TAG diff --git a/bare-metal/.github/workflows/e2e.yaml b/bare-metal/.github/workflows/e2e.yaml deleted file mode 100644 index 8be5d82..0000000 --- a/bare-metal/.github/workflows/e2e.yaml +++ /dev/null @@ -1,215 +0,0 @@ -name: E2E Tests - -on: - push: - branches: - - main - schedule: - - cron: '0 5 * * *' # Runs every day at 5 AM UTC - workflow_dispatch: - inputs: - simplyblock_csi_branch: - description: '' - required: true - default: 'master' - sbcli_cmd: - description: '' - required: true - default: 'sbcli-dev' - upload_logs: - description: 'Upload logs to AWS' - required: false - default: false - type: boolean -jobs: - e2e: - runs-on: self-hosted - concurrency: - group: ${{ github.workflow }} - cancel-in-progress: false - steps: - - name: Checkout code - uses: actions/checkout@v2 - - - name: Set upload_logs for scheduled and push events - if: github.event_name == 'schedule' || github.event_name == 'push' - run: echo "upload_logs=true" >> $GITHUB_ENV - - - name: Set upload_logs for manual workflow_dispatch - if: github.event_name == 'workflow_dispatch' - run: echo "upload_logs=${{ github.event.inputs.upload_logs }}" >> $GITHUB_ENV - - - uses: actions/setup-go@v5 - with: - go-version: '1.22' - - run: go version - - - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v2 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ secrets.AWS_REGION }} - - - name: Setup Terraform - uses: hashicorp/setup-terraform@v2 - with: - terraform_wrapper: false - - - name: Install kubectl - run: | - curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" - sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl - - - name: Install Helm - run: | - curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 - chmod 700 get_helm.sh - ./get_helm.sh - - - name: Initialize Terraform - run: | - export TFSTATE_BUCKET=simplyblock-terraform-state-bucket - export TFSTATE_KEY=csi - export TFSTATE_REGION=us-east-2 - export TFSTATE_DYNAMODB_TABLE=terraform-up-and-running-locks - - terraform init -reconfigure \ - -backend-config="bucket=${TFSTATE_BUCKET}" \ - -backend-config="key=${TFSTATE_KEY}" \ - -backend-config="region=${TFSTATE_REGION}" \ - -backend-config="dynamodb_table=${TFSTATE_DYNAMODB_TABLE}" \ - -backend-config="encrypt=true" - - - name: select or create workspace - run: terraform workspace select -or-create githubactions - - - name: Validate Terraform Configuration - run: terraform validate - - - name: Plan Terraform Changes - run: | - terraform plan \ - -var "mgmt_nodes=1" \ - -var "storage_nodes=3" \ - -var "storage_nodes_arch=arm64" \ - -var "storage_nodes_instance_type=m6g.2xlarge" \ - -var "sec_storage_nodes_instance_type=m6g.2xlarge" \ - -var "snode_deploy_on_k8s=true" \ - -var "extra_nodes=1" \ - -var "extra_nodes_instance_type=m6gd.xlarge" \ - -var "extra_nodes_arch=arm64" \ - -var "region=us-east-2" \ - -var "sbcli_cmd=${{ github.event.inputs.sbcli_cmd || 'sbcli-dev' }}" \ - -out=tfplan - - - name: Apply Terraform Changes - run: terraform apply tfplan - - - name: Bootstrap Cluster - run: $GITHUB_WORKSPACE/bootstrap-cluster.sh --max-lvol 10 --max-snap 10 --max-prov 150g --number-of-devices 1 --k8s-snode - id: bootstrap_cluster - env: - SBCLI_CMD: ${{ github.event.inputs.sbcli_cmd || 'sbcli-dev' }} - - - name: Bootstrap K3s - run: $GITHUB_WORKSPACE/bootstrap-k3s.sh --k8s-snode - id: bootstrap_k3s - - - name: copying Kubeconfig file - run: | - mkdir -p ${HOME}/.kube - scp -o StrictHostKeyChecking=no -i ${{ steps.bootstrap_k3s.outputs.KEY }} ec2-user@${{ steps.bootstrap_k3s.outputs.extra_node_ip }}:/etc/rancher/k3s/k3s.yaml ${HOME}/.kube/config - - - name: update .kube/config address - run: | - sed -i "s/127.0.0.1/${{ steps.bootstrap_k3s.outputs.extra_node_ip }}/g" ${HOME}/.kube/config - - - name: Clone simplyblock-csi repo - run: git clone -b ${{ github.event.inputs.simplyblock_csi_branch || 'master'}} https://github.com/simplyblock-io/simplyblock-csi.git - - - name: Install SPDK-CSI using Helm - run: | - cd simplyblock-csi/charts/latest/spdk-csi - helm install -n spdk-csi --create-namespace spdk-csi ./ \ - --set csiConfig.simplybk.uuid=${{ steps.bootstrap_cluster.outputs.cluster_id }} \ - --set csiConfig.simplybk.ip=${{ steps.bootstrap_cluster.outputs.cluster_api_gateway_endpoint }} \ - --set csiSecret.simplybk.secret=${{ steps.bootstrap_cluster.outputs.cluster_secret }} \ - --set logicalVolume.pool_name=testing1 \ - --set image.simplyblock.tag=main \ - --set image.spdkcsi.tag=master-arm64 \ - --set storagenode.create=true \ - --set cachingnode.create=true \ - --set logicalVolume.encryption=true - - - name: Check Cluster Status - run: | - CLUSTER_API_GATEWAY_ENDPOINT=${{ steps.bootstrap_cluster.outputs.cluster_api_gateway_endpoint }} - CLUSTER_UUID=${{ steps.bootstrap_cluster.outputs.cluster_id }} - CLUSTER_SECRET=${{ steps.bootstrap_cluster.outputs.cluster_secret }} - n=0 - until [ "$n" -ge 60 ] - do - response=$(curl -s -X GET "$CLUSTER_API_GATEWAY_ENDPOINT/cluster/$CLUSTER_UUID" \ - -H "Content-Type: application/json" \ - -H "Authorization: $CLUSTER_UUID $CLUSTER_SECRET") - - status=$(echo $response | jq -r '.results[0].status') - - if [ "$status" != "active" ]; then - echo "Cluster status is not active, current status: $status, retrying" - n=$((n+1)) - sleep 10 - else - echo "Cluster status is active" - exit 0 - fi - done - echo "Cluster status is not active" - exit 1 - - - - name: Run tests - run: | - cd simplyblock-csi - echo "Running tests in namespace ${{ steps.get-namespace.outputs.namespace }}" - export CSI_NAMESPACE=spdk-csi - export CGO_ENABLED=1 - make e2e-test - - - name: Upload docker logs to s3 - run: | - if [[ "${{ github.event_name }}" == 'schedule' || "${{ env.upload_logs }}" == 'true' ]]; then - $GITHUB_WORKSPACE/upload_docker_logs_to_s3.sh --k8s --namespace "spdk-csi" - else - $GITHUB_WORKSPACE/upload_docker_logs_to_s3.sh - fi - if: always() - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_REGION: ${{ secrets.AWS_REGION }} - S3_BUCKET_NAME: "simplyblock-e2e-test-logs" - RUN_ID: ${{ github.run_id }} - - - name: Send Slack Notification - if: always() - env: - SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} - run: | - if [[ ${{ job.status }} == 'success' ]]; then - curl -X POST -H 'Content-type: application/json' --data '{"text":"Kubernetes E2E tests successfully completed!"}' $SLACK_WEBHOOK_URL - else - curl -X POST -H 'Content-type: application/json' --data '{"text":"Kubernetes E2E tests failed!"}' $SLACK_WEBHOOK_URL - fi - - - name: Destroy Cluster - if: always() - run: terraform destroy --auto-approve - - - name: 'Cleanup build folder' - run: | - ls -la ./ - rm -rf ./* || true - rm -rf ./.??* || true - ls -la ./ From d0aa950b68ea4c0119192b9bf46f4d87e9fdce7f Mon Sep 17 00:00:00 2001 From: Manohar Reddy Date: Tue, 15 Jul 2025 12:50:16 +0530 Subject: [PATCH 3/5] update user data for bastion server --- aws/main.tf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/aws/main.tf b/aws/main.tf index ad6075c..e99bcb4 100644 --- a/aws/main.tf +++ b/aws/main.tf @@ -573,6 +573,10 @@ resource "aws_instance" "bastion" { tags = { Name = "${terraform.workspace}-bastion" } + user_data = <> /home/ec2-user/.ssh/authorized_keys +EOF } resource "aws_instance" "mgmt_nodes" { From 966d7093a199061be3cec3f9ceeab00e748cd67a Mon Sep 17 00:00:00 2001 From: Manohar Reddy Date: Wed, 16 Jul 2025 11:28:52 +0530 Subject: [PATCH 4/5] allow current user to access the EKS cluster --- aws/eks.tf | 41 +++++++++++++++-------------------------- aws/locals.tf | 1 + aws/variables.tf | 2 +- 3 files changed, 17 insertions(+), 27 deletions(-) diff --git a/aws/eks.tf b/aws/eks.tf index 944b5ee..83fe0fb 100644 --- a/aws/eks.tf +++ b/aws/eks.tf @@ -250,30 +250,19 @@ module "eks" { } } -# resource "aws_eks_access_entry" "example" { -# cluster_name = var.cluster_name -# principal_arn = aws_iam_role.example.arn -# kubernetes_groups = ["group-1", "group-2"] -# type = "STANDARD" -# } - -# resource "aws_eks_access_policy_association" "eksadmin" { -# cluster_name = var.cluster_name -# policy_arn = "arn:aws:eks::aws:cluster-access-policy/AmazonEKSAdminPolicy" -# principal_arn = aws_iam_user.example.arn - -# access_scope { -# type = "cluster" -# } -# } - -# resource "aws_eks_access_policy_association" "eksclusteradmin" { -# cluster_name = var.cluster_name -# policy_arn = "arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy" -# principal_arn = aws_iam_user.example.arn - -# access_scope { -# type = "cluster" -# } -# } +resource "aws_eks_access_entry" "user1" { + cluster_name = module.eks[0].cluster_name + principal_arn = data.aws_caller_identity.current.arn + type = "STANDARD" +} + +resource "aws_eks_access_policy_association" "eksclusteradmin" { + cluster_name = module.eks[0].cluster_name + policy_arn = "arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy" + principal_arn = data.aws_caller_identity.current.arn + + access_scope { + type = "cluster" + } +} diff --git a/aws/locals.tf b/aws/locals.tf index 1053158..77e7ae6 100644 --- a/aws/locals.tf +++ b/aws/locals.tf @@ -49,4 +49,5 @@ locals { az_index = lookup(local.az_suffix_to_number, local.az_suffix, -1) account_id = data.aws_caller_identity.current.account_id + current_user_arn = data.aws_caller_identity.current.arn } diff --git a/aws/variables.tf b/aws/variables.tf index 5b2b05c..9dbd4bc 100644 --- a/aws/variables.tf +++ b/aws/variables.tf @@ -20,7 +20,7 @@ variable "env" { } variable "sbcli_cmd" { - default = "sbctl" + default = "sbcli-pre" description = "sbcli command to be used" type = string } From ab36186cfed48eed1de337eb1161d6c9359e85e7 Mon Sep 17 00:00:00 2001 From: Manohar Reddy Date: Wed, 16 Jul 2025 11:30:45 +0530 Subject: [PATCH 5/5] update cluster name --- aws/eks.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aws/eks.tf b/aws/eks.tf index 83fe0fb..ec5b49d 100644 --- a/aws/eks.tf +++ b/aws/eks.tf @@ -252,13 +252,13 @@ module "eks" { resource "aws_eks_access_entry" "user1" { - cluster_name = module.eks[0].cluster_name + cluster_name = "${terraform.workspace}-${var.cluster_name}" principal_arn = data.aws_caller_identity.current.arn type = "STANDARD" } resource "aws_eks_access_policy_association" "eksclusteradmin" { - cluster_name = module.eks[0].cluster_name + cluster_name = "${terraform.workspace}-${var.cluster_name}" policy_arn = "arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy" principal_arn = data.aws_caller_identity.current.arn