diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index b3aff5ed..c365f137 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -102,14 +102,11 @@ jobs: -var \"storage_nodes_instance_type=m6i.2xlarge\" \ -var \"sec_storage_nodes_instance_type=m6i.2xlarge\" \ -var \"sec_storage_nodes=0\" \ - -var \"extra_nodes=1\" \ - -var \"extra_nodes_instance_type=m6id.xlarge\" \ - -var \"extra_nodes_arch=amd64\" \ -var \"region=us-east-2\" \ -var \"sbcli_cmd=${{ github.event.inputs.sbcli_cmd || 'sbcli-dev' }}\"" if ${{ github.event.inputs.k8s-snode }}; then - TF_CMD="$TF_CMD -var \"snode_deploy_on_k8s=true\"" + TF_CMD="$TF_CMD -var \"enable_eks=1\"" fi TF_CMD="$TF_CMD -out=tfplan" @@ -132,25 +129,6 @@ jobs: env: SBCLI_CMD: ${{ github.event.inputs.sbcli_cmd || 'sbcli-dev' }} - - name: Bootstrap K3s - run: | - CMD="$GITHUB_WORKSPACE/aws/bootstrap-k3s.sh" - if ${{ github.event.inputs.k8s-snode }}; then - CMD="$CMD --k8s-snode" - fi - echo "Running: $CMD" - eval $CMD - id: bootstrap_k3s - - - name: copying Kubeconfig file - run: | - mkdir -p ${HOME}/.kube - scp -o StrictHostKeyChecking=no -i ${{ steps.bootstrap_k3s.outputs.KEY }} ec2-user@${{ steps.bootstrap_k3s.outputs.extra_node_ip }}:/etc/rancher/k3s/k3s.yaml ${HOME}/.kube/config - - - name: update .kube/config address - run: | - sed -i "s/127.0.0.1/${{ steps.bootstrap_k3s.outputs.extra_node_ip }}/g" ${HOME}/.kube/config - - name: Checkout code uses: actions/checkout@v4 with: @@ -186,11 +164,6 @@ jobs: chmod +x config-apply.sh ./config-apply.sh - - name: Reboot Workers - if: ${{ github.event.inputs.k8s-snode == 'true' }} - run: | - $GITHUB_WORKSPACE/aws/reboot-worker.sh - - name: Install SIMPLYBLK-CONTROLLER using Helm if: ${{ github.event.inputs.k8s-snode == 'true' }} run: | diff --git a/README.md b/README.md new file mode 100644 index 00000000..71c2439e --- /dev/null +++ b/README.md @@ -0,0 +1,8 @@ +### Introduction + +Simplyblock storage solution can be deployed to multiple environments. This repo hosts code to automatically deploy +the setup in AWS and bare metal + +### AWS Deployment + + diff --git a/aws/README.md b/aws/README.md index f9f44c1e..0e7d8418 100644 --- a/aws/README.md +++ b/aws/README.md @@ -19,10 +19,6 @@ Follow the installation instructions for AWS CLI based on your operating system: After installing AWS CLI, configure it with your AWS credentials by running the `aws configure` command and providing your Access Key ID, Secret Access Key, region, and output format. -# Intro - -Terraform template to set up a simple cluster. - # Deploy Infrastructure ## Change Node Counts @@ -62,12 +58,6 @@ terraform plan ## Apply Configurations -### Basic Deployment - -```bash -terraform apply -var mgmt_nodes=1 -var storage_nodes=3 --auto-approve -``` - ### Deploying with EKS ```bash @@ -80,13 +70,6 @@ terraform apply -var mgmt_nodes=1 -var storage_nodes=3 -var enable_eks=1 --auto- terraform apply -var mgmt_nodes=1 -var storage_nodes=3 -var az=us-east-2b --auto-approve ``` -### Specifying the Arch type to Deploy - -```bash -terraform apply -var mgmt_nodes=1 -var storage_nodes=3 -var extra_nodes_arch=arm64 \ - -var extra_nodes_instance_type="m6gd.xlarge" --auto-approve -``` - ### Specifying Instance Types ```bash @@ -94,12 +77,6 @@ terraform apply -var mgmt_nodes=1 -var storage_nodes=3 \ -var mgmt_nodes_instance_type="m5.large" -var storage_nodes_instance_type="m5.large" --auto-approve ``` -### Specifying the Number of EBS Volumes - -```bash -terraform apply -var mgmt_nodes=1 -var storage_nodes=3 -var volumes_per_storage_nodes=2 --auto-approve -``` - ### Specifying the Size of EBS Volumes ```bash @@ -172,12 +149,6 @@ chmod +x ./bootstrap-cluster.sh ### Deploy Storage-node to K8s -#### Set Terraform variable snode_deploy_on_k8s to true - -```bash -terraform apply -var mgmt_nodes=1 -var storage_nodes=3 -var snode_deploy_on_k8s="true" --auto-approve -``` - #### Set cluster argument --k8s-snode ```bash diff --git a/aws/bootstrap-cluster.sh b/aws/bootstrap-cluster.sh index 7d07ee5b..609354fb 100755 --- a/aws/bootstrap-cluster.sh +++ b/aws/bootstrap-cluster.sh @@ -1,7 +1,7 @@ #!/bin/bash set -euo pipefail -KEY="$HOME/.ssh/simplyblock-ohio.pem" +KEY="${KEY:-$HOME/.ssh/id_ed25519}" print_help() { echo "Usage: $0 [options]" @@ -32,6 +32,8 @@ print_help() { echo " --disable-ha-jm Disable HA JM for distrib creation (optional)" echo " --data-nics Set Storage network interface name(s). Can be more than one. (optional)" echo " --id-device-by-nqn Use device nqn to identify it instead of serial number. (optional)" + echo " --max-lvol Set Maximum lvols (optional)" + echo " --number-of-devices Set number of devices (optional)" echo " --help Print this help message" exit 0 } @@ -70,6 +72,14 @@ HA_JM_COUNT="" while [[ $# -gt 0 ]]; do arg="$1" case $arg in + --max-lvol) + MAX_LVOL="$2" + shift + ;; + --number-of-devices) + NO_DEVICE="$2" + shift + ;; --max-snap) MAX_SNAPSHOT="$2" shift @@ -184,37 +194,14 @@ while [[ $# -gt 0 ]]; do shift done -SECRET_VALUE=$(terraform output -raw secret_value) -KEY_NAME=$(terraform output -raw key_name) +echo "reading terraform outputs..." BASTION_IP=$(terraform output -raw bastion_public_ip) GRAFANA_ENDPOINT=$(terraform output -raw grafana_invoke_url) - -ssh_dir="$HOME/.ssh" - -if [ ! -d "$ssh_dir" ]; then - mkdir -p "$ssh_dir" - echo "Directory $ssh_dir created." -else - echo "Directory $ssh_dir already exists." -fi - -if [[ -n "$SECRET_VALUE" ]]; then - KEY="$HOME/.ssh/$KEY_NAME" - if [ -f "$HOME/.ssh/$KEY_NAME" ]; then - echo "the ssh key: ${KEY} already exits on local" - else - echo "$SECRET_VALUE" >"$KEY" - chmod 400 "$KEY" - fi -else - echo "Failed to retrieve secret value. Falling back to default key." -fi - mnodes=$(terraform output -raw mgmt_private_ips) +storage_private_ips=$(terraform output -raw storage_private_ips) + echo "mgmt_private_ips: ${mnodes}" IFS=' ' read -ra mnodes <<<"$mnodes" -storage_private_ips=$(terraform output -raw storage_private_ips) -sec_storage_private_ips=$(terraform output -raw sec_storage_private_ips) echo "bootstrapping cluster..." @@ -241,7 +228,7 @@ echo "" echo "Deploying management node..." echo "" -command="sudo docker swarm leave --force ; ${SBCLI_CMD} --dev -d cluster create" +command="${SBCLI_CMD} --dev -d cluster create" if [[ -n "$LOG_DEL_INTERVAL" ]]; then command+=" --log-del-interval $LOG_DEL_INTERVAL" fi @@ -400,15 +387,6 @@ else echo "add node command: \${add_node_command}" \$add_node_command sleep 3 - done - - for node in ${sec_storage_private_ips}; do - echo "" - echo "joining secondary node \${node}" - add_node_command=\"${command} --is-secondary-node ${CLUSTER_ID} \${node}:5000 eth0\" - echo "add node command: \${add_node_command}" - \$add_node_command - sleep 3 done" echo "" diff --git a/aws/data.tf b/aws/data.tf index bfe7d18c..ced6e5e0 100644 --- a/aws/data.tf +++ b/aws/data.tf @@ -2,10 +2,6 @@ data "aws_availability_zones" "available" { state = "available" } -data "aws_secretsmanager_secret_version" "simply" { - secret_id = local.selected_key_name -} - data "aws_ami" "rhel9" { most_recent = true owners = ["309956199498"] # Red Hat diff --git a/aws/dev.tfvars b/aws/dev.tfvars index 50375d7a..d5a080d1 100644 --- a/aws/dev.tfvars +++ b/aws/dev.tfvars @@ -2,10 +2,6 @@ region = "us-east-1" sbcli_cmd = "sbcli-dev" mgmt_nodes = 1 storage_nodes = 3 -extra_nodes = 0 mgmt_nodes_instance_type = "m5.large" storage_nodes_instance_type = "m5.large" -extra_nodes_instance_type = "m6id.large" -volumes_per_storage_nodes = 1 -enable_apigateway = 0 env = "dev" diff --git a/aws/efs.tf b/aws/efs.tf deleted file mode 100644 index a128fdbc..00000000 --- a/aws/efs.tf +++ /dev/null @@ -1,23 +0,0 @@ -# resource "aws_efs_file_system" "efs" { -# for_each = local.efs_file_systems - -# performance_mode = "generalPurpose" - -# tags = { -# Name = "${aws_instance.mgmt_nodes[0].id}/monitoring_${each.value}" -# } -# } - -# resource "aws_efs_mount_target" "efs_mt" { -# for_each = local.efs_file_systems - -# file_system_id = aws_efs_file_system.efs[each.key].id -# subnet_id = module.vpc.private_subnets[local.az_index] -# security_groups = [aws_security_group.mgmt_node_sg.id] - -# lifecycle { -# ignore_changes = [ -# subnet_id, -# ] -# } -# } diff --git a/aws/eks.tf b/aws/eks.tf index 3c625547..ec5b49de 100644 --- a/aws/eks.tf +++ b/aws/eks.tf @@ -121,10 +121,10 @@ resource "aws_security_group" "eks_nodes_sg" { module "eks" { count = var.enable_eks source = "terraform-aws-modules/eks/aws" - version = "19.16.0" + version = "~> 20.31" cluster_name = "${terraform.workspace}-${var.cluster_name}" - cluster_version = "1.31" + cluster_version = "1.33" cluster_endpoint_private_access = true # default is true cluster_endpoint_public_access = true @@ -144,22 +144,14 @@ module "eks" { } } - enable_irsa = true - eks_managed_node_group_defaults = { disk_size = 30 iam_role_attach_cni_policy = true - # remote_access = { - # ec2_ssh_key = local.selected_key_name - # source_security_group_ids = [aws_security_group.eks_nodes_sg[0].id] - # } } eks_managed_node_groups = { - # FIXME: Caching-node not working properly with bottlerocket ami_type - # https://simplyblock.atlassian.net/browse/SFAM-865 - + # bottlerock storage nodes bottlerocket = { instance_types = ["m6id.large"] ami_type = "BOTTLEROCKET_x86_64" @@ -167,7 +159,7 @@ module "eks" { use_custom_launch_template = false vpc_security_group_ids = [aws_security_group.eks_nodes_sg[0].id] min_size = 0 - max_size = 2 + max_size = 3 desired_size = 0 key_name = local.selected_key_name enable_bootstrap_user_data = true @@ -207,76 +199,47 @@ module "eks" { EOT } - storage-nodes = { - desired_size = 0 - min_size = 0 - max_size = 3 + # AL2023 storage nodes + # TODO: add support for ARM images + al2023 = { + desired_size = var.storage_nodes + min_size = 3 + max_size = 10 labels = { type = "simplyblock-storage-plane" } - taints = { - dedicated = { - key = "dedicated" - value = "simplyblock-storage-plane" - effect = "NO_SCHEDULE" - } - } - - ami_type = "AL2023_x86_64_STANDARD" - instance_types = ["i3en.2xlarge"] + instance_types = [var.storage_nodes_instance_type] + architecture = var.storage_nodes_arch capacity_type = "ON_DEMAND" key_name = local.selected_key_name vpc_security_group_ids = [aws_security_group.eks_nodes_sg[0].id] - pre_bootstrap_user_data = <<-EOT - echo "installing nvme-cli.." - sudo yum install -y nvme-cli - sudo modprobe nvme-tcp - sudo dnf install tuned - EOT - } + bootstrap_extra_args = <<-EOT + # The admin host container provides SSH access and runs with "superpowers". + # It is disabled by default, but can be disabled explicitly. + [settings.host-containers.admin] + enabled = true - eks-nodes = { - desired_size = 2 - min_size = 2 - max_size = 2 + # The control host container provides out-of-band access via SSM. + # It is enabled by default, and can be disabled if you do not expect to use SSM. + # This could leave you with no way to access the API and change settings on an existing node! + [settings.host-containers.control] + enabled = true - labels = { - role = "general" - } + # extra args added + [settings.kernel] + lockdown = "integrity" - ami_type = "AL2023_x86_64_STANDARD" - instance_types = ["t3.xlarge"] - capacity_type = "ON_DEMAND" - key_name = local.selected_key_name - vpc_security_group_ids = [aws_security_group.eks_nodes_sg[0].id] - pre_bootstrap_user_data = <<-EOT - echo "installing nvme-cli.." - sudo yum install -y nvme-cli - sudo modprobe nvme-tcp - sudo dnf install tuned + [settings.kubernetes.node-labels] + type = "simplyblock-storage-plane" EOT - } - - cache-nodes = { - desired_size = 0 - min_size = 0 - max_size = 1 - labels = { - role = "cache" - } - - ami_type = "AL2023_x86_64_STANDARD" - instance_types = ["m6id.large"] - capacity_type = "ON_DEMAND" - key_name = local.selected_key_name - vpc_security_group_ids = [aws_security_group.eks_nodes_sg[0].id] pre_bootstrap_user_data = <<-EOT echo "installing nvme-cli.." sudo yum install -y nvme-cli sudo modprobe nvme-tcp + sudo dnf install tuned EOT } } @@ -286,3 +249,20 @@ module "eks" { Environment = "${terraform.workspace}-dev" } } + + +resource "aws_eks_access_entry" "user1" { + cluster_name = "${terraform.workspace}-${var.cluster_name}" + principal_arn = data.aws_caller_identity.current.arn + type = "STANDARD" +} + +resource "aws_eks_access_policy_association" "eksclusteradmin" { + cluster_name = "${terraform.workspace}-${var.cluster_name}" + policy_arn = "arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy" + principal_arn = data.aws_caller_identity.current.arn + + access_scope { + type = "cluster" + } +} diff --git a/aws/locals.tf b/aws/locals.tf index 0377f251..77e7ae61 100644 --- a/aws/locals.tf +++ b/aws/locals.tf @@ -1,17 +1,7 @@ data "aws_caller_identity" "current" {} locals { - volume_device_names = ["/dev/sdi", "/dev/sdj", "/dev/sdk", "/dev/sdl", "/dev/sdm", "/dev/sdn", "/dev/sdo"] - - snodes = toset([for n in range(var.storage_nodes) : tostring(n)]) - - sec_snodes = toset([for n in range(var.sec_storage_nodes) : tostring(n)]) - - node_disks = { for pair in setproduct(local.snodes, slice(local.volume_device_names, 0, var.volumes_per_storage_nodes)) : "${pair[0]}:${pair[1]}" => { - node_name = pair[0] - disk_dev_path = pair[1] - } } - + snodes = toset([for n in range(var.enable_eks == 1 ? 0 : var.storage_nodes) : tostring(n)]) key_name = { "us-east-1" = "simplyblock-us-east-1.pem" "us-east-2" = "simplyblock-us-east-2.pem" @@ -58,15 +48,6 @@ locals { az_suffix = substr(var.az, -1, 1) az_index = lookup(local.az_suffix_to_number, local.az_suffix, -1) - efs_file_systems = { - mongodb_data = "mongodb_data" - os_data = "os_data" - graylog_data = "graylog_data" - graylog_journal = "graylog_journal" - graylog_config = "graylog_config" - grafana_data = "grafana_data" - prometheus_data = "prometheus_data" - } - account_id = data.aws_caller_identity.current.account_id + current_user_arn = data.aws_caller_identity.current.arn } diff --git a/aws/main.tf b/aws/main.tf index 5ddfab7b..e99bcb42 100644 --- a/aws/main.tf +++ b/aws/main.tf @@ -1,6 +1,7 @@ module "vpc" { source = "terraform-aws-modules/vpc/aws" + version = "5.21.0" name = "${terraform.workspace}-storage-vpc-sb" cidr = "10.0.0.0/16" @@ -30,7 +31,7 @@ module "vpc" { } module "apigatewayendpoint" { - count = var.enable_apigateway == 1 && var.mgmt_nodes > 0 ? 1 : 0 + count = var.mgmt_nodes > 0 ? 1 : 0 source = "./modules/apigateway" region = var.region mgmt_node_instance_ids = aws_instance.mgmt_nodes[*].id @@ -545,97 +546,6 @@ data "aws_iam_policy_document" "assume_role_policy" { } } -# create a policy -resource "aws_iam_policy" "mgmt_policy" { - name = "${terraform.workspace}-mgmt_node_policy" - description = "Policy for allowing EC2 to communicate with other resources" - - policy = jsonencode({ - Version = "2012-10-17" - Statement = [ - { - "Effect" : "Allow", - "Action" : [ - "ec2:DescribeAvailabilityZones", - "ec2:DescribeSubnets", - "ec2:DescribeNetworkInterfaces", - "ec2:DescribeInstances", - "ec2:DescribeInstanceAttribute", - "ec2:DescribeSecurityGroups", - "ec2:DescribeTags", - "ec2:DescribeVolumes", - "ec2:RunInstances", - "ec2:CreateVolume", - "ec2:AttachVolume", - "ec2:DetachVolume", - "ec2:CreateTags" - ], - "Resource" : "*" - }, - { - "Effect" : "Allow", - "Action" : "sts:GetServiceBearerToken", - "Resource" : "*" - }, - { - "Effect" : "Allow", - "Action" : "iam:PassRole", - "Resource" : "*" - }, - { - Action = [ - "codeartifact:GetAuthorizationToken", - "codeartifact:GetRepositoryEndpoint", - "codeartifact:ReadFromRepository", - ], - Effect = "Allow", - Resource = [ - "arn:aws:codeartifact:eu-west-1:${local.account_id}:repository/simplyblock/sbcli", - "arn:aws:codeartifact:eu-west-1:${local.account_id}:domain/simplyblock" - ] - }, - { - Action = [ - "ssm:SendCommand", - ], - Effect = "Allow", - Resource = [ - "arn:aws:ec2:${var.region}:${local.account_id}:instance/*", - "arn:aws:ssm:${var.region}::document/AWS-RunShellScript", - "arn:aws:ssm:${var.region}:${local.account_id}:*" - ] - }, - { - Action = [ - "ssm:GetCommandInvocation" - ], - Effect = "Allow", - Resource = [ - "arn:aws:ssm:${var.region}:${local.account_id}:*" - ] - }, - { - "Effect" : "Allow", - "Action" : [ - "s3:GetObject" - ], - "Resource" : [ - "${aws_s3_bucket.tfengine_logs.arn}/*", - "arn:aws:s3:::${var.tf_state_bucket_name}/*" - ] - }, - { - "Effect" : "Allow", - "Action" : [ - "s3:ListBucket" - ], - "Resource" : [ - "arn:aws:s3:::${var.tf_state_bucket_name}" - ] - } - ] - }) -} # create a role with an assumed policy resource "aws_iam_role" "role" { @@ -643,11 +553,6 @@ resource "aws_iam_role" "role" { assume_role_policy = data.aws_iam_policy_document.assume_role_policy.json } -# attach policy to the role -resource "aws_iam_role_policy_attachment" "s3_get_object_attachment" { - role = aws_iam_role.role.name - policy_arn = aws_iam_policy.mgmt_policy.arn -} # create instance profile resource "aws_iam_instance_profile" "inst_profile" { @@ -668,6 +573,10 @@ resource "aws_instance" "bastion" { tags = { Name = "${terraform.workspace}-bastion" } + user_data = <> /home/ec2-user/.ssh/authorized_keys +EOF } resource "aws_instance" "mgmt_nodes" { @@ -693,6 +602,7 @@ resource "aws_instance" "mgmt_nodes" { user_data = <> /home/ec2-user/.ssh/authorized_keys echo "installing sbcli.." sudo yum install -y pip jq pip install ${local.sbcli_pkg} @@ -712,7 +622,6 @@ resource "aws_instance" "storage_nodes" { ami = local.ami_map[var.storage_nodes_arch][var.region] # RHEL 9 // use this outside simplyblock aws acccount data.aws_ami.rhel9.id instance_type = var.storage_nodes_instance_type - key_name = local.selected_key_name vpc_security_group_ids = [aws_security_group.storage_nodes_sg.id] subnet_id = module.vpc.private_subnets[local.az_index] iam_instance_profile = aws_iam_instance_profile.inst_profile.name @@ -731,6 +640,7 @@ resource "aws_instance" "storage_nodes" { user_data = <> /home/ec2-user/.ssh/authorized_keys sudo sysctl -w vm.nr_hugepages=${var.nr_hugepages} cat /proc/meminfo | grep -i hug echo "installing sbcli.." @@ -739,123 +649,10 @@ pip install ${local.sbcli_pkg} curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" unzip awscliv2.zip sudo ./aws/install -if [ "${var.snode_deploy_on_k8s}" = "false" ]; then - ${var.sbcli_cmd} storage-node configure --max-lvol ${var.max_lvol} --max-size ${var.max_size} \ - --nodes-per-socket ${var.nodes_per_socket} --sockets-to-use ${var.socket_to_use} \ - --pci-allowed "${join(",", var.pci_allowed)}" --pci-blocked "${join(",", var.pci_blocked)}" - - ${var.sbcli_cmd} storage-node deploy -fi -EOF -} - -resource "aws_instance" "sec_storage_nodes" { - for_each = local.sec_snodes - ami = local.ami_map[var.storage_nodes_arch][var.region] # RHEL 9 - instance_type = var.sec_storage_nodes_instance_type - key_name = local.selected_key_name - vpc_security_group_ids = [aws_security_group.storage_nodes_sg.id] - subnet_id = module.vpc.private_subnets[local.az_index] - iam_instance_profile = aws_iam_instance_profile.inst_profile.name - root_block_device { - volume_size = 45 - } - tags = { - Name = "${terraform.workspace}-sec-storage-${each.value + 1}" - } - - lifecycle { - ignore_changes = [ - subnet_id, - ] - } - - user_data = < 0 && var.storage_nodes > 0 ? var.storage_nodes : 0 - availability_zone = data.aws_availability_zones.available.names[local.az_index] - size = var.storage_nodes_ebs_size1 - - tags = { - Name = "simplyblock-jm" - } - - lifecycle { - ignore_changes = [ - availability_zone, - ] - } -} - -resource "aws_ebs_volume" "storage_nodes_ebs2" { - for_each = var.storage_nodes > 0 ? local.node_disks : {} - - availability_zone = data.aws_availability_zones.available.names[local.az_index] - size = var.storage_nodes_ebs_size2 - - tags = { - Name = "simplyblock-storage" - } - - lifecycle { - ignore_changes = [ - availability_zone, - ] - } -} - -resource "aws_volume_attachment" "attach_sn2" { - for_each = var.storage_nodes > 0 ? local.node_disks : {} - - device_name = each.value.disk_dev_path - volume_id = aws_ebs_volume.storage_nodes_ebs2[each.key].id - instance_id = aws_instance.storage_nodes[each.value.node_name].id -} - -resource "aws_volume_attachment" "attach_sn" { - count = var.volumes_per_storage_nodes > 0 && var.storage_nodes > 0 ? var.storage_nodes : 0 - device_name = "/dev/sdh" - volume_id = aws_ebs_volume.storage_nodes_ebs[count.index].id - instance_id = aws_instance.storage_nodes[count.index].id -} - -# can be used for testing caching nodes -resource "aws_instance" "extra_nodes" { - count = var.extra_nodes - ami = local.ami_map[var.extra_nodes_arch][var.region] # RHEL 9 - instance_type = var.extra_nodes_instance_type - key_name = local.selected_key_name - vpc_security_group_ids = [aws_security_group.extra_nodes_sg.id] - subnet_id = module.vpc.public_subnets[1] - iam_instance_profile = aws_iam_instance_profile.inst_profile.name - root_block_device { - volume_size = 45 - } - tags = { - Name = "${terraform.workspace}-k8scluster-${count.index + 1}" - } - user_data = < { @@ -58,18 +41,6 @@ output "storage_public_ips" { value = join(" ", [for inst in aws_instance.storage_nodes : inst.public_ip]) } -output "sec_storage_node_details" { - value = { for i, instance in aws_instance.sec_storage_nodes : - instance.tags["Name"] => { - type = instance.instance_type - public_ip = instance.public_ip - private_ip = instance.private_ip - availability_zone = instance.availability_zone - } - } - description = "Details of the secondary storage node nodes." -} - output "bastion_public_ip" { value = aws_instance.bastion.public_ip } diff --git a/aws/prod.tfvars b/aws/prod.tfvars index 43aeb844..86d8480e 100644 --- a/aws/prod.tfvars +++ b/aws/prod.tfvars @@ -2,10 +2,6 @@ region = " eu-central-1" sbcli_cmd = "sbcli-release" mgmt_nodes = 1 storage_nodes = 3 -extra_nodes = 0 mgmt_nodes_instance_type = "m5.large" -storage_nodes_instance_type = "i3en.large" -extra_nodes_instance_type = "m6id.large" -volumes_per_storage_nodes = 1 -enable_apigateway = 0 +storage_nodes_instance_type = "i3en.2xlarge" env = "prod" diff --git a/aws/provider.tf b/aws/provider.tf index 05e296a7..1cbf6399 100644 --- a/aws/provider.tf +++ b/aws/provider.tf @@ -8,7 +8,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "5.80.0" + version = "5.95.0" } } } diff --git a/aws/tfengine.tf b/aws/tfengine.tf deleted file mode 100644 index 470d4caa..00000000 --- a/aws/tfengine.tf +++ /dev/null @@ -1,240 +0,0 @@ - - -data "aws_ami" "this" { - most_recent = true - owners = ["amazon"] - - filter { - name = "name" - values = ["al2023-ami-2023*"] - } - - filter { - name = "architecture" - values = ["x86_64"] - } -} - -resource "aws_autoscaling_group" "tfengine_asg" { - min_size = 0 - max_size = 1 - desired_capacity = 0 - vpc_zone_identifier = [module.vpc.private_subnets[0]] - tag { - key = "Name" - value = "${terraform.workspace}-tfengine" - propagate_at_launch = true - } - tag { - key = "long-term-test" - value = "true" - propagate_at_launch = true - } - lifecycle { - create_before_destroy = true - } - launch_template { - id = aws_launch_template.tfengine_lc.id - version = "$Latest" - } -} - -resource "aws_launch_template" "tfengine_lc" { - name_prefix = "tfengine" - image_id = data.aws_ami.this.id - instance_type = "t3.medium" - - lifecycle { - create_before_destroy = true - } - - network_interfaces { - associate_public_ip_address = false - security_groups = [aws_security_group.tfengine_sg.id] - } - - iam_instance_profile { - name = aws_iam_instance_profile.tfengine.name - } - - user_data = base64encode(< ~/.docker/config.json -EOF - ) - - tag_specifications { - resource_type = "instance" - - tags = { - Name = "${terraform.workspace}-tfengine" - } - } -} - -resource "aws_security_group" "tfengine_sg" { - description = "tfEngine security group" - vpc_id = module.vpc.vpc_id - - egress { - from_port = 0 - to_port = 0 - protocol = "-1" - cidr_blocks = ["0.0.0.0/0"] - } -} - -resource "aws_iam_instance_profile" "tfengine" { - role = aws_iam_role.tfengine.name -} - -resource "aws_iam_role" "tfengine" { - - assume_role_policy = <> $GITHUB_ENV - - - name: Set upload_logs for manual workflow_dispatch - if: github.event_name == 'workflow_dispatch' - run: echo "upload_logs=${{ github.event.inputs.upload_logs }}" >> $GITHUB_ENV - - - uses: actions/setup-go@v5 - with: - go-version: '1.22' - - run: go version - - - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v2 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ secrets.AWS_REGION }} - - - name: Setup Terraform - uses: hashicorp/setup-terraform@v2 - with: - terraform_wrapper: false - - - name: Install kubectl - run: | - curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" - sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl - - - name: Install Helm - run: | - curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 - chmod 700 get_helm.sh - ./get_helm.sh - - - name: Initialize Terraform - run: | - export TFSTATE_BUCKET=simplyblock-terraform-state-bucket - export TFSTATE_KEY=csi - export TFSTATE_REGION=us-east-2 - export TFSTATE_DYNAMODB_TABLE=terraform-up-and-running-locks - - terraform init -reconfigure \ - -backend-config="bucket=${TFSTATE_BUCKET}" \ - -backend-config="key=${TFSTATE_KEY}" \ - -backend-config="region=${TFSTATE_REGION}" \ - -backend-config="dynamodb_table=${TFSTATE_DYNAMODB_TABLE}" \ - -backend-config="encrypt=true" - - - name: select or create workspace - run: terraform workspace select -or-create githubactions - - - name: Validate Terraform Configuration - run: terraform validate - - - name: Plan Terraform Changes - run: | - terraform plan \ - -var "mgmt_nodes=1" \ - -var "storage_nodes=3" \ - -var "storage_nodes_arch=arm64" \ - -var "storage_nodes_instance_type=m6g.2xlarge" \ - -var "sec_storage_nodes_instance_type=m6g.2xlarge" \ - -var "snode_deploy_on_k8s=true" \ - -var "extra_nodes=1" \ - -var "extra_nodes_instance_type=m6gd.xlarge" \ - -var "extra_nodes_arch=arm64" \ - -var "region=us-east-2" \ - -var "sbcli_cmd=${{ github.event.inputs.sbcli_cmd || 'sbcli-dev' }}" \ - -out=tfplan - - - name: Apply Terraform Changes - run: terraform apply tfplan - - - name: Bootstrap Cluster - run: $GITHUB_WORKSPACE/bootstrap-cluster.sh --max-lvol 10 --max-snap 10 --max-prov 150g --number-of-devices 1 --k8s-snode - id: bootstrap_cluster - env: - SBCLI_CMD: ${{ github.event.inputs.sbcli_cmd || 'sbcli-dev' }} - - - name: Bootstrap K3s - run: $GITHUB_WORKSPACE/bootstrap-k3s.sh --k8s-snode - id: bootstrap_k3s - - - name: copying Kubeconfig file - run: | - mkdir -p ${HOME}/.kube - scp -o StrictHostKeyChecking=no -i ${{ steps.bootstrap_k3s.outputs.KEY }} ec2-user@${{ steps.bootstrap_k3s.outputs.extra_node_ip }}:/etc/rancher/k3s/k3s.yaml ${HOME}/.kube/config - - - name: update .kube/config address - run: | - sed -i "s/127.0.0.1/${{ steps.bootstrap_k3s.outputs.extra_node_ip }}/g" ${HOME}/.kube/config - - - name: Clone simplyblock-csi repo - run: git clone -b ${{ github.event.inputs.simplyblock_csi_branch || 'master'}} https://github.com/simplyblock-io/simplyblock-csi.git - - - name: Install SPDK-CSI using Helm - run: | - cd simplyblock-csi/charts/latest/spdk-csi - helm install -n spdk-csi --create-namespace spdk-csi ./ \ - --set csiConfig.simplybk.uuid=${{ steps.bootstrap_cluster.outputs.cluster_id }} \ - --set csiConfig.simplybk.ip=${{ steps.bootstrap_cluster.outputs.cluster_api_gateway_endpoint }} \ - --set csiSecret.simplybk.secret=${{ steps.bootstrap_cluster.outputs.cluster_secret }} \ - --set logicalVolume.pool_name=testing1 \ - --set image.simplyblock.tag=main \ - --set image.spdkcsi.tag=master-arm64 \ - --set storagenode.create=true \ - --set cachingnode.create=true \ - --set logicalVolume.encryption=true - - - name: Check Cluster Status - run: | - CLUSTER_API_GATEWAY_ENDPOINT=${{ steps.bootstrap_cluster.outputs.cluster_api_gateway_endpoint }} - CLUSTER_UUID=${{ steps.bootstrap_cluster.outputs.cluster_id }} - CLUSTER_SECRET=${{ steps.bootstrap_cluster.outputs.cluster_secret }} - n=0 - until [ "$n" -ge 60 ] - do - response=$(curl -s -X GET "$CLUSTER_API_GATEWAY_ENDPOINT/cluster/$CLUSTER_UUID" \ - -H "Content-Type: application/json" \ - -H "Authorization: $CLUSTER_UUID $CLUSTER_SECRET") - - status=$(echo $response | jq -r '.results[0].status') - - if [ "$status" != "active" ]; then - echo "Cluster status is not active, current status: $status, retrying" - n=$((n+1)) - sleep 10 - else - echo "Cluster status is active" - exit 0 - fi - done - echo "Cluster status is not active" - exit 1 - - - - name: Run tests - run: | - cd simplyblock-csi - echo "Running tests in namespace ${{ steps.get-namespace.outputs.namespace }}" - export CSI_NAMESPACE=spdk-csi - export CGO_ENABLED=1 - make e2e-test - - - name: Upload docker logs to s3 - run: | - if [[ "${{ github.event_name }}" == 'schedule' || "${{ env.upload_logs }}" == 'true' ]]; then - $GITHUB_WORKSPACE/upload_docker_logs_to_s3.sh --k8s --namespace "spdk-csi" - else - $GITHUB_WORKSPACE/upload_docker_logs_to_s3.sh - fi - if: always() - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_REGION: ${{ secrets.AWS_REGION }} - S3_BUCKET_NAME: "simplyblock-e2e-test-logs" - RUN_ID: ${{ github.run_id }} - - - name: Send Slack Notification - if: always() - env: - SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} - run: | - if [[ ${{ job.status }} == 'success' ]]; then - curl -X POST -H 'Content-type: application/json' --data '{"text":"Kubernetes E2E tests successfully completed!"}' $SLACK_WEBHOOK_URL - else - curl -X POST -H 'Content-type: application/json' --data '{"text":"Kubernetes E2E tests failed!"}' $SLACK_WEBHOOK_URL - fi - - - name: Destroy Cluster - if: always() - run: terraform destroy --auto-approve - - - name: 'Cleanup build folder' - run: | - ls -la ./ - rm -rf ./* || true - rm -rf ./.??* || true - ls -la ./