From 271f81f6551d96675966244027fa18985def4e5f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 11 Feb 2026 03:18:42 +0000 Subject: [PATCH 1/6] Initial plan From e50bf0d0f4211bc22170252eafc364796464c676 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 11 Feb 2026 03:23:01 +0000 Subject: [PATCH 2/6] Add Docker Compose SLURM environment with configuration and examples Co-authored-by: mihow <158175+mihow@users.noreply.github.com> --- .github/workflows/test-slurm-jobs.yml | 126 +++++++++ README.md | 24 ++ docker-compose.slurm.yml | 61 ++++ docker/slurm/Dockerfile | 87 ++++++ docker/slurm/README.md | 345 +++++++++++++++++++++++ docker/slurm/cgroup.conf | 11 + docker/slurm/entrypoint.sh | 52 ++++ docker/slurm/examples/job_gpu_test.sh | 47 +++ docker/slurm/examples/job_hello.sh | 20 ++ docker/slurm/examples/job_test_ami_ml.sh | 43 +++ docker/slurm/examples/job_test_env.sh | 36 +++ docker/slurm/gres.conf | 2 + docker/slurm/slurm.conf | 76 +++++ 13 files changed, 930 insertions(+) create mode 100644 .github/workflows/test-slurm-jobs.yml create mode 100644 docker-compose.slurm.yml create mode 100644 docker/slurm/Dockerfile create mode 100644 docker/slurm/README.md create mode 100644 docker/slurm/cgroup.conf create mode 100644 docker/slurm/entrypoint.sh create mode 100644 docker/slurm/examples/job_gpu_test.sh create mode 100644 docker/slurm/examples/job_hello.sh create mode 100644 docker/slurm/examples/job_test_ami_ml.sh create mode 100644 docker/slurm/examples/job_test_env.sh create mode 100644 docker/slurm/gres.conf create mode 100644 docker/slurm/slurm.conf diff --git a/.github/workflows/test-slurm-jobs.yml b/.github/workflows/test-slurm-jobs.yml new file mode 100644 index 0000000..0bf0db3 --- /dev/null +++ b/.github/workflows/test-slurm-jobs.yml @@ -0,0 +1,126 @@ +name: Test SLURM Jobs + +on: + push: + branches: [ main, develop ] + paths: + - 'scripts/job_*.sh' + - 'research/**/job_*.sh' + - 'docker/slurm/**' + - '.github/workflows/test-slurm-jobs.yml' + pull_request: + branches: [ main, develop ] + paths: + - 'scripts/job_*.sh' + - 'research/**/job_*.sh' + - 'docker/slurm/**' + - '.github/workflows/test-slurm-jobs.yml' + workflow_dispatch: + +jobs: + test-slurm-environment: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build SLURM Docker images + run: | + docker-compose -f docker-compose.slurm.yml build + + - name: Start SLURM cluster + run: | + docker-compose -f docker-compose.slurm.yml up -d + # Wait for SLURM to be ready + sleep 30 + + - name: Check SLURM cluster status + run: | + docker exec ami-ml-slurmctld sinfo + docker exec ami-ml-slurmctld scontrol show nodes + + - name: Test basic SLURM job submission + run: | + # Submit test job + docker exec ami-ml-slurmctld bash -c "cd /workspace && sbatch docker/slurm/examples/job_hello.sh" + + # Wait for job to complete + sleep 10 + + # Check job status + docker exec ami-ml-slurmctld squeue + + # Display job output + docker exec ami-ml-slurmctld bash -c "cd /workspace && cat hello_slurm_*.out || echo 'Job output not found yet'" + + - name: Test environment setup job + run: | + # Submit environment test job + JOB_ID=$(docker exec ami-ml-slurmctld bash -c "cd /workspace && sbatch docker/slurm/examples/job_test_env.sh" | grep -oP '\d+') + echo "Submitted job ID: $JOB_ID" + + # Wait for job to complete (with timeout) + timeout=120 + elapsed=0 + while [ $elapsed -lt $timeout ]; do + status=$(docker exec ami-ml-slurmctld squeue -j $JOB_ID -h -o "%T" 2>/dev/null || echo "COMPLETED") + if [ "$status" = "COMPLETED" ] || [ -z "$status" ]; then + echo "Job $JOB_ID completed" + break + fi + echo "Job $JOB_ID status: $status (waiting...)" + sleep 10 + elapsed=$((elapsed + 10)) + done + + # Check if job completed successfully + docker exec ami-ml-slurmctld bash -c "sacct -j $JOB_ID --format=JobID,State,ExitCode" + + # Display job output + docker exec ami-ml-slurmctld bash -c "cd /workspace && cat test_env_setup_*.out" + + - name: Test ami-ml package availability + run: | + # Submit ami-ml test job + JOB_ID=$(docker exec ami-ml-slurmctld bash -c "cd /workspace && sbatch docker/slurm/examples/job_test_ami_ml.sh" | grep -oP '\d+') + echo "Submitted job ID: $JOB_ID" + + # Wait for job to complete (with timeout) + timeout=300 + elapsed=0 + while [ $elapsed -lt $timeout ]; do + status=$(docker exec ami-ml-slurmctld squeue -j $JOB_ID -h -o "%T" 2>/dev/null || echo "COMPLETED") + if [ "$status" = "COMPLETED" ] || [ -z "$status" ]; then + echo "Job $JOB_ID completed" + break + fi + echo "Job $JOB_ID status: $status (waiting...)" + sleep 15 + elapsed=$((elapsed + 15)) + done + + # Check if job completed successfully + docker exec ami-ml-slurmctld bash -c "sacct -j $JOB_ID --format=JobID,State,ExitCode" + + # Display job output + docker exec ami-ml-slurmctld bash -c "cd /workspace && cat test_ami_ml_training_*.out" + + - name: Collect SLURM logs on failure + if: failure() + run: | + echo "=== SLURM Controller Logs ===" + docker-compose -f docker-compose.slurm.yml logs slurm-controller + echo "=== SLURM Compute Node Logs ===" + docker-compose -f docker-compose.slurm.yml logs slurm-compute + echo "=== All job outputs ===" + docker exec ami-ml-slurmctld bash -c "cd /workspace && ls -la *.out 2>/dev/null || echo 'No job outputs found'" + docker exec ami-ml-slurmctld bash -c "cd /workspace && cat *.out 2>/dev/null || echo 'No job outputs to display'" + + - name: Stop SLURM cluster + if: always() + run: | + docker-compose -f docker-compose.slurm.yml down -v diff --git a/README.md b/README.md index 504ddaf..d6a8e72 100644 --- a/README.md +++ b/README.md @@ -58,3 +58,27 @@ Alternatively, one can run the scripts without activating poetry's shell: ```bash poetry run python