From a942d9dc60f2c10b190c351cfab95411655d8c55 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 19 Feb 2026 14:59:43 -0500
Subject: [PATCH 01/10] Add test sharding, proactive clean, and retry logic for
 self-hosted CI

- Shard Frontier GPU tests into 2 parts for faster parallel execution
- Add proactive ./mfc.sh clean in Phoenix test scripts to prevent
  cross-compiler contamination from stale build artifacts
- Add --requeue to Phoenix SLURM jobs for preemption recovery
- Add lint-gate job that must pass before self-hosted tests run
- Add retry logic for GitHub runner tests (retry <=5 failures)
- Add Frontier AMD test support with dedicated submit/test scripts
- Restructure self-hosted matrix with explicit cluster names

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/frontier/submit.sh     |  8 +--
 .github/workflows/frontier/test.sh       |  7 ++-
 .github/workflows/frontier_amd/submit.sh |  8 +--
 .github/workflows/frontier_amd/test.sh   |  7 ++-
 .github/workflows/phoenix/submit.sh      |  1 +
 .github/workflows/phoenix/test.sh        |  4 ++
 .github/workflows/test.yml               | 67 ++++++++++++++++++++----
 7 files changed, 85 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh
index d5b416c65a..4c3e0e3e27 100644
--- a/.github/workflows/frontier/submit.sh
+++ b/.github/workflows/frontier/submit.sh
@@ -34,12 +34,13 @@ output_file="$job_slug.out"
 submit_output=$(sbatch <<EOT
 #!/bin/bash
 #SBATCH -J MFC-$job_slug            # Job name
-#SBATCH -A ENG160                  # charge account
+#SBATCH -A CFD154                  # charge account
 #SBATCH -N 1                       # Number of nodes required
 $sbatch_device_opts
-#SBATCH -t 05:59:00                # Duration of the job (Ex: 15 mins)
+#SBATCH -t 01:59:00                # Duration of the job
 #SBATCH -o$output_file             # Combined output and error messages file
-#SBATCH -p extended                # Extended partition for shorter queues
+#SBATCH -p batch                   # Batch partition (concurrent jobs)
+#SBATCH --qos=hackathon            # Hackathon QOS for batch access
 
 set -e
 set -x
@@ -50,6 +51,7 @@ echo "Running in $(pwd):"
 job_slug="$job_slug"
 job_device="$2"
 job_interface="$3"
+job_shard="$4"
 
 . ./mfc.sh load -c f -m $([ "$2" = "gpu" ] && echo "g" || echo "c")
 
diff --git a/.github/workflows/frontier/test.sh b/.github/workflows/frontier/test.sh
index 17fbbaf8e5..ad109c6478 100644
--- a/.github/workflows/frontier/test.sh
+++ b/.github/workflows/frontier/test.sh
@@ -13,8 +13,13 @@ if [ "$job_device" = "gpu" ]; then
     fi
 fi
 
+shard_opts=""
+if [ -n "$job_shard" ]; then
+    shard_opts="--shard $job_shard"
+fi
+
 if [ "$job_device" = "gpu" ]; then
-    ./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts -- -c frontier
+    ./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c frontier
 else
     ./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c frontier
 fi
diff --git a/.github/workflows/frontier_amd/submit.sh b/.github/workflows/frontier_amd/submit.sh
index 551e0056b8..df73db5807 100644
--- a/.github/workflows/frontier_amd/submit.sh
+++ b/.github/workflows/frontier_amd/submit.sh
@@ -34,12 +34,13 @@ output_file="$job_slug.out"
 submit_output=$(sbatch <<EOT
 #!/bin/bash
 #SBATCH -J MFC-$job_slug            # Job name
-#SBATCH -A ENG160                  # charge account
+#SBATCH -A CFD154                  # charge account
 #SBATCH -N 1                       # Number of nodes required
 $sbatch_device_opts
-#SBATCH -t 05:59:00                # Duration of the job (Ex: 15 mins)
+#SBATCH -t 01:59:00                # Duration of the job
 #SBATCH -o$output_file             # Combined output and error messages file
-#SBATCH -p extended                # Extended partition for shorter queues
+#SBATCH -p batch                   # Batch partition (concurrent jobs)
+#SBATCH --qos=hackathon            # Hackathon QOS for batch access
 
 set -e
 set -x
@@ -50,6 +51,7 @@ echo "Running in $(pwd):"
 job_slug="$job_slug"
 job_device="$2"
 job_interface="$3"
+job_shard="$4"
 
 . ./mfc.sh load -c famd -m $([ "$2" = "gpu" ] && echo "g" || echo "c")
 
diff --git a/.github/workflows/frontier_amd/test.sh b/.github/workflows/frontier_amd/test.sh
index ff65aa2b0e..c051144b2d 100644
--- a/.github/workflows/frontier_amd/test.sh
+++ b/.github/workflows/frontier_amd/test.sh
@@ -13,8 +13,13 @@ if [ "$job_device" = "gpu" ]; then
     fi
 fi
 
+shard_opts=""
+if [ -n "$job_shard" ]; then
+    shard_opts="--shard $job_shard"
+fi
+
 if [ "$job_device" = "gpu" ]; then
-    ./mfc.sh test -v -a --max-attempts 3 -j $ngpus $device_opts -- -c frontier_amd
+    ./mfc.sh test -v -a --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c frontier_amd
 else
     ./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c frontier_amd
 fi
diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh
index 06a03e465a..874f5afa44 100755
--- a/.github/workflows/phoenix/submit.sh
+++ b/.github/workflows/phoenix/submit.sh
@@ -48,6 +48,7 @@ submit_output=$(sbatch <<EOT
 $sbatch_device_opts
 #SBATCH -t 03:00:00                # Duration of the job (Ex: 15 mins)
 #SBATCH -q embers                  # QOS Name
+#SBATCH --requeue                  # Auto-requeue on preemption
 #SBATCH -o$output_file             # Combined output and error messages file
 
 set -e
diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh
index 74c31c9fba..e6912f70b6 100644
--- a/.github/workflows/phoenix/test.sh
+++ b/.github/workflows/phoenix/test.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
 
+# Clean stale build artifacts from previous CI runs to prevent
+# cross-compiler contamination (e.g. gfortran LAPACK linked by NVHPC)
+./mfc.sh clean
+
 build_opts=""
 if [ "$job_device" = "gpu" ]; then
     build_opts="--gpu"
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index aae87c7204..fbb2839806 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -143,14 +143,33 @@ jobs:
           TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
 
       - name: Test
-        run:  |
-          /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT
+        run: |
+          rm -f tests/failed_uuids.txt
+          TEST_EXIT=0
+          /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || TEST_EXIT=$?
+
+          # Retry only if a small number of tests failed (sporadic failures)
+          if [ -f tests/failed_uuids.txt ]; then
+            NUM_FAILED=$(wc -l < tests/failed_uuids.txt)
+            if [ "$NUM_FAILED" -le 5 ]; then
+              FAILED=$(cat tests/failed_uuids.txt | tr '\n' ' ')
+              echo ""
+              echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ==="
+              echo ""
+              /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) --only $FAILED $TEST_ALL || exit $?
+            else
+              echo "Too many failures ($NUM_FAILED) to retry — likely a real issue."
+              exit 1
+            fi
+          elif [ "$TEST_EXIT" -ne 0 ]; then
+            exit $TEST_EXIT
+          fi
         env:
           TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
           TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }}
 
   self:
-    name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
+    name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }}${{ matrix.shard != '' && format(' [{0}]', matrix.shard) || '' }})"
     if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true'
     needs: [lint-gate, file-changes]
     continue-on-error: false
@@ -164,50 +183,74 @@ jobs:
             cluster_name: 'Georgia Tech | Phoenix'
             device: 'gpu'
             interface: 'acc'
+            shard: ''
           - runner:       'gt'
             cluster:      'phoenix'
             cluster_name: 'Georgia Tech | Phoenix'
             device: 'gpu'
             interface: 'omp'
+            shard: ''
           - runner:       'gt'
             cluster:      'phoenix'
             cluster_name: 'Georgia Tech | Phoenix'
             device: 'cpu'
             interface: 'none'
-          # Frontier (ORNL) — build on login node, test via SLURM
+            shard: ''
+          # Frontier (ORNL) — build on login node, GPU tests sharded for batch partition
+          - runner:       'frontier'
+            cluster:      'frontier'
+            cluster_name: 'Oak Ridge | Frontier'
+            device: 'gpu'
+            interface: 'acc'
+            shard: '1/2'
           - runner:       'frontier'
             cluster:      'frontier'
             cluster_name: 'Oak Ridge | Frontier'
             device: 'gpu'
             interface: 'acc'
+            shard: '2/2'
           - runner:       'frontier'
             cluster:      'frontier'
             cluster_name: 'Oak Ridge | Frontier'
             device: 'gpu'
             interface: 'omp'
+            shard: '1/2'
+          - runner:       'frontier'
+            cluster:      'frontier'
+            cluster_name: 'Oak Ridge | Frontier'
+            device: 'gpu'
+            interface: 'omp'
+            shard: '2/2'
           - runner:       'frontier'
             cluster:      'frontier'
             cluster_name: 'Oak Ridge | Frontier'
             device: 'cpu'
             interface: 'none'
-          # Frontier AMD — build on login node, test via SLURM
+            shard: ''
+          # Frontier AMD — build on login node, GPU tests sharded for batch partition
           - runner:       'frontier'
             cluster:      'frontier_amd'
             cluster_name: 'Oak Ridge | Frontier (AMD)'
             device: 'gpu'
             interface: 'omp'
+            shard: '1/2'
+          - runner:       'frontier'
+            cluster:      'frontier_amd'
+            cluster_name: 'Oak Ridge | Frontier (AMD)'
+            device: 'gpu'
+            interface: 'omp'
+            shard: '2/2'
           - runner:       'frontier'
             cluster:      'frontier_amd'
             cluster_name: 'Oak Ridge | Frontier (AMD)'
             device: 'cpu'
             interface: 'none'
+            shard: ''
     runs-on:
       group:  phoenix
       labels: ${{ matrix.runner }}
     env:
       NODE_OPTIONS: ${{ matrix.cluster == 'phoenix' && '--max-old-space-size=2048' || '' }}
-      ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
-      ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
     steps:
       - name: Clone
         uses: actions/checkout@v4
@@ -216,10 +259,16 @@ jobs:
 
       - name: Build
         if:   matrix.cluster != 'phoenix'
-        run:  bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 3
+          retry_wait_seconds: 60
+          timeout_minutes: 480
+          command: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
+          on_retry_command: ./mfc.sh clean
 
       - name: Test
-        run:  bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }}
+        run:  bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.shard }}
 
       - name: Print Logs
         if:   always()

From 97d724d9ab1f5da8c28baf2bf249dec50d15977b Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 19 Feb 2026 16:58:06 -0500
Subject: [PATCH 02/10] Add --shard and failed_uuids.txt support to test
 toolchain

The CI test scripts use --shard for splitting Frontier GPU tests across
multiple jobs, and failed_uuids.txt for retry logic. These toolchain
changes were missing from the cherry-pick.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 toolchain/mfc/cli/commands.py |  6 ++++++
 toolchain/mfc/test/test.py    | 17 +++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/toolchain/mfc/cli/commands.py b/toolchain/mfc/cli/commands.py
index 8ad8c4bd07..018e3cef83 100644
--- a/toolchain/mfc/cli/commands.py
+++ b/toolchain/mfc/cli/commands.py
@@ -452,6 +452,12 @@
             default=False,
             dest="dry_run",
         ),
+        Argument(
+            name="shard",
+            help="Run only a subset of tests (e.g., '1/2' for first half, '2/2' for second half).",
+            type=str,
+            default=None,
+        ),
     ],
     mutually_exclusive=[
         MutuallyExclusiveGroup(arguments=[
diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
index 31a3771cb9..54e00186dd 100644
--- a/toolchain/mfc/test/test.py
+++ b/toolchain/mfc/test/test.py
@@ -99,6 +99,14 @@ def __filter(cases_) -> typing.List[TestCase]:
         skipped_cases += example_cases
         cases = [case for case in cases if case not in example_cases]
 
+    if ARG("shard") is not None:
+        parts = ARG("shard").split("/")
+        if len(parts) != 2 or not all(p.isdigit() for p in parts) or int(parts[1]) < 1 or not 1 <= int(parts[0]) <= int(parts[1]):
+            raise MFCException(f"Invalid --shard '{ARG('shard')}': expected 'i/n' with 1 <= i <= n (e.g., '1/2').")
+        shard_idx, shard_count = int(parts[0]), int(parts[1])
+        skipped_cases += [c for i, c in enumerate(cases) if i % shard_count != shard_idx - 1]
+        cases = [c for i, c in enumerate(cases) if i % shard_count == shard_idx - 1]
+
     if ARG("percent") == 100:
         return cases, skipped_cases
 
@@ -206,6 +214,15 @@ def test():
     # Build the summary report
     _print_test_summary(nPASS, nFAIL, nSKIP, minutes, seconds, failed_tests, skipped_cases)
 
+    # Write failed UUIDs to file for CI retry logic
+    failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt")
+    if failed_tests:
+        with open(failed_uuids_path, "w") as f:
+            for test_info in failed_tests:
+                f.write(test_info['uuid'] + "\n")
+    elif os.path.exists(failed_uuids_path):
+        os.remove(failed_uuids_path)
+
     exit(nFAIL)
 
 

From 59de55bb8ef4d43fefe46bbcba701f6d789e0d2a Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 20 Feb 2026 14:43:00 -0500
Subject: [PATCH 03/10] Fix stale failed_uuids.txt on abort, guard empty retry,
 quote nproc

- Clean up failed_uuids.txt on early abort path so CI doesn't retry
  stale UUIDs from a previous run
- Guard retry condition with NUM_FAILED > 0 to prevent full-suite
  rerun when the file exists but is empty
- Quote $(nproc) to silence shellcheck SC2046 warnings

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/test.yml | 10 +++++-----
 toolchain/mfc/test/test.py |  5 +++++
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index fbb2839806..f8ca756053 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -28,7 +28,7 @@ jobs:
 
       - name: Check Formatting
         run: |
-          ./mfc.sh format -j $(nproc)
+          ./mfc.sh format -j "$(nproc)"
           git diff --exit-code || (echo "::error::Code is not formatted. Run './mfc.sh format' locally." && exit 1)
 
       - name: Spell Check
@@ -138,7 +138,7 @@ jobs:
 
       - name: Build
         run:  |
-          /bin/bash mfc.sh test -v --dry-run -j $(nproc) --${{ matrix.debug }} --${{ matrix.mpi }} --${{ matrix.precision }} $TEST_ALL
+          /bin/bash mfc.sh test -v --dry-run -j "$(nproc)" --${{ matrix.debug }} --${{ matrix.mpi }} --${{ matrix.precision }} $TEST_ALL
         env:
           TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
 
@@ -146,17 +146,17 @@ jobs:
         run: |
           rm -f tests/failed_uuids.txt
           TEST_EXIT=0
-          /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || TEST_EXIT=$?
+          /bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" $TEST_ALL $TEST_PCT || TEST_EXIT=$?
 
           # Retry only if a small number of tests failed (sporadic failures)
           if [ -f tests/failed_uuids.txt ]; then
             NUM_FAILED=$(wc -l < tests/failed_uuids.txt)
-            if [ "$NUM_FAILED" -le 5 ]; then
+            if [ "$NUM_FAILED" -gt 0 ] && [ "$NUM_FAILED" -le 5 ]; then
               FAILED=$(cat tests/failed_uuids.txt | tr '\n' ' ')
               echo ""
               echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ==="
               echo ""
-              /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) --only $FAILED $TEST_ALL || exit $?
+              /bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" --only $FAILED $TEST_ALL || exit $?
             else
               echo "Too many failures ($NUM_FAILED) to retry — likely a real issue."
               exit 1
diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
index 54e00186dd..681f59f6ae 100644
--- a/toolchain/mfc/test/test.py
+++ b/toolchain/mfc/test/test.py
@@ -190,6 +190,11 @@ def test():
 
     # Check if we aborted due to high failure rate
     if abort_tests.is_set():
+        # Clean up stale failed_uuids.txt so CI doesn't retry wrong tests
+        failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt")
+        if os.path.exists(failed_uuids_path):
+            os.remove(failed_uuids_path)
+
         total_completed = nFAIL + nPASS
         cons.print()
         cons.unindent()

From fbbe1658f896553a002b81ea0fb52611b036b839 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 20 Feb 2026 15:36:28 -0500
Subject: [PATCH 04/10] Remove proactive clean from Phoenix test script

The build system should handle compiler changes correctly. Proactive
clean forces full rebuilds of FFTW/LAPACK from scratch every run,
which is slow and exposes builds to transient filesystem failures
(CMake TryCompile errors on Phoenix scratch).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/phoenix/test.sh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh
index e6912f70b6..74c31c9fba 100644
--- a/.github/workflows/phoenix/test.sh
+++ b/.github/workflows/phoenix/test.sh
@@ -1,9 +1,5 @@
 #!/bin/bash
 
-# Clean stale build artifacts from previous CI runs to prevent
-# cross-compiler contamination (e.g. gfortran LAPACK linked by NVHPC)
-./mfc.sh clean
-
 build_opts=""
 if [ "$job_device" = "gpu" ]; then
     build_opts="--gpu"

From 788e88a0ac5dff1a4da16361f7548f0725618d28 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 20 Feb 2026 22:20:18 -0500
Subject: [PATCH 05/10] Skip benchmark workflow for bot review events

Bot reviews (AI code reviewers) were triggering the benchmark workflow,
and the concurrency group was cancelling the real benchmark run from
the pull_request event. Gate the workflow early by skipping when the
review author is a Bot account type.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/bench.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 382b2fee77..4ec557a3a7 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -13,6 +13,9 @@ concurrency:
 jobs:
   file-changes:
     name: Detect File Changes
+    if: >
+      github.event_name != 'pull_request_review' ||
+      github.event.review.user.type != 'Bot'
     runs-on: 'ubuntu-latest'
     outputs:
       checkall: ${{ steps.changes.outputs.checkall }}

From bceca04779ace97c42ef19b7e84f75d545150276 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sat, 21 Feb 2026 00:16:32 -0500
Subject: [PATCH 06/10] Fix CI edge cases: guard os.remove, skip bare -- flag,
 use -s for empty file check

- Wrap os.remove() in try/except OSError on abort path so permission errors
  don't mask the real MFCException
- Only pass --precision flag when matrix.precision is non-empty to avoid
  invalid bare -- argument
- Use -s instead of -f for failed_uuids.txt to skip retry when file exists
  but is empty

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/test.yml | 4 ++--
 toolchain/mfc/test/test.py | 7 +++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index f8ca756053..0c8efb5662 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -138,7 +138,7 @@ jobs:
 
       - name: Build
         run:  |
-          /bin/bash mfc.sh test -v --dry-run -j "$(nproc)" --${{ matrix.debug }} --${{ matrix.mpi }} --${{ matrix.precision }} $TEST_ALL
+          /bin/bash mfc.sh test -v --dry-run -j "$(nproc)" --${{ matrix.debug }} --${{ matrix.mpi }} ${{ matrix.precision != '' && format('--{0}', matrix.precision) || '' }} $TEST_ALL
         env:
           TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
 
@@ -149,7 +149,7 @@ jobs:
           /bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" $TEST_ALL $TEST_PCT || TEST_EXIT=$?
 
           # Retry only if a small number of tests failed (sporadic failures)
-          if [ -f tests/failed_uuids.txt ]; then
+          if [ -s tests/failed_uuids.txt ]; then
             NUM_FAILED=$(wc -l < tests/failed_uuids.txt)
             if [ "$NUM_FAILED" -gt 0 ] && [ "$NUM_FAILED" -le 5 ]; then
               FAILED=$(cat tests/failed_uuids.txt | tr '\n' ' ')
diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
index 681f59f6ae..26be08fb8a 100644
--- a/toolchain/mfc/test/test.py
+++ b/toolchain/mfc/test/test.py
@@ -192,8 +192,11 @@ def test():
     if abort_tests.is_set():
         # Clean up stale failed_uuids.txt so CI doesn't retry wrong tests
         failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt")
-        if os.path.exists(failed_uuids_path):
-            os.remove(failed_uuids_path)
+        try:
+            if os.path.exists(failed_uuids_path):
+                os.remove(failed_uuids_path)
+        except OSError:
+            pass
 
         total_completed = nFAIL + nPASS
         cons.print()

From 491b27ba4c38710e9490c2e0813fd92fe7ec4975 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 23 Feb 2026 09:28:33 -0500
Subject: [PATCH 07/10] Fix --only filter silently matching zero tests with
 multiple UUIDs

The subset check required ALL passed UUIDs to match a single test
case's trace, which is impossible since each case has one UUID.
With 2+ failed tests, the CI retry selected 0 tests and exited 0,
silently masking real failures. Changed to intersection so each
case is kept if ANY of the passed UUIDs matches.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 toolchain/mfc/test/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
index 26be08fb8a..9fb0bd8eaf 100644
--- a/toolchain/mfc/test/test.py
+++ b/toolchain/mfc/test/test.py
@@ -71,7 +71,7 @@ def __filter(cases_) -> typing.List[TestCase]:
 
             checkCase = case.trace.split(" -> ")
             checkCase.append(case.get_uuid())
-            if not set(ARG("only")).issubset(set(checkCase)):
+            if not set(ARG("only")).intersection(set(checkCase)):
                 cases.remove(case)
                 skipped_cases.append(case)
 

From e55de30a0229f7279d56e2e497cffef77da4a001 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 23 Feb 2026 17:36:28 -0500
Subject: [PATCH 08/10] Print cache hit/miss status in HPC build cache setup

Reports whether existing build artifacts were found in the persistent
cache directory, along with the last build timestamp. Helps diagnose
whether Phoenix and Frontier runners are getting cache hits.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/scripts/setup-build-cache.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/scripts/setup-build-cache.sh b/.github/scripts/setup-build-cache.sh
index 7da3912c38..6742ea7b44 100755
--- a/.github/scripts/setup-build-cache.sh
+++ b/.github/scripts/setup-build-cache.sh
@@ -36,4 +36,13 @@ fi
 ln -s "$_cache_dir" "build"
 
 echo "  Symlink: build -> $_cache_dir"
+
+# Report cache hit/miss based on whether compiled artifacts exist
+_cache_sim=$(find "$_cache_dir" -name 'simulation' -type f 2>/dev/null | head -1)
+if [ -n "$_cache_sim" ]; then
+    _cache_age=$(stat -c '%y' "$_cache_sim" 2>/dev/null || stat -f '%Sm' "$_cache_sim" 2>/dev/null || echo "unknown")
+    echo "  Cache status: HIT (found existing build artifacts, last built: $_cache_age)"
+else
+    echo "  Cache status: MISS (no existing build artifacts)"
+fi
 echo "========================="

From 8f161bbaf578d1fe648276857ec79dd6386a1107 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 23 Feb 2026 17:54:02 -0500
Subject: [PATCH 09/10] Hoist failed_uuids_path to single assignment in test()

Eliminates duplicated os.path.join computation in abort and normal
code paths, reducing divergence risk.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 toolchain/mfc/test/test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
index 9fb0bd8eaf..5a9f71bac4 100644
--- a/toolchain/mfc/test/test.py
+++ b/toolchain/mfc/test/test.py
@@ -179,6 +179,8 @@ def test():
     cons.print("  Progress      Test Name                                        Time(s)   UUID")
     cons.print()
 
+    failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt")
+
     # Select the correct number of threads to use to launch test cases
     # We can't use ARG("jobs") when the --case-optimization option is set
     # because running a test case may cause it to rebuild, and thus
@@ -191,7 +193,6 @@ def test():
     # Check if we aborted due to high failure rate
     if abort_tests.is_set():
         # Clean up stale failed_uuids.txt so CI doesn't retry wrong tests
-        failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt")
         try:
             if os.path.exists(failed_uuids_path):
                 os.remove(failed_uuids_path)
@@ -223,7 +224,6 @@ def test():
     _print_test_summary(nPASS, nFAIL, nSKIP, minutes, seconds, failed_tests, skipped_cases)
 
     # Write failed UUIDs to file for CI retry logic
-    failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt")
     if failed_tests:
         with open(failed_uuids_path, "w") as f:
             for test_info in failed_tests:

From 3ce4f399fb1550c5fe38a84c559a2928ddd88faa Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 23 Feb 2026 21:19:15 -0500
Subject: [PATCH 10/10] Gitignore failed_uuids.txt, restore AND semantics for
 --only labels

- Add /tests/failed_uuids.txt to .gitignore so local test failures
  don't pollute git status
- Detect whether --only tokens are UUIDs (8-char hex) or trace labels:
  UUIDs use OR (any match), labels use AND (all must match). This
  preserves the documented behavior of --only 2D Bubbles while
  supporting the CI retry path --only UUID1 UUID2.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .gitignore                 |  1 +
 toolchain/mfc/test/test.py | 11 +++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index e80d14a6f9..02ece7fb86 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,7 @@ docs/documentation/parameters.md
 /tests/*/**
 !/tests/*/golden.txt
 !/tests/*/golden-metadata.txt
+/tests/failed_uuids.txt
 
 # NVIDIA Nsight Compute
 *.nsys-rep
diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py
index 5a9f71bac4..c6d5d114ec 100644
--- a/toolchain/mfc/test/test.py
+++ b/toolchain/mfc/test/test.py
@@ -42,7 +42,7 @@
 class TestTimeoutError(MFCException):
     pass
 
-# pylint: disable=too-many-branches, trailing-whitespace
+# pylint: disable=too-many-branches, too-many-locals, too-many-statements, trailing-whitespace
 def __filter(cases_) -> typing.List[TestCase]:
     cases = cases_[:]
     selected_cases = []
@@ -66,12 +66,19 @@ def __filter(cases_) -> typing.List[TestCase]:
         raise MFCException("Testing: Your specified range [--from,--to] is incorrect. Please ensure both IDs exist and are in the correct order.")
 
     if len(ARG("only")) > 0:
+        # UUIDs are 8-char hex (CRC32): use OR so --only UUID1 UUID2 selects
+        # any matching test. Labels use AND so --only 2D Bubbles selects both.
+        _uuid_mode = all(len(t) == 8 and t.isalnum() and not t.isalpha() for t in ARG("only"))
         for case in cases[:]:
             case: TestCase
 
             checkCase = case.trace.split(" -> ")
             checkCase.append(case.get_uuid())
-            if not set(ARG("only")).intersection(set(checkCase)):
+            if _uuid_mode:
+                if not set(ARG("only")).intersection(set(checkCase)):
+                    cases.remove(case)
+                    skipped_cases.append(case)
+            elif not set(ARG("only")).issubset(set(checkCase)):
                 cases.remove(case)
                 skipped_cases.append(case)