From a942d9dc60f2c10b190c351cfab95411655d8c55 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 19 Feb 2026 14:59:43 -0500 Subject: [PATCH 01/10] Add test sharding, proactive clean, and retry logic for self-hosted CI - Shard Frontier GPU tests into 2 parts for faster parallel execution - Add proactive ./mfc.sh clean in Phoenix test scripts to prevent cross-compiler contamination from stale build artifacts - Add --requeue to Phoenix SLURM jobs for preemption recovery - Add lint-gate job that must pass before self-hosted tests run - Add retry logic for GitHub runner tests (retry <=5 failures) - Add Frontier AMD test support with dedicated submit/test scripts - Restructure self-hosted matrix with explicit cluster names Co-Authored-By: Claude Opus 4.6 --- .github/workflows/frontier/submit.sh | 8 +-- .github/workflows/frontier/test.sh | 7 ++- .github/workflows/frontier_amd/submit.sh | 8 +-- .github/workflows/frontier_amd/test.sh | 7 ++- .github/workflows/phoenix/submit.sh | 1 + .github/workflows/phoenix/test.sh | 4 ++ .github/workflows/test.yml | 67 ++++++++++++++++++++---- 7 files changed, 85 insertions(+), 17 deletions(-) diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh index d5b416c65a..4c3e0e3e27 100644 --- a/.github/workflows/frontier/submit.sh +++ b/.github/workflows/frontier/submit.sh @@ -34,12 +34,13 @@ output_file="$job_slug.out" submit_output=$(sbatch < Date: Thu, 19 Feb 2026 16:58:06 -0500 Subject: [PATCH 02/10] Add --shard and failed_uuids.txt support to test toolchain The CI test scripts use --shard for splitting Frontier GPU tests across multiple jobs, and failed_uuids.txt for retry logic. These toolchain changes were missing from the cherry-pick. Co-Authored-By: Claude Opus 4.6 --- toolchain/mfc/cli/commands.py | 6 ++++++ toolchain/mfc/test/test.py | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/toolchain/mfc/cli/commands.py b/toolchain/mfc/cli/commands.py index 8ad8c4bd07..018e3cef83 100644 --- a/toolchain/mfc/cli/commands.py +++ b/toolchain/mfc/cli/commands.py @@ -452,6 +452,12 @@ default=False, dest="dry_run", ), + Argument( + name="shard", + help="Run only a subset of tests (e.g., '1/2' for first half, '2/2' for second half).", + type=str, + default=None, + ), ], mutually_exclusive=[ MutuallyExclusiveGroup(arguments=[ diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 31a3771cb9..54e00186dd 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -99,6 +99,14 @@ def __filter(cases_) -> typing.List[TestCase]: skipped_cases += example_cases cases = [case for case in cases if case not in example_cases] + if ARG("shard") is not None: + parts = ARG("shard").split("/") + if len(parts) != 2 or not all(p.isdigit() for p in parts) or int(parts[1]) < 1 or not 1 <= int(parts[0]) <= int(parts[1]): + raise MFCException(f"Invalid --shard '{ARG('shard')}': expected 'i/n' with 1 <= i <= n (e.g., '1/2').") + shard_idx, shard_count = int(parts[0]), int(parts[1]) + skipped_cases += [c for i, c in enumerate(cases) if i % shard_count != shard_idx - 1] + cases = [c for i, c in enumerate(cases) if i % shard_count == shard_idx - 1] + if ARG("percent") == 100: return cases, skipped_cases @@ -206,6 +214,15 @@ def test(): # Build the summary report _print_test_summary(nPASS, nFAIL, nSKIP, minutes, seconds, failed_tests, skipped_cases) + # Write failed UUIDs to file for CI retry logic + failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt") + if failed_tests: + with open(failed_uuids_path, "w") as f: + for test_info in failed_tests: + f.write(test_info['uuid'] + "\n") + elif os.path.exists(failed_uuids_path): + os.remove(failed_uuids_path) + exit(nFAIL) From 59de55bb8ef4d43fefe46bbcba701f6d789e0d2a Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 20 Feb 2026 14:43:00 -0500 Subject: [PATCH 03/10] Fix stale failed_uuids.txt on abort, guard empty retry, quote nproc - Clean up failed_uuids.txt on early abort path so CI doesn't retry stale UUIDs from a previous run - Guard retry condition with NUM_FAILED > 0 to prevent full-suite rerun when the file exists but is empty - Quote $(nproc) to silence shellcheck SC2046 warnings Co-Authored-By: Claude Opus 4.6 --- .github/workflows/test.yml | 10 +++++----- toolchain/mfc/test/test.py | 5 +++++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index fbb2839806..f8ca756053 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -28,7 +28,7 @@ jobs: - name: Check Formatting run: | - ./mfc.sh format -j $(nproc) + ./mfc.sh format -j "$(nproc)" git diff --exit-code || (echo "::error::Code is not formatted. Run './mfc.sh format' locally." && exit 1) - name: Spell Check @@ -138,7 +138,7 @@ jobs: - name: Build run: | - /bin/bash mfc.sh test -v --dry-run -j $(nproc) --${{ matrix.debug }} --${{ matrix.mpi }} --${{ matrix.precision }} $TEST_ALL + /bin/bash mfc.sh test -v --dry-run -j "$(nproc)" --${{ matrix.debug }} --${{ matrix.mpi }} --${{ matrix.precision }} $TEST_ALL env: TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }} @@ -146,17 +146,17 @@ jobs: run: | rm -f tests/failed_uuids.txt TEST_EXIT=0 - /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT || TEST_EXIT=$? + /bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" $TEST_ALL $TEST_PCT || TEST_EXIT=$? # Retry only if a small number of tests failed (sporadic failures) if [ -f tests/failed_uuids.txt ]; then NUM_FAILED=$(wc -l < tests/failed_uuids.txt) - if [ "$NUM_FAILED" -le 5 ]; then + if [ "$NUM_FAILED" -gt 0 ] && [ "$NUM_FAILED" -le 5 ]; then FAILED=$(cat tests/failed_uuids.txt | tr '\n' ' ') echo "" echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ===" echo "" - /bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) --only $FAILED $TEST_ALL || exit $? + /bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" --only $FAILED $TEST_ALL || exit $? else echo "Too many failures ($NUM_FAILED) to retry — likely a real issue." exit 1 diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 54e00186dd..681f59f6ae 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -190,6 +190,11 @@ def test(): # Check if we aborted due to high failure rate if abort_tests.is_set(): + # Clean up stale failed_uuids.txt so CI doesn't retry wrong tests + failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt") + if os.path.exists(failed_uuids_path): + os.remove(failed_uuids_path) + total_completed = nFAIL + nPASS cons.print() cons.unindent() From fbbe1658f896553a002b81ea0fb52611b036b839 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 20 Feb 2026 15:36:28 -0500 Subject: [PATCH 04/10] Remove proactive clean from Phoenix test script The build system should handle compiler changes correctly. Proactive clean forces full rebuilds of FFTW/LAPACK from scratch every run, which is slow and exposes builds to transient filesystem failures (CMake TryCompile errors on Phoenix scratch). Co-Authored-By: Claude Opus 4.6 --- .github/workflows/phoenix/test.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh index e6912f70b6..74c31c9fba 100644 --- a/.github/workflows/phoenix/test.sh +++ b/.github/workflows/phoenix/test.sh @@ -1,9 +1,5 @@ #!/bin/bash -# Clean stale build artifacts from previous CI runs to prevent -# cross-compiler contamination (e.g. gfortran LAPACK linked by NVHPC) -./mfc.sh clean - build_opts="" if [ "$job_device" = "gpu" ]; then build_opts="--gpu" From 788e88a0ac5dff1a4da16361f7548f0725618d28 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 20 Feb 2026 22:20:18 -0500 Subject: [PATCH 05/10] Skip benchmark workflow for bot review events Bot reviews (AI code reviewers) were triggering the benchmark workflow, and the concurrency group was cancelling the real benchmark run from the pull_request event. Gate the workflow early by skipping when the review author is a Bot account type. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/bench.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 382b2fee77..4ec557a3a7 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -13,6 +13,9 @@ concurrency: jobs: file-changes: name: Detect File Changes + if: > + github.event_name != 'pull_request_review' || + github.event.review.user.type != 'Bot' runs-on: 'ubuntu-latest' outputs: checkall: ${{ steps.changes.outputs.checkall }} From bceca04779ace97c42ef19b7e84f75d545150276 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sat, 21 Feb 2026 00:16:32 -0500 Subject: [PATCH 06/10] Fix CI edge cases: guard os.remove, skip bare -- flag, use -s for empty file check - Wrap os.remove() in try/except OSError on abort path so permission errors don't mask the real MFCException - Only pass --precision flag when matrix.precision is non-empty to avoid invalid bare -- argument - Use -s instead of -f for failed_uuids.txt to skip retry when file exists but is empty Co-Authored-By: Claude Opus 4.6 --- .github/workflows/test.yml | 4 ++-- toolchain/mfc/test/test.py | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f8ca756053..0c8efb5662 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -138,7 +138,7 @@ jobs: - name: Build run: | - /bin/bash mfc.sh test -v --dry-run -j "$(nproc)" --${{ matrix.debug }} --${{ matrix.mpi }} --${{ matrix.precision }} $TEST_ALL + /bin/bash mfc.sh test -v --dry-run -j "$(nproc)" --${{ matrix.debug }} --${{ matrix.mpi }} ${{ matrix.precision != '' && format('--{0}', matrix.precision) || '' }} $TEST_ALL env: TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }} @@ -149,7 +149,7 @@ jobs: /bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" $TEST_ALL $TEST_PCT || TEST_EXIT=$? # Retry only if a small number of tests failed (sporadic failures) - if [ -f tests/failed_uuids.txt ]; then + if [ -s tests/failed_uuids.txt ]; then NUM_FAILED=$(wc -l < tests/failed_uuids.txt) if [ "$NUM_FAILED" -gt 0 ] && [ "$NUM_FAILED" -le 5 ]; then FAILED=$(cat tests/failed_uuids.txt | tr '\n' ' ') diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 681f59f6ae..26be08fb8a 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -192,8 +192,11 @@ def test(): if abort_tests.is_set(): # Clean up stale failed_uuids.txt so CI doesn't retry wrong tests failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt") - if os.path.exists(failed_uuids_path): - os.remove(failed_uuids_path) + try: + if os.path.exists(failed_uuids_path): + os.remove(failed_uuids_path) + except OSError: + pass total_completed = nFAIL + nPASS cons.print() From 491b27ba4c38710e9490c2e0813fd92fe7ec4975 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 23 Feb 2026 09:28:33 -0500 Subject: [PATCH 07/10] Fix --only filter silently matching zero tests with multiple UUIDs The subset check required ALL passed UUIDs to match a single test case's trace, which is impossible since each case has one UUID. With 2+ failed tests, the CI retry selected 0 tests and exited 0, silently masking real failures. Changed to intersection so each case is kept if ANY of the passed UUIDs matches. Co-Authored-By: Claude Opus 4.6 --- toolchain/mfc/test/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 26be08fb8a..9fb0bd8eaf 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -71,7 +71,7 @@ def __filter(cases_) -> typing.List[TestCase]: checkCase = case.trace.split(" -> ") checkCase.append(case.get_uuid()) - if not set(ARG("only")).issubset(set(checkCase)): + if not set(ARG("only")).intersection(set(checkCase)): cases.remove(case) skipped_cases.append(case) From e55de30a0229f7279d56e2e497cffef77da4a001 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 23 Feb 2026 17:36:28 -0500 Subject: [PATCH 08/10] Print cache hit/miss status in HPC build cache setup Reports whether existing build artifacts were found in the persistent cache directory, along with the last build timestamp. Helps diagnose whether Phoenix and Frontier runners are getting cache hits. Co-Authored-By: Claude Opus 4.6 --- .github/scripts/setup-build-cache.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/scripts/setup-build-cache.sh b/.github/scripts/setup-build-cache.sh index 7da3912c38..6742ea7b44 100755 --- a/.github/scripts/setup-build-cache.sh +++ b/.github/scripts/setup-build-cache.sh @@ -36,4 +36,13 @@ fi ln -s "$_cache_dir" "build" echo " Symlink: build -> $_cache_dir" + +# Report cache hit/miss based on whether compiled artifacts exist +_cache_sim=$(find "$_cache_dir" -name 'simulation' -type f 2>/dev/null | head -1) +if [ -n "$_cache_sim" ]; then + _cache_age=$(stat -c '%y' "$_cache_sim" 2>/dev/null || stat -f '%Sm' "$_cache_sim" 2>/dev/null || echo "unknown") + echo " Cache status: HIT (found existing build artifacts, last built: $_cache_age)" +else + echo " Cache status: MISS (no existing build artifacts)" +fi echo "=========================" From 8f161bbaf578d1fe648276857ec79dd6386a1107 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 23 Feb 2026 17:54:02 -0500 Subject: [PATCH 09/10] Hoist failed_uuids_path to single assignment in test() Eliminates duplicated os.path.join computation in abort and normal code paths, reducing divergence risk. Co-Authored-By: Claude Opus 4.6 --- toolchain/mfc/test/test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 9fb0bd8eaf..5a9f71bac4 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -179,6 +179,8 @@ def test(): cons.print(" Progress Test Name Time(s) UUID") cons.print() + failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt") + # Select the correct number of threads to use to launch test cases # We can't use ARG("jobs") when the --case-optimization option is set # because running a test case may cause it to rebuild, and thus @@ -191,7 +193,6 @@ def test(): # Check if we aborted due to high failure rate if abort_tests.is_set(): # Clean up stale failed_uuids.txt so CI doesn't retry wrong tests - failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt") try: if os.path.exists(failed_uuids_path): os.remove(failed_uuids_path) @@ -223,7 +224,6 @@ def test(): _print_test_summary(nPASS, nFAIL, nSKIP, minutes, seconds, failed_tests, skipped_cases) # Write failed UUIDs to file for CI retry logic - failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt") if failed_tests: with open(failed_uuids_path, "w") as f: for test_info in failed_tests: From 3ce4f399fb1550c5fe38a84c559a2928ddd88faa Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 23 Feb 2026 21:19:15 -0500 Subject: [PATCH 10/10] Gitignore failed_uuids.txt, restore AND semantics for --only labels - Add /tests/failed_uuids.txt to .gitignore so local test failures don't pollute git status - Detect whether --only tokens are UUIDs (8-char hex) or trace labels: UUIDs use OR (any match), labels use AND (all must match). This preserves the documented behavior of --only 2D Bubbles while supporting the CI retry path --only UUID1 UUID2. Co-Authored-By: Claude Opus 4.6 --- .gitignore | 1 + toolchain/mfc/test/test.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index e80d14a6f9..02ece7fb86 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,7 @@ docs/documentation/parameters.md /tests/*/** !/tests/*/golden.txt !/tests/*/golden-metadata.txt +/tests/failed_uuids.txt # NVIDIA Nsight Compute *.nsys-rep diff --git a/toolchain/mfc/test/test.py b/toolchain/mfc/test/test.py index 5a9f71bac4..c6d5d114ec 100644 --- a/toolchain/mfc/test/test.py +++ b/toolchain/mfc/test/test.py @@ -42,7 +42,7 @@ class TestTimeoutError(MFCException): pass -# pylint: disable=too-many-branches, trailing-whitespace +# pylint: disable=too-many-branches, too-many-locals, too-many-statements, trailing-whitespace def __filter(cases_) -> typing.List[TestCase]: cases = cases_[:] selected_cases = [] @@ -66,12 +66,19 @@ def __filter(cases_) -> typing.List[TestCase]: raise MFCException("Testing: Your specified range [--from,--to] is incorrect. Please ensure both IDs exist and are in the correct order.") if len(ARG("only")) > 0: + # UUIDs are 8-char hex (CRC32): use OR so --only UUID1 UUID2 selects + # any matching test. Labels use AND so --only 2D Bubbles selects both. + _uuid_mode = all(len(t) == 8 and t.isalnum() and not t.isalpha() for t in ARG("only")) for case in cases[:]: case: TestCase checkCase = case.trace.split(" -> ") checkCase.append(case.get_uuid()) - if not set(ARG("only")).intersection(set(checkCase)): + if _uuid_mode: + if not set(ARG("only")).intersection(set(checkCase)): + cases.remove(case) + skipped_cases.append(case) + elif not set(ARG("only")).issubset(set(checkCase)): cases.remove(case) skipped_cases.append(case)