Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/scripts/setup-build-cache.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,13 @@ fi
ln -s "$_cache_dir" "build"

echo " Symlink: build -> $_cache_dir"

# Report cache hit/miss based on whether compiled artifacts exist
_cache_sim=$(find "$_cache_dir" -name 'simulation' -type f 2>/dev/null | head -1)
if [ -n "$_cache_sim" ]; then
_cache_age=$(stat -c '%y' "$_cache_sim" 2>/dev/null || stat -f '%Sm' "$_cache_sim" 2>/dev/null || echo "unknown")
echo " Cache status: HIT (found existing build artifacts, last built: $_cache_age)"
else
echo " Cache status: MISS (no existing build artifacts)"
fi
echo "========================="
3 changes: 3 additions & 0 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ concurrency:
jobs:
file-changes:
name: Detect File Changes
if: >
github.event_name != 'pull_request_review' ||
github.event.review.user.type != 'Bot'
runs-on: 'ubuntu-latest'
outputs:
checkall: ${{ steps.changes.outputs.checkall }}
Expand Down
8 changes: 5 additions & 3 deletions .github/workflows/frontier/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,13 @@ output_file="$job_slug.out"
submit_output=$(sbatch <<EOT
#!/bin/bash
#SBATCH -J MFC-$job_slug # Job name
#SBATCH -A ENG160 # charge account
#SBATCH -A CFD154 # charge account
#SBATCH -N 1 # Number of nodes required
$sbatch_device_opts
#SBATCH -t 05:59:00 # Duration of the job (Ex: 15 mins)
#SBATCH -t 01:59:00 # Duration of the job
#SBATCH -o$output_file # Combined output and error messages file
#SBATCH -p extended # Extended partition for shorter queues
#SBATCH -p batch # Batch partition (concurrent jobs)
#SBATCH --qos=hackathon # Hackathon QOS for batch access

set -e
set -x
Expand All @@ -50,6 +51,7 @@ echo "Running in $(pwd):"
job_slug="$job_slug"
job_device="$2"
job_interface="$3"
job_shard="$4"

. ./mfc.sh load -c f -m $([ "$2" = "gpu" ] && echo "g" || echo "c")

Expand Down
7 changes: 6 additions & 1 deletion .github/workflows/frontier/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,13 @@ if [ "$job_device" = "gpu" ]; then
fi
fi

shard_opts=""
if [ -n "$job_shard" ]; then
shard_opts="--shard $job_shard"
fi

if [ "$job_device" = "gpu" ]; then
./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts -- -c frontier
./mfc.sh test -v -a --rdma-mpi --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c frontier
else
./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c frontier
fi
8 changes: 5 additions & 3 deletions .github/workflows/frontier_amd/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,13 @@ output_file="$job_slug.out"
submit_output=$(sbatch <<EOT
#!/bin/bash
#SBATCH -J MFC-$job_slug # Job name
#SBATCH -A ENG160 # charge account
#SBATCH -A CFD154 # charge account
#SBATCH -N 1 # Number of nodes required
$sbatch_device_opts
#SBATCH -t 05:59:00 # Duration of the job (Ex: 15 mins)
#SBATCH -t 01:59:00 # Duration of the job
#SBATCH -o$output_file # Combined output and error messages file
#SBATCH -p extended # Extended partition for shorter queues
#SBATCH -p batch # Batch partition (concurrent jobs)
#SBATCH --qos=hackathon # Hackathon QOS for batch access

set -e
set -x
Expand All @@ -50,6 +51,7 @@ echo "Running in $(pwd):"
job_slug="$job_slug"
job_device="$2"
job_interface="$3"
job_shard="$4"

. ./mfc.sh load -c famd -m $([ "$2" = "gpu" ] && echo "g" || echo "c")

Expand Down
7 changes: 6 additions & 1 deletion .github/workflows/frontier_amd/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,13 @@ if [ "$job_device" = "gpu" ]; then
fi
fi

shard_opts=""
if [ -n "$job_shard" ]; then
shard_opts="--shard $job_shard"
fi

if [ "$job_device" = "gpu" ]; then
./mfc.sh test -v -a --max-attempts 3 -j $ngpus $device_opts -- -c frontier_amd
./mfc.sh test -v -a --max-attempts 3 -j $ngpus $device_opts $shard_opts -- -c frontier_amd
else
./mfc.sh test -v -a --max-attempts 3 -j 32 --no-gpu -- -c frontier_amd
fi
1 change: 1 addition & 0 deletions .github/workflows/phoenix/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ submit_output=$(sbatch <<EOT
$sbatch_device_opts
#SBATCH -t 03:00:00 # Duration of the job (Ex: 15 mins)
#SBATCH -q embers # QOS Name
#SBATCH --requeue # Auto-requeue on preemption
#SBATCH -o$output_file # Combined output and error messages file

set -e
Expand Down
71 changes: 60 additions & 11 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:

- name: Check Formatting
run: |
./mfc.sh format -j $(nproc)
./mfc.sh format -j "$(nproc)"
git diff --exit-code || (echo "::error::Code is not formatted. Run './mfc.sh format' locally." && exit 1)

- name: Spell Check
Expand Down Expand Up @@ -138,19 +138,38 @@ jobs:

- name: Build
run: |
/bin/bash mfc.sh test -v --dry-run -j $(nproc) --${{ matrix.debug }} --${{ matrix.mpi }} --${{ matrix.precision }} $TEST_ALL
/bin/bash mfc.sh test -v --dry-run -j "$(nproc)" --${{ matrix.debug }} --${{ matrix.mpi }} ${{ matrix.precision != '' && format('--{0}', matrix.precision) || '' }} $TEST_ALL
env:
TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}

- name: Test
run: |
/bin/bash mfc.sh test -v --max-attempts 3 -j $(nproc) $TEST_ALL $TEST_PCT
run: |
rm -f tests/failed_uuids.txt
TEST_EXIT=0
/bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" $TEST_ALL $TEST_PCT || TEST_EXIT=$?

# Retry only if a small number of tests failed (sporadic failures)
if [ -s tests/failed_uuids.txt ]; then
NUM_FAILED=$(wc -l < tests/failed_uuids.txt)
if [ "$NUM_FAILED" -gt 0 ] && [ "$NUM_FAILED" -le 5 ]; then
FAILED=$(cat tests/failed_uuids.txt | tr '\n' ' ')
echo ""
echo "=== Retrying $NUM_FAILED failed test(s): $FAILED ==="
echo ""
/bin/bash mfc.sh test -v --max-attempts 3 -j "$(nproc)" --only $FAILED $TEST_ALL || exit $?
else
echo "Too many failures ($NUM_FAILED) to retry — likely a real issue."
exit 1
fi
elif [ "$TEST_EXIT" -ne 0 ]; then
exit $TEST_EXIT
fi
env:
TEST_ALL: ${{ matrix.mpi == 'mpi' && '--test-all' || '' }}
TEST_PCT: ${{ matrix.debug == 'debug' && '-% 20' || '' }}

self:
name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }})"
name: "${{ matrix.cluster_name }} (${{ matrix.device }}${{ matrix.interface != 'none' && format('-{0}', matrix.interface) || '' }}${{ matrix.shard != '' && format(' [{0}]', matrix.shard) || '' }})"
if: github.repository == 'MFlowCode/MFC' && needs.file-changes.outputs.checkall == 'true'
needs: [lint-gate, file-changes]
continue-on-error: false
Expand All @@ -164,50 +183,74 @@ jobs:
cluster_name: 'Georgia Tech | Phoenix'
device: 'gpu'
interface: 'acc'
shard: ''
- runner: 'gt'
cluster: 'phoenix'
cluster_name: 'Georgia Tech | Phoenix'
device: 'gpu'
interface: 'omp'
shard: ''
- runner: 'gt'
cluster: 'phoenix'
cluster_name: 'Georgia Tech | Phoenix'
device: 'cpu'
interface: 'none'
# Frontier (ORNL) — build on login node, test via SLURM
shard: ''
# Frontier (ORNL) — build on login node, GPU tests sharded for batch partition
- runner: 'frontier'
cluster: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
device: 'gpu'
interface: 'acc'
shard: '1/2'
- runner: 'frontier'
cluster: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
device: 'gpu'
interface: 'acc'
shard: '2/2'
- runner: 'frontier'
cluster: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
device: 'gpu'
interface: 'omp'
shard: '1/2'
- runner: 'frontier'
cluster: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
device: 'gpu'
interface: 'omp'
shard: '2/2'
- runner: 'frontier'
cluster: 'frontier'
cluster_name: 'Oak Ridge | Frontier'
device: 'cpu'
interface: 'none'
# Frontier AMD — build on login node, test via SLURM
shard: ''
# Frontier AMD — build on login node, GPU tests sharded for batch partition
- runner: 'frontier'
cluster: 'frontier_amd'
cluster_name: 'Oak Ridge | Frontier (AMD)'
device: 'gpu'
interface: 'omp'
shard: '1/2'
- runner: 'frontier'
cluster: 'frontier_amd'
cluster_name: 'Oak Ridge | Frontier (AMD)'
device: 'gpu'
interface: 'omp'
shard: '2/2'
- runner: 'frontier'
cluster: 'frontier_amd'
cluster_name: 'Oak Ridge | Frontier (AMD)'
device: 'cpu'
interface: 'none'
shard: ''
runs-on:
group: phoenix
labels: ${{ matrix.runner }}
env:
NODE_OPTIONS: ${{ matrix.cluster == 'phoenix' && '--max-old-space-size=2048' || '' }}
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
steps:
- name: Clone
uses: actions/checkout@v4
Expand All @@ -216,10 +259,16 @@ jobs:

- name: Build
if: matrix.cluster != 'phoenix'
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
uses: nick-fields/retry@v3
with:
max_attempts: 3
retry_wait_seconds: 60
timeout_minutes: 480
command: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
on_retry_command: ./mfc.sh clean

- name: Test
run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }}
run: bash .github/workflows/${{ matrix.cluster }}/submit.sh .github/workflows/${{ matrix.cluster }}/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.shard }}

- name: Print Logs
if: always()
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ docs/documentation/parameters.md
/tests/*/**
!/tests/*/golden.txt
!/tests/*/golden-metadata.txt
/tests/failed_uuids.txt

# NVIDIA Nsight Compute
*.nsys-rep
Expand Down
6 changes: 6 additions & 0 deletions toolchain/mfc/cli/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,12 @@
default=False,
dest="dry_run",
),
Argument(
name="shard",
help="Run only a subset of tests (e.g., '1/2' for first half, '2/2' for second half).",
type=str,
default=None,
),
],
mutually_exclusive=[
MutuallyExclusiveGroup(arguments=[
Expand Down
36 changes: 34 additions & 2 deletions toolchain/mfc/test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
class TestTimeoutError(MFCException):
pass

# pylint: disable=too-many-branches, trailing-whitespace
# pylint: disable=too-many-branches, too-many-locals, too-many-statements, trailing-whitespace
def __filter(cases_) -> typing.List[TestCase]:
cases = cases_[:]
selected_cases = []
Expand All @@ -66,12 +66,19 @@ def __filter(cases_) -> typing.List[TestCase]:
raise MFCException("Testing: Your specified range [--from,--to] is incorrect. Please ensure both IDs exist and are in the correct order.")

if len(ARG("only")) > 0:
# UUIDs are 8-char hex (CRC32): use OR so --only UUID1 UUID2 selects
# any matching test. Labels use AND so --only 2D Bubbles selects both.
_uuid_mode = all(len(t) == 8 and t.isalnum() and not t.isalpha() for t in ARG("only"))
for case in cases[:]:
case: TestCase

checkCase = case.trace.split(" -> ")
checkCase.append(case.get_uuid())
if not set(ARG("only")).issubset(set(checkCase)):
if _uuid_mode:
if not set(ARG("only")).intersection(set(checkCase)):
cases.remove(case)
skipped_cases.append(case)
elif not set(ARG("only")).issubset(set(checkCase)):
cases.remove(case)
skipped_cases.append(case)

Expand Down Expand Up @@ -99,6 +106,14 @@ def __filter(cases_) -> typing.List[TestCase]:
skipped_cases += example_cases
cases = [case for case in cases if case not in example_cases]

if ARG("shard") is not None:
parts = ARG("shard").split("/")
if len(parts) != 2 or not all(p.isdigit() for p in parts) or int(parts[1]) < 1 or not 1 <= int(parts[0]) <= int(parts[1]):
raise MFCException(f"Invalid --shard '{ARG('shard')}': expected 'i/n' with 1 <= i <= n (e.g., '1/2').")
shard_idx, shard_count = int(parts[0]), int(parts[1])
skipped_cases += [c for i, c in enumerate(cases) if i % shard_count != shard_idx - 1]
cases = [c for i, c in enumerate(cases) if i % shard_count == shard_idx - 1]

if ARG("percent") == 100:
return cases, skipped_cases

Expand Down Expand Up @@ -171,6 +186,8 @@ def test():
cons.print(" Progress Test Name Time(s) UUID")
cons.print()

failed_uuids_path = os.path.join(common.MFC_TEST_DIR, "failed_uuids.txt")

# Select the correct number of threads to use to launch test cases
# We can't use ARG("jobs") when the --case-optimization option is set
# because running a test case may cause it to rebuild, and thus
Expand All @@ -182,6 +199,13 @@ def test():

# Check if we aborted due to high failure rate
if abort_tests.is_set():
# Clean up stale failed_uuids.txt so CI doesn't retry wrong tests
try:
if os.path.exists(failed_uuids_path):
os.remove(failed_uuids_path)
except OSError:
pass

total_completed = nFAIL + nPASS
cons.print()
cons.unindent()
Expand All @@ -206,6 +230,14 @@ def test():
# Build the summary report
_print_test_summary(nPASS, nFAIL, nSKIP, minutes, seconds, failed_tests, skipped_cases)

# Write failed UUIDs to file for CI retry logic
if failed_tests:
with open(failed_uuids_path, "w") as f:
for test_info in failed_tests:
f.write(test_info['uuid'] + "\n")
elif os.path.exists(failed_uuids_path):
os.remove(failed_uuids_path)

exit(nFAIL)


Expand Down
Loading