From 489c71dd1cc4edce6fa8a60cc5a937c38e13a310 Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Tue, 14 Apr 2026 22:06:33 +0100 Subject: [PATCH 1/6] DAOS-17427 ftest: Auto-restart after self-terminate tests Functional tests for the automatic engine restart feature introduced in the control plane. These tests verify that engines automatically restart after self-termination when excluded from the system, with cases to verify disabling, rate-limiting and configuration support. Test-tag: hw,medium,dmg,control,engine_auto_restart Signed-off-by: Tom Nabarro --- .../ftest/control/engine_auto_restart.py | 215 +++++++++++++++ .../ftest/control/engine_auto_restart.yaml | 25 ++ .../control/engine_auto_restart_advanced.py | 253 ++++++++++++++++++ .../control/engine_auto_restart_advanced.yaml | 27 ++ .../control/engine_auto_restart_disabled.py | 181 +++++++++++++ .../control/engine_auto_restart_disabled.yaml | 27 ++ 6 files changed, 728 insertions(+) create mode 100644 src/tests/ftest/control/engine_auto_restart.py create mode 100644 src/tests/ftest/control/engine_auto_restart.yaml create mode 100644 src/tests/ftest/control/engine_auto_restart_advanced.py create mode 100644 src/tests/ftest/control/engine_auto_restart_advanced.yaml create mode 100644 src/tests/ftest/control/engine_auto_restart_disabled.py create mode 100644 src/tests/ftest/control/engine_auto_restart_disabled.yaml diff --git a/src/tests/ftest/control/engine_auto_restart.py b/src/tests/ftest/control/engine_auto_restart.py new file mode 100644 index 00000000000..d6d5b85655b --- /dev/null +++ b/src/tests/ftest/control/engine_auto_restart.py @@ -0,0 +1,215 @@ +""" + (C) Copyright 2026 Hewlett Packard Enterprise Development LP + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" +import time + +from control_test_base import ControlTestBase +from general_utils import report_errors + + +class EngineAutoRestartTest(ControlTestBase): + """Test automatic engine restart on self-termination. + + Test Class Description: + Verify automatic engine restart behavior when engines self-terminate + after being excluded from the system. + + :avocado: recursive + """ + + def setUp(self): + """Set up each test case.""" + super().setUp() + self.dmg = self.get_dmg_command() + + def get_all_ranks(self): + """Get list of all ranks in the system.""" + return list(self.server_managers[0].ranks.keys()) + + def get_rank_state(self, rank): + data = self.dmg.system_query(ranks=f"{rank}") + if data["status"] != 0: + self.fail("Cmd dmg system query failed") + if "response" in data and "members" in data["response"]: + if data["response"]["members"] is None: + self.fail("No members returned from dmg system query") + for member in data["response"]["members"]: + return member["state"].lower() + self.fail("No member state returned from dmg system query") + + def exclude_rank_and_wait_restart(self, rank, expect_restart=True, timeout=30): + """Exclude a rank and wait for it to self-terminate and potentially restart. + + Args: + rank (int): Rank to exclude + expect_restart (bool): Whether automatic restart is expected + timeout (int): Maximum seconds to wait for restart + + Returns: + tuple: (restarted, final_state) - whether rank restarted and its final state + """ + self.log_step(f"Excluding rank {rank}") + self.dmg.system_exclude(ranks=[rank], rank_hosts=None) + + # Wait for rank to self-terminate (should go to Excluded state) + self.log_step(f"Waiting for rank {rank} to self-terminate") + time.sleep(5) + + # Check if rank is excluded + failed_ranks = self.server_managers[0].check_rank_state( + ranks=[rank], valid_states=["excluded"], max_checks=10) + if failed_ranks: + self.fail(f"Rank {rank} did not reach Excluded state after exclusion") + + if expect_restart: + # Wait for automatic restart (rank should go to Joined state) + self.log_step(f"Waiting for rank {rank} to automatically restart") + start_time = time.time() + restarted = False + + while time.time() - start_time < timeout: + time.sleep(2) + # Check if rank has rejoined + failed_ranks = self.server_managers[0].check_rank_state( + ranks=[rank], valid_states=["joined"], max_checks=1) + if not failed_ranks: + restarted = True + break + + if restarted: + self.log.info(f"Rank {rank} automatically restarted and rejoined") + return (True, "joined") + else: + state = self.get_rank_state(rank) + self.log.error(f"Rank {rank} ({state}) did not restart within {timeout}s") + return (False, state) + else: + # Verify rank stays excluded (no automatic restart) + self.log_step(f"Verifying rank {rank} does not automatically restart") + time.sleep(timeout) + + failed_ranks = self.server_managers[0].check_rank_state( + ranks=[rank], valid_states=["excluded"], max_checks=1) + if failed_ranks: + state = self.get_rank_state(rank) + self.log.error(f"Rank {rank} ({state}) unexpectedly restarted") + return (True, state) + else: + return (False, "excluded") + + def test_auto_restart_basic(self): + """Test basic automatic engine restart after self-termination. + + Test Description: + 1. Exclude a rank from the system + 2. Wait for rank to self-terminate + 3. Verify rank automatically restarts and rejoins the system + + :avocado: tags=all,pr,daily_regression + :avocado: tags=hw,medium + :avocado: tags=dmg,control,engine_auto_restart + :avocado: tags=EngineAutoRestartTest,test_auto_restart_basic + """ + all_ranks = self.get_all_ranks() + if len(all_ranks) < 2: + self.skipTest("Test requires at least 2 ranks") + + test_rank = self.random.choice(all_ranks) + + self.log_step(f"Testing automatic restart of rank {test_rank}") + + restarted, final_state = self.exclude_rank_and_wait_restart(test_rank) + + if not restarted: + self.fail(f"Rank {test_rank} did not automatically restart. Final state: {final_state}") + + self.log.info(f"SUCCESS: Rank {test_rank} automatically restarted after self-termination") + + def test_auto_restart_multiple_ranks(self): + """Test automatic restart of multiple ranks. + + Test Description: + 1. Exclude multiple ranks simultaneously + 2. Wait for all to self-terminate + 3. Verify all automatically restart and rejoin + + :avocado: tags=all,full_regression + :avocado: tags=hw,medium + :avocado: tags=dmg,control,engine_auto_restart + :avocado: tags=EngineAutoRestartTest,test_auto_restart_multiple_ranks + """ + all_ranks = self.get_all_ranks() + if len(all_ranks) < 3: + self.skipTest("Test requires at least 3 ranks") + + # Exclude half the ranks (but keep at least one for quorum) + num_to_exclude = max(1, len(all_ranks) // 2) + ranks_to_test = self.random.sample(all_ranks, num_to_exclude) + + self.log_step(f"Testing automatic restart of multiple ranks: {ranks_to_test}") + + errors = [] + results = {} + + for test_rank in ranks_to_test: + restarted, final_state = self.exclude_rank_and_wait_restart(test_rank) + results[test_rank] = (restarted, final_state) + + if not restarted: + errors.append( + f"Rank {test_rank} did not automatically restart. State: {final_state}") + + # Report results + self.log.info("=== Multiple Rank Restart Results ===") + for rank, (restarted, state) in results.items(): + status = "PASS" if restarted else "FAIL" + self.log.info(f"Rank {rank}: {status} (final state: {state})") + + report_errors(test=self, errors=errors) + + def test_auto_restart_with_pool(self): + """Test automatic restart works with active pools. + + Test Description: + 1. Create a pool + 2. Exclude a rank (not in pool service) + 3. Verify rank automatically restarts + 4. Verify pool remains accessible + + :avocado: tags=all,daily_regression + :avocado: tags=hw,medium + :avocado: tags=dmg,control,engine_auto_restart,pool + :avocado: tags=EngineAutoRestartTest,test_auto_restart_with_pool + """ + all_ranks = self.get_all_ranks() + if len(all_ranks) < 2: + self.skipTest("Test requires at least 2 ranks") + + # Create pool first + self.add_pool(connect=False) + + # Get pool service ranks to avoid excluding them + pool_svc_ranks = self.pool.svc_ranks + self.log.info(f"Pool service ranks: {pool_svc_ranks}") + + # Find a rank not in pool service + non_svc_ranks = [r for r in all_ranks if r not in pool_svc_ranks] + if not non_svc_ranks: + self.skipTest("All ranks are pool service ranks") + + test_rank = self.random.choice(non_svc_ranks) + + self.log_step(f"Excluding non-service rank {test_rank} while pool is active") + + restarted, final_state = self.exclude_rank_and_wait_restart(test_rank) + + if not restarted: + self.fail(f"Rank {test_rank} did not restart. State: {final_state}") + + # Verify pool is still accessible + self.log_step("Verifying pool is still accessible after rank restart") + self.pool.query() + + self.log.info(f"SUCCESS: Rank {test_rank} restarted and pool remains accessible") diff --git a/src/tests/ftest/control/engine_auto_restart.yaml b/src/tests/ftest/control/engine_auto_restart.yaml new file mode 100644 index 00000000000..aefee54e589 --- /dev/null +++ b/src/tests/ftest/control/engine_auto_restart.yaml @@ -0,0 +1,25 @@ +hosts: + test_servers: 1 +server_config: + name: daos_server + engines_per_host: 2 + engines: + 0: + log_file: daos_server0.log + targets: 4 + nr_xs_helpers: 0 + storage: + 0: + class: ram + scm_mount: /mnt/daos0 + 1: + log_file: daos_server1.log + targets: 4 + nr_xs_helpers: 0 + storage: + 0: + class: ram + scm_mount: /mnt/daos1 +pool: + size: 2G +timeout: 300 diff --git a/src/tests/ftest/control/engine_auto_restart_advanced.py b/src/tests/ftest/control/engine_auto_restart_advanced.py new file mode 100644 index 00000000000..59bfdea511b --- /dev/null +++ b/src/tests/ftest/control/engine_auto_restart_advanced.py @@ -0,0 +1,253 @@ +""" + (C) Copyright 2026 Hewlett Packard Enterprise Development LP + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" +import time + +from control_test_base import ControlTestBase + + +class EngineAutoRestartAdvanced(ControlTestBase): + """Test advanced automatic engine restart scenarios. + + Test Class Description: + Verify automatic engine restart with custom configurations including + rate-limiting, deferred restarts, and disabled restart behavior. + + :avocado: recursive + """ + + def setUp(self): + """Set up each test case.""" + super().setUp() + self.dmg = self.get_dmg_command() + + def get_all_ranks(self): + """Get list of all ranks in the system.""" + return list(self.server_managers[0].ranks.keys()) + + def get_rank_state(self, rank): + data = self.dmg.system_query(ranks=f"{rank}") + if data["status"] != 0: + self.fail("Cmd dmg system query failed") + if "response" in data and "members" in data["response"]: + if data["response"]["members"] is None: + self.fail("No members returned from dmg system query") + for member in data["response"]["members"]: + return member["state"].lower() + self.fail("No member state returned from dmg system query") + + def wait_for_rank_state(self, rank, expected_state, timeout=30, check_interval=2): + """Wait for a rank to reach expected state. + + Args: + rank (int): Rank number + expected_state (str): Expected state + timeout (int): Maximum seconds to wait + check_interval (int): Seconds between state checks + + Returns: + bool: True if state reached, False if timeout + """ + start_time = time.time() + + while time.time() - start_time < timeout: + failed_ranks = self.server_managers[0].check_rank_state( + ranks=[rank], valid_states=[expected_state], max_checks=1) + + if not failed_ranks: + self.log.info(f"Rank {rank} reached state '{expected_state}' after " + f"{time.time() - start_time:.1f}s") + return True + + time.sleep(check_interval) + + current_state = self.get_rank_state(rank) + self.log.warning(f"Rank {rank} did not reach '{expected_state}' within {timeout}s. " + f"Current state: {current_state}") + return False + + def test_deferred_restart(self): + """Test deferred restart when multiple self-terminations occur rapidly. + + Test Description: + This test requires custom server configuration with a short + engine_auto_restart_min_delay (e.g., 15 seconds) to avoid long test runtime. + + 1. Exclude rank and wait for automatic restart (first restart) + 2. Immediately exclude same rank again (second self-termination) + 3. Verify restart is deferred, not immediate + 4. Wait for deferred restart to execute after delay expires + 5. Verify rank successfully rejoins + + :avocado: tags=all,full_regression + :avocado: tags=hw,medium + :avocado: tags=dmg,control,engine_auto_restart + :avocado: tags=EngineAutoRestartAdvanced,test_deferred_restart + """ + # Get configured restart delay from test params + restart_delay = self.params.get("engine_auto_restart_min_delay", "/run/server_config/*", 15) + + all_ranks = self.get_all_ranks() + if len(all_ranks) < 2: + self.skipTest("Test requires at least 2 ranks") + + test_rank = self.random.choice(all_ranks) + + # First exclusion - should restart immediately (no previous restart) + self.log_step(f"Step 1: First exclusion of rank {test_rank}") + self.dmg.system_exclude(ranks=[test_rank], rank_hosts=None) + + # Wait for self-termination + if not self.wait_for_rank_state(test_rank, "excluded", timeout=10): + self.fail(f"Rank {test_rank} did not self-terminate") + + # Wait for automatic restart + self.log_step(f"Step 2: Waiting for first automatic restart of rank {test_rank}") + if not self.wait_for_rank_state(test_rank, "joined", timeout=30): + self.fail(f"Rank {test_rank} did not automatically restart on first exclusion") + + first_restart_time = time.time() + self.log.info(f"First restart completed at T={first_restart_time:.1f}") + + # Second exclusion - should be deferred due to rate-limiting + self.log_step(f"Step 3: Second exclusion of rank {test_rank} (should be deferred)") + self.dmg.system_exclude(ranks=[test_rank], rank_hosts=None) + + # Wait for self-termination + if not self.wait_for_rank_state(test_rank, "excluded", timeout=10): + self.fail(f"Rank {test_rank} did not self-terminate on second exclusion") + + # Verify restart is NOT immediate (should be deferred) + self.log_step("Step 4: Verifying restart is deferred (not immediate)") + time.sleep(5) # Wait a bit + + current_state = self.get_rank_state(test_rank) + if current_state == "joined": + self.fail(f"Rank {test_rank} restarted immediately - rate-limiting not working") + + self.log.info(f"Confirmed: Restart is deferred (rank still in '{current_state}' state)") + + # Wait for deferred restart to execute (after delay expires) + # Add buffer time for processing + wait_time = restart_delay + 10 + self.log_step(f"Step 5: Waiting {wait_time}s for deferred restart to execute") + + if not self.wait_for_rank_state(test_rank, "joined", timeout=wait_time): + self.fail(f"Rank {test_rank} did not restart after rate-limit delay") + + deferred_restart_time = time.time() + actual_delay = deferred_restart_time - first_restart_time + + self.log.info(f"SUCCESS: Deferred restart executed after {actual_delay:.1f}s " + f"(expected ~{restart_delay}s)") + + # Verify delay was approximately correct (within tolerance) + if actual_delay < restart_delay * 0.8: + self.fail(f"Restart occurred too early: {actual_delay:.1f}s < {restart_delay}s") + + def test_custom_restart_delay(self): + """Test custom engine_auto_restart_min_delay configuration. + + Test Description: + This test requires server configuration with custom + engine_auto_restart_min_delay value. + + 1. Exclude rank and wait for first restart + 2. Exclude same rank again + 3. Measure time until deferred restart executes + 4. Verify delay matches configured value + + :avocado: tags=all,full_regression + :avocado: tags=hw,medium + :avocado: tags=dmg,control,engine_auto_restart + :avocado: tags=EngineAutoRestartAdvanced,test_custom_restart_delay + """ + # Get configured delay from test parameters + expected_delay = self.params.get("engine_auto_restart_min_delay", + "/run/server_config/*", 20) + + all_ranks = self.get_all_ranks() + if len(all_ranks) < 2: + self.skipTest("Test requires at least 2 ranks") + + test_rank = self.random.choice(all_ranks) + + self.log_step(f"Testing custom restart delay of {expected_delay}s for rank {test_rank}") + + # First restart to establish baseline + self.log_step("Step 1: First exclusion and restart") + self.dmg.system_exclude(ranks=[test_rank], rank_hosts=None) + self.wait_for_rank_state(test_rank, "excluded", timeout=10) + self.wait_for_rank_state(test_rank, "joined", timeout=30) + + first_restart_time = time.time() + + # Second restart to measure delay + self.log_step("Step 2: Second exclusion to trigger deferred restart") + self.dmg.system_exclude(ranks=[test_rank], rank_hosts=None) + self.wait_for_rank_state(test_rank, "excluded", timeout=10) + + # Wait for deferred restart + self.log_step(f"Step 3: Waiting for deferred restart (expected delay: {expected_delay}s)") + wait_timeout = expected_delay + 20 # Add buffer + + if not self.wait_for_rank_state(test_rank, "joined", timeout=wait_timeout): + self.fail(f"Rank {test_rank} did not restart within expected time") + + second_restart_time = time.time() + actual_delay = second_restart_time - first_restart_time + + self.log.info(f"Measured delay: {actual_delay:.1f}s (expected: ~{expected_delay}s)") + + # Verify delay is within acceptable range (80% to 120% of expected) + min_delay = expected_delay * 0.8 + max_delay = expected_delay * 1.2 + + if actual_delay < min_delay: + self.fail(f"Restart too early: {actual_delay:.1f}s < {min_delay:.1f}s") + elif actual_delay > max_delay: + self.log.warning(f"Restart delayed beyond expected: {actual_delay:.1f}s > " + f"{max_delay:.1f}s (may be acceptable depending on system load)") + else: + self.log.info(f"SUCCESS: Restart delay within expected range " + f"[{min_delay:.1f}s, {max_delay:.1f}s]") + + def test_restart_after_clear_exclude(self): + """Test interaction between auto-restart and manual clear-exclude. + + Test Description: + 1. Exclude rank, wait for self-termination + 2. Clear exclusion before auto-restart triggers + 3. Verify rank rejoins successfully + + :avocado: tags=all,daily_regression + :avocado: tags=hw,medium + :avocado: tags=dmg,control,engine_auto_restart + :avocado: tags=EngineAutoRestartAdvanced,test_restart_after_clear_exclude + """ + all_ranks = self.get_all_ranks() + if len(all_ranks) < 2: + self.skipTest("Test requires at least 2 ranks") + + test_rank = self.random.choice(all_ranks) + + self.log_step(f"Step 1: Excluding rank {test_rank}") + self.dmg.system_exclude(ranks=[test_rank], rank_hosts=None) + + # Wait for self-termination + # FIXME: should this be checking for "adminexcluded" state? + if not self.wait_for_rank_state(test_rank, "excluded", timeout=10): + self.fail(f"Rank {test_rank} did not self-terminate") + + # Clear exclusion before auto-restart + self.log_step(f"Step 2: Clearing exclusion for rank {test_rank}") + self.dmg.system_clear_exclude(ranks=[test_rank], rank_hosts=None) + + # Verify rank rejoins + if not self.wait_for_rank_state(test_rank, "joined", timeout=30): + self.fail(f"Rank {test_rank} did not rejoin after manual start") + self.fail(f"Rank {test_rank} did not automatically restart on admin exclusion") + + self.log.info(f"SUCCESS: Rank {test_rank} successfully rejoined after clear-exclude") diff --git a/src/tests/ftest/control/engine_auto_restart_advanced.yaml b/src/tests/ftest/control/engine_auto_restart_advanced.yaml new file mode 100644 index 00000000000..f45bcf57618 --- /dev/null +++ b/src/tests/ftest/control/engine_auto_restart_advanced.yaml @@ -0,0 +1,27 @@ +hosts: + test_servers: 1 +server_config: + name: daos_server + engines_per_host: 2 + # Custom restart delay for faster testing (15 seconds instead of default 300) + engine_auto_restart_min_delay: 15 + engines: + 0: + log_file: daos_server0.log + targets: 4 + nr_xs_helpers: 0 + storage: + 0: + class: ram + scm_mount: /mnt/daos0 + 1: + log_file: daos_server1.log + targets: 4 + nr_xs_helpers: 0 + storage: + 0: + class: ram + scm_mount: /mnt/daos1 +pool: + size: 8G +timeout: 400 diff --git a/src/tests/ftest/control/engine_auto_restart_disabled.py b/src/tests/ftest/control/engine_auto_restart_disabled.py new file mode 100644 index 00000000000..cf66474c06d --- /dev/null +++ b/src/tests/ftest/control/engine_auto_restart_disabled.py @@ -0,0 +1,181 @@ +""" + (C) Copyright 2026 Hewlett Packard Enterprise Development LP + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" +import time + +from control_test_base import ControlTestBase +from general_utils import report_errors + + +class EngineAutoRestartDisabled(ControlTestBase): + """Test automatic engine restart disabled configuration. + + Test Class Description: + Verify that automatic engine restart can be disabled and that + excluded ranks stay excluded when auto-restart is disabled. + + :avocado: recursive + """ + + def setUp(self): + """Set up each test case.""" + super().setUp() + self.dmg = self.get_dmg_command() + + def get_all_ranks(self): + """Get list of all ranks in the system.""" + return list(self.server_managers[0].ranks.keys()) + + def test_no_restart_when_disabled(self): + """Test that engines do not automatically restart when feature is disabled. + + Test Description: + Server is configured with disable_engine_auto_restart: true. + + 1. Exclude a rank from the system + 2. Wait for rank to self-terminate + 3. Wait additional time to verify NO automatic restart occurs + 4. Manually start the rank to verify it can still be started + 5. Verify manual start succeeds + + :avocado: tags=all,daily_regression + :avocado: tags=hw,medium + :avocado: tags=dmg,control,engine_auto_restart + :avocado: tags=EngineAutoRestartDisabled,test_no_restart_when_disabled + """ + all_ranks = self.get_all_ranks() + if len(all_ranks) < 2: + self.skipTest("Test requires at least 2 ranks") + + test_rank = self.random.choice(all_ranks) + + self.log_step(f"Step 1: Excluding rank {test_rank} (auto-restart is DISABLED)") + self.dmg.system_exclude(ranks=[test_rank], rank_hosts=None) + + # Step 2: Wait for self-termination + self.log_step(f"Step 2: Waiting for rank {test_rank} to self-terminate") + time.sleep(5) + + failed_ranks = self.server_managers[0].check_rank_state( + ranks=[test_rank], valid_states=["excluded"], max_checks=10) + if failed_ranks: + self.fail(f"Rank {test_rank} did not reach Excluded state") + + # Step 3: Wait to verify NO automatic restart + wait_time = 20 # Wait 20 seconds + self.log_step(f"Step 3: Waiting {wait_time}s to verify NO automatic restart occurs") + time.sleep(wait_time) + + # Verify rank is still excluded + failed_ranks = self.server_managers[0].check_rank_state( + ranks=[test_rank], valid_states=["excluded"], max_checks=1) + + if failed_ranks: + # Rank is NOT excluded, check if it restarted + check_joined = self.server_managers[0].check_rank_state( + ranks=[test_rank], valid_states=["joined"], max_checks=1) + if not check_joined: + self.fail(f"Rank {test_rank} unexpectedly restarted when auto-restart disabled!") + else: + self.fail(f"Rank {test_rank} in unexpected state (not excluded or joined)") + + self.log.info(f"Confirmed: Rank {test_rank} did NOT automatically restart (as expected)") + + # Step 4: Manually clear exclusion + self.log_step(f"Step 4: Manually clearing exclusion for rank {test_rank}") + self.dmg.system_clear_exclude(ranks=[test_rank], rank_hosts=None) + + # Step 5: Manually start the rank + self.log_step(f"Step 5: Manually starting rank {test_rank}") + self.dmg.system_start(ranks=f"{test_rank}") + + # Verify manual start succeeds + failed_ranks = self.server_managers[0].check_rank_state( + ranks=[test_rank], valid_states=["joined"], max_checks=15) + if failed_ranks: + self.fail(f"Manual start of rank {test_rank} failed") + + self.log.info(f"SUCCESS: Rank {test_rank} stayed excluded when auto-restart disabled, " + f"and manual start succeeded") + + def test_multiple_ranks_no_restart(self): + """Test that multiple excluded ranks stay excluded when auto-restart disabled. + + Test Description: + Server configured with disable_engine_auto_restart: true. + + 1. Exclude multiple ranks + 2. Verify all self-terminate and reach Excluded state + 3. Wait to confirm none automatically restart + 4. Manually restart all ranks + 5. Verify all successfully rejoin + + :avocado: tags=all,full_regression + :avocado: tags=hw,medium + :avocado: tags=dmg,control,engine_auto_restart + :avocado: tags=EngineAutoRestartDisabled,test_multiple_ranks_no_restart + """ + all_ranks = self.get_all_ranks() + if len(all_ranks) < 3: + self.skipTest("Test requires at least 3 ranks") + + # Exclude half the ranks + num_to_test = max(2, len(all_ranks) // 2) + test_ranks = self.random.sample(all_ranks, num_to_test) + + self.log_step(f"Step 1: Excluding {num_to_test} ranks: {test_ranks}") + + for rank in test_ranks: + self.dmg.system_exclude(ranks=[rank], rank_hosts=None) + time.sleep(1) # Small delay between exclusions + + # Step 2: Verify all reach Excluded state + self.log_step("Step 2: Verifying all ranks self-terminate") + time.sleep(10) + + for rank in test_ranks: + failed = self.server_managers[0].check_rank_state( + ranks=[rank], valid_states=["excluded"], max_checks=5) + if failed: + self.fail(f"Rank {rank} did not self-terminate") + + # Step 3: Wait and verify none restart + wait_time = 20 + self.log_step(f"Step 3: Waiting {wait_time}s to verify no automatic restarts") + time.sleep(wait_time) + + errors = [] + for rank in test_ranks: + failed = self.server_managers[0].check_rank_state( + ranks=[rank], valid_states=["excluded"], max_checks=1) + if failed: + errors.append(f"Rank {rank} unexpectedly restarted when auto-restart disabled") + + if errors: + self.fail("\n".join(errors)) + + self.log.info(f"Confirmed: None of {test_ranks} automatically restarted") + + # Step 4: Manually clear and restart all + self.log_step("Step 4: Manually clearing exclusion and restarting ranks") + self.dmg.system_clear_exclude(ranks=test_ranks, rank_hosts=None) + + for rank in test_ranks: + self.dmg.system_start(ranks=f"{rank}") + + # Step 5: Verify all rejoin + self.log_step("Step 5: Verifying all ranks successfully rejoin") + time.sleep(10) + + for rank in test_ranks: + failed = self.server_managers[0].check_rank_state( + ranks=[rank], valid_states=["joined"], max_checks=10) + if failed: + errors.append(f"Manual restart of rank {rank} failed") + + report_errors(test=self, errors=errors) + + self.log.info(f"SUCCESS: All {num_to_test} ranks stayed excluded and " + f"manual restart succeeded") diff --git a/src/tests/ftest/control/engine_auto_restart_disabled.yaml b/src/tests/ftest/control/engine_auto_restart_disabled.yaml new file mode 100644 index 00000000000..e986bc12115 --- /dev/null +++ b/src/tests/ftest/control/engine_auto_restart_disabled.yaml @@ -0,0 +1,27 @@ +hosts: + test_servers: 1 +server_config: + name: daos_server + engines_per_host: 2 + # Disable automatic engine restart + disable_engine_auto_restart: true + engines: + 0: + log_file: daos_server0.log + targets: 4 + nr_xs_helpers: 0 + storage: + 0: + class: ram + scm_mount: /mnt/daos0 + 1: + log_file: daos_server1.log + targets: 4 + nr_xs_helpers: 0 + storage: + 0: + class: ram + scm_mount: /mnt/daos1 +pool: + size: 8G +timeout: 300 From 1dceec3eec26c96e89326c19a49607ee26e2ed43 Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Wed, 15 Apr 2026 17:30:52 +0000 Subject: [PATCH 2/6] lint fixes Signed-off-by: Dalton Bohning --- .../ftest/control/engine_auto_restart.py | 31 ++++++++++++------- .../ftest/control/engine_auto_restart.yaml | 5 ++- .../control/engine_auto_restart_advanced.yaml | 5 ++- .../control/engine_auto_restart_disabled.yaml | 8 +++-- 4 files changed, 33 insertions(+), 16 deletions(-) diff --git a/src/tests/ftest/control/engine_auto_restart.py b/src/tests/ftest/control/engine_auto_restart.py index d6d5b85655b..0f577668291 100644 --- a/src/tests/ftest/control/engine_auto_restart.py +++ b/src/tests/ftest/control/engine_auto_restart.py @@ -29,6 +29,14 @@ def get_all_ranks(self): return list(self.server_managers[0].ranks.keys()) def get_rank_state(self, rank): + """Get the current state of a rank from dmg system query. + + Args: + rank (int): Rank to query + + Returns: + str: State of the rank + """ data = self.dmg.system_query(ranks=f"{rank}") if data["status"] != 0: self.fail("Cmd dmg system query failed") @@ -38,6 +46,7 @@ def get_rank_state(self, rank): for member in data["response"]["members"]: return member["state"].lower() self.fail("No member state returned from dmg system query") + return None # to appease pylint def exclude_rank_and_wait_restart(self, rank, expect_restart=True, timeout=30): """Exclude a rank and wait for it to self-terminate and potentially restart. @@ -81,10 +90,10 @@ def exclude_rank_and_wait_restart(self, rank, expect_restart=True, timeout=30): if restarted: self.log.info(f"Rank {rank} automatically restarted and rejoined") return (True, "joined") - else: - state = self.get_rank_state(rank) - self.log.error(f"Rank {rank} ({state}) did not restart within {timeout}s") - return (False, state) + + state = self.get_rank_state(rank) + self.log.error("Rank %s (%s) did not restart within %ss", rank, state, timeout) + return (False, state) else: # Verify rank stays excluded (no automatic restart) self.log_step(f"Verifying rank {rank} does not automatically restart") @@ -94,10 +103,10 @@ def exclude_rank_and_wait_restart(self, rank, expect_restart=True, timeout=30): ranks=[rank], valid_states=["excluded"], max_checks=1) if failed_ranks: state = self.get_rank_state(rank) - self.log.error(f"Rank {rank} ({state}) unexpectedly restarted") + self.log.error("Rank %s (%s) unexpectedly restarted", rank, state) return (True, state) - else: - return (False, "excluded") + + return (False, "excluded") def test_auto_restart_basic(self): """Test basic automatic engine restart after self-termination. @@ -125,7 +134,7 @@ def test_auto_restart_basic(self): if not restarted: self.fail(f"Rank {test_rank} did not automatically restart. Final state: {final_state}") - self.log.info(f"SUCCESS: Rank {test_rank} automatically restarted after self-termination") + self.log.info("SUCCESS: Rank %s automatically restarted after self-termination", test_rank) def test_auto_restart_multiple_ranks(self): """Test automatic restart of multiple ranks. @@ -165,7 +174,7 @@ def test_auto_restart_multiple_ranks(self): self.log.info("=== Multiple Rank Restart Results ===") for rank, (restarted, state) in results.items(): status = "PASS" if restarted else "FAIL" - self.log.info(f"Rank {rank}: {status} (final state: {state})") + self.log.info("Rank %s: %s (final state: %s)", rank, status, state) report_errors(test=self, errors=errors) @@ -192,7 +201,7 @@ def test_auto_restart_with_pool(self): # Get pool service ranks to avoid excluding them pool_svc_ranks = self.pool.svc_ranks - self.log.info(f"Pool service ranks: {pool_svc_ranks}") + self.log.info("Pool service ranks: {%s", pool_svc_ranks) # Find a rank not in pool service non_svc_ranks = [r for r in all_ranks if r not in pool_svc_ranks] @@ -201,7 +210,7 @@ def test_auto_restart_with_pool(self): test_rank = self.random.choice(non_svc_ranks) - self.log_step(f"Excluding non-service rank {test_rank} while pool is active") + self.log_step("Excluding non-service rank %s while pool is active", test_rank) restarted, final_state = self.exclude_rank_and_wait_restart(test_rank) diff --git a/src/tests/ftest/control/engine_auto_restart.yaml b/src/tests/ftest/control/engine_auto_restart.yaml index aefee54e589..5923f25e671 100644 --- a/src/tests/ftest/control/engine_auto_restart.yaml +++ b/src/tests/ftest/control/engine_auto_restart.yaml @@ -1,5 +1,8 @@ hosts: test_servers: 1 + +timeout: 300 + server_config: name: daos_server engines_per_host: 2 @@ -20,6 +23,6 @@ server_config: 0: class: ram scm_mount: /mnt/daos1 + pool: size: 2G -timeout: 300 diff --git a/src/tests/ftest/control/engine_auto_restart_advanced.yaml b/src/tests/ftest/control/engine_auto_restart_advanced.yaml index f45bcf57618..4a7a1eb8d95 100644 --- a/src/tests/ftest/control/engine_auto_restart_advanced.yaml +++ b/src/tests/ftest/control/engine_auto_restart_advanced.yaml @@ -1,5 +1,8 @@ hosts: test_servers: 1 + +timeout: 400 + server_config: name: daos_server engines_per_host: 2 @@ -22,6 +25,6 @@ server_config: 0: class: ram scm_mount: /mnt/daos1 + pool: size: 8G -timeout: 400 diff --git a/src/tests/ftest/control/engine_auto_restart_disabled.yaml b/src/tests/ftest/control/engine_auto_restart_disabled.yaml index e986bc12115..c374a452945 100644 --- a/src/tests/ftest/control/engine_auto_restart_disabled.yaml +++ b/src/tests/ftest/control/engine_auto_restart_disabled.yaml @@ -1,10 +1,12 @@ hosts: test_servers: 1 + +timeout: 300 + server_config: name: daos_server engines_per_host: 2 - # Disable automatic engine restart - disable_engine_auto_restart: true + disable_engine_auto_restart: true # Disable automatic engine restart engines: 0: log_file: daos_server0.log @@ -22,6 +24,6 @@ server_config: 0: class: ram scm_mount: /mnt/daos1 + pool: size: 8G -timeout: 300 From 09cc52ebaef5206ad9860e85013460dceda4c9fa Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Wed, 15 Apr 2026 18:58:31 +0000 Subject: [PATCH 3/6] Skip test for now doc-only: true Signed-off-by: Dalton Bohning From ef3bcf32810a527254cd7f6bc0c66c04ea91146f Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Wed, 15 Apr 2026 19:22:12 +0000 Subject: [PATCH 4/6] move get_rank_state to server manager Doc-only: true Signed-off-by: Dalton Bohning --- .../ftest/control/engine_auto_restart.py | 32 ++++--------------- .../control/engine_auto_restart_advanced.py | 21 +++--------- .../control/engine_auto_restart_disabled.py | 4 +-- src/tests/ftest/util/server_utils.py | 17 ++++++++++ 4 files changed, 30 insertions(+), 44 deletions(-) diff --git a/src/tests/ftest/control/engine_auto_restart.py b/src/tests/ftest/control/engine_auto_restart.py index 0f577668291..68d9fb5ed0f 100644 --- a/src/tests/ftest/control/engine_auto_restart.py +++ b/src/tests/ftest/control/engine_auto_restart.py @@ -28,26 +28,6 @@ def get_all_ranks(self): """Get list of all ranks in the system.""" return list(self.server_managers[0].ranks.keys()) - def get_rank_state(self, rank): - """Get the current state of a rank from dmg system query. - - Args: - rank (int): Rank to query - - Returns: - str: State of the rank - """ - data = self.dmg.system_query(ranks=f"{rank}") - if data["status"] != 0: - self.fail("Cmd dmg system query failed") - if "response" in data and "members" in data["response"]: - if data["response"]["members"] is None: - self.fail("No members returned from dmg system query") - for member in data["response"]["members"]: - return member["state"].lower() - self.fail("No member state returned from dmg system query") - return None # to appease pylint - def exclude_rank_and_wait_restart(self, rank, expect_restart=True, timeout=30): """Exclude a rank and wait for it to self-terminate and potentially restart. @@ -91,7 +71,7 @@ def exclude_rank_and_wait_restart(self, rank, expect_restart=True, timeout=30): self.log.info(f"Rank {rank} automatically restarted and rejoined") return (True, "joined") - state = self.get_rank_state(rank) + state = self.server_managers[0].get_rank_state(rank) self.log.error("Rank %s (%s) did not restart within %ss", rank, state, timeout) return (False, state) else: @@ -102,7 +82,7 @@ def exclude_rank_and_wait_restart(self, rank, expect_restart=True, timeout=30): failed_ranks = self.server_managers[0].check_rank_state( ranks=[rank], valid_states=["excluded"], max_checks=1) if failed_ranks: - state = self.get_rank_state(rank) + state = self.server_managers[0].get_rank_state(rank) self.log.error("Rank %s (%s) unexpectedly restarted", rank, state) return (True, state) @@ -123,7 +103,7 @@ def test_auto_restart_basic(self): """ all_ranks = self.get_all_ranks() if len(all_ranks) < 2: - self.skipTest("Test requires at least 2 ranks") + self.fail("Test requires at least 2 ranks") test_rank = self.random.choice(all_ranks) @@ -151,7 +131,7 @@ def test_auto_restart_multiple_ranks(self): """ all_ranks = self.get_all_ranks() if len(all_ranks) < 3: - self.skipTest("Test requires at least 3 ranks") + self.fail("Test requires at least 3 ranks") # Exclude half the ranks (but keep at least one for quorum) num_to_exclude = max(1, len(all_ranks) // 2) @@ -194,7 +174,7 @@ def test_auto_restart_with_pool(self): """ all_ranks = self.get_all_ranks() if len(all_ranks) < 2: - self.skipTest("Test requires at least 2 ranks") + self.fail("Test requires at least 2 ranks") # Create pool first self.add_pool(connect=False) @@ -206,7 +186,7 @@ def test_auto_restart_with_pool(self): # Find a rank not in pool service non_svc_ranks = [r for r in all_ranks if r not in pool_svc_ranks] if not non_svc_ranks: - self.skipTest("All ranks are pool service ranks") + self.fail("All ranks are pool service ranks") test_rank = self.random.choice(non_svc_ranks) diff --git a/src/tests/ftest/control/engine_auto_restart_advanced.py b/src/tests/ftest/control/engine_auto_restart_advanced.py index 59bfdea511b..3e00e927839 100644 --- a/src/tests/ftest/control/engine_auto_restart_advanced.py +++ b/src/tests/ftest/control/engine_auto_restart_advanced.py @@ -27,17 +27,6 @@ def get_all_ranks(self): """Get list of all ranks in the system.""" return list(self.server_managers[0].ranks.keys()) - def get_rank_state(self, rank): - data = self.dmg.system_query(ranks=f"{rank}") - if data["status"] != 0: - self.fail("Cmd dmg system query failed") - if "response" in data and "members" in data["response"]: - if data["response"]["members"] is None: - self.fail("No members returned from dmg system query") - for member in data["response"]["members"]: - return member["state"].lower() - self.fail("No member state returned from dmg system query") - def wait_for_rank_state(self, rank, expected_state, timeout=30, check_interval=2): """Wait for a rank to reach expected state. @@ -63,7 +52,7 @@ def wait_for_rank_state(self, rank, expected_state, timeout=30, check_interval=2 time.sleep(check_interval) - current_state = self.get_rank_state(rank) + current_state = self.server_managers[0].get_rank_state(rank) self.log.warning(f"Rank {rank} did not reach '{expected_state}' within {timeout}s. " f"Current state: {current_state}") return False @@ -91,7 +80,7 @@ def test_deferred_restart(self): all_ranks = self.get_all_ranks() if len(all_ranks) < 2: - self.skipTest("Test requires at least 2 ranks") + self.fail("Test requires at least 2 ranks") test_rank = self.random.choice(all_ranks) @@ -123,7 +112,7 @@ def test_deferred_restart(self): self.log_step("Step 4: Verifying restart is deferred (not immediate)") time.sleep(5) # Wait a bit - current_state = self.get_rank_state(test_rank) + current_state = self.server_managers[0].get_rank_state(test_rank) if current_state == "joined": self.fail(f"Rank {test_rank} restarted immediately - rate-limiting not working") @@ -170,7 +159,7 @@ def test_custom_restart_delay(self): all_ranks = self.get_all_ranks() if len(all_ranks) < 2: - self.skipTest("Test requires at least 2 ranks") + self.fail("Test requires at least 2 ranks") test_rank = self.random.choice(all_ranks) @@ -229,7 +218,7 @@ def test_restart_after_clear_exclude(self): """ all_ranks = self.get_all_ranks() if len(all_ranks) < 2: - self.skipTest("Test requires at least 2 ranks") + self.fail("Test requires at least 2 ranks") test_rank = self.random.choice(all_ranks) diff --git a/src/tests/ftest/control/engine_auto_restart_disabled.py b/src/tests/ftest/control/engine_auto_restart_disabled.py index cf66474c06d..331ea0a632b 100644 --- a/src/tests/ftest/control/engine_auto_restart_disabled.py +++ b/src/tests/ftest/control/engine_auto_restart_disabled.py @@ -47,7 +47,7 @@ def test_no_restart_when_disabled(self): """ all_ranks = self.get_all_ranks() if len(all_ranks) < 2: - self.skipTest("Test requires at least 2 ranks") + self.fail("Test requires at least 2 ranks") test_rank = self.random.choice(all_ranks) @@ -119,7 +119,7 @@ def test_multiple_ranks_no_restart(self): """ all_ranks = self.get_all_ranks() if len(all_ranks) < 3: - self.skipTest("Test requires at least 3 ranks") + self.fail("Test requires at least 3 ranks") # Exclude half the ranks num_to_test = max(2, len(all_ranks) // 2) diff --git a/src/tests/ftest/util/server_utils.py b/src/tests/ftest/util/server_utils.py index d36900dc54f..513976039e0 100644 --- a/src/tests/ftest/util/server_utils.py +++ b/src/tests/ftest/util/server_utils.py @@ -760,6 +760,23 @@ def get_single_system_state(self): raise ServerFailed("Multiple system states ({}) detected:\n {}".format(states, data)) return states.pop() + def get_rank_state(self, rank): + """Get the current state of a rank from dmg system query. + + Args: + rank (int): Rank to query + + Returns: + str: State of the rank + """ + data = self.dmg.system_query(ranks=str(rank)) + if data["status"] != 0: + raise CommandFailure(f"dmg system query failed with status {data['status']}") + try: + return data["response"]["members"][0]["state"] + except (KeyError, IndexError) as error: + raise CommandFailure("Failed to get rank state from dmg system query") from error + def check_rank_state(self, ranks, valid_states, max_checks=1): """Check the states of list of ranks in DAOS system. From 724f10897370bd2d21c2c4a46894833c8f82a383 Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Wed, 15 Apr 2026 19:29:48 +0000 Subject: [PATCH 5/6] do not pass rank_hosts=None since it is default Doc-only: true Signed-off-by: Dalton Bohning --- src/tests/ftest/control/engine_auto_restart.py | 2 +- .../ftest/control/engine_auto_restart_advanced.py | 12 ++++++------ .../ftest/control/engine_auto_restart_disabled.py | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/tests/ftest/control/engine_auto_restart.py b/src/tests/ftest/control/engine_auto_restart.py index 68d9fb5ed0f..3eb17846243 100644 --- a/src/tests/ftest/control/engine_auto_restart.py +++ b/src/tests/ftest/control/engine_auto_restart.py @@ -40,7 +40,7 @@ def exclude_rank_and_wait_restart(self, rank, expect_restart=True, timeout=30): tuple: (restarted, final_state) - whether rank restarted and its final state """ self.log_step(f"Excluding rank {rank}") - self.dmg.system_exclude(ranks=[rank], rank_hosts=None) + self.dmg.system_exclude(ranks=[rank]) # Wait for rank to self-terminate (should go to Excluded state) self.log_step(f"Waiting for rank {rank} to self-terminate") diff --git a/src/tests/ftest/control/engine_auto_restart_advanced.py b/src/tests/ftest/control/engine_auto_restart_advanced.py index 3e00e927839..60f915fa2b5 100644 --- a/src/tests/ftest/control/engine_auto_restart_advanced.py +++ b/src/tests/ftest/control/engine_auto_restart_advanced.py @@ -86,7 +86,7 @@ def test_deferred_restart(self): # First exclusion - should restart immediately (no previous restart) self.log_step(f"Step 1: First exclusion of rank {test_rank}") - self.dmg.system_exclude(ranks=[test_rank], rank_hosts=None) + self.dmg.system_exclude(ranks=[test_rank]) # Wait for self-termination if not self.wait_for_rank_state(test_rank, "excluded", timeout=10): @@ -102,7 +102,7 @@ def test_deferred_restart(self): # Second exclusion - should be deferred due to rate-limiting self.log_step(f"Step 3: Second exclusion of rank {test_rank} (should be deferred)") - self.dmg.system_exclude(ranks=[test_rank], rank_hosts=None) + self.dmg.system_exclude(ranks=[test_rank]) # Wait for self-termination if not self.wait_for_rank_state(test_rank, "excluded", timeout=10): @@ -167,7 +167,7 @@ def test_custom_restart_delay(self): # First restart to establish baseline self.log_step("Step 1: First exclusion and restart") - self.dmg.system_exclude(ranks=[test_rank], rank_hosts=None) + self.dmg.system_exclude(ranks=[test_rank]) self.wait_for_rank_state(test_rank, "excluded", timeout=10) self.wait_for_rank_state(test_rank, "joined", timeout=30) @@ -175,7 +175,7 @@ def test_custom_restart_delay(self): # Second restart to measure delay self.log_step("Step 2: Second exclusion to trigger deferred restart") - self.dmg.system_exclude(ranks=[test_rank], rank_hosts=None) + self.dmg.system_exclude(ranks=[test_rank]) self.wait_for_rank_state(test_rank, "excluded", timeout=10) # Wait for deferred restart @@ -223,7 +223,7 @@ def test_restart_after_clear_exclude(self): test_rank = self.random.choice(all_ranks) self.log_step(f"Step 1: Excluding rank {test_rank}") - self.dmg.system_exclude(ranks=[test_rank], rank_hosts=None) + self.dmg.system_exclude(ranks=[test_rank]) # Wait for self-termination # FIXME: should this be checking for "adminexcluded" state? @@ -232,7 +232,7 @@ def test_restart_after_clear_exclude(self): # Clear exclusion before auto-restart self.log_step(f"Step 2: Clearing exclusion for rank {test_rank}") - self.dmg.system_clear_exclude(ranks=[test_rank], rank_hosts=None) + self.dmg.system_clear_exclude(ranks=[test_rank]) # Verify rank rejoins if not self.wait_for_rank_state(test_rank, "joined", timeout=30): diff --git a/src/tests/ftest/control/engine_auto_restart_disabled.py b/src/tests/ftest/control/engine_auto_restart_disabled.py index 331ea0a632b..070f400d7e2 100644 --- a/src/tests/ftest/control/engine_auto_restart_disabled.py +++ b/src/tests/ftest/control/engine_auto_restart_disabled.py @@ -52,7 +52,7 @@ def test_no_restart_when_disabled(self): test_rank = self.random.choice(all_ranks) self.log_step(f"Step 1: Excluding rank {test_rank} (auto-restart is DISABLED)") - self.dmg.system_exclude(ranks=[test_rank], rank_hosts=None) + self.dmg.system_exclude(ranks=[test_rank]) # Step 2: Wait for self-termination self.log_step(f"Step 2: Waiting for rank {test_rank} to self-terminate") @@ -128,7 +128,7 @@ def test_multiple_ranks_no_restart(self): self.log_step(f"Step 1: Excluding {num_to_test} ranks: {test_ranks}") for rank in test_ranks: - self.dmg.system_exclude(ranks=[rank], rank_hosts=None) + self.dmg.system_exclude(ranks=[rank]) time.sleep(1) # Small delay between exclusions # Step 2: Verify all reach Excluded state From 005cf86619517defefe3bcb0c92b59942c3234fe Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Wed, 15 Apr 2026 20:16:44 +0000 Subject: [PATCH 6/6] more lint fixes Signed-off-by: Dalton Bohning --- .../ftest/control/engine_auto_restart.py | 26 +++++++++---------- .../control/engine_auto_restart_advanced.py | 14 +++++----- .../control/engine_auto_restart_disabled.py | 13 +++++----- 3 files changed, 28 insertions(+), 25 deletions(-) diff --git a/src/tests/ftest/control/engine_auto_restart.py b/src/tests/ftest/control/engine_auto_restart.py index 3eb17846243..6ed0e126779 100644 --- a/src/tests/ftest/control/engine_auto_restart.py +++ b/src/tests/ftest/control/engine_auto_restart.py @@ -68,25 +68,25 @@ def exclude_rank_and_wait_restart(self, rank, expect_restart=True, timeout=30): break if restarted: - self.log.info(f"Rank {rank} automatically restarted and rejoined") + self.log.info("Rank %s automatically restarted and rejoined", rank) return (True, "joined") state = self.server_managers[0].get_rank_state(rank) self.log.error("Rank %s (%s) did not restart within %ss", rank, state, timeout) return (False, state) - else: - # Verify rank stays excluded (no automatic restart) - self.log_step(f"Verifying rank {rank} does not automatically restart") - time.sleep(timeout) - failed_ranks = self.server_managers[0].check_rank_state( - ranks=[rank], valid_states=["excluded"], max_checks=1) - if failed_ranks: - state = self.server_managers[0].get_rank_state(rank) - self.log.error("Rank %s (%s) unexpectedly restarted", rank, state) - return (True, state) + # Otherwise, verify rank stays excluded (no automatic restart) + self.log_step(f"Verifying rank {rank} does not automatically restart") + time.sleep(timeout) - return (False, "excluded") + failed_ranks = self.server_managers[0].check_rank_state( + ranks=[rank], valid_states=["excluded"], max_checks=1) + if failed_ranks: + state = self.server_managers[0].get_rank_state(rank) + self.log.error("Rank %s (%s) unexpectedly restarted", rank, state) + return (True, state) + + return (False, "excluded") def test_auto_restart_basic(self): """Test basic automatic engine restart after self-termination. @@ -201,4 +201,4 @@ def test_auto_restart_with_pool(self): self.log_step("Verifying pool is still accessible after rank restart") self.pool.query() - self.log.info(f"SUCCESS: Rank {test_rank} restarted and pool remains accessible") + self.log.info("SUCCESS: Rank %s restarted and pool remains accessible", test_rank) diff --git a/src/tests/ftest/control/engine_auto_restart_advanced.py b/src/tests/ftest/control/engine_auto_restart_advanced.py index 60f915fa2b5..2147094f081 100644 --- a/src/tests/ftest/control/engine_auto_restart_advanced.py +++ b/src/tests/ftest/control/engine_auto_restart_advanced.py @@ -46,8 +46,9 @@ def wait_for_rank_state(self, rank, expected_state, timeout=30, check_interval=2 ranks=[rank], valid_states=[expected_state], max_checks=1) if not failed_ranks: - self.log.info(f"Rank {rank} reached state '{expected_state}' after " - f"{time.time() - start_time:.1f}s") + self.log.info( + "Rank %s reached state '%s' after %.1fs", + rank, expected_state, time.time() - start_time) return True time.sleep(check_interval) @@ -98,7 +99,7 @@ def test_deferred_restart(self): self.fail(f"Rank {test_rank} did not automatically restart on first exclusion") first_restart_time = time.time() - self.log.info(f"First restart completed at T={first_restart_time:.1f}") + self.log.info("First restart completed at T=%.1f", first_restart_time) # Second exclusion - should be deferred due to rate-limiting self.log_step(f"Step 3: Second exclusion of rank {test_rank} (should be deferred)") @@ -116,7 +117,7 @@ def test_deferred_restart(self): if current_state == "joined": self.fail(f"Rank {test_rank} restarted immediately - rate-limiting not working") - self.log.info(f"Confirmed: Restart is deferred (rank still in '{current_state}' state)") + self.log.info("Confirmed: Restart is deferred (rank still in '%s' state)", current_state) # Wait for deferred restart to execute (after delay expires) # Add buffer time for processing @@ -129,8 +130,9 @@ def test_deferred_restart(self): deferred_restart_time = time.time() actual_delay = deferred_restart_time - first_restart_time - self.log.info(f"SUCCESS: Deferred restart executed after {actual_delay:.1f}s " - f"(expected ~{restart_delay}s)") + self.log.info( + "SUCCESS: Deferred restart executed after %.1fs (expected ~%ss)", + actual_delay, restart_delay) # Verify delay was approximately correct (within tolerance) if actual_delay < restart_delay * 0.8: diff --git a/src/tests/ftest/control/engine_auto_restart_disabled.py b/src/tests/ftest/control/engine_auto_restart_disabled.py index 070f400d7e2..5d22e8965f9 100644 --- a/src/tests/ftest/control/engine_auto_restart_disabled.py +++ b/src/tests/ftest/control/engine_auto_restart_disabled.py @@ -81,7 +81,7 @@ def test_no_restart_when_disabled(self): else: self.fail(f"Rank {test_rank} in unexpected state (not excluded or joined)") - self.log.info(f"Confirmed: Rank {test_rank} did NOT automatically restart (as expected)") + self.log.info("Confirmed: Rank %s did NOT automatically restart (as expected)", test_rank) # Step 4: Manually clear exclusion self.log_step(f"Step 4: Manually clearing exclusion for rank {test_rank}") @@ -97,8 +97,9 @@ def test_no_restart_when_disabled(self): if failed_ranks: self.fail(f"Manual start of rank {test_rank} failed") - self.log.info(f"SUCCESS: Rank {test_rank} stayed excluded when auto-restart disabled, " - f"and manual start succeeded") + self.log.info( + "SUCCESS: Rank %s stayed excluded when auto-restart disabled, " + "and manual start succeeded", test_rank) def test_multiple_ranks_no_restart(self): """Test that multiple excluded ranks stay excluded when auto-restart disabled. @@ -156,7 +157,7 @@ def test_multiple_ranks_no_restart(self): if errors: self.fail("\n".join(errors)) - self.log.info(f"Confirmed: None of {test_ranks} automatically restarted") + self.log.info("Confirmed: None of %s automatically restarted", test_ranks) # Step 4: Manually clear and restart all self.log_step("Step 4: Manually clearing exclusion and restarting ranks") @@ -177,5 +178,5 @@ def test_multiple_ranks_no_restart(self): report_errors(test=self, errors=errors) - self.log.info(f"SUCCESS: All {num_to_test} ranks stayed excluded and " - f"manual restart succeeded") + self.log.info( + "SUCCESS: All %s ranks stayed excluded and manual restart succeeded", num_to_test)