diff --git a/src/tests/ftest/control/engine_auto_restart.py b/src/tests/ftest/control/engine_auto_restart.py new file mode 100644 index 00000000000..6ed0e126779 --- /dev/null +++ b/src/tests/ftest/control/engine_auto_restart.py @@ -0,0 +1,204 @@ +""" + (C) Copyright 2026 Hewlett Packard Enterprise Development LP + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" +import time + +from control_test_base import ControlTestBase +from general_utils import report_errors + + +class EngineAutoRestartTest(ControlTestBase): + """Test automatic engine restart on self-termination. + + Test Class Description: + Verify automatic engine restart behavior when engines self-terminate + after being excluded from the system. + + :avocado: recursive + """ + + def setUp(self): + """Set up each test case.""" + super().setUp() + self.dmg = self.get_dmg_command() + + def get_all_ranks(self): + """Get list of all ranks in the system.""" + return list(self.server_managers[0].ranks.keys()) + + def exclude_rank_and_wait_restart(self, rank, expect_restart=True, timeout=30): + """Exclude a rank and wait for it to self-terminate and potentially restart. + + Args: + rank (int): Rank to exclude + expect_restart (bool): Whether automatic restart is expected + timeout (int): Maximum seconds to wait for restart + + Returns: + tuple: (restarted, final_state) - whether rank restarted and its final state + """ + self.log_step(f"Excluding rank {rank}") + self.dmg.system_exclude(ranks=[rank]) + + # Wait for rank to self-terminate (should go to Excluded state) + self.log_step(f"Waiting for rank {rank} to self-terminate") + time.sleep(5) + + # Check if rank is excluded + failed_ranks = self.server_managers[0].check_rank_state( + ranks=[rank], valid_states=["excluded"], max_checks=10) + if failed_ranks: + self.fail(f"Rank {rank} did not reach Excluded state after exclusion") + + if expect_restart: + # Wait for automatic restart (rank should go to Joined state) + self.log_step(f"Waiting for rank {rank} to automatically restart") + start_time = time.time() + restarted = False + + while time.time() - start_time < timeout: + time.sleep(2) + # Check if rank has rejoined + failed_ranks = self.server_managers[0].check_rank_state( + ranks=[rank], valid_states=["joined"], max_checks=1) + if not failed_ranks: + restarted = True + break + + if restarted: + self.log.info("Rank %s automatically restarted and rejoined", rank) + return (True, "joined") + + state = self.server_managers[0].get_rank_state(rank) + self.log.error("Rank %s (%s) did not restart within %ss", rank, state, timeout) + return (False, state) + + # Otherwise, verify rank stays excluded (no automatic restart) + self.log_step(f"Verifying rank {rank} does not automatically restart") + time.sleep(timeout) + + failed_ranks = self.server_managers[0].check_rank_state( + ranks=[rank], valid_states=["excluded"], max_checks=1) + if failed_ranks: + state = self.server_managers[0].get_rank_state(rank) + self.log.error("Rank %s (%s) unexpectedly restarted", rank, state) + return (True, state) + + return (False, "excluded") + + def test_auto_restart_basic(self): + """Test basic automatic engine restart after self-termination. + + Test Description: + 1. Exclude a rank from the system + 2. Wait for rank to self-terminate + 3. Verify rank automatically restarts and rejoins the system + + :avocado: tags=all,pr,daily_regression + :avocado: tags=hw,medium + :avocado: tags=dmg,control,engine_auto_restart + :avocado: tags=EngineAutoRestartTest,test_auto_restart_basic + """ + all_ranks = self.get_all_ranks() + if len(all_ranks) < 2: + self.fail("Test requires at least 2 ranks") + + test_rank = self.random.choice(all_ranks) + + self.log_step(f"Testing automatic restart of rank {test_rank}") + + restarted, final_state = self.exclude_rank_and_wait_restart(test_rank) + + if not restarted: + self.fail(f"Rank {test_rank} did not automatically restart. Final state: {final_state}") + + self.log.info("SUCCESS: Rank %s automatically restarted after self-termination", test_rank) + + def test_auto_restart_multiple_ranks(self): + """Test automatic restart of multiple ranks. + + Test Description: + 1. Exclude multiple ranks simultaneously + 2. Wait for all to self-terminate + 3. Verify all automatically restart and rejoin + + :avocado: tags=all,full_regression + :avocado: tags=hw,medium + :avocado: tags=dmg,control,engine_auto_restart + :avocado: tags=EngineAutoRestartTest,test_auto_restart_multiple_ranks + """ + all_ranks = self.get_all_ranks() + if len(all_ranks) < 3: + self.fail("Test requires at least 3 ranks") + + # Exclude half the ranks (but keep at least one for quorum) + num_to_exclude = max(1, len(all_ranks) // 2) + ranks_to_test = self.random.sample(all_ranks, num_to_exclude) + + self.log_step(f"Testing automatic restart of multiple ranks: {ranks_to_test}") + + errors = [] + results = {} + + for test_rank in ranks_to_test: + restarted, final_state = self.exclude_rank_and_wait_restart(test_rank) + results[test_rank] = (restarted, final_state) + + if not restarted: + errors.append( + f"Rank {test_rank} did not automatically restart. State: {final_state}") + + # Report results + self.log.info("=== Multiple Rank Restart Results ===") + for rank, (restarted, state) in results.items(): + status = "PASS" if restarted else "FAIL" + self.log.info("Rank %s: %s (final state: %s)", rank, status, state) + + report_errors(test=self, errors=errors) + + def test_auto_restart_with_pool(self): + """Test automatic restart works with active pools. + + Test Description: + 1. Create a pool + 2. Exclude a rank (not in pool service) + 3. Verify rank automatically restarts + 4. Verify pool remains accessible + + :avocado: tags=all,daily_regression + :avocado: tags=hw,medium + :avocado: tags=dmg,control,engine_auto_restart,pool + :avocado: tags=EngineAutoRestartTest,test_auto_restart_with_pool + """ + all_ranks = self.get_all_ranks() + if len(all_ranks) < 2: + self.fail("Test requires at least 2 ranks") + + # Create pool first + self.add_pool(connect=False) + + # Get pool service ranks to avoid excluding them + pool_svc_ranks = self.pool.svc_ranks + self.log.info("Pool service ranks: {%s", pool_svc_ranks) + + # Find a rank not in pool service + non_svc_ranks = [r for r in all_ranks if r not in pool_svc_ranks] + if not non_svc_ranks: + self.fail("All ranks are pool service ranks") + + test_rank = self.random.choice(non_svc_ranks) + + self.log_step("Excluding non-service rank %s while pool is active", test_rank) + + restarted, final_state = self.exclude_rank_and_wait_restart(test_rank) + + if not restarted: + self.fail(f"Rank {test_rank} did not restart. State: {final_state}") + + # Verify pool is still accessible + self.log_step("Verifying pool is still accessible after rank restart") + self.pool.query() + + self.log.info("SUCCESS: Rank %s restarted and pool remains accessible", test_rank) diff --git a/src/tests/ftest/control/engine_auto_restart.yaml b/src/tests/ftest/control/engine_auto_restart.yaml new file mode 100644 index 00000000000..5923f25e671 --- /dev/null +++ b/src/tests/ftest/control/engine_auto_restart.yaml @@ -0,0 +1,28 @@ +hosts: + test_servers: 1 + +timeout: 300 + +server_config: + name: daos_server + engines_per_host: 2 + engines: + 0: + log_file: daos_server0.log + targets: 4 + nr_xs_helpers: 0 + storage: + 0: + class: ram + scm_mount: /mnt/daos0 + 1: + log_file: daos_server1.log + targets: 4 + nr_xs_helpers: 0 + storage: + 0: + class: ram + scm_mount: /mnt/daos1 + +pool: + size: 2G diff --git a/src/tests/ftest/control/engine_auto_restart_advanced.py b/src/tests/ftest/control/engine_auto_restart_advanced.py new file mode 100644 index 00000000000..2147094f081 --- /dev/null +++ b/src/tests/ftest/control/engine_auto_restart_advanced.py @@ -0,0 +1,244 @@ +""" + (C) Copyright 2026 Hewlett Packard Enterprise Development LP + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" +import time + +from control_test_base import ControlTestBase + + +class EngineAutoRestartAdvanced(ControlTestBase): + """Test advanced automatic engine restart scenarios. + + Test Class Description: + Verify automatic engine restart with custom configurations including + rate-limiting, deferred restarts, and disabled restart behavior. + + :avocado: recursive + """ + + def setUp(self): + """Set up each test case.""" + super().setUp() + self.dmg = self.get_dmg_command() + + def get_all_ranks(self): + """Get list of all ranks in the system.""" + return list(self.server_managers[0].ranks.keys()) + + def wait_for_rank_state(self, rank, expected_state, timeout=30, check_interval=2): + """Wait for a rank to reach expected state. + + Args: + rank (int): Rank number + expected_state (str): Expected state + timeout (int): Maximum seconds to wait + check_interval (int): Seconds between state checks + + Returns: + bool: True if state reached, False if timeout + """ + start_time = time.time() + + while time.time() - start_time < timeout: + failed_ranks = self.server_managers[0].check_rank_state( + ranks=[rank], valid_states=[expected_state], max_checks=1) + + if not failed_ranks: + self.log.info( + "Rank %s reached state '%s' after %.1fs", + rank, expected_state, time.time() - start_time) + return True + + time.sleep(check_interval) + + current_state = self.server_managers[0].get_rank_state(rank) + self.log.warning(f"Rank {rank} did not reach '{expected_state}' within {timeout}s. " + f"Current state: {current_state}") + return False + + def test_deferred_restart(self): + """Test deferred restart when multiple self-terminations occur rapidly. + + Test Description: + This test requires custom server configuration with a short + engine_auto_restart_min_delay (e.g., 15 seconds) to avoid long test runtime. + + 1. Exclude rank and wait for automatic restart (first restart) + 2. Immediately exclude same rank again (second self-termination) + 3. Verify restart is deferred, not immediate + 4. Wait for deferred restart to execute after delay expires + 5. Verify rank successfully rejoins + + :avocado: tags=all,full_regression + :avocado: tags=hw,medium + :avocado: tags=dmg,control,engine_auto_restart + :avocado: tags=EngineAutoRestartAdvanced,test_deferred_restart + """ + # Get configured restart delay from test params + restart_delay = self.params.get("engine_auto_restart_min_delay", "/run/server_config/*", 15) + + all_ranks = self.get_all_ranks() + if len(all_ranks) < 2: + self.fail("Test requires at least 2 ranks") + + test_rank = self.random.choice(all_ranks) + + # First exclusion - should restart immediately (no previous restart) + self.log_step(f"Step 1: First exclusion of rank {test_rank}") + self.dmg.system_exclude(ranks=[test_rank]) + + # Wait for self-termination + if not self.wait_for_rank_state(test_rank, "excluded", timeout=10): + self.fail(f"Rank {test_rank} did not self-terminate") + + # Wait for automatic restart + self.log_step(f"Step 2: Waiting for first automatic restart of rank {test_rank}") + if not self.wait_for_rank_state(test_rank, "joined", timeout=30): + self.fail(f"Rank {test_rank} did not automatically restart on first exclusion") + + first_restart_time = time.time() + self.log.info("First restart completed at T=%.1f", first_restart_time) + + # Second exclusion - should be deferred due to rate-limiting + self.log_step(f"Step 3: Second exclusion of rank {test_rank} (should be deferred)") + self.dmg.system_exclude(ranks=[test_rank]) + + # Wait for self-termination + if not self.wait_for_rank_state(test_rank, "excluded", timeout=10): + self.fail(f"Rank {test_rank} did not self-terminate on second exclusion") + + # Verify restart is NOT immediate (should be deferred) + self.log_step("Step 4: Verifying restart is deferred (not immediate)") + time.sleep(5) # Wait a bit + + current_state = self.server_managers[0].get_rank_state(test_rank) + if current_state == "joined": + self.fail(f"Rank {test_rank} restarted immediately - rate-limiting not working") + + self.log.info("Confirmed: Restart is deferred (rank still in '%s' state)", current_state) + + # Wait for deferred restart to execute (after delay expires) + # Add buffer time for processing + wait_time = restart_delay + 10 + self.log_step(f"Step 5: Waiting {wait_time}s for deferred restart to execute") + + if not self.wait_for_rank_state(test_rank, "joined", timeout=wait_time): + self.fail(f"Rank {test_rank} did not restart after rate-limit delay") + + deferred_restart_time = time.time() + actual_delay = deferred_restart_time - first_restart_time + + self.log.info( + "SUCCESS: Deferred restart executed after %.1fs (expected ~%ss)", + actual_delay, restart_delay) + + # Verify delay was approximately correct (within tolerance) + if actual_delay < restart_delay * 0.8: + self.fail(f"Restart occurred too early: {actual_delay:.1f}s < {restart_delay}s") + + def test_custom_restart_delay(self): + """Test custom engine_auto_restart_min_delay configuration. + + Test Description: + This test requires server configuration with custom + engine_auto_restart_min_delay value. + + 1. Exclude rank and wait for first restart + 2. Exclude same rank again + 3. Measure time until deferred restart executes + 4. Verify delay matches configured value + + :avocado: tags=all,full_regression + :avocado: tags=hw,medium + :avocado: tags=dmg,control,engine_auto_restart + :avocado: tags=EngineAutoRestartAdvanced,test_custom_restart_delay + """ + # Get configured delay from test parameters + expected_delay = self.params.get("engine_auto_restart_min_delay", + "/run/server_config/*", 20) + + all_ranks = self.get_all_ranks() + if len(all_ranks) < 2: + self.fail("Test requires at least 2 ranks") + + test_rank = self.random.choice(all_ranks) + + self.log_step(f"Testing custom restart delay of {expected_delay}s for rank {test_rank}") + + # First restart to establish baseline + self.log_step("Step 1: First exclusion and restart") + self.dmg.system_exclude(ranks=[test_rank]) + self.wait_for_rank_state(test_rank, "excluded", timeout=10) + self.wait_for_rank_state(test_rank, "joined", timeout=30) + + first_restart_time = time.time() + + # Second restart to measure delay + self.log_step("Step 2: Second exclusion to trigger deferred restart") + self.dmg.system_exclude(ranks=[test_rank]) + self.wait_for_rank_state(test_rank, "excluded", timeout=10) + + # Wait for deferred restart + self.log_step(f"Step 3: Waiting for deferred restart (expected delay: {expected_delay}s)") + wait_timeout = expected_delay + 20 # Add buffer + + if not self.wait_for_rank_state(test_rank, "joined", timeout=wait_timeout): + self.fail(f"Rank {test_rank} did not restart within expected time") + + second_restart_time = time.time() + actual_delay = second_restart_time - first_restart_time + + self.log.info(f"Measured delay: {actual_delay:.1f}s (expected: ~{expected_delay}s)") + + # Verify delay is within acceptable range (80% to 120% of expected) + min_delay = expected_delay * 0.8 + max_delay = expected_delay * 1.2 + + if actual_delay < min_delay: + self.fail(f"Restart too early: {actual_delay:.1f}s < {min_delay:.1f}s") + elif actual_delay > max_delay: + self.log.warning(f"Restart delayed beyond expected: {actual_delay:.1f}s > " + f"{max_delay:.1f}s (may be acceptable depending on system load)") + else: + self.log.info(f"SUCCESS: Restart delay within expected range " + f"[{min_delay:.1f}s, {max_delay:.1f}s]") + + def test_restart_after_clear_exclude(self): + """Test interaction between auto-restart and manual clear-exclude. + + Test Description: + 1. Exclude rank, wait for self-termination + 2. Clear exclusion before auto-restart triggers + 3. Verify rank rejoins successfully + + :avocado: tags=all,daily_regression + :avocado: tags=hw,medium + :avocado: tags=dmg,control,engine_auto_restart + :avocado: tags=EngineAutoRestartAdvanced,test_restart_after_clear_exclude + """ + all_ranks = self.get_all_ranks() + if len(all_ranks) < 2: + self.fail("Test requires at least 2 ranks") + + test_rank = self.random.choice(all_ranks) + + self.log_step(f"Step 1: Excluding rank {test_rank}") + self.dmg.system_exclude(ranks=[test_rank]) + + # Wait for self-termination + # FIXME: should this be checking for "adminexcluded" state? + if not self.wait_for_rank_state(test_rank, "excluded", timeout=10): + self.fail(f"Rank {test_rank} did not self-terminate") + + # Clear exclusion before auto-restart + self.log_step(f"Step 2: Clearing exclusion for rank {test_rank}") + self.dmg.system_clear_exclude(ranks=[test_rank]) + + # Verify rank rejoins + if not self.wait_for_rank_state(test_rank, "joined", timeout=30): + self.fail(f"Rank {test_rank} did not rejoin after manual start") + self.fail(f"Rank {test_rank} did not automatically restart on admin exclusion") + + self.log.info(f"SUCCESS: Rank {test_rank} successfully rejoined after clear-exclude") diff --git a/src/tests/ftest/control/engine_auto_restart_advanced.yaml b/src/tests/ftest/control/engine_auto_restart_advanced.yaml new file mode 100644 index 00000000000..4a7a1eb8d95 --- /dev/null +++ b/src/tests/ftest/control/engine_auto_restart_advanced.yaml @@ -0,0 +1,30 @@ +hosts: + test_servers: 1 + +timeout: 400 + +server_config: + name: daos_server + engines_per_host: 2 + # Custom restart delay for faster testing (15 seconds instead of default 300) + engine_auto_restart_min_delay: 15 + engines: + 0: + log_file: daos_server0.log + targets: 4 + nr_xs_helpers: 0 + storage: + 0: + class: ram + scm_mount: /mnt/daos0 + 1: + log_file: daos_server1.log + targets: 4 + nr_xs_helpers: 0 + storage: + 0: + class: ram + scm_mount: /mnt/daos1 + +pool: + size: 8G diff --git a/src/tests/ftest/control/engine_auto_restart_disabled.py b/src/tests/ftest/control/engine_auto_restart_disabled.py new file mode 100644 index 00000000000..5d22e8965f9 --- /dev/null +++ b/src/tests/ftest/control/engine_auto_restart_disabled.py @@ -0,0 +1,182 @@ +""" + (C) Copyright 2026 Hewlett Packard Enterprise Development LP + + SPDX-License-Identifier: BSD-2-Clause-Patent +""" +import time + +from control_test_base import ControlTestBase +from general_utils import report_errors + + +class EngineAutoRestartDisabled(ControlTestBase): + """Test automatic engine restart disabled configuration. + + Test Class Description: + Verify that automatic engine restart can be disabled and that + excluded ranks stay excluded when auto-restart is disabled. + + :avocado: recursive + """ + + def setUp(self): + """Set up each test case.""" + super().setUp() + self.dmg = self.get_dmg_command() + + def get_all_ranks(self): + """Get list of all ranks in the system.""" + return list(self.server_managers[0].ranks.keys()) + + def test_no_restart_when_disabled(self): + """Test that engines do not automatically restart when feature is disabled. + + Test Description: + Server is configured with disable_engine_auto_restart: true. + + 1. Exclude a rank from the system + 2. Wait for rank to self-terminate + 3. Wait additional time to verify NO automatic restart occurs + 4. Manually start the rank to verify it can still be started + 5. Verify manual start succeeds + + :avocado: tags=all,daily_regression + :avocado: tags=hw,medium + :avocado: tags=dmg,control,engine_auto_restart + :avocado: tags=EngineAutoRestartDisabled,test_no_restart_when_disabled + """ + all_ranks = self.get_all_ranks() + if len(all_ranks) < 2: + self.fail("Test requires at least 2 ranks") + + test_rank = self.random.choice(all_ranks) + + self.log_step(f"Step 1: Excluding rank {test_rank} (auto-restart is DISABLED)") + self.dmg.system_exclude(ranks=[test_rank]) + + # Step 2: Wait for self-termination + self.log_step(f"Step 2: Waiting for rank {test_rank} to self-terminate") + time.sleep(5) + + failed_ranks = self.server_managers[0].check_rank_state( + ranks=[test_rank], valid_states=["excluded"], max_checks=10) + if failed_ranks: + self.fail(f"Rank {test_rank} did not reach Excluded state") + + # Step 3: Wait to verify NO automatic restart + wait_time = 20 # Wait 20 seconds + self.log_step(f"Step 3: Waiting {wait_time}s to verify NO automatic restart occurs") + time.sleep(wait_time) + + # Verify rank is still excluded + failed_ranks = self.server_managers[0].check_rank_state( + ranks=[test_rank], valid_states=["excluded"], max_checks=1) + + if failed_ranks: + # Rank is NOT excluded, check if it restarted + check_joined = self.server_managers[0].check_rank_state( + ranks=[test_rank], valid_states=["joined"], max_checks=1) + if not check_joined: + self.fail(f"Rank {test_rank} unexpectedly restarted when auto-restart disabled!") + else: + self.fail(f"Rank {test_rank} in unexpected state (not excluded or joined)") + + self.log.info("Confirmed: Rank %s did NOT automatically restart (as expected)", test_rank) + + # Step 4: Manually clear exclusion + self.log_step(f"Step 4: Manually clearing exclusion for rank {test_rank}") + self.dmg.system_clear_exclude(ranks=[test_rank], rank_hosts=None) + + # Step 5: Manually start the rank + self.log_step(f"Step 5: Manually starting rank {test_rank}") + self.dmg.system_start(ranks=f"{test_rank}") + + # Verify manual start succeeds + failed_ranks = self.server_managers[0].check_rank_state( + ranks=[test_rank], valid_states=["joined"], max_checks=15) + if failed_ranks: + self.fail(f"Manual start of rank {test_rank} failed") + + self.log.info( + "SUCCESS: Rank %s stayed excluded when auto-restart disabled, " + "and manual start succeeded", test_rank) + + def test_multiple_ranks_no_restart(self): + """Test that multiple excluded ranks stay excluded when auto-restart disabled. + + Test Description: + Server configured with disable_engine_auto_restart: true. + + 1. Exclude multiple ranks + 2. Verify all self-terminate and reach Excluded state + 3. Wait to confirm none automatically restart + 4. Manually restart all ranks + 5. Verify all successfully rejoin + + :avocado: tags=all,full_regression + :avocado: tags=hw,medium + :avocado: tags=dmg,control,engine_auto_restart + :avocado: tags=EngineAutoRestartDisabled,test_multiple_ranks_no_restart + """ + all_ranks = self.get_all_ranks() + if len(all_ranks) < 3: + self.fail("Test requires at least 3 ranks") + + # Exclude half the ranks + num_to_test = max(2, len(all_ranks) // 2) + test_ranks = self.random.sample(all_ranks, num_to_test) + + self.log_step(f"Step 1: Excluding {num_to_test} ranks: {test_ranks}") + + for rank in test_ranks: + self.dmg.system_exclude(ranks=[rank]) + time.sleep(1) # Small delay between exclusions + + # Step 2: Verify all reach Excluded state + self.log_step("Step 2: Verifying all ranks self-terminate") + time.sleep(10) + + for rank in test_ranks: + failed = self.server_managers[0].check_rank_state( + ranks=[rank], valid_states=["excluded"], max_checks=5) + if failed: + self.fail(f"Rank {rank} did not self-terminate") + + # Step 3: Wait and verify none restart + wait_time = 20 + self.log_step(f"Step 3: Waiting {wait_time}s to verify no automatic restarts") + time.sleep(wait_time) + + errors = [] + for rank in test_ranks: + failed = self.server_managers[0].check_rank_state( + ranks=[rank], valid_states=["excluded"], max_checks=1) + if failed: + errors.append(f"Rank {rank} unexpectedly restarted when auto-restart disabled") + + if errors: + self.fail("\n".join(errors)) + + self.log.info("Confirmed: None of %s automatically restarted", test_ranks) + + # Step 4: Manually clear and restart all + self.log_step("Step 4: Manually clearing exclusion and restarting ranks") + self.dmg.system_clear_exclude(ranks=test_ranks, rank_hosts=None) + + for rank in test_ranks: + self.dmg.system_start(ranks=f"{rank}") + + # Step 5: Verify all rejoin + self.log_step("Step 5: Verifying all ranks successfully rejoin") + time.sleep(10) + + for rank in test_ranks: + failed = self.server_managers[0].check_rank_state( + ranks=[rank], valid_states=["joined"], max_checks=10) + if failed: + errors.append(f"Manual restart of rank {rank} failed") + + report_errors(test=self, errors=errors) + + self.log.info( + "SUCCESS: All %s ranks stayed excluded and manual restart succeeded", num_to_test) diff --git a/src/tests/ftest/control/engine_auto_restart_disabled.yaml b/src/tests/ftest/control/engine_auto_restart_disabled.yaml new file mode 100644 index 00000000000..c374a452945 --- /dev/null +++ b/src/tests/ftest/control/engine_auto_restart_disabled.yaml @@ -0,0 +1,29 @@ +hosts: + test_servers: 1 + +timeout: 300 + +server_config: + name: daos_server + engines_per_host: 2 + disable_engine_auto_restart: true # Disable automatic engine restart + engines: + 0: + log_file: daos_server0.log + targets: 4 + nr_xs_helpers: 0 + storage: + 0: + class: ram + scm_mount: /mnt/daos0 + 1: + log_file: daos_server1.log + targets: 4 + nr_xs_helpers: 0 + storage: + 0: + class: ram + scm_mount: /mnt/daos1 + +pool: + size: 8G diff --git a/src/tests/ftest/util/server_utils.py b/src/tests/ftest/util/server_utils.py index d36900dc54f..513976039e0 100644 --- a/src/tests/ftest/util/server_utils.py +++ b/src/tests/ftest/util/server_utils.py @@ -760,6 +760,23 @@ def get_single_system_state(self): raise ServerFailed("Multiple system states ({}) detected:\n {}".format(states, data)) return states.pop() + def get_rank_state(self, rank): + """Get the current state of a rank from dmg system query. + + Args: + rank (int): Rank to query + + Returns: + str: State of the rank + """ + data = self.dmg.system_query(ranks=str(rank)) + if data["status"] != 0: + raise CommandFailure(f"dmg system query failed with status {data['status']}") + try: + return data["response"]["members"][0]["state"] + except (KeyError, IndexError) as error: + raise CommandFailure("Failed to get rank state from dmg system query") from error + def check_rank_state(self, ranks, valid_states, max_checks=1): """Check the states of list of ranks in DAOS system.