From 053c88894ce11611df049de7b130eddd60548897 Mon Sep 17 00:00:00 2001 From: Luca Consalvi Date: Fri, 13 Mar 2026 15:12:53 +0100 Subject: [PATCH 1/7] Add two-node-regression test suite with 5 etcd resilience tests Introduces the openshift/two-node-regression suite with 5 regression tests that validate podman-etcd resource agent behavior under disruptive conditions: - OCP-88178: learner_node CRM attribute cleanup during stop/start - OCP-88179: active resource count excludes stopping resources - OCP-88180: simultaneous stop delay prevents WAL corruption - OCP-88181: coordinated recovery after etcd container kill - OCP-88213: attribute retry during force-new-cluster recovery Also adds shared pacemaker/CRM utilities to utils/common.go and updates the openshift/two-node suite qualifier to exclude regression tests. Co-Authored-By: Claude Opus 4.6 --- pkg/testsuites/standard_suites.go | 16 +- test/extended/two_node/tnf_resilience.go | 709 +++++++++++++++++++++++ test/extended/two_node/utils/common.go | 73 +++ 3 files changed, 797 insertions(+), 1 deletion(-) create mode 100644 test/extended/two_node/tnf_resilience.go diff --git a/pkg/testsuites/standard_suites.go b/pkg/testsuites/standard_suites.go index f7d51800f167..fefbafd15352 100644 --- a/pkg/testsuites/standard_suites.go +++ b/pkg/testsuites/standard_suites.go @@ -426,12 +426,26 @@ var staticSuites = []ginkgo.TestSuite{ This test suite runs tests to validate two-node. `), Qualifiers: []string{ - `name.contains("[Suite:openshift/two-node") || name.contains("[OCPFeatureGate:DualReplica]") || name.contains("[OCPFeatureGate:HighlyAvailableArbiter]")`, + `(name.contains("[Suite:openshift/two-node") || name.contains("[OCPFeatureGate:DualReplica]") || name.contains("[OCPFeatureGate:HighlyAvailableArbiter]")) && !name.contains("[Suite:openshift/two-node-regression]")`, }, TestTimeout: 60 * time.Minute, Parallelism: 1, // Tests must run serially as they involve node reboots and fencing ClusterStabilityDuringTest: ginkgo.Disruptive, }, + { + Name: "openshift/two-node-regression", + Description: templates.LongDesc(` + This test suite runs regression tests for two-node clusters with fencing topology. + These tests validate resource agent behavior under disruptive conditions + such as etcd restarts, container kills, and force-new-cluster recovery. + `), + Qualifiers: []string{ + `name.contains("[Suite:openshift/two-node-regression]")`, + }, + TestTimeout: 60 * time.Minute, + Parallelism: 1, + ClusterStabilityDuringTest: ginkgo.Disruptive, + }, { Name: "openshift/auth/external-oidc", Description: templates.LongDesc(` diff --git a/test/extended/two_node/tnf_resilience.go b/test/extended/two_node/tnf_resilience.go new file mode 100644 index 000000000000..3b1be36168b7 --- /dev/null +++ b/test/extended/two_node/tnf_resilience.go @@ -0,0 +1,709 @@ +package two_node + +import ( + "context" + "fmt" + "strings" + "time" + + g "github.com/onsi/ginkgo/v2" + o "github.com/onsi/gomega" + v1 "github.com/openshift/api/config/v1" + "github.com/openshift/origin/test/extended/etcd/helpers" + "github.com/openshift/origin/test/extended/two_node/utils" + exutil "github.com/openshift/origin/test/extended/util" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + nodeutil "k8s.io/kubernetes/pkg/util/node" + "k8s.io/kubernetes/test/e2e/framework" +) + +const ( + etcdResourceRecoveryTimeout = 5 * time.Minute // Time for etcd-clone to restart and stabilize + longRecoveryTimeout = 10 * time.Minute // Time for container kill or standby/unstandby recovery + + crmAttributeName = "learner_node" // The CRM attribute under test + pcsWaitTimeout = 120 // Seconds for pcs --wait flag + etcdCloneResource = "etcd-clone" // Pacemaker clone resource name + + // activeCountLogPattern is the pacemaker log message emitted when get_truly_active_resources_count() + // is called during the start action. + activeCountLogPattern = "active etcd resources" + // unexpectedCountError is the error message that should NOT appear after a disable/enable cycle. + unexpectedCountError = "Unexpected active resource count" + + // stoppingResourcesLogPattern is the pacemaker log message emitted by leave_etcd_member_list() + // when it counts how many etcd resources are stopping concurrently. + stoppingResourcesLogPattern = "stopping etcd resources" + // delayStopLogPattern is the pacemaker log message emitted when the alphabetically second + // node delays its stop to prevent simultaneous etcd member removal and WAL corruption. + delayStopLogPattern = "delaying stop for" + +) + +// learnerCleanupResult holds the parsed output from the disable/enable cycle script. +type learnerCleanupResult struct { + // StopQueryRC is the return code of crm_attribute --query after the stop operation. + // RC=6 with "No such device or address" means the attribute was successfully cleared. + StopQueryRC string + StopQueryResult string + // StartQueryRC is the return code of crm_attribute --query after the start operation. + StartQueryRC string + StartQueryResult string + // RawOutput is the full script output for diagnostics. + RawOutput string +} + +// isAttributeCleared returns true if the crm_attribute query indicates the attribute was deleted. +// When the attribute doesn't exist, crm_attribute returns RC=6 and prints "No such device or address". +func isAttributeCleared(rc, result string) bool { + return rc == "6" || strings.Contains(result, "No such device or address") +} + +// pcsDisableScript returns a bash snippet that disables a resource and exits on failure. +// On failure it re-enables the resource as a safety net before exiting. +func pcsDisableScript(resource string, timeout int) string { + return fmt.Sprintf(`sudo pcs resource disable %[1]s --wait=%[2]d + DISABLE_RC=$? + if [ $DISABLE_RC -ne 0 ]; then + echo "DISABLE_FAILED" + sudo pcs resource enable %[1]s --wait=%[2]d 2>/dev/null || true + exit 1 + fi`, resource, timeout) +} + +// pcsEnableScript returns a bash snippet that enables a resource and exits on failure. +func pcsEnableScript(resource string, timeout int) string { + return fmt.Sprintf(`sudo pcs resource enable %s --wait=%d + ENABLE_RC=$? + if [ $ENABLE_RC -ne 0 ]; then + echo "ENABLE_FAILED" + exit 1 + fi`, resource, timeout) +} + +// queryCRMAttributeScript returns a bash snippet that queries an attribute and echoes +// the result with the given label prefix (e.g. "STOP" → "STOP_RC=...", "STOP_RESULT=..."). +func queryCRMAttributeScript(attr, label string) string { + return fmt.Sprintf(`%[1]s_RESULT=$(sudo crm_attribute --query --name %[2]s 2>&1); %[1]s_RC=$? + echo "%[1]s_RC=${%[1]s_RC}" + echo "%[1]s_RESULT=${%[1]s_RESULT}"`, label, attr) +} + +// injectCRMAttributeScript returns a bash snippet that sets a CRM attribute to the given value. +func injectCRMAttributeScript(attr, value string) string { + return fmt.Sprintf(`sudo crm_attribute --name %s --update %s`, attr, value) +} + +// runDisableEnableCycle executes the full disable/enable cycle as a single compound command. +// +// This must run as one bash invocation because disabling etcd-clone stops etcd, which brings +// down the API server — no new debug containers can be created until etcd is re-enabled. +// The debug pod is created while the API is still up; the bash process then runs locally on +// the node and does not need the API for subsequent commands. +// +// The initial inject is also included in the compound command because the resource agent's +// monitor action calls reconcile_member_state() which clears learner_node every few seconds. +// A separate inject would be race-conditioned by the monitor. +// +// The script performs: +// 1. Inject stale learner_node attribute +// 2. Disable etcd-clone (waits for stop to complete) +// 3. Query learner_node attribute (should be cleared by the resource agent's stop action) +// 4. Re-inject the stale learner_node attribute +// 5. Enable etcd-clone (waits for start to complete) +// 6. Query learner_node attribute (should be cleared by the resource agent's start action) +func runDisableEnableCycle(oc *exutil.CLI, nodeName string) (learnerCleanupResult, error) { + script := strings.Join([]string{ + injectCRMAttributeScript(crmAttributeName, nodeName), + pcsDisableScript(etcdCloneResource, pcsWaitTimeout), + queryCRMAttributeScript(crmAttributeName, "STOP"), + injectCRMAttributeScript(crmAttributeName, nodeName), + pcsEnableScript(etcdCloneResource, pcsWaitTimeout), + queryCRMAttributeScript(crmAttributeName, "START"), + }, "\n") + + output, err := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, nodeName, "default", "bash", "-c", script) + framework.Logf("Disable/enable cycle output:\n%s", output) + + // err may be non-nil if the debug container cleanup fails while etcd is down. + // The actual test results are captured in stdout. + if err != nil { + framework.Logf("Disable/enable cycle returned error (may be expected due to API disruption): %v", err) + } + + return learnerCleanupResult{ + StopQueryRC: extractValue(output, "STOP_RC="), + StopQueryResult: extractValue(output, "STOP_RESULT="), + StartQueryRC: extractValue(output, "START_RC="), + StartQueryResult: extractValue(output, "START_RESULT="), + RawOutput: output, + }, err +} + +// extractValue finds a line starting with the given prefix and returns the value after it. +func extractValue(output, prefix string) string { + for _, line := range strings.Split(output, "\n") { + line = strings.TrimSpace(line) + if strings.HasPrefix(line, prefix) { + return strings.TrimPrefix(line, prefix) + } + } + return "" +} + +// waitForAllNodesReady checks that the expected number of nodes exist and all are Ready. +func waitForAllNodesReady(oc *exutil.CLI, expectedCount int) error { + nodeList, err := utils.GetNodes(oc, utils.AllNodes) + if err != nil { + return fmt.Errorf("failed to retrieve nodes: %v", err) + } + if len(nodeList.Items) != expectedCount { + return fmt.Errorf("expected %d nodes, found %d", expectedCount, len(nodeList.Items)) + } + for _, node := range nodeList.Items { + nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get( + context.Background(), node.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get node %s: %v", node.Name, err) + } + if !nodeutil.IsNodeReady(nodeObj) { + return fmt.Errorf("node %s is not Ready", node.Name) + } + } + return nil +} + +// verifyEtcdCloneStartedOnAllNodes checks that pcs status shows etcd-clone Started on all given nodes. +// Clone resources use the format "Started: [ node1 node2 ]", so we extract the etcd-clone section +// and look for each node name on a "Started" line within that section. +func verifyEtcdCloneStartedOnAllNodes(oc *exutil.CLI, execNodeName string, nodes []corev1.Node) error { + statusOutput, err := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, execNodeName, "default", "bash", "-c", "sudo pcs status") + if err != nil { + return fmt.Errorf("failed to get pcs status: %v", err) + } + etcdIdx := strings.Index(statusOutput, "etcd-clone") + if etcdIdx == -1 { + return fmt.Errorf("etcd-clone not found in pcs status:\n%s", statusOutput) + } + etcdSection := statusOutput[etcdIdx:] + for _, node := range nodes { + found := false + for _, line := range strings.Split(etcdSection, "\n") { + if strings.Contains(line, "Started") && strings.Contains(line, node.Name) { + found = true + break + } + } + if !found { + return fmt.Errorf("etcd-clone not Started on %s, status:\n%s", node.Name, statusOutput) + } + } + framework.Logf("Final pcs status:\n%s", statusOutput) + return nil +} + +// getPacemakerLogGrep runs a grep against /var/log/pacemaker/pacemaker.log on the given node +// and returns the matching lines. Returns empty string if no matches found. +func getPacemakerLogGrep(oc *exutil.CLI, nodeName, pattern string) (string, error) { + cmd := fmt.Sprintf(`grep "%s" /var/log/pacemaker/pacemaker.log | tail -5`, pattern) + return exutil.DebugNodeRetryWithOptionsAndChroot(oc, nodeName, "default", "bash", "-c", cmd) +} + +// extractFailedActionsSection extracts everything after "Failed Resource Actions:" from pcs status output. +// In pacemaker, this section lists historical failures that haven't been cleared with `pcs resource cleanup`. +func extractFailedActionsSection(pcsOutput string) string { + for _, marker := range []string{"Failed Resource Actions:", "Failed Resource Actions"} { + idx := strings.Index(pcsOutput, marker) + if idx != -1 { + return pcsOutput[idx:] + } + } + return "" +} + +// runSimpleDisableEnableCycle disables and re-enables etcd-clone as a single compound command. +// Returns the combined output. The error may be non-nil due to API disruption while etcd is down. +func runSimpleDisableEnableCycle(oc *exutil.CLI, nodeName string) string { + script := strings.Join([]string{ + pcsDisableScript(etcdCloneResource, pcsWaitTimeout), + pcsEnableScript(etcdCloneResource, pcsWaitTimeout), + }, "\n") + + output, err := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, nodeName, "default", "bash", "-c", script) + framework.Logf("Disable/enable cycle output:\n%s", output) + + if err != nil { + framework.Logf("Disable/enable cycle returned error (may be expected due to API disruption): %v", err) + } + + o.Expect(output).NotTo(o.ContainSubstring("DISABLE_FAILED"), + "pcs resource disable should succeed") + o.Expect(output).NotTo(o.ContainSubstring("ENABLE_FAILED"), + "pcs resource enable should succeed") + + return output +} + +// expectPacemakerLogFound verifies that at least one node's pacemaker log contains the given pattern. +func expectPacemakerLogFound(oc *exutil.CLI, nodes []corev1.Node, pattern, description string) { + var found bool + for _, node := range nodes { + logOutput, logErr := getPacemakerLogGrep(oc, node.Name, pattern) + if logErr != nil { + framework.Logf("Warning: failed to grep pacemaker log on %s: %v", node.Name, logErr) + continue + } + if strings.TrimSpace(logOutput) != "" { + framework.Logf("%s on %s:\n%s", description, node.Name, logOutput) + found = true + } + } + o.Expect(found).To(o.BeTrue(), + fmt.Sprintf("Expected at least one node's pacemaker log to contain %s", description)) +} + +// verifyFinalClusterHealth runs the common end-of-test health checks: etcd cluster status, +// etcd-clone started on both nodes, all nodes ready, and essential operators available. +func verifyFinalClusterHealth(oc *exutil.CLI, execNodeName string, nodes []corev1.Node, + etcdClientFactory *helpers.EtcdClientFactoryImpl, label string, timeout time.Duration) { + + g.By("Verifying etcd cluster health") + o.Eventually(func() error { + return utils.LogEtcdClusterStatus(oc, label, etcdClientFactory) + }, timeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "etcd cluster should be healthy") + + g.By("Verifying pcs status shows etcd-clone Started on both nodes") + o.Eventually(func() error { + return verifyEtcdCloneStartedOnAllNodes(oc, execNodeName, nodes) + }, timeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "etcd-clone should be Started on both nodes") + + g.By("Verifying both nodes are Ready") + o.Eventually(func() error { + return waitForAllNodesReady(oc, 2) + }, timeout, utils.FiveSecondPollInterval).Should( + o.Succeed(), "Both nodes should be Ready") + + g.By("Verifying essential operators are available") + o.Eventually(func() error { + return utils.ValidateEssentialOperatorsAvailable(oc) + }, timeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "Essential operators should be available") +} + +var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node-regression][Serial][Disruptive] Two Node with Fencing etcd regression", func() { + defer g.GinkgoRecover() + + var ( + oc = exutil.NewCLIWithoutNamespace("two-node-regression").AsAdmin() + etcdClientFactory *helpers.EtcdClientFactoryImpl + setupCompleted bool + ) + + g.BeforeEach(func() { + utils.SkipIfNotTopology(oc, v1.DualReplicaTopologyMode) + + etcdClientFactory = helpers.NewEtcdClientFactory(oc.KubeClient()) + + utils.SkipIfClusterIsNotHealthy(oc, etcdClientFactory) + setupCompleted = true + }) + + g.AfterEach(func() { + if !setupCompleted { + framework.Logf("Test was skipped before setup completed, skipping AfterEach cleanup") + return + } + + nodeList, _ := utils.GetNodes(oc, utils.AllNodes) + if len(nodeList.Items) == 0 { + framework.Logf("Warning: Could not retrieve nodes during cleanup") + return + } + cleanupNode := nodeList.Items[0] + + g.By("Cleanup: Ensuring all nodes are unstandby") + for _, node := range nodeList.Items { + if _, err := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, cleanupNode.Name, "default", "bash", "-c", + fmt.Sprintf("sudo pcs node unstandby %s 2>/dev/null; true", node.Name)); err != nil { + framework.Logf("Warning: Failed to unstandby %s: %v", node.Name, err) + } + } + + g.By("Cleanup: Ensuring etcd-clone is enabled") + if err := utils.EnablePacemakerResource(oc, cleanupNode.Name, etcdCloneResource); err != nil { + framework.Logf("Warning: Failed to enable etcd-clone during cleanup: %v", err) + } + + g.By("Cleanup: Clearing any stale learner_node CRM attribute") + utils.DeleteCRMAttribute(oc, cleanupNode.Name, crmAttributeName) + + g.By("Cleanup: Running pcs resource cleanup to clear failed actions") + if output, err := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, cleanupNode.Name, "default", "bash", "-c", "sudo pcs resource cleanup"); err != nil { + framework.Logf("Warning: Failed to run pcs resource cleanup during AfterEach: %v", err) + } else { + framework.Logf("PCS resource cleanup output: %s", output) + } + + g.By("Cleanup: Waiting for both nodes to become Ready") + o.Eventually(func() error { + return waitForAllNodesReady(oc, 2) + }, longRecoveryTimeout, utils.FiveSecondPollInterval).Should( + o.Succeed(), "Both nodes must be Ready after cleanup") + + g.By("Cleanup: Validating etcd cluster health") + o.Eventually(func() error { + return utils.LogEtcdClusterStatus(oc, "AfterEach cleanup", etcdClientFactory) + }, longRecoveryTimeout, utils.FiveSecondPollInterval).Should( + o.Succeed(), "Etcd cluster must be healthy after cleanup") + }) + + // This test verifies that the resource agent's stop and start actions both clear + // a stale learner_node CRM attribute. A stale attribute would prevent a node from + // completing its etcd rejoin because the start action polls this attribute. + g.It("should clean up stale learner_node attribute during etcd-clone stop and start operations", func() { + nodeList, err := utils.GetNodes(oc, utils.AllNodes) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") + o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected exactly 2 nodes for two-node cluster") + + nodes := nodeList.Items + execNode := nodes[0] + + g.By("Verifying both nodes are healthy before test") + o.Eventually(func() error { + return waitForAllNodesReady(oc, 2) + }, nodeIsHealthyTimeout, utils.FiveSecondPollInterval).Should( + o.Succeed(), "Both nodes should be Ready before test") + + // Run inject + disable/enable cycle as a single compound command. + // The inject must be part of the compound command because the resource agent's + // monitor action calls reconcile_member_state() which clears learner_node + // every few seconds — a separate inject would be race-conditioned. + g.By("Running inject + disable/enable cycle to verify learner_node cleanup on stop and start") + result, _ := runDisableEnableCycle(oc, execNode.Name) + + // Verify the disable/enable completed successfully + o.Expect(result.RawOutput).NotTo(o.ContainSubstring("DISABLE_FAILED"), + "pcs resource disable should succeed") + o.Expect(result.RawOutput).NotTo(o.ContainSubstring("ENABLE_FAILED"), + "pcs resource enable should succeed") + + // Verify: attribute was cleared by the resource agent's stop action + g.By("Verifying learner_node attribute was cleared after etcd-clone stop") + o.Expect(result.StopQueryRC).NotTo(o.BeEmpty(), + fmt.Sprintf("Expected STOP_RC in script output, raw output:\n%s", result.RawOutput)) + o.Expect(isAttributeCleared(result.StopQueryRC, result.StopQueryResult)).To(o.BeTrue(), + fmt.Sprintf("Expected learner_node to be cleared after stop (RC=%s, result=%s)", + result.StopQueryRC, result.StopQueryResult)) + framework.Logf("STOP path verified: learner_node was cleared by the resource agent stop action") + + g.By("Verifying learner_node attribute was cleared after etcd-clone start") + o.Expect(result.StartQueryRC).NotTo(o.BeEmpty(), + fmt.Sprintf("Expected START_RC in script output, raw output:\n%s", result.RawOutput)) + o.Expect(isAttributeCleared(result.StartQueryRC, result.StartQueryResult)).To(o.BeTrue(), + fmt.Sprintf("Expected learner_node to be cleared after start (RC=%s, result=%s)", + result.StartQueryRC, result.StartQueryResult)) + framework.Logf("START path verified: learner_node was cleared by the resource agent start action") + + verifyFinalClusterHealth(oc, execNode.Name, nodes, etcdClientFactory, + "after learner cleanup test", etcdResourceRecoveryTimeout) + }) + + // This test verifies that get_truly_active_resources_count() in the podman-etcd resource agent + // correctly differentiates truly active resources from those being stopped. + // + // A disable/enable cycle triggers this code path because both instances restart cleanly + // without force-new-cluster being pre-set, entering the branch that calls the function + // and logs the active resource count. + g.It("should exclude stopping resources from active count during etcd-clone disable/enable cycle", func() { + nodeList, err := utils.GetNodes(oc, utils.AllNodes) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") + o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected exactly 2 nodes for two-node cluster") + + nodes := nodeList.Items + execNode := nodes[0] + + g.By("Verifying both nodes are healthy before test") + o.Eventually(func() error { + return waitForAllNodesReady(oc, 2) + }, nodeIsHealthyTimeout, utils.FiveSecondPollInterval).Should( + o.Succeed(), "Both nodes should be Ready before test") + + g.By("Running etcd-clone disable/enable cycle to trigger active resource count logic") + runSimpleDisableEnableCycle(oc, execNode.Name) + + g.By("Waiting for etcd cluster to recover after disable/enable cycle") + o.Eventually(func() error { + return utils.LogEtcdClusterStatus(oc, "after disable/enable cycle", etcdClientFactory) + }, etcdResourceRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "etcd cluster should recover after disable/enable cycle") + + g.By("Verifying pcs status shows etcd-clone Started on both nodes") + o.Eventually(func() error { + return verifyEtcdCloneStartedOnAllNodes(oc, execNode.Name, nodes) + }, etcdResourceRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "etcd-clone should be Started on both nodes after recovery") + + g.By("Checking pacemaker logs for correct active resource count logic") + expectPacemakerLogFound(oc, nodes, activeCountLogPattern, "Active count log entries") + + g.By("Verifying no 'Unexpected active resource count' errors in pacemaker logs") + for _, node := range nodes { + errorOutput, logErr := getPacemakerLogGrep(oc, node.Name, unexpectedCountError) + if logErr != nil { + framework.Logf("Warning: failed to grep pacemaker log on %s: %v", node.Name, logErr) + continue + } + o.Expect(strings.TrimSpace(errorOutput)).To(o.BeEmpty(), + fmt.Sprintf("Expected no 'Unexpected active resource count' errors on %s", node.Name)) + } + + verifyFinalClusterHealth(oc, execNode.Name, nodes, etcdClientFactory, + "after active count test", etcdResourceRecoveryTimeout) + }) + + // This test verifies that podman-etcd prevents simultaneous etcd member removal + // when both nodes receive a graceful shutdown request. + // + // When etcd-clone is disabled, both nodes stop concurrently. The leave_etcd_member_list() + // function detects this by counting the stopping resources. The alphabetically second node + // is delayed by DELAY_SECOND_NODE_LEAVE_SEC (10s) to prevent WAL corruption. + g.It("should delay the second node stop to prevent simultaneous etcd member removal", func() { + nodeList, err := utils.GetNodes(oc, utils.AllNodes) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") + o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected exactly 2 nodes for two-node cluster") + + nodes := nodeList.Items + execNode := nodes[0] + + g.By("Verifying both nodes are healthy before test") + o.Eventually(func() error { + return waitForAllNodesReady(oc, 2) + }, nodeIsHealthyTimeout, utils.FiveSecondPollInterval).Should( + o.Succeed(), "Both nodes should be Ready before test") + + g.By("Running etcd-clone disable/enable cycle to trigger simultaneous stop logic") + runSimpleDisableEnableCycle(oc, execNode.Name) + + g.By("Waiting for etcd cluster to recover after disable/enable cycle") + o.Eventually(func() error { + return utils.LogEtcdClusterStatus(oc, "after disable/enable cycle", etcdClientFactory) + }, etcdResourceRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "etcd cluster should recover after disable/enable cycle") + + g.By("Verifying pcs status shows etcd-clone Started on both nodes") + o.Eventually(func() error { + return verifyEtcdCloneStartedOnAllNodes(oc, execNode.Name, nodes) + }, etcdResourceRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "etcd-clone should be Started on both nodes after recovery") + + g.By("Checking pacemaker logs for stopping resource count detection") + expectPacemakerLogFound(oc, nodes, stoppingResourcesLogPattern, "Stopping resources log entries") + + g.By("Verifying delay intervention was applied to prevent simultaneous member removal") + expectPacemakerLogFound(oc, nodes, delayStopLogPattern, "Delay intervention log") + + verifyFinalClusterHealth(oc, execNode.Name, nodes, etcdClientFactory, + "after simultaneous stop test", etcdResourceRecoveryTimeout) + }) + + // This test verifies that an abrupt termination of the etcd container triggers a + // coordinated "Error occurred" monitor state on both nodes before the cluster + // self-heals automatically. + // + // When a local etcd container is killed, the podman-etcd resource agent must + // coordinate recovery with the peer node. The surviving node sets force_new_cluster + // and the killed node's etcd restarts and joins as a learner. During this process, + // both nodes briefly enter a coordinated failed state visible in pcs status as + // "Failed Resource Actions". + g.It("should coordinate recovery with peer when local etcd container is killed", func() { + nodeList, err := utils.GetNodes(oc, utils.AllNodes) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") + o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected exactly 2 nodes for two-node cluster") + + nodes := nodeList.Items + targetNode := nodes[1] // Kill etcd on the second node + execNode := nodes[0] // Use first node for pcs status checks after recovery + + g.By("Verifying both nodes are healthy before test") + o.Eventually(func() error { + return waitForAllNodesReady(oc, 2) + }, nodeIsHealthyTimeout, utils.FiveSecondPollInterval).Should( + o.Succeed(), "Both nodes should be Ready before test") + + // Kill etcd container on the target node. + g.By(fmt.Sprintf("Killing etcd container on %s", targetNode.Name)) + _, err = exutil.DebugNodeRetryWithOptionsAndChroot( + oc, targetNode.Name, "openshift-etcd", + "bash", "-c", "podman kill etcd 2>/dev/null") + o.Expect(err).To(o.BeNil(), "Expected to kill etcd container without command errors") + + // Wait for the cluster to self-heal. + g.By("Waiting for etcd cluster to self-heal after container kill") + o.Eventually(func() error { + return utils.LogEtcdClusterStatus(oc, "after container kill", etcdClientFactory) + }, longRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "etcd cluster should self-heal after container kill") + + g.By("Verifying pcs status shows etcd-clone Started on both nodes") + o.Eventually(func() error { + return verifyEtcdCloneStartedOnAllNodes(oc, execNode.Name, nodes) + }, longRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "etcd-clone should be Started on both nodes after recovery") + + // Verify that the coordinated failure was observed. + g.By("Checking pcs status for coordinated 'Failed Resource Actions' on both nodes") + pcsOutput, statusErr := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, execNode.Name, "default", "bash", "-c", "sudo pcs status") + o.Expect(statusErr).ShouldNot(o.HaveOccurred(), "Expected to get pcs status without error") + framework.Logf("PCS status after recovery:\n%s", pcsOutput) + + failedSection := extractFailedActionsSection(pcsOutput) + o.Expect(failedSection).NotTo(o.BeEmpty(), + "Expected pcs status to contain 'Failed Resource Actions' section after container kill") + framework.Logf("Failed Resource Actions section:\n%s", failedSection) + + o.Expect(failedSection).To(o.ContainSubstring("etcd"), + "Expected Failed Resource Actions to reference etcd") + + for _, node := range nodes { + if strings.Contains(failedSection, node.Name) { + framework.Logf("Coordinated failure confirmed: node %s found in Failed Resource Actions", node.Name) + } else { + framework.Logf("Node %s NOT found in Failed Resource Actions", node.Name) + } + } + for _, node := range nodes { + o.Expect(failedSection).To(o.ContainSubstring(node.Name), + fmt.Sprintf("Expected Failed Resource Actions to show failure on %s for coordinated recovery", node.Name)) + } + + verifyFinalClusterHealth(oc, execNode.Name, nodes, etcdClientFactory, + "after coordinated recovery test", longRecoveryTimeout) + }) + + // This test verifies that the podman-etcd resource agent retries setting + // CRM attributes when they fail during the force-new-cluster recovery path. + // + // When the learner_node CIB attribute is deleted while a node is in standby, + // the returning node's start action polls for the attribute but finds it missing. + // Without the retry fix, the node gets stuck in the LEARNER=true stage because + // nobody re-sets the attribute. With the fix, the leader node's monitor detects + // that a learner member exists in etcd but the learner_node attribute is missing, + // and retries setting it, allowing the returning node to proceed. + // + // Test flow: + // 1. Put a node in standby (triggers force-new-cluster on the peer) + // 2. Wait for the standby node to appear as a learner in etcd member list + // 3. Delete the learner_node CRM attribute + // 4. Unstandby the node + // 5. Verify both nodes recover to voting etcd members + g.It("should retry setting learner_node attribute after deletion during force-new-cluster recovery", func() { + nodeList, err := utils.GetNodes(oc, utils.AllNodes) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") + o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected exactly 2 nodes for two-node cluster") + + nodes := nodeList.Items + execNode := nodes[0] // Stays active, runs solo during standby + standbyNode := nodes[1] // Will be put in standby + + g.By("Verifying both nodes are healthy before test") + o.Eventually(func() error { + return waitForAllNodesReady(oc, 2) + }, nodeIsHealthyTimeout, utils.FiveSecondPollInterval).Should( + o.Succeed(), "Both nodes should be Ready before test") + + // Put the standby node in standby mode. + g.By(fmt.Sprintf("Putting %s in standby", standbyNode.Name)) + output, err := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, execNode.Name, "default", "bash", "-c", + fmt.Sprintf("sudo pcs node standby %s", standbyNode.Name)) + o.Expect(err).ShouldNot(o.HaveOccurred(), + fmt.Sprintf("Expected pcs node standby to succeed, output: %s", output)) + framework.Logf("PCS node standby output: %s", output) + + // Wait for force-new-cluster recovery to complete. + g.By(fmt.Sprintf("Waiting for %s to appear as learner in etcd member list", standbyNode.Name)) + o.Eventually(func() error { + members, err := utils.GetMembers(etcdClientFactory) + if err != nil { + return fmt.Errorf("failed to get etcd members: %v", err) + } + _, isLearner, err := utils.GetMemberState(&standbyNode, members) + if err != nil { + return fmt.Errorf("standby node not in member list yet: %v", err) + } + if !isLearner { + return fmt.Errorf("standby node %s is not a learner yet", standbyNode.Name) + } + framework.Logf("Standby node %s confirmed as learner in etcd member list", standbyNode.Name) + return nil + }, longRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "Standby node should appear as learner in etcd member list") + + g.By("Logging pcs status after standby") + if pcsOutput, pcsErr := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, execNode.Name, "default", "bash", "-c", "sudo pcs status"); pcsErr == nil { + framework.Logf("PCS status after standby:\n%s", pcsOutput) + } + + // Verify learner_node attribute is set before we delete it + g.By("Verifying learner_node CRM attribute is set") + attrOutput, err := utils.QueryCRMAttribute(oc, execNode.Name, crmAttributeName) + o.Expect(err).ShouldNot(o.HaveOccurred(), + "Expected learner_node attribute to exist after force-new-cluster recovery") + framework.Logf("learner_node attribute value: %s", attrOutput) + + // Delete the learner_node attribute to simulate attribute update failure. + g.By("Deleting learner_node CRM attribute to simulate attribute update failure") + utils.DeleteCRMAttribute(oc, execNode.Name, crmAttributeName) + framework.Logf("learner_node attribute deleted") + + // Unstandby the node. With the retry fix, the leader node's monitor detects + // the missing attribute and re-sets it, allowing the returning node to proceed. + g.By(fmt.Sprintf("Unstandby %s to trigger etcd rejoin", standbyNode.Name)) + output, err = exutil.DebugNodeRetryWithOptionsAndChroot( + oc, execNode.Name, "default", "bash", "-c", + fmt.Sprintf("sudo pcs node unstandby %s", standbyNode.Name)) + o.Expect(err).ShouldNot(o.HaveOccurred(), + fmt.Sprintf("Expected pcs node unstandby to succeed, output: %s", output)) + framework.Logf("PCS node unstandby output: %s", output) + + // Wait for both nodes to become voting etcd members. + g.By("Waiting for both nodes to become voting etcd members") + o.Eventually(func() error { + members, err := utils.GetMembers(etcdClientFactory) + if err != nil { + return fmt.Errorf("failed to get etcd members: %v", err) + } + if len(members) != 2 { + return fmt.Errorf("expected 2 members, found %d", len(members)) + } + for i := range nodes { + isStarted, isLearner, err := utils.GetMemberState(&nodes[i], members) + if err != nil { + return fmt.Errorf("member %s not found: %v", nodes[i].Name, err) + } + if !isStarted { + return fmt.Errorf("member %s is not started", nodes[i].Name) + } + if isLearner { + return fmt.Errorf("member %s is still a learner", nodes[i].Name) + } + } + framework.Logf("Both etcd members are now voting members") + return nil + }, longRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "Both nodes should become voting etcd members") + + verifyFinalClusterHealth(oc, execNode.Name, nodes, etcdClientFactory, + "after attribute retry test", longRecoveryTimeout) + }) +}) diff --git a/test/extended/two_node/utils/common.go b/test/extended/two_node/utils/common.go index f22cc90c0209..b1d16cfa56de 100644 --- a/test/extended/two_node/utils/common.go +++ b/test/extended/two_node/utils/common.go @@ -499,6 +499,79 @@ func RemoveConstraint(oc *exutil.CLI, nodeName string, resourceName string) erro return nil } +// DisablePacemakerResource disables a pacemaker resource on all nodes (stops it globally). +// +// err := DisablePacemakerResource(oc, "master-0", "etcd-clone") +func DisablePacemakerResource(oc *exutil.CLI, nodeName string, resourceName string) error { + cmd := fmt.Sprintf("sudo pcs resource disable %s", resourceName) + + output, err := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, nodeName, "default", "bash", "-c", cmd) + + if err != nil { + return fmt.Errorf("failed to disable resource %s: %v, output: %s", resourceName, err, output) + } + + return nil +} + +// EnablePacemakerResource enables a pacemaker resource on all nodes (allows it to start). +// +// err := EnablePacemakerResource(oc, "master-0", "etcd-clone") +func EnablePacemakerResource(oc *exutil.CLI, nodeName string, resourceName string) error { + cmd := fmt.Sprintf("sudo pcs resource enable %s", resourceName) + + output, err := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, nodeName, "default", "bash", "-c", cmd) + + if err != nil { + return fmt.Errorf("failed to enable resource %s: %v, output: %s", resourceName, err, output) + } + + return nil +} + +// SetCRMAttribute sets a CRM cluster attribute to the given value. +// +// err := SetCRMAttribute(oc, "master-0", "learner_node", "master-1") +func SetCRMAttribute(oc *exutil.CLI, nodeName string, attrName string, value string) error { + cmd := fmt.Sprintf("sudo crm_attribute --name %s --update %s", attrName, value) + + output, err := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, nodeName, "default", "bash", "-c", cmd) + + if err != nil { + return fmt.Errorf("failed to set CRM attribute %s=%s: %v, output: %s", attrName, value, err, output) + } + + return nil +} + +// QueryCRMAttribute queries a CRM cluster attribute and returns the raw output. +// Returns an error if the query command fails (e.g., attribute does not exist). +// +// output, err := QueryCRMAttribute(oc, "master-0", "learner_node") +func QueryCRMAttribute(oc *exutil.CLI, nodeName string, attrName string) (string, error) { + cmd := fmt.Sprintf("sudo crm_attribute --query --name %s", attrName) + + return exutil.DebugNodeRetryWithOptionsAndChroot( + oc, nodeName, "default", "bash", "-c", cmd) +} + +// DeleteCRMAttribute deletes a CRM cluster attribute (best-effort, logs warnings on failure). +// +// DeleteCRMAttribute(oc, "master-0", "learner_node") +func DeleteCRMAttribute(oc *exutil.CLI, nodeName string, attrName string) { + cmd := fmt.Sprintf("sudo crm_attribute --name %s --delete 2>/dev/null; true", attrName) + + _, err := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, nodeName, "default", "bash", "-c", cmd) + + if err != nil { + framework.Logf("Warning: failed to delete CRM attribute %s: %v", attrName, err) + } +} + // IsResourceStopped checks if a pacemaker resource is in stopped state. // // stopped, err := IsResourceStopped(oc, "master-0", "kubelet-clone") From 00cf165111b49cbbc1e753e2d664c07eed62411b Mon Sep 17 00:00:00 2001 From: Luca Consalvi Date: Fri, 13 Mar 2026 15:36:35 +0100 Subject: [PATCH 2/7] Rename suite from two-node-regression to tnf-resilience Co-Authored-By: Claude Opus 4.6 --- pkg/testsuites/standard_suites.go | 8 ++++---- test/extended/two_node/tnf_resilience.go | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/testsuites/standard_suites.go b/pkg/testsuites/standard_suites.go index fefbafd15352..8f0a53ac3061 100644 --- a/pkg/testsuites/standard_suites.go +++ b/pkg/testsuites/standard_suites.go @@ -426,21 +426,21 @@ var staticSuites = []ginkgo.TestSuite{ This test suite runs tests to validate two-node. `), Qualifiers: []string{ - `(name.contains("[Suite:openshift/two-node") || name.contains("[OCPFeatureGate:DualReplica]") || name.contains("[OCPFeatureGate:HighlyAvailableArbiter]")) && !name.contains("[Suite:openshift/two-node-regression]")`, + `(name.contains("[Suite:openshift/two-node") || name.contains("[OCPFeatureGate:DualReplica]") || name.contains("[OCPFeatureGate:HighlyAvailableArbiter]")) && !name.contains("[Suite:openshift/tnf-resilience]")`, }, TestTimeout: 60 * time.Minute, Parallelism: 1, // Tests must run serially as they involve node reboots and fencing ClusterStabilityDuringTest: ginkgo.Disruptive, }, { - Name: "openshift/two-node-regression", + Name: "openshift/tnf-resilience", Description: templates.LongDesc(` - This test suite runs regression tests for two-node clusters with fencing topology. + This test suite runs resilience tests for two-node clusters with fencing topology. These tests validate resource agent behavior under disruptive conditions such as etcd restarts, container kills, and force-new-cluster recovery. `), Qualifiers: []string{ - `name.contains("[Suite:openshift/two-node-regression]")`, + `name.contains("[Suite:openshift/tnf-resilience]")`, }, TestTimeout: 60 * time.Minute, Parallelism: 1, diff --git a/test/extended/two_node/tnf_resilience.go b/test/extended/two_node/tnf_resilience.go index 3b1be36168b7..b0505c106a99 100644 --- a/test/extended/two_node/tnf_resilience.go +++ b/test/extended/two_node/tnf_resilience.go @@ -296,11 +296,11 @@ func verifyFinalClusterHealth(oc *exutil.CLI, execNodeName string, nodes []corev o.HaveOccurred(), "Essential operators should be available") } -var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node-regression][Serial][Disruptive] Two Node with Fencing etcd regression", func() { +var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/tnf-resilience][Serial][Disruptive] Two Node with Fencing etcd resilience", func() { defer g.GinkgoRecover() var ( - oc = exutil.NewCLIWithoutNamespace("two-node-regression").AsAdmin() + oc = exutil.NewCLIWithoutNamespace("tnf-resilience").AsAdmin() etcdClientFactory *helpers.EtcdClientFactoryImpl setupCompleted bool ) From 778acd84b404c323e58224e88abd1cfc0db7f020 Mon Sep 17 00:00:00 2001 From: Luca Consalvi Date: Mon, 16 Mar 2026 10:12:11 +0100 Subject: [PATCH 3/7] Add [Skipped:SingleReplicaTopology] label to the Describe block to automatically skip these tests on SNO environments. --- test/extended/two_node/tnf_resilience.go | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/test/extended/two_node/tnf_resilience.go b/test/extended/two_node/tnf_resilience.go index b0505c106a99..c50b2c3c69c4 100644 --- a/test/extended/two_node/tnf_resilience.go +++ b/test/extended/two_node/tnf_resilience.go @@ -296,7 +296,7 @@ func verifyFinalClusterHealth(oc *exutil.CLI, execNodeName string, nodes []corev o.HaveOccurred(), "Essential operators should be available") } -var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/tnf-resilience][Serial][Disruptive] Two Node with Fencing etcd resilience", func() { +var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/tnf-resilience][Serial][Disruptive][Skipped:SingleReplicaTopology] Two Node with Fencing etcd resilience", func() { defer g.GinkgoRecover() var ( @@ -543,7 +543,7 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual _, err = exutil.DebugNodeRetryWithOptionsAndChroot( oc, targetNode.Name, "openshift-etcd", "bash", "-c", "podman kill etcd 2>/dev/null") - o.Expect(err).To(o.BeNil(), "Expected to kill etcd container without command errors") + o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to kill etcd container without command errors") // Wait for the cluster to self-heal. g.By("Waiting for etcd cluster to self-heal after container kill") @@ -573,16 +573,10 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual o.Expect(failedSection).To(o.ContainSubstring("etcd"), "Expected Failed Resource Actions to reference etcd") - for _, node := range nodes { - if strings.Contains(failedSection, node.Name) { - framework.Logf("Coordinated failure confirmed: node %s found in Failed Resource Actions", node.Name) - } else { - framework.Logf("Node %s NOT found in Failed Resource Actions", node.Name) - } - } for _, node := range nodes { o.Expect(failedSection).To(o.ContainSubstring(node.Name), fmt.Sprintf("Expected Failed Resource Actions to show failure on %s for coordinated recovery", node.Name)) + framework.Logf("Coordinated failure confirmed: node %s found in Failed Resource Actions", node.Name) } verifyFinalClusterHealth(oc, execNode.Name, nodes, etcdClientFactory, From c4b91d711e295f1a7d6c66d657ce17725b64f9e0 Mon Sep 17 00:00:00 2001 From: Luca Consalvi Date: Mon, 16 Mar 2026 10:22:03 +0100 Subject: [PATCH 4/7] Solve formatting problems. --- test/extended/two_node/tnf_resilience.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/extended/two_node/tnf_resilience.go b/test/extended/two_node/tnf_resilience.go index c50b2c3c69c4..08a620c17528 100644 --- a/test/extended/two_node/tnf_resilience.go +++ b/test/extended/two_node/tnf_resilience.go @@ -20,7 +20,7 @@ import ( const ( etcdResourceRecoveryTimeout = 5 * time.Minute // Time for etcd-clone to restart and stabilize - longRecoveryTimeout = 10 * time.Minute // Time for container kill or standby/unstandby recovery + longRecoveryTimeout = 10 * time.Minute // Time for container kill or standby/unstandby recovery crmAttributeName = "learner_node" // The CRM attribute under test pcsWaitTimeout = 120 // Seconds for pcs --wait flag @@ -38,7 +38,6 @@ const ( // delayStopLogPattern is the pacemaker log message emitted when the alphabetically second // node delays its stop to prevent simultaneous etcd member removal and WAL corruption. delayStopLogPattern = "delaying stop for" - ) // learnerCleanupResult holds the parsed output from the disable/enable cycle script. From b6a80a68de0d6c9a34201d152b9e8fd64786663e Mon Sep 17 00:00:00 2001 From: Luca Consalvi Date: Mon, 16 Mar 2026 12:06:15 +0100 Subject: [PATCH 5/7] Address review feedback for TNF resilience tests - Handle GetNodes error in AfterEach to prevent nil panic - Scope verifyEtcdCloneStartedOnAllNodes to etcd-clone block only - Add per-node pacemaker log baselines to prevent stale log matches - Fail test on log retrieval errors instead of silently skipping - Poll for learner_node attribute with Eventually for async monitor Co-Authored-By: Claude Opus 4.6 --- test/extended/two_node/tnf_resilience.go | 95 ++++++++++++++++++------ 1 file changed, 73 insertions(+), 22 deletions(-) diff --git a/test/extended/two_node/tnf_resilience.go b/test/extended/two_node/tnf_resilience.go index 08a620c17528..a6b0a15760ee 100644 --- a/test/extended/two_node/tnf_resilience.go +++ b/test/extended/two_node/tnf_resilience.go @@ -187,10 +187,23 @@ func verifyEtcdCloneStartedOnAllNodes(oc *exutil.CLI, execNodeName string, nodes if etcdIdx == -1 { return fmt.Errorf("etcd-clone not found in pcs status:\n%s", statusOutput) } - etcdSection := statusOutput[etcdIdx:] + // Scope parsing to the etcd-clone resource block only, stopping at the next + // resource header or blank line to avoid matching unrelated "Started" lines. + etcdLines := strings.Split(statusOutput[etcdIdx:], "\n") + var etcdSection strings.Builder + etcdSection.WriteString(etcdLines[0]) + for _, line := range etcdLines[1:] { + trimmed := strings.TrimSpace(line) + if trimmed == "" || (strings.HasSuffix(trimmed, ":") && !strings.Contains(line, "Started")) { + break + } + etcdSection.WriteString("\n") + etcdSection.WriteString(line) + } + sectionStr := etcdSection.String() for _, node := range nodes { found := false - for _, line := range strings.Split(etcdSection, "\n") { + for _, line := range strings.Split(sectionStr, "\n") { if strings.Contains(line, "Started") && strings.Contains(line, node.Name) { found = true break @@ -205,12 +218,37 @@ func verifyEtcdCloneStartedOnAllNodes(oc *exutil.CLI, execNodeName string, nodes } // getPacemakerLogGrep runs a grep against /var/log/pacemaker/pacemaker.log on the given node -// and returns the matching lines. Returns empty string if no matches found. -func getPacemakerLogGrep(oc *exutil.CLI, nodeName, pattern string) (string, error) { - cmd := fmt.Sprintf(`grep "%s" /var/log/pacemaker/pacemaker.log | tail -5`, pattern) +// and returns matching lines. If baselineLineCount is non-empty, only lines after that line +// number are searched (using tail +N). Returns empty string if no matches found. +func getPacemakerLogGrep(oc *exutil.CLI, nodeName, pattern, baselineLineCount string) (string, error) { + var cmd string + if baselineLineCount != "" { + // Use tail to skip lines that existed before the test, then grep for pattern + cmd = fmt.Sprintf(`tail -n +%s /var/log/pacemaker/pacemaker.log | grep -F -- %q | tail -5`, + baselineLineCount, pattern) + } else { + cmd = fmt.Sprintf(`grep -F -- %q /var/log/pacemaker/pacemaker.log | tail -5`, pattern) + } return exutil.DebugNodeRetryWithOptionsAndChroot(oc, nodeName, "default", "bash", "-c", cmd) } +// getPacemakerLogBaselines captures the current line count of the pacemaker log on each node. +// Returns a map of nodeName -> lineCount string. Used to scope log assertions to only lines +// emitted after the baseline, preventing stale log lines from prior tests causing false positives. +func getPacemakerLogBaselines(oc *exutil.CLI, nodes []corev1.Node) map[string]string { + baselines := make(map[string]string, len(nodes)) + for _, node := range nodes { + output, err := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, node.Name, "default", "bash", "-c", "wc -l < /var/log/pacemaker/pacemaker.log") + if err != nil { + framework.Logf("Warning: could not get pacemaker log line count from %s: %v", node.Name, err) + continue + } + baselines[node.Name] = strings.TrimSpace(output) + } + return baselines +} + // extractFailedActionsSection extracts everything after "Failed Resource Actions:" from pcs status output. // In pacemaker, this section lists historical failures that haven't been cleared with `pcs resource cleanup`. func extractFailedActionsSection(pcsOutput string) string { @@ -248,10 +286,11 @@ func runSimpleDisableEnableCycle(oc *exutil.CLI, nodeName string) string { } // expectPacemakerLogFound verifies that at least one node's pacemaker log contains the given pattern. -func expectPacemakerLogFound(oc *exutil.CLI, nodes []corev1.Node, pattern, description string) { +// If baselines is non-nil, only log lines after each node's baseline line count are considered. +func expectPacemakerLogFound(oc *exutil.CLI, nodes []corev1.Node, pattern, description string, baselines map[string]string) { var found bool for _, node := range nodes { - logOutput, logErr := getPacemakerLogGrep(oc, node.Name, pattern) + logOutput, logErr := getPacemakerLogGrep(oc, node.Name, pattern, baselines[node.Name]) if logErr != nil { framework.Logf("Warning: failed to grep pacemaker log on %s: %v", node.Name, logErr) continue @@ -319,9 +358,9 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual return } - nodeList, _ := utils.GetNodes(oc, utils.AllNodes) - if len(nodeList.Items) == 0 { - framework.Logf("Warning: Could not retrieve nodes during cleanup") + nodeList, err := utils.GetNodes(oc, utils.AllNodes) + if err != nil || len(nodeList.Items) == 0 { + framework.Logf("Warning: Could not retrieve nodes during cleanup: %v", err) return } cleanupNode := nodeList.Items[0] @@ -435,6 +474,10 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual }, nodeIsHealthyTimeout, utils.FiveSecondPollInterval).Should( o.Succeed(), "Both nodes should be Ready before test") + // Capture per-node baseline log line counts before the disruptive action so + // log assertions only consider lines emitted during this test. + logBaselines := getPacemakerLogBaselines(oc, nodes) + g.By("Running etcd-clone disable/enable cycle to trigger active resource count logic") runSimpleDisableEnableCycle(oc, execNode.Name) @@ -451,15 +494,13 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual o.HaveOccurred(), "etcd-clone should be Started on both nodes after recovery") g.By("Checking pacemaker logs for correct active resource count logic") - expectPacemakerLogFound(oc, nodes, activeCountLogPattern, "Active count log entries") + expectPacemakerLogFound(oc, nodes, activeCountLogPattern, "Active count log entries", logBaselines) g.By("Verifying no 'Unexpected active resource count' errors in pacemaker logs") for _, node := range nodes { - errorOutput, logErr := getPacemakerLogGrep(oc, node.Name, unexpectedCountError) - if logErr != nil { - framework.Logf("Warning: failed to grep pacemaker log on %s: %v", node.Name, logErr) - continue - } + errorOutput, logErr := getPacemakerLogGrep(oc, node.Name, unexpectedCountError, logBaselines[node.Name]) + o.Expect(logErr).ShouldNot(o.HaveOccurred(), + fmt.Sprintf("Expected to read pacemaker log on %s", node.Name)) o.Expect(strings.TrimSpace(errorOutput)).To(o.BeEmpty(), fmt.Sprintf("Expected no 'Unexpected active resource count' errors on %s", node.Name)) } @@ -488,6 +529,10 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual }, nodeIsHealthyTimeout, utils.FiveSecondPollInterval).Should( o.Succeed(), "Both nodes should be Ready before test") + // Capture per-node baseline log line counts before the disruptive action so + // log assertions only consider lines emitted during this test. + logBaselines := getPacemakerLogBaselines(oc, nodes) + g.By("Running etcd-clone disable/enable cycle to trigger simultaneous stop logic") runSimpleDisableEnableCycle(oc, execNode.Name) @@ -504,10 +549,10 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual o.HaveOccurred(), "etcd-clone should be Started on both nodes after recovery") g.By("Checking pacemaker logs for stopping resource count detection") - expectPacemakerLogFound(oc, nodes, stoppingResourcesLogPattern, "Stopping resources log entries") + expectPacemakerLogFound(oc, nodes, stoppingResourcesLogPattern, "Stopping resources log entries", logBaselines) g.By("Verifying delay intervention was applied to prevent simultaneous member removal") - expectPacemakerLogFound(oc, nodes, delayStopLogPattern, "Delay intervention log") + expectPacemakerLogFound(oc, nodes, delayStopLogPattern, "Delay intervention log", logBaselines) verifyFinalClusterHealth(oc, execNode.Name, nodes, etcdClientFactory, "after simultaneous stop test", etcdResourceRecoveryTimeout) @@ -647,11 +692,17 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual framework.Logf("PCS status after standby:\n%s", pcsOutput) } - // Verify learner_node attribute is set before we delete it + // Verify learner_node attribute is set before we delete it. + // The attribute is set by the leader's monitor action which runs asynchronously + // after detecting the learner, so poll until it appears. g.By("Verifying learner_node CRM attribute is set") - attrOutput, err := utils.QueryCRMAttribute(oc, execNode.Name, crmAttributeName) - o.Expect(err).ShouldNot(o.HaveOccurred(), - "Expected learner_node attribute to exist after force-new-cluster recovery") + var attrOutput string + o.Eventually(func() error { + var queryErr error + attrOutput, queryErr = utils.QueryCRMAttribute(oc, execNode.Name, crmAttributeName) + return queryErr + }, etcdResourceRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "Expected learner_node attribute to exist after force-new-cluster recovery") framework.Logf("learner_node attribute value: %s", attrOutput) // Delete the learner_node attribute to simulate attribute update failure. From 462c91e79adf084e95525acec2771bfaf7c050ed Mon Sep 17 00:00:00 2001 From: Luca Consalvi Date: Mon, 16 Mar 2026 12:38:23 +0100 Subject: [PATCH 6/7] Make podman kill idempotent in container kill test The podman kill command can fail if etcd is already stopped or restarting. Append '; true' to tolerate non-zero exit codes since the real assertion is whether the cluster recovers, not whether the kill command succeeded. Co-Authored-By: Claude Opus 4.6 --- test/extended/two_node/tnf_resilience.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/extended/two_node/tnf_resilience.go b/test/extended/two_node/tnf_resilience.go index a6b0a15760ee..92c42383f8a0 100644 --- a/test/extended/two_node/tnf_resilience.go +++ b/test/extended/two_node/tnf_resilience.go @@ -584,10 +584,10 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual // Kill etcd container on the target node. g.By(fmt.Sprintf("Killing etcd container on %s", targetNode.Name)) - _, err = exutil.DebugNodeRetryWithOptionsAndChroot( + output, err := exutil.DebugNodeRetryWithOptionsAndChroot( oc, targetNode.Name, "openshift-etcd", - "bash", "-c", "podman kill etcd 2>/dev/null") - o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to kill etcd container without command errors") + "bash", "-c", "podman kill etcd 2>/dev/null; true") + framework.Logf("Podman kill output: %s, err: %v", output, err) // Wait for the cluster to self-heal. g.By("Waiting for etcd cluster to self-heal after container kill") From 78407b6789a6b880835a9bc033d94f1dac65a855 Mon Sep 17 00:00:00 2001 From: Luca Consalvi Date: Wed, 18 Mar 2026 16:37:48 +0100 Subject: [PATCH 7/7] Merge resilience tests into tnf_recovery.go under the existing two-node suite Move all 5 resilience tests from the separate tnf_resilience.go file into the existing Describe block in tnf_recovery.go, eliminating the need for a dedicated openshift/tnf-resilience suite. The tests now run as part of the openshift/two-node suite alongside the existing recovery tests. Co-Authored-By: Claude Opus 4.6 --- pkg/testsuites/standard_suites.go | 16 +- test/extended/two_node/tnf_recovery.go | 711 +++++++++++++++++++++ test/extended/two_node/tnf_resilience.go | 753 ----------------------- 3 files changed, 712 insertions(+), 768 deletions(-) delete mode 100644 test/extended/two_node/tnf_resilience.go diff --git a/pkg/testsuites/standard_suites.go b/pkg/testsuites/standard_suites.go index 8f0a53ac3061..f7d51800f167 100644 --- a/pkg/testsuites/standard_suites.go +++ b/pkg/testsuites/standard_suites.go @@ -426,26 +426,12 @@ var staticSuites = []ginkgo.TestSuite{ This test suite runs tests to validate two-node. `), Qualifiers: []string{ - `(name.contains("[Suite:openshift/two-node") || name.contains("[OCPFeatureGate:DualReplica]") || name.contains("[OCPFeatureGate:HighlyAvailableArbiter]")) && !name.contains("[Suite:openshift/tnf-resilience]")`, + `name.contains("[Suite:openshift/two-node") || name.contains("[OCPFeatureGate:DualReplica]") || name.contains("[OCPFeatureGate:HighlyAvailableArbiter]")`, }, TestTimeout: 60 * time.Minute, Parallelism: 1, // Tests must run serially as they involve node reboots and fencing ClusterStabilityDuringTest: ginkgo.Disruptive, }, - { - Name: "openshift/tnf-resilience", - Description: templates.LongDesc(` - This test suite runs resilience tests for two-node clusters with fencing topology. - These tests validate resource agent behavior under disruptive conditions - such as etcd restarts, container kills, and force-new-cluster recovery. - `), - Qualifiers: []string{ - `name.contains("[Suite:openshift/tnf-resilience]")`, - }, - TestTimeout: 60 * time.Minute, - Parallelism: 1, - ClusterStabilityDuringTest: ginkgo.Disruptive, - }, { Name: "openshift/auth/external-oidc", Description: templates.LongDesc(` diff --git a/test/extended/two_node/tnf_recovery.go b/test/extended/two_node/tnf_recovery.go index e8b2157fe5c1..5d213af8983c 100644 --- a/test/extended/two_node/tnf_recovery.go +++ b/test/extended/two_node/tnf_recovery.go @@ -5,6 +5,7 @@ import ( "fmt" "math/rand" "os" + "strings" "time" g "github.com/onsi/ginkgo/v2" @@ -17,6 +18,8 @@ import ( "github.com/openshift/origin/test/extended/two_node/utils/services" exutil "github.com/openshift/origin/test/extended/util" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + nodeutil "k8s.io/kubernetes/pkg/util/node" "k8s.io/kubernetes/test/e2e/framework" ) @@ -33,6 +36,26 @@ const ( vmGracefulShutdownTimeout = 10 * time.Minute // Graceful VM shutdown is typically slow membersHealthyAfterDoubleReboot = 30 * time.Minute // Includes full VM reboot and etcd member healthy progressLogInterval = time.Minute // Target interval for progress logging + + etcdResourceRecoveryTimeout = 5 * time.Minute // Time for etcd-clone to restart and stabilize + longRecoveryTimeout = 10 * time.Minute // Time for container kill or standby/unstandby recovery + + crmAttributeName = "learner_node" // The CRM attribute under test + pcsWaitTimeout = 120 // Seconds for pcs --wait flag + etcdCloneResource = "etcd-clone" // Pacemaker clone resource name + + // activeCountLogPattern is the pacemaker log message emitted when get_truly_active_resources_count() + // is called during the start action. + activeCountLogPattern = "active etcd resources" + // unexpectedCountError is the error message that should NOT appear after a disable/enable cycle. + unexpectedCountError = "Unexpected active resource count" + + // stoppingResourcesLogPattern is the pacemaker log message emitted by leave_etcd_member_list() + // when it counts how many etcd resources are stopping concurrently. + stoppingResourcesLogPattern = "stopping etcd resources" + // delayStopLogPattern is the pacemaker log message emitted when the alphabetically second + // node delays its stop to prevent simultaneous etcd member removal and WAL corruption. + delayStopLogPattern = "delaying stop for" ) // computeLogInterval calculates poll attempts between progress logs based on poll interval. @@ -47,6 +70,300 @@ func computeLogInterval(pollInterval time.Duration) int { return n } +// learnerCleanupResult holds the parsed output from the disable/enable cycle script. +type learnerCleanupResult struct { + // StopQueryRC is the return code of crm_attribute --query after the stop operation. + // RC=6 with "No such device or address" means the attribute was successfully cleared. + StopQueryRC string + StopQueryResult string + // StartQueryRC is the return code of crm_attribute --query after the start operation. + StartQueryRC string + StartQueryResult string + // RawOutput is the full script output for diagnostics. + RawOutput string +} + +// isAttributeCleared returns true if the crm_attribute query indicates the attribute was deleted. +// When the attribute doesn't exist, crm_attribute returns RC=6 and prints "No such device or address". +func isAttributeCleared(rc, result string) bool { + return rc == "6" || strings.Contains(result, "No such device or address") +} + +// pcsDisableScript returns a bash snippet that disables a resource and exits on failure. +// On failure it re-enables the resource as a safety net before exiting. +func pcsDisableScript(resource string, timeout int) string { + return fmt.Sprintf(`sudo pcs resource disable %[1]s --wait=%[2]d + DISABLE_RC=$? + if [ $DISABLE_RC -ne 0 ]; then + echo "DISABLE_FAILED" + sudo pcs resource enable %[1]s --wait=%[2]d 2>/dev/null || true + exit 1 + fi`, resource, timeout) +} + +// pcsEnableScript returns a bash snippet that enables a resource and exits on failure. +func pcsEnableScript(resource string, timeout int) string { + return fmt.Sprintf(`sudo pcs resource enable %s --wait=%d + ENABLE_RC=$? + if [ $ENABLE_RC -ne 0 ]; then + echo "ENABLE_FAILED" + exit 1 + fi`, resource, timeout) +} + +// queryCRMAttributeScript returns a bash snippet that queries an attribute and echoes +// the result with the given label prefix (e.g. "STOP" → "STOP_RC=...", "STOP_RESULT=..."). +func queryCRMAttributeScript(attr, label string) string { + return fmt.Sprintf(`%[1]s_RESULT=$(sudo crm_attribute --query --name %[2]s 2>&1); %[1]s_RC=$? + echo "%[1]s_RC=${%[1]s_RC}" + echo "%[1]s_RESULT=${%[1]s_RESULT}"`, label, attr) +} + +// injectCRMAttributeScript returns a bash snippet that sets a CRM attribute to the given value. +func injectCRMAttributeScript(attr, value string) string { + return fmt.Sprintf(`sudo crm_attribute --name %s --update %s`, attr, value) +} + +// runDisableEnableCycle executes the full disable/enable cycle as a single compound command. +// +// This must run as one bash invocation because disabling etcd-clone stops etcd, which brings +// down the API server — no new debug containers can be created until etcd is re-enabled. +// The debug pod is created while the API is still up; the bash process then runs locally on +// the node and does not need the API for subsequent commands. +// +// The initial inject is also included in the compound command because the resource agent's +// monitor action calls reconcile_member_state() which clears learner_node every few seconds. +// A separate inject would be race-conditioned by the monitor. +// +// The script performs: +// 1. Inject stale learner_node attribute +// 2. Disable etcd-clone (waits for stop to complete) +// 3. Query learner_node attribute (should be cleared by the resource agent's stop action) +// 4. Re-inject the stale learner_node attribute +// 5. Enable etcd-clone (waits for start to complete) +// 6. Query learner_node attribute (should be cleared by the resource agent's start action) +func runDisableEnableCycle(oc *exutil.CLI, nodeName string) (learnerCleanupResult, error) { + script := strings.Join([]string{ + injectCRMAttributeScript(crmAttributeName, nodeName), + pcsDisableScript(etcdCloneResource, pcsWaitTimeout), + queryCRMAttributeScript(crmAttributeName, "STOP"), + injectCRMAttributeScript(crmAttributeName, nodeName), + pcsEnableScript(etcdCloneResource, pcsWaitTimeout), + queryCRMAttributeScript(crmAttributeName, "START"), + }, "\n") + + output, err := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, nodeName, "default", "bash", "-c", script) + framework.Logf("Disable/enable cycle output:\n%s", output) + + // err may be non-nil if the debug container cleanup fails while etcd is down. + // The actual test results are captured in stdout. + if err != nil { + framework.Logf("Disable/enable cycle returned error (may be expected due to API disruption): %v", err) + } + + return learnerCleanupResult{ + StopQueryRC: extractValue(output, "STOP_RC="), + StopQueryResult: extractValue(output, "STOP_RESULT="), + StartQueryRC: extractValue(output, "START_RC="), + StartQueryResult: extractValue(output, "START_RESULT="), + RawOutput: output, + }, err +} + +// extractValue finds a line starting with the given prefix and returns the value after it. +func extractValue(output, prefix string) string { + for _, line := range strings.Split(output, "\n") { + line = strings.TrimSpace(line) + if strings.HasPrefix(line, prefix) { + return strings.TrimPrefix(line, prefix) + } + } + return "" +} + +// waitForAllNodesReady checks that the expected number of nodes exist and all are Ready. +func waitForAllNodesReady(oc *exutil.CLI, expectedCount int) error { + nodeList, err := utils.GetNodes(oc, utils.AllNodes) + if err != nil { + return fmt.Errorf("failed to retrieve nodes: %v", err) + } + if len(nodeList.Items) != expectedCount { + return fmt.Errorf("expected %d nodes, found %d", expectedCount, len(nodeList.Items)) + } + for _, node := range nodeList.Items { + nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get( + context.Background(), node.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get node %s: %v", node.Name, err) + } + if !nodeutil.IsNodeReady(nodeObj) { + return fmt.Errorf("node %s is not Ready", node.Name) + } + } + return nil +} + +// verifyEtcdCloneStartedOnAllNodes checks that pcs status shows etcd-clone Started on all given nodes. +// Clone resources use the format "Started: [ node1 node2 ]", so we extract the etcd-clone section +// and look for each node name on a "Started" line within that section. +func verifyEtcdCloneStartedOnAllNodes(oc *exutil.CLI, execNodeName string, nodes []corev1.Node) error { + statusOutput, err := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, execNodeName, "default", "bash", "-c", "sudo pcs status") + if err != nil { + return fmt.Errorf("failed to get pcs status: %v", err) + } + etcdIdx := strings.Index(statusOutput, "etcd-clone") + if etcdIdx == -1 { + return fmt.Errorf("etcd-clone not found in pcs status:\n%s", statusOutput) + } + // Scope parsing to the etcd-clone resource block only, stopping at the next + // resource header or blank line to avoid matching unrelated "Started" lines. + etcdLines := strings.Split(statusOutput[etcdIdx:], "\n") + var etcdSection strings.Builder + etcdSection.WriteString(etcdLines[0]) + for _, line := range etcdLines[1:] { + trimmed := strings.TrimSpace(line) + if trimmed == "" || (strings.HasSuffix(trimmed, ":") && !strings.Contains(line, "Started")) { + break + } + etcdSection.WriteString("\n") + etcdSection.WriteString(line) + } + sectionStr := etcdSection.String() + for _, node := range nodes { + found := false + for _, line := range strings.Split(sectionStr, "\n") { + if strings.Contains(line, "Started") && strings.Contains(line, node.Name) { + found = true + break + } + } + if !found { + return fmt.Errorf("etcd-clone not Started on %s, status:\n%s", node.Name, statusOutput) + } + } + framework.Logf("Final pcs status:\n%s", statusOutput) + return nil +} + +// getPacemakerLogGrep runs a grep against /var/log/pacemaker/pacemaker.log on the given node +// and returns matching lines. If baselineLineCount is non-empty, only lines after that line +// number are searched (using tail +N). Returns empty string if no matches found. +func getPacemakerLogGrep(oc *exutil.CLI, nodeName, pattern, baselineLineCount string) (string, error) { + var cmd string + if baselineLineCount != "" { + // Use tail to skip lines that existed before the test, then grep for pattern + cmd = fmt.Sprintf(`tail -n +%s /var/log/pacemaker/pacemaker.log | grep -F -- %q | tail -5`, + baselineLineCount, pattern) + } else { + cmd = fmt.Sprintf(`grep -F -- %q /var/log/pacemaker/pacemaker.log | tail -5`, pattern) + } + return exutil.DebugNodeRetryWithOptionsAndChroot(oc, nodeName, "default", "bash", "-c", cmd) +} + +// getPacemakerLogBaselines captures the current line count of the pacemaker log on each node. +// Returns a map of nodeName -> lineCount string. Used to scope log assertions to only lines +// emitted after the baseline, preventing stale log lines from prior tests causing false positives. +func getPacemakerLogBaselines(oc *exutil.CLI, nodes []corev1.Node) map[string]string { + baselines := make(map[string]string, len(nodes)) + for _, node := range nodes { + output, err := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, node.Name, "default", "bash", "-c", "wc -l < /var/log/pacemaker/pacemaker.log") + if err != nil { + framework.Logf("Warning: could not get pacemaker log line count from %s: %v", node.Name, err) + continue + } + baselines[node.Name] = strings.TrimSpace(output) + } + return baselines +} + +// extractFailedActionsSection extracts everything after "Failed Resource Actions:" from pcs status output. +// In pacemaker, this section lists historical failures that haven't been cleared with `pcs resource cleanup`. +func extractFailedActionsSection(pcsOutput string) string { + for _, marker := range []string{"Failed Resource Actions:", "Failed Resource Actions"} { + idx := strings.Index(pcsOutput, marker) + if idx != -1 { + return pcsOutput[idx:] + } + } + return "" +} + +// runSimpleDisableEnableCycle disables and re-enables etcd-clone as a single compound command. +// Returns the combined output. The error may be non-nil due to API disruption while etcd is down. +func runSimpleDisableEnableCycle(oc *exutil.CLI, nodeName string) string { + script := strings.Join([]string{ + pcsDisableScript(etcdCloneResource, pcsWaitTimeout), + pcsEnableScript(etcdCloneResource, pcsWaitTimeout), + }, "\n") + + output, err := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, nodeName, "default", "bash", "-c", script) + framework.Logf("Disable/enable cycle output:\n%s", output) + + if err != nil { + framework.Logf("Disable/enable cycle returned error (may be expected due to API disruption): %v", err) + } + + o.Expect(output).NotTo(o.ContainSubstring("DISABLE_FAILED"), + "pcs resource disable should succeed") + o.Expect(output).NotTo(o.ContainSubstring("ENABLE_FAILED"), + "pcs resource enable should succeed") + + return output +} + +// expectPacemakerLogFound verifies that at least one node's pacemaker log contains the given pattern. +// If baselines is non-nil, only log lines after each node's baseline line count are considered. +func expectPacemakerLogFound(oc *exutil.CLI, nodes []corev1.Node, pattern, description string, baselines map[string]string) { + var found bool + for _, node := range nodes { + logOutput, logErr := getPacemakerLogGrep(oc, node.Name, pattern, baselines[node.Name]) + if logErr != nil { + framework.Logf("Warning: failed to grep pacemaker log on %s: %v", node.Name, logErr) + continue + } + if strings.TrimSpace(logOutput) != "" { + framework.Logf("%s on %s:\n%s", description, node.Name, logOutput) + found = true + } + } + o.Expect(found).To(o.BeTrue(), + fmt.Sprintf("Expected at least one node's pacemaker log to contain %s", description)) +} + +// verifyFinalClusterHealth runs the common end-of-test health checks: etcd cluster status, +// etcd-clone started on both nodes, all nodes ready, and essential operators available. +func verifyFinalClusterHealth(oc *exutil.CLI, execNodeName string, nodes []corev1.Node, + etcdClientFactory *helpers.EtcdClientFactoryImpl, label string, timeout time.Duration) { + + g.By("Verifying etcd cluster health") + o.Eventually(func() error { + return utils.LogEtcdClusterStatus(oc, label, etcdClientFactory) + }, timeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "etcd cluster should be healthy") + + g.By("Verifying pcs status shows etcd-clone Started on both nodes") + o.Eventually(func() error { + return verifyEtcdCloneStartedOnAllNodes(oc, execNodeName, nodes) + }, timeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "etcd-clone should be Started on both nodes") + + g.By("Verifying both nodes are Ready") + o.Eventually(func() error { + return waitForAllNodesReady(oc, 2) + }, timeout, utils.FiveSecondPollInterval).Should( + o.Succeed(), "Both nodes should be Ready") + + g.By("Verifying essential operators are available") + o.Eventually(func() error { + return utils.ValidateEssentialOperatorsAvailable(oc) + }, timeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "Essential operators should be available") +} + type hypervisorExtendedConfig struct { HypervisorConfig core.SSHConfig HypervisorKnownHostsPath string @@ -80,6 +397,52 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual targetNode = nodes.Items[(randomIndex+1)%len(nodes.Items)] }) + g.AfterEach(func() { + nodeList, err := utils.GetNodes(oc, utils.AllNodes) + if err != nil || len(nodeList.Items) == 0 { + framework.Logf("Warning: Could not retrieve nodes during cleanup: %v", err) + return + } + cleanupNode := nodeList.Items[0] + + g.By("Cleanup: Ensuring all nodes are unstandby") + for _, node := range nodeList.Items { + if _, err := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, cleanupNode.Name, "default", "bash", "-c", + fmt.Sprintf("sudo pcs node unstandby %s 2>/dev/null; true", node.Name)); err != nil { + framework.Logf("Warning: Failed to unstandby %s: %v", node.Name, err) + } + } + + g.By("Cleanup: Ensuring etcd-clone is enabled") + if err := utils.EnablePacemakerResource(oc, cleanupNode.Name, etcdCloneResource); err != nil { + framework.Logf("Warning: Failed to enable etcd-clone during cleanup: %v", err) + } + + g.By("Cleanup: Clearing any stale learner_node CRM attribute") + utils.DeleteCRMAttribute(oc, cleanupNode.Name, crmAttributeName) + + g.By("Cleanup: Running pcs resource cleanup to clear failed actions") + if output, err := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, cleanupNode.Name, "default", "bash", "-c", "sudo pcs resource cleanup"); err != nil { + framework.Logf("Warning: Failed to run pcs resource cleanup during AfterEach: %v", err) + } else { + framework.Logf("PCS resource cleanup output: %s", output) + } + + g.By("Cleanup: Waiting for both nodes to become Ready") + o.Eventually(func() error { + return waitForAllNodesReady(oc, 2) + }, longRecoveryTimeout, utils.FiveSecondPollInterval).Should( + o.Succeed(), "Both nodes must be Ready after cleanup") + + g.By("Cleanup: Validating etcd cluster health") + o.Eventually(func() error { + return utils.LogEtcdClusterStatus(oc, "AfterEach cleanup", etcdClientFactory) + }, longRecoveryTimeout, utils.FiveSecondPollInterval).Should( + o.Succeed(), "Etcd cluster must be healthy after cleanup") + }) + g.It("should recover from graceful node shutdown with etcd member re-addition", func() { // Note: In graceful shutdown, the targetNode is deliberately shut down while // the peerNode remains running and becomes the etcd leader. @@ -407,6 +770,354 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual &targetNode, true, false, // targetNode expected started == true, learner == false 6*time.Minute, 45*time.Second) }) + + // This test verifies that the resource agent's stop and start actions both clear + // a stale learner_node CRM attribute. A stale attribute would prevent a node from + // completing its etcd rejoin because the start action polls this attribute. + g.It("should clean up stale learner_node attribute during etcd-clone stop and start operations", func() { + nodeList, err := utils.GetNodes(oc, utils.AllNodes) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") + o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected exactly 2 nodes for two-node cluster") + + nodes := nodeList.Items + execNode := nodes[0] + + g.By("Verifying both nodes are healthy before test") + o.Eventually(func() error { + return waitForAllNodesReady(oc, 2) + }, nodeIsHealthyTimeout, utils.FiveSecondPollInterval).Should( + o.Succeed(), "Both nodes should be Ready before test") + + // Run inject + disable/enable cycle as a single compound command. + // The inject must be part of the compound command because the resource agent's + // monitor action calls reconcile_member_state() which clears learner_node + // every few seconds — a separate inject would be race-conditioned. + g.By("Running inject + disable/enable cycle to verify learner_node cleanup on stop and start") + result, _ := runDisableEnableCycle(oc, execNode.Name) + + // Verify the disable/enable completed successfully + o.Expect(result.RawOutput).NotTo(o.ContainSubstring("DISABLE_FAILED"), + "pcs resource disable should succeed") + o.Expect(result.RawOutput).NotTo(o.ContainSubstring("ENABLE_FAILED"), + "pcs resource enable should succeed") + + // Verify: attribute was cleared by the resource agent's stop action + g.By("Verifying learner_node attribute was cleared after etcd-clone stop") + o.Expect(result.StopQueryRC).NotTo(o.BeEmpty(), + fmt.Sprintf("Expected STOP_RC in script output, raw output:\n%s", result.RawOutput)) + o.Expect(isAttributeCleared(result.StopQueryRC, result.StopQueryResult)).To(o.BeTrue(), + fmt.Sprintf("Expected learner_node to be cleared after stop (RC=%s, result=%s)", + result.StopQueryRC, result.StopQueryResult)) + framework.Logf("STOP path verified: learner_node was cleared by the resource agent stop action") + + g.By("Verifying learner_node attribute was cleared after etcd-clone start") + o.Expect(result.StartQueryRC).NotTo(o.BeEmpty(), + fmt.Sprintf("Expected START_RC in script output, raw output:\n%s", result.RawOutput)) + o.Expect(isAttributeCleared(result.StartQueryRC, result.StartQueryResult)).To(o.BeTrue(), + fmt.Sprintf("Expected learner_node to be cleared after start (RC=%s, result=%s)", + result.StartQueryRC, result.StartQueryResult)) + framework.Logf("START path verified: learner_node was cleared by the resource agent start action") + + verifyFinalClusterHealth(oc, execNode.Name, nodes, etcdClientFactory, + "after learner cleanup test", etcdResourceRecoveryTimeout) + }) + + // This test verifies that get_truly_active_resources_count() in the podman-etcd resource agent + // correctly differentiates truly active resources from those being stopped. + // + // A disable/enable cycle triggers this code path because both instances restart cleanly + // without force-new-cluster being pre-set, entering the branch that calls the function + // and logs the active resource count. + g.It("should exclude stopping resources from active count during etcd-clone disable/enable cycle", func() { + nodeList, err := utils.GetNodes(oc, utils.AllNodes) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") + o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected exactly 2 nodes for two-node cluster") + + nodes := nodeList.Items + execNode := nodes[0] + + g.By("Verifying both nodes are healthy before test") + o.Eventually(func() error { + return waitForAllNodesReady(oc, 2) + }, nodeIsHealthyTimeout, utils.FiveSecondPollInterval).Should( + o.Succeed(), "Both nodes should be Ready before test") + + // Capture per-node baseline log line counts before the disruptive action so + // log assertions only consider lines emitted during this test. + logBaselines := getPacemakerLogBaselines(oc, nodes) + + g.By("Running etcd-clone disable/enable cycle to trigger active resource count logic") + runSimpleDisableEnableCycle(oc, execNode.Name) + + g.By("Waiting for etcd cluster to recover after disable/enable cycle") + o.Eventually(func() error { + return utils.LogEtcdClusterStatus(oc, "after disable/enable cycle", etcdClientFactory) + }, etcdResourceRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "etcd cluster should recover after disable/enable cycle") + + g.By("Verifying pcs status shows etcd-clone Started on both nodes") + o.Eventually(func() error { + return verifyEtcdCloneStartedOnAllNodes(oc, execNode.Name, nodes) + }, etcdResourceRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "etcd-clone should be Started on both nodes after recovery") + + g.By("Checking pacemaker logs for correct active resource count logic") + expectPacemakerLogFound(oc, nodes, activeCountLogPattern, "Active count log entries", logBaselines) + + g.By("Verifying no 'Unexpected active resource count' errors in pacemaker logs") + for _, node := range nodes { + errorOutput, logErr := getPacemakerLogGrep(oc, node.Name, unexpectedCountError, logBaselines[node.Name]) + o.Expect(logErr).ShouldNot(o.HaveOccurred(), + fmt.Sprintf("Expected to read pacemaker log on %s", node.Name)) + o.Expect(strings.TrimSpace(errorOutput)).To(o.BeEmpty(), + fmt.Sprintf("Expected no 'Unexpected active resource count' errors on %s", node.Name)) + } + + verifyFinalClusterHealth(oc, execNode.Name, nodes, etcdClientFactory, + "after active count test", etcdResourceRecoveryTimeout) + }) + + // This test verifies that podman-etcd prevents simultaneous etcd member removal + // when both nodes receive a graceful shutdown request. + // + // When etcd-clone is disabled, both nodes stop concurrently. The leave_etcd_member_list() + // function detects this by counting the stopping resources. The alphabetically second node + // is delayed by DELAY_SECOND_NODE_LEAVE_SEC (10s) to prevent WAL corruption. + g.It("should delay the second node stop to prevent simultaneous etcd member removal", func() { + nodeList, err := utils.GetNodes(oc, utils.AllNodes) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") + o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected exactly 2 nodes for two-node cluster") + + nodes := nodeList.Items + execNode := nodes[0] + + g.By("Verifying both nodes are healthy before test") + o.Eventually(func() error { + return waitForAllNodesReady(oc, 2) + }, nodeIsHealthyTimeout, utils.FiveSecondPollInterval).Should( + o.Succeed(), "Both nodes should be Ready before test") + + // Capture per-node baseline log line counts before the disruptive action so + // log assertions only consider lines emitted during this test. + logBaselines := getPacemakerLogBaselines(oc, nodes) + + g.By("Running etcd-clone disable/enable cycle to trigger simultaneous stop logic") + runSimpleDisableEnableCycle(oc, execNode.Name) + + g.By("Waiting for etcd cluster to recover after disable/enable cycle") + o.Eventually(func() error { + return utils.LogEtcdClusterStatus(oc, "after disable/enable cycle", etcdClientFactory) + }, etcdResourceRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "etcd cluster should recover after disable/enable cycle") + + g.By("Verifying pcs status shows etcd-clone Started on both nodes") + o.Eventually(func() error { + return verifyEtcdCloneStartedOnAllNodes(oc, execNode.Name, nodes) + }, etcdResourceRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "etcd-clone should be Started on both nodes after recovery") + + g.By("Checking pacemaker logs for stopping resource count detection") + expectPacemakerLogFound(oc, nodes, stoppingResourcesLogPattern, "Stopping resources log entries", logBaselines) + + g.By("Verifying delay intervention was applied to prevent simultaneous member removal") + expectPacemakerLogFound(oc, nodes, delayStopLogPattern, "Delay intervention log", logBaselines) + + verifyFinalClusterHealth(oc, execNode.Name, nodes, etcdClientFactory, + "after simultaneous stop test", etcdResourceRecoveryTimeout) + }) + + // This test verifies that an abrupt termination of the etcd container triggers a + // coordinated "Error occurred" monitor state on both nodes before the cluster + // self-heals automatically. + // + // When a local etcd container is killed, the podman-etcd resource agent must + // coordinate recovery with the peer node. The surviving node sets force_new_cluster + // and the killed node's etcd restarts and joins as a learner. During this process, + // both nodes briefly enter a coordinated failed state visible in pcs status as + // "Failed Resource Actions". + g.It("should coordinate recovery with peer when local etcd container is killed", func() { + nodeList, err := utils.GetNodes(oc, utils.AllNodes) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") + o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected exactly 2 nodes for two-node cluster") + + nodes := nodeList.Items + targetNode := nodes[1] // Kill etcd on the second node + execNode := nodes[0] // Use first node for pcs status checks after recovery + + g.By("Verifying both nodes are healthy before test") + o.Eventually(func() error { + return waitForAllNodesReady(oc, 2) + }, nodeIsHealthyTimeout, utils.FiveSecondPollInterval).Should( + o.Succeed(), "Both nodes should be Ready before test") + + // Kill etcd container on the target node. + g.By(fmt.Sprintf("Killing etcd container on %s", targetNode.Name)) + output, err := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, targetNode.Name, "openshift-etcd", + "bash", "-c", "podman kill etcd 2>/dev/null; true") + framework.Logf("Podman kill output: %s, err: %v", output, err) + + // Wait for the cluster to self-heal. + g.By("Waiting for etcd cluster to self-heal after container kill") + o.Eventually(func() error { + return utils.LogEtcdClusterStatus(oc, "after container kill", etcdClientFactory) + }, longRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "etcd cluster should self-heal after container kill") + + g.By("Verifying pcs status shows etcd-clone Started on both nodes") + o.Eventually(func() error { + return verifyEtcdCloneStartedOnAllNodes(oc, execNode.Name, nodes) + }, longRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "etcd-clone should be Started on both nodes after recovery") + + // Verify that the coordinated failure was observed. + g.By("Checking pcs status for coordinated 'Failed Resource Actions' on both nodes") + pcsOutput, statusErr := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, execNode.Name, "default", "bash", "-c", "sudo pcs status") + o.Expect(statusErr).ShouldNot(o.HaveOccurred(), "Expected to get pcs status without error") + framework.Logf("PCS status after recovery:\n%s", pcsOutput) + + failedSection := extractFailedActionsSection(pcsOutput) + o.Expect(failedSection).NotTo(o.BeEmpty(), + "Expected pcs status to contain 'Failed Resource Actions' section after container kill") + framework.Logf("Failed Resource Actions section:\n%s", failedSection) + + o.Expect(failedSection).To(o.ContainSubstring("etcd"), + "Expected Failed Resource Actions to reference etcd") + + for _, node := range nodes { + o.Expect(failedSection).To(o.ContainSubstring(node.Name), + fmt.Sprintf("Expected Failed Resource Actions to show failure on %s for coordinated recovery", node.Name)) + framework.Logf("Coordinated failure confirmed: node %s found in Failed Resource Actions", node.Name) + } + + verifyFinalClusterHealth(oc, execNode.Name, nodes, etcdClientFactory, + "after coordinated recovery test", longRecoveryTimeout) + }) + + // This test verifies that the podman-etcd resource agent retries setting + // CRM attributes when they fail during the force-new-cluster recovery path. + // + // When the learner_node CIB attribute is deleted while a node is in standby, + // the returning node's start action polls for the attribute but finds it missing. + // Without the retry fix, the node gets stuck in the LEARNER=true stage because + // nobody re-sets the attribute. With the fix, the leader node's monitor detects + // that a learner member exists in etcd but the learner_node attribute is missing, + // and retries setting it, allowing the returning node to proceed. + // + // Test flow: + // 1. Put a node in standby (triggers force-new-cluster on the peer) + // 2. Wait for the standby node to appear as a learner in etcd member list + // 3. Delete the learner_node CRM attribute + // 4. Unstandby the node + // 5. Verify both nodes recover to voting etcd members + g.It("should retry setting learner_node attribute after deletion during force-new-cluster recovery", func() { + nodeList, err := utils.GetNodes(oc, utils.AllNodes) + o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") + o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected exactly 2 nodes for two-node cluster") + + nodes := nodeList.Items + execNode := nodes[0] // Stays active, runs solo during standby + standbyNode := nodes[1] // Will be put in standby + + g.By("Verifying both nodes are healthy before test") + o.Eventually(func() error { + return waitForAllNodesReady(oc, 2) + }, nodeIsHealthyTimeout, utils.FiveSecondPollInterval).Should( + o.Succeed(), "Both nodes should be Ready before test") + + // Put the standby node in standby mode. + g.By(fmt.Sprintf("Putting %s in standby", standbyNode.Name)) + output, err := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, execNode.Name, "default", "bash", "-c", + fmt.Sprintf("sudo pcs node standby %s", standbyNode.Name)) + o.Expect(err).ShouldNot(o.HaveOccurred(), + fmt.Sprintf("Expected pcs node standby to succeed, output: %s", output)) + framework.Logf("PCS node standby output: %s", output) + + // Wait for force-new-cluster recovery to complete. + g.By(fmt.Sprintf("Waiting for %s to appear as learner in etcd member list", standbyNode.Name)) + o.Eventually(func() error { + members, err := utils.GetMembers(etcdClientFactory) + if err != nil { + return fmt.Errorf("failed to get etcd members: %v", err) + } + _, isLearner, err := utils.GetMemberState(&standbyNode, members) + if err != nil { + return fmt.Errorf("standby node not in member list yet: %v", err) + } + if !isLearner { + return fmt.Errorf("standby node %s is not a learner yet", standbyNode.Name) + } + framework.Logf("Standby node %s confirmed as learner in etcd member list", standbyNode.Name) + return nil + }, longRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "Standby node should appear as learner in etcd member list") + + g.By("Logging pcs status after standby") + if pcsOutput, pcsErr := exutil.DebugNodeRetryWithOptionsAndChroot( + oc, execNode.Name, "default", "bash", "-c", "sudo pcs status"); pcsErr == nil { + framework.Logf("PCS status after standby:\n%s", pcsOutput) + } + + // Verify learner_node attribute is set before we delete it. + // The attribute is set by the leader's monitor action which runs asynchronously + // after detecting the learner, so poll until it appears. + g.By("Verifying learner_node CRM attribute is set") + var attrOutput string + o.Eventually(func() error { + var queryErr error + attrOutput, queryErr = utils.QueryCRMAttribute(oc, execNode.Name, crmAttributeName) + return queryErr + }, etcdResourceRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "Expected learner_node attribute to exist after force-new-cluster recovery") + framework.Logf("learner_node attribute value: %s", attrOutput) + + // Delete the learner_node attribute to simulate attribute update failure. + g.By("Deleting learner_node CRM attribute to simulate attribute update failure") + utils.DeleteCRMAttribute(oc, execNode.Name, crmAttributeName) + framework.Logf("learner_node attribute deleted") + + // Unstandby the node. With the retry fix, the leader node's monitor detects + // the missing attribute and re-sets it, allowing the returning node to proceed. + g.By(fmt.Sprintf("Unstandby %s to trigger etcd rejoin", standbyNode.Name)) + output, err = exutil.DebugNodeRetryWithOptionsAndChroot( + oc, execNode.Name, "default", "bash", "-c", + fmt.Sprintf("sudo pcs node unstandby %s", standbyNode.Name)) + o.Expect(err).ShouldNot(o.HaveOccurred(), + fmt.Sprintf("Expected pcs node unstandby to succeed, output: %s", output)) + framework.Logf("PCS node unstandby output: %s", output) + + // Wait for both nodes to become voting etcd members. + g.By("Waiting for both nodes to become voting etcd members") + o.Eventually(func() error { + members, err := utils.GetMembers(etcdClientFactory) + if err != nil { + return fmt.Errorf("failed to get etcd members: %v", err) + } + if len(members) != 2 { + return fmt.Errorf("expected 2 members, found %d", len(members)) + } + for i := range nodes { + isStarted, isLearner, err := utils.GetMemberState(&nodes[i], members) + if err != nil { + return fmt.Errorf("member %s not found: %v", nodes[i].Name, err) + } + if !isStarted { + return fmt.Errorf("member %s is not started", nodes[i].Name) + } + if isLearner { + return fmt.Errorf("member %s is still a learner", nodes[i].Name) + } + } + framework.Logf("Both etcd members are now voting members") + return nil + }, longRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( + o.HaveOccurred(), "Both nodes should become voting etcd members") + + verifyFinalClusterHealth(oc, execNode.Name, nodes, etcdClientFactory, + "after attribute retry test", longRecoveryTimeout) + }) }) func validateEtcdRecoveryState( diff --git a/test/extended/two_node/tnf_resilience.go b/test/extended/two_node/tnf_resilience.go deleted file mode 100644 index 92c42383f8a0..000000000000 --- a/test/extended/two_node/tnf_resilience.go +++ /dev/null @@ -1,753 +0,0 @@ -package two_node - -import ( - "context" - "fmt" - "strings" - "time" - - g "github.com/onsi/ginkgo/v2" - o "github.com/onsi/gomega" - v1 "github.com/openshift/api/config/v1" - "github.com/openshift/origin/test/extended/etcd/helpers" - "github.com/openshift/origin/test/extended/two_node/utils" - exutil "github.com/openshift/origin/test/extended/util" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - nodeutil "k8s.io/kubernetes/pkg/util/node" - "k8s.io/kubernetes/test/e2e/framework" -) - -const ( - etcdResourceRecoveryTimeout = 5 * time.Minute // Time for etcd-clone to restart and stabilize - longRecoveryTimeout = 10 * time.Minute // Time for container kill or standby/unstandby recovery - - crmAttributeName = "learner_node" // The CRM attribute under test - pcsWaitTimeout = 120 // Seconds for pcs --wait flag - etcdCloneResource = "etcd-clone" // Pacemaker clone resource name - - // activeCountLogPattern is the pacemaker log message emitted when get_truly_active_resources_count() - // is called during the start action. - activeCountLogPattern = "active etcd resources" - // unexpectedCountError is the error message that should NOT appear after a disable/enable cycle. - unexpectedCountError = "Unexpected active resource count" - - // stoppingResourcesLogPattern is the pacemaker log message emitted by leave_etcd_member_list() - // when it counts how many etcd resources are stopping concurrently. - stoppingResourcesLogPattern = "stopping etcd resources" - // delayStopLogPattern is the pacemaker log message emitted when the alphabetically second - // node delays its stop to prevent simultaneous etcd member removal and WAL corruption. - delayStopLogPattern = "delaying stop for" -) - -// learnerCleanupResult holds the parsed output from the disable/enable cycle script. -type learnerCleanupResult struct { - // StopQueryRC is the return code of crm_attribute --query after the stop operation. - // RC=6 with "No such device or address" means the attribute was successfully cleared. - StopQueryRC string - StopQueryResult string - // StartQueryRC is the return code of crm_attribute --query after the start operation. - StartQueryRC string - StartQueryResult string - // RawOutput is the full script output for diagnostics. - RawOutput string -} - -// isAttributeCleared returns true if the crm_attribute query indicates the attribute was deleted. -// When the attribute doesn't exist, crm_attribute returns RC=6 and prints "No such device or address". -func isAttributeCleared(rc, result string) bool { - return rc == "6" || strings.Contains(result, "No such device or address") -} - -// pcsDisableScript returns a bash snippet that disables a resource and exits on failure. -// On failure it re-enables the resource as a safety net before exiting. -func pcsDisableScript(resource string, timeout int) string { - return fmt.Sprintf(`sudo pcs resource disable %[1]s --wait=%[2]d - DISABLE_RC=$? - if [ $DISABLE_RC -ne 0 ]; then - echo "DISABLE_FAILED" - sudo pcs resource enable %[1]s --wait=%[2]d 2>/dev/null || true - exit 1 - fi`, resource, timeout) -} - -// pcsEnableScript returns a bash snippet that enables a resource and exits on failure. -func pcsEnableScript(resource string, timeout int) string { - return fmt.Sprintf(`sudo pcs resource enable %s --wait=%d - ENABLE_RC=$? - if [ $ENABLE_RC -ne 0 ]; then - echo "ENABLE_FAILED" - exit 1 - fi`, resource, timeout) -} - -// queryCRMAttributeScript returns a bash snippet that queries an attribute and echoes -// the result with the given label prefix (e.g. "STOP" → "STOP_RC=...", "STOP_RESULT=..."). -func queryCRMAttributeScript(attr, label string) string { - return fmt.Sprintf(`%[1]s_RESULT=$(sudo crm_attribute --query --name %[2]s 2>&1); %[1]s_RC=$? - echo "%[1]s_RC=${%[1]s_RC}" - echo "%[1]s_RESULT=${%[1]s_RESULT}"`, label, attr) -} - -// injectCRMAttributeScript returns a bash snippet that sets a CRM attribute to the given value. -func injectCRMAttributeScript(attr, value string) string { - return fmt.Sprintf(`sudo crm_attribute --name %s --update %s`, attr, value) -} - -// runDisableEnableCycle executes the full disable/enable cycle as a single compound command. -// -// This must run as one bash invocation because disabling etcd-clone stops etcd, which brings -// down the API server — no new debug containers can be created until etcd is re-enabled. -// The debug pod is created while the API is still up; the bash process then runs locally on -// the node and does not need the API for subsequent commands. -// -// The initial inject is also included in the compound command because the resource agent's -// monitor action calls reconcile_member_state() which clears learner_node every few seconds. -// A separate inject would be race-conditioned by the monitor. -// -// The script performs: -// 1. Inject stale learner_node attribute -// 2. Disable etcd-clone (waits for stop to complete) -// 3. Query learner_node attribute (should be cleared by the resource agent's stop action) -// 4. Re-inject the stale learner_node attribute -// 5. Enable etcd-clone (waits for start to complete) -// 6. Query learner_node attribute (should be cleared by the resource agent's start action) -func runDisableEnableCycle(oc *exutil.CLI, nodeName string) (learnerCleanupResult, error) { - script := strings.Join([]string{ - injectCRMAttributeScript(crmAttributeName, nodeName), - pcsDisableScript(etcdCloneResource, pcsWaitTimeout), - queryCRMAttributeScript(crmAttributeName, "STOP"), - injectCRMAttributeScript(crmAttributeName, nodeName), - pcsEnableScript(etcdCloneResource, pcsWaitTimeout), - queryCRMAttributeScript(crmAttributeName, "START"), - }, "\n") - - output, err := exutil.DebugNodeRetryWithOptionsAndChroot( - oc, nodeName, "default", "bash", "-c", script) - framework.Logf("Disable/enable cycle output:\n%s", output) - - // err may be non-nil if the debug container cleanup fails while etcd is down. - // The actual test results are captured in stdout. - if err != nil { - framework.Logf("Disable/enable cycle returned error (may be expected due to API disruption): %v", err) - } - - return learnerCleanupResult{ - StopQueryRC: extractValue(output, "STOP_RC="), - StopQueryResult: extractValue(output, "STOP_RESULT="), - StartQueryRC: extractValue(output, "START_RC="), - StartQueryResult: extractValue(output, "START_RESULT="), - RawOutput: output, - }, err -} - -// extractValue finds a line starting with the given prefix and returns the value after it. -func extractValue(output, prefix string) string { - for _, line := range strings.Split(output, "\n") { - line = strings.TrimSpace(line) - if strings.HasPrefix(line, prefix) { - return strings.TrimPrefix(line, prefix) - } - } - return "" -} - -// waitForAllNodesReady checks that the expected number of nodes exist and all are Ready. -func waitForAllNodesReady(oc *exutil.CLI, expectedCount int) error { - nodeList, err := utils.GetNodes(oc, utils.AllNodes) - if err != nil { - return fmt.Errorf("failed to retrieve nodes: %v", err) - } - if len(nodeList.Items) != expectedCount { - return fmt.Errorf("expected %d nodes, found %d", expectedCount, len(nodeList.Items)) - } - for _, node := range nodeList.Items { - nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get( - context.Background(), node.Name, metav1.GetOptions{}) - if err != nil { - return fmt.Errorf("failed to get node %s: %v", node.Name, err) - } - if !nodeutil.IsNodeReady(nodeObj) { - return fmt.Errorf("node %s is not Ready", node.Name) - } - } - return nil -} - -// verifyEtcdCloneStartedOnAllNodes checks that pcs status shows etcd-clone Started on all given nodes. -// Clone resources use the format "Started: [ node1 node2 ]", so we extract the etcd-clone section -// and look for each node name on a "Started" line within that section. -func verifyEtcdCloneStartedOnAllNodes(oc *exutil.CLI, execNodeName string, nodes []corev1.Node) error { - statusOutput, err := exutil.DebugNodeRetryWithOptionsAndChroot( - oc, execNodeName, "default", "bash", "-c", "sudo pcs status") - if err != nil { - return fmt.Errorf("failed to get pcs status: %v", err) - } - etcdIdx := strings.Index(statusOutput, "etcd-clone") - if etcdIdx == -1 { - return fmt.Errorf("etcd-clone not found in pcs status:\n%s", statusOutput) - } - // Scope parsing to the etcd-clone resource block only, stopping at the next - // resource header or blank line to avoid matching unrelated "Started" lines. - etcdLines := strings.Split(statusOutput[etcdIdx:], "\n") - var etcdSection strings.Builder - etcdSection.WriteString(etcdLines[0]) - for _, line := range etcdLines[1:] { - trimmed := strings.TrimSpace(line) - if trimmed == "" || (strings.HasSuffix(trimmed, ":") && !strings.Contains(line, "Started")) { - break - } - etcdSection.WriteString("\n") - etcdSection.WriteString(line) - } - sectionStr := etcdSection.String() - for _, node := range nodes { - found := false - for _, line := range strings.Split(sectionStr, "\n") { - if strings.Contains(line, "Started") && strings.Contains(line, node.Name) { - found = true - break - } - } - if !found { - return fmt.Errorf("etcd-clone not Started on %s, status:\n%s", node.Name, statusOutput) - } - } - framework.Logf("Final pcs status:\n%s", statusOutput) - return nil -} - -// getPacemakerLogGrep runs a grep against /var/log/pacemaker/pacemaker.log on the given node -// and returns matching lines. If baselineLineCount is non-empty, only lines after that line -// number are searched (using tail +N). Returns empty string if no matches found. -func getPacemakerLogGrep(oc *exutil.CLI, nodeName, pattern, baselineLineCount string) (string, error) { - var cmd string - if baselineLineCount != "" { - // Use tail to skip lines that existed before the test, then grep for pattern - cmd = fmt.Sprintf(`tail -n +%s /var/log/pacemaker/pacemaker.log | grep -F -- %q | tail -5`, - baselineLineCount, pattern) - } else { - cmd = fmt.Sprintf(`grep -F -- %q /var/log/pacemaker/pacemaker.log | tail -5`, pattern) - } - return exutil.DebugNodeRetryWithOptionsAndChroot(oc, nodeName, "default", "bash", "-c", cmd) -} - -// getPacemakerLogBaselines captures the current line count of the pacemaker log on each node. -// Returns a map of nodeName -> lineCount string. Used to scope log assertions to only lines -// emitted after the baseline, preventing stale log lines from prior tests causing false positives. -func getPacemakerLogBaselines(oc *exutil.CLI, nodes []corev1.Node) map[string]string { - baselines := make(map[string]string, len(nodes)) - for _, node := range nodes { - output, err := exutil.DebugNodeRetryWithOptionsAndChroot( - oc, node.Name, "default", "bash", "-c", "wc -l < /var/log/pacemaker/pacemaker.log") - if err != nil { - framework.Logf("Warning: could not get pacemaker log line count from %s: %v", node.Name, err) - continue - } - baselines[node.Name] = strings.TrimSpace(output) - } - return baselines -} - -// extractFailedActionsSection extracts everything after "Failed Resource Actions:" from pcs status output. -// In pacemaker, this section lists historical failures that haven't been cleared with `pcs resource cleanup`. -func extractFailedActionsSection(pcsOutput string) string { - for _, marker := range []string{"Failed Resource Actions:", "Failed Resource Actions"} { - idx := strings.Index(pcsOutput, marker) - if idx != -1 { - return pcsOutput[idx:] - } - } - return "" -} - -// runSimpleDisableEnableCycle disables and re-enables etcd-clone as a single compound command. -// Returns the combined output. The error may be non-nil due to API disruption while etcd is down. -func runSimpleDisableEnableCycle(oc *exutil.CLI, nodeName string) string { - script := strings.Join([]string{ - pcsDisableScript(etcdCloneResource, pcsWaitTimeout), - pcsEnableScript(etcdCloneResource, pcsWaitTimeout), - }, "\n") - - output, err := exutil.DebugNodeRetryWithOptionsAndChroot( - oc, nodeName, "default", "bash", "-c", script) - framework.Logf("Disable/enable cycle output:\n%s", output) - - if err != nil { - framework.Logf("Disable/enable cycle returned error (may be expected due to API disruption): %v", err) - } - - o.Expect(output).NotTo(o.ContainSubstring("DISABLE_FAILED"), - "pcs resource disable should succeed") - o.Expect(output).NotTo(o.ContainSubstring("ENABLE_FAILED"), - "pcs resource enable should succeed") - - return output -} - -// expectPacemakerLogFound verifies that at least one node's pacemaker log contains the given pattern. -// If baselines is non-nil, only log lines after each node's baseline line count are considered. -func expectPacemakerLogFound(oc *exutil.CLI, nodes []corev1.Node, pattern, description string, baselines map[string]string) { - var found bool - for _, node := range nodes { - logOutput, logErr := getPacemakerLogGrep(oc, node.Name, pattern, baselines[node.Name]) - if logErr != nil { - framework.Logf("Warning: failed to grep pacemaker log on %s: %v", node.Name, logErr) - continue - } - if strings.TrimSpace(logOutput) != "" { - framework.Logf("%s on %s:\n%s", description, node.Name, logOutput) - found = true - } - } - o.Expect(found).To(o.BeTrue(), - fmt.Sprintf("Expected at least one node's pacemaker log to contain %s", description)) -} - -// verifyFinalClusterHealth runs the common end-of-test health checks: etcd cluster status, -// etcd-clone started on both nodes, all nodes ready, and essential operators available. -func verifyFinalClusterHealth(oc *exutil.CLI, execNodeName string, nodes []corev1.Node, - etcdClientFactory *helpers.EtcdClientFactoryImpl, label string, timeout time.Duration) { - - g.By("Verifying etcd cluster health") - o.Eventually(func() error { - return utils.LogEtcdClusterStatus(oc, label, etcdClientFactory) - }, timeout, utils.FiveSecondPollInterval).ShouldNot( - o.HaveOccurred(), "etcd cluster should be healthy") - - g.By("Verifying pcs status shows etcd-clone Started on both nodes") - o.Eventually(func() error { - return verifyEtcdCloneStartedOnAllNodes(oc, execNodeName, nodes) - }, timeout, utils.FiveSecondPollInterval).ShouldNot( - o.HaveOccurred(), "etcd-clone should be Started on both nodes") - - g.By("Verifying both nodes are Ready") - o.Eventually(func() error { - return waitForAllNodesReady(oc, 2) - }, timeout, utils.FiveSecondPollInterval).Should( - o.Succeed(), "Both nodes should be Ready") - - g.By("Verifying essential operators are available") - o.Eventually(func() error { - return utils.ValidateEssentialOperatorsAvailable(oc) - }, timeout, utils.FiveSecondPollInterval).ShouldNot( - o.HaveOccurred(), "Essential operators should be available") -} - -var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/tnf-resilience][Serial][Disruptive][Skipped:SingleReplicaTopology] Two Node with Fencing etcd resilience", func() { - defer g.GinkgoRecover() - - var ( - oc = exutil.NewCLIWithoutNamespace("tnf-resilience").AsAdmin() - etcdClientFactory *helpers.EtcdClientFactoryImpl - setupCompleted bool - ) - - g.BeforeEach(func() { - utils.SkipIfNotTopology(oc, v1.DualReplicaTopologyMode) - - etcdClientFactory = helpers.NewEtcdClientFactory(oc.KubeClient()) - - utils.SkipIfClusterIsNotHealthy(oc, etcdClientFactory) - setupCompleted = true - }) - - g.AfterEach(func() { - if !setupCompleted { - framework.Logf("Test was skipped before setup completed, skipping AfterEach cleanup") - return - } - - nodeList, err := utils.GetNodes(oc, utils.AllNodes) - if err != nil || len(nodeList.Items) == 0 { - framework.Logf("Warning: Could not retrieve nodes during cleanup: %v", err) - return - } - cleanupNode := nodeList.Items[0] - - g.By("Cleanup: Ensuring all nodes are unstandby") - for _, node := range nodeList.Items { - if _, err := exutil.DebugNodeRetryWithOptionsAndChroot( - oc, cleanupNode.Name, "default", "bash", "-c", - fmt.Sprintf("sudo pcs node unstandby %s 2>/dev/null; true", node.Name)); err != nil { - framework.Logf("Warning: Failed to unstandby %s: %v", node.Name, err) - } - } - - g.By("Cleanup: Ensuring etcd-clone is enabled") - if err := utils.EnablePacemakerResource(oc, cleanupNode.Name, etcdCloneResource); err != nil { - framework.Logf("Warning: Failed to enable etcd-clone during cleanup: %v", err) - } - - g.By("Cleanup: Clearing any stale learner_node CRM attribute") - utils.DeleteCRMAttribute(oc, cleanupNode.Name, crmAttributeName) - - g.By("Cleanup: Running pcs resource cleanup to clear failed actions") - if output, err := exutil.DebugNodeRetryWithOptionsAndChroot( - oc, cleanupNode.Name, "default", "bash", "-c", "sudo pcs resource cleanup"); err != nil { - framework.Logf("Warning: Failed to run pcs resource cleanup during AfterEach: %v", err) - } else { - framework.Logf("PCS resource cleanup output: %s", output) - } - - g.By("Cleanup: Waiting for both nodes to become Ready") - o.Eventually(func() error { - return waitForAllNodesReady(oc, 2) - }, longRecoveryTimeout, utils.FiveSecondPollInterval).Should( - o.Succeed(), "Both nodes must be Ready after cleanup") - - g.By("Cleanup: Validating etcd cluster health") - o.Eventually(func() error { - return utils.LogEtcdClusterStatus(oc, "AfterEach cleanup", etcdClientFactory) - }, longRecoveryTimeout, utils.FiveSecondPollInterval).Should( - o.Succeed(), "Etcd cluster must be healthy after cleanup") - }) - - // This test verifies that the resource agent's stop and start actions both clear - // a stale learner_node CRM attribute. A stale attribute would prevent a node from - // completing its etcd rejoin because the start action polls this attribute. - g.It("should clean up stale learner_node attribute during etcd-clone stop and start operations", func() { - nodeList, err := utils.GetNodes(oc, utils.AllNodes) - o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") - o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected exactly 2 nodes for two-node cluster") - - nodes := nodeList.Items - execNode := nodes[0] - - g.By("Verifying both nodes are healthy before test") - o.Eventually(func() error { - return waitForAllNodesReady(oc, 2) - }, nodeIsHealthyTimeout, utils.FiveSecondPollInterval).Should( - o.Succeed(), "Both nodes should be Ready before test") - - // Run inject + disable/enable cycle as a single compound command. - // The inject must be part of the compound command because the resource agent's - // monitor action calls reconcile_member_state() which clears learner_node - // every few seconds — a separate inject would be race-conditioned. - g.By("Running inject + disable/enable cycle to verify learner_node cleanup on stop and start") - result, _ := runDisableEnableCycle(oc, execNode.Name) - - // Verify the disable/enable completed successfully - o.Expect(result.RawOutput).NotTo(o.ContainSubstring("DISABLE_FAILED"), - "pcs resource disable should succeed") - o.Expect(result.RawOutput).NotTo(o.ContainSubstring("ENABLE_FAILED"), - "pcs resource enable should succeed") - - // Verify: attribute was cleared by the resource agent's stop action - g.By("Verifying learner_node attribute was cleared after etcd-clone stop") - o.Expect(result.StopQueryRC).NotTo(o.BeEmpty(), - fmt.Sprintf("Expected STOP_RC in script output, raw output:\n%s", result.RawOutput)) - o.Expect(isAttributeCleared(result.StopQueryRC, result.StopQueryResult)).To(o.BeTrue(), - fmt.Sprintf("Expected learner_node to be cleared after stop (RC=%s, result=%s)", - result.StopQueryRC, result.StopQueryResult)) - framework.Logf("STOP path verified: learner_node was cleared by the resource agent stop action") - - g.By("Verifying learner_node attribute was cleared after etcd-clone start") - o.Expect(result.StartQueryRC).NotTo(o.BeEmpty(), - fmt.Sprintf("Expected START_RC in script output, raw output:\n%s", result.RawOutput)) - o.Expect(isAttributeCleared(result.StartQueryRC, result.StartQueryResult)).To(o.BeTrue(), - fmt.Sprintf("Expected learner_node to be cleared after start (RC=%s, result=%s)", - result.StartQueryRC, result.StartQueryResult)) - framework.Logf("START path verified: learner_node was cleared by the resource agent start action") - - verifyFinalClusterHealth(oc, execNode.Name, nodes, etcdClientFactory, - "after learner cleanup test", etcdResourceRecoveryTimeout) - }) - - // This test verifies that get_truly_active_resources_count() in the podman-etcd resource agent - // correctly differentiates truly active resources from those being stopped. - // - // A disable/enable cycle triggers this code path because both instances restart cleanly - // without force-new-cluster being pre-set, entering the branch that calls the function - // and logs the active resource count. - g.It("should exclude stopping resources from active count during etcd-clone disable/enable cycle", func() { - nodeList, err := utils.GetNodes(oc, utils.AllNodes) - o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") - o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected exactly 2 nodes for two-node cluster") - - nodes := nodeList.Items - execNode := nodes[0] - - g.By("Verifying both nodes are healthy before test") - o.Eventually(func() error { - return waitForAllNodesReady(oc, 2) - }, nodeIsHealthyTimeout, utils.FiveSecondPollInterval).Should( - o.Succeed(), "Both nodes should be Ready before test") - - // Capture per-node baseline log line counts before the disruptive action so - // log assertions only consider lines emitted during this test. - logBaselines := getPacemakerLogBaselines(oc, nodes) - - g.By("Running etcd-clone disable/enable cycle to trigger active resource count logic") - runSimpleDisableEnableCycle(oc, execNode.Name) - - g.By("Waiting for etcd cluster to recover after disable/enable cycle") - o.Eventually(func() error { - return utils.LogEtcdClusterStatus(oc, "after disable/enable cycle", etcdClientFactory) - }, etcdResourceRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( - o.HaveOccurred(), "etcd cluster should recover after disable/enable cycle") - - g.By("Verifying pcs status shows etcd-clone Started on both nodes") - o.Eventually(func() error { - return verifyEtcdCloneStartedOnAllNodes(oc, execNode.Name, nodes) - }, etcdResourceRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( - o.HaveOccurred(), "etcd-clone should be Started on both nodes after recovery") - - g.By("Checking pacemaker logs for correct active resource count logic") - expectPacemakerLogFound(oc, nodes, activeCountLogPattern, "Active count log entries", logBaselines) - - g.By("Verifying no 'Unexpected active resource count' errors in pacemaker logs") - for _, node := range nodes { - errorOutput, logErr := getPacemakerLogGrep(oc, node.Name, unexpectedCountError, logBaselines[node.Name]) - o.Expect(logErr).ShouldNot(o.HaveOccurred(), - fmt.Sprintf("Expected to read pacemaker log on %s", node.Name)) - o.Expect(strings.TrimSpace(errorOutput)).To(o.BeEmpty(), - fmt.Sprintf("Expected no 'Unexpected active resource count' errors on %s", node.Name)) - } - - verifyFinalClusterHealth(oc, execNode.Name, nodes, etcdClientFactory, - "after active count test", etcdResourceRecoveryTimeout) - }) - - // This test verifies that podman-etcd prevents simultaneous etcd member removal - // when both nodes receive a graceful shutdown request. - // - // When etcd-clone is disabled, both nodes stop concurrently. The leave_etcd_member_list() - // function detects this by counting the stopping resources. The alphabetically second node - // is delayed by DELAY_SECOND_NODE_LEAVE_SEC (10s) to prevent WAL corruption. - g.It("should delay the second node stop to prevent simultaneous etcd member removal", func() { - nodeList, err := utils.GetNodes(oc, utils.AllNodes) - o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") - o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected exactly 2 nodes for two-node cluster") - - nodes := nodeList.Items - execNode := nodes[0] - - g.By("Verifying both nodes are healthy before test") - o.Eventually(func() error { - return waitForAllNodesReady(oc, 2) - }, nodeIsHealthyTimeout, utils.FiveSecondPollInterval).Should( - o.Succeed(), "Both nodes should be Ready before test") - - // Capture per-node baseline log line counts before the disruptive action so - // log assertions only consider lines emitted during this test. - logBaselines := getPacemakerLogBaselines(oc, nodes) - - g.By("Running etcd-clone disable/enable cycle to trigger simultaneous stop logic") - runSimpleDisableEnableCycle(oc, execNode.Name) - - g.By("Waiting for etcd cluster to recover after disable/enable cycle") - o.Eventually(func() error { - return utils.LogEtcdClusterStatus(oc, "after disable/enable cycle", etcdClientFactory) - }, etcdResourceRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( - o.HaveOccurred(), "etcd cluster should recover after disable/enable cycle") - - g.By("Verifying pcs status shows etcd-clone Started on both nodes") - o.Eventually(func() error { - return verifyEtcdCloneStartedOnAllNodes(oc, execNode.Name, nodes) - }, etcdResourceRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( - o.HaveOccurred(), "etcd-clone should be Started on both nodes after recovery") - - g.By("Checking pacemaker logs for stopping resource count detection") - expectPacemakerLogFound(oc, nodes, stoppingResourcesLogPattern, "Stopping resources log entries", logBaselines) - - g.By("Verifying delay intervention was applied to prevent simultaneous member removal") - expectPacemakerLogFound(oc, nodes, delayStopLogPattern, "Delay intervention log", logBaselines) - - verifyFinalClusterHealth(oc, execNode.Name, nodes, etcdClientFactory, - "after simultaneous stop test", etcdResourceRecoveryTimeout) - }) - - // This test verifies that an abrupt termination of the etcd container triggers a - // coordinated "Error occurred" monitor state on both nodes before the cluster - // self-heals automatically. - // - // When a local etcd container is killed, the podman-etcd resource agent must - // coordinate recovery with the peer node. The surviving node sets force_new_cluster - // and the killed node's etcd restarts and joins as a learner. During this process, - // both nodes briefly enter a coordinated failed state visible in pcs status as - // "Failed Resource Actions". - g.It("should coordinate recovery with peer when local etcd container is killed", func() { - nodeList, err := utils.GetNodes(oc, utils.AllNodes) - o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") - o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected exactly 2 nodes for two-node cluster") - - nodes := nodeList.Items - targetNode := nodes[1] // Kill etcd on the second node - execNode := nodes[0] // Use first node for pcs status checks after recovery - - g.By("Verifying both nodes are healthy before test") - o.Eventually(func() error { - return waitForAllNodesReady(oc, 2) - }, nodeIsHealthyTimeout, utils.FiveSecondPollInterval).Should( - o.Succeed(), "Both nodes should be Ready before test") - - // Kill etcd container on the target node. - g.By(fmt.Sprintf("Killing etcd container on %s", targetNode.Name)) - output, err := exutil.DebugNodeRetryWithOptionsAndChroot( - oc, targetNode.Name, "openshift-etcd", - "bash", "-c", "podman kill etcd 2>/dev/null; true") - framework.Logf("Podman kill output: %s, err: %v", output, err) - - // Wait for the cluster to self-heal. - g.By("Waiting for etcd cluster to self-heal after container kill") - o.Eventually(func() error { - return utils.LogEtcdClusterStatus(oc, "after container kill", etcdClientFactory) - }, longRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( - o.HaveOccurred(), "etcd cluster should self-heal after container kill") - - g.By("Verifying pcs status shows etcd-clone Started on both nodes") - o.Eventually(func() error { - return verifyEtcdCloneStartedOnAllNodes(oc, execNode.Name, nodes) - }, longRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( - o.HaveOccurred(), "etcd-clone should be Started on both nodes after recovery") - - // Verify that the coordinated failure was observed. - g.By("Checking pcs status for coordinated 'Failed Resource Actions' on both nodes") - pcsOutput, statusErr := exutil.DebugNodeRetryWithOptionsAndChroot( - oc, execNode.Name, "default", "bash", "-c", "sudo pcs status") - o.Expect(statusErr).ShouldNot(o.HaveOccurred(), "Expected to get pcs status without error") - framework.Logf("PCS status after recovery:\n%s", pcsOutput) - - failedSection := extractFailedActionsSection(pcsOutput) - o.Expect(failedSection).NotTo(o.BeEmpty(), - "Expected pcs status to contain 'Failed Resource Actions' section after container kill") - framework.Logf("Failed Resource Actions section:\n%s", failedSection) - - o.Expect(failedSection).To(o.ContainSubstring("etcd"), - "Expected Failed Resource Actions to reference etcd") - - for _, node := range nodes { - o.Expect(failedSection).To(o.ContainSubstring(node.Name), - fmt.Sprintf("Expected Failed Resource Actions to show failure on %s for coordinated recovery", node.Name)) - framework.Logf("Coordinated failure confirmed: node %s found in Failed Resource Actions", node.Name) - } - - verifyFinalClusterHealth(oc, execNode.Name, nodes, etcdClientFactory, - "after coordinated recovery test", longRecoveryTimeout) - }) - - // This test verifies that the podman-etcd resource agent retries setting - // CRM attributes when they fail during the force-new-cluster recovery path. - // - // When the learner_node CIB attribute is deleted while a node is in standby, - // the returning node's start action polls for the attribute but finds it missing. - // Without the retry fix, the node gets stuck in the LEARNER=true stage because - // nobody re-sets the attribute. With the fix, the leader node's monitor detects - // that a learner member exists in etcd but the learner_node attribute is missing, - // and retries setting it, allowing the returning node to proceed. - // - // Test flow: - // 1. Put a node in standby (triggers force-new-cluster on the peer) - // 2. Wait for the standby node to appear as a learner in etcd member list - // 3. Delete the learner_node CRM attribute - // 4. Unstandby the node - // 5. Verify both nodes recover to voting etcd members - g.It("should retry setting learner_node attribute after deletion during force-new-cluster recovery", func() { - nodeList, err := utils.GetNodes(oc, utils.AllNodes) - o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error") - o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected exactly 2 nodes for two-node cluster") - - nodes := nodeList.Items - execNode := nodes[0] // Stays active, runs solo during standby - standbyNode := nodes[1] // Will be put in standby - - g.By("Verifying both nodes are healthy before test") - o.Eventually(func() error { - return waitForAllNodesReady(oc, 2) - }, nodeIsHealthyTimeout, utils.FiveSecondPollInterval).Should( - o.Succeed(), "Both nodes should be Ready before test") - - // Put the standby node in standby mode. - g.By(fmt.Sprintf("Putting %s in standby", standbyNode.Name)) - output, err := exutil.DebugNodeRetryWithOptionsAndChroot( - oc, execNode.Name, "default", "bash", "-c", - fmt.Sprintf("sudo pcs node standby %s", standbyNode.Name)) - o.Expect(err).ShouldNot(o.HaveOccurred(), - fmt.Sprintf("Expected pcs node standby to succeed, output: %s", output)) - framework.Logf("PCS node standby output: %s", output) - - // Wait for force-new-cluster recovery to complete. - g.By(fmt.Sprintf("Waiting for %s to appear as learner in etcd member list", standbyNode.Name)) - o.Eventually(func() error { - members, err := utils.GetMembers(etcdClientFactory) - if err != nil { - return fmt.Errorf("failed to get etcd members: %v", err) - } - _, isLearner, err := utils.GetMemberState(&standbyNode, members) - if err != nil { - return fmt.Errorf("standby node not in member list yet: %v", err) - } - if !isLearner { - return fmt.Errorf("standby node %s is not a learner yet", standbyNode.Name) - } - framework.Logf("Standby node %s confirmed as learner in etcd member list", standbyNode.Name) - return nil - }, longRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( - o.HaveOccurred(), "Standby node should appear as learner in etcd member list") - - g.By("Logging pcs status after standby") - if pcsOutput, pcsErr := exutil.DebugNodeRetryWithOptionsAndChroot( - oc, execNode.Name, "default", "bash", "-c", "sudo pcs status"); pcsErr == nil { - framework.Logf("PCS status after standby:\n%s", pcsOutput) - } - - // Verify learner_node attribute is set before we delete it. - // The attribute is set by the leader's monitor action which runs asynchronously - // after detecting the learner, so poll until it appears. - g.By("Verifying learner_node CRM attribute is set") - var attrOutput string - o.Eventually(func() error { - var queryErr error - attrOutput, queryErr = utils.QueryCRMAttribute(oc, execNode.Name, crmAttributeName) - return queryErr - }, etcdResourceRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( - o.HaveOccurred(), "Expected learner_node attribute to exist after force-new-cluster recovery") - framework.Logf("learner_node attribute value: %s", attrOutput) - - // Delete the learner_node attribute to simulate attribute update failure. - g.By("Deleting learner_node CRM attribute to simulate attribute update failure") - utils.DeleteCRMAttribute(oc, execNode.Name, crmAttributeName) - framework.Logf("learner_node attribute deleted") - - // Unstandby the node. With the retry fix, the leader node's monitor detects - // the missing attribute and re-sets it, allowing the returning node to proceed. - g.By(fmt.Sprintf("Unstandby %s to trigger etcd rejoin", standbyNode.Name)) - output, err = exutil.DebugNodeRetryWithOptionsAndChroot( - oc, execNode.Name, "default", "bash", "-c", - fmt.Sprintf("sudo pcs node unstandby %s", standbyNode.Name)) - o.Expect(err).ShouldNot(o.HaveOccurred(), - fmt.Sprintf("Expected pcs node unstandby to succeed, output: %s", output)) - framework.Logf("PCS node unstandby output: %s", output) - - // Wait for both nodes to become voting etcd members. - g.By("Waiting for both nodes to become voting etcd members") - o.Eventually(func() error { - members, err := utils.GetMembers(etcdClientFactory) - if err != nil { - return fmt.Errorf("failed to get etcd members: %v", err) - } - if len(members) != 2 { - return fmt.Errorf("expected 2 members, found %d", len(members)) - } - for i := range nodes { - isStarted, isLearner, err := utils.GetMemberState(&nodes[i], members) - if err != nil { - return fmt.Errorf("member %s not found: %v", nodes[i].Name, err) - } - if !isStarted { - return fmt.Errorf("member %s is not started", nodes[i].Name) - } - if isLearner { - return fmt.Errorf("member %s is still a learner", nodes[i].Name) - } - } - framework.Logf("Both etcd members are now voting members") - return nil - }, longRecoveryTimeout, utils.FiveSecondPollInterval).ShouldNot( - o.HaveOccurred(), "Both nodes should become voting etcd members") - - verifyFinalClusterHealth(oc, execNode.Name, nodes, etcdClientFactory, - "after attribute retry test", longRecoveryTimeout) - }) -})