|
| 1 | +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 2 | +// SPDX-License-Identifier: Apache-2.0 |
| 3 | + |
| 4 | +#![cfg(feature = "e2e")] |
| 5 | + |
| 6 | +//! E2E tests for gateway resume from existing state. |
| 7 | +//! |
| 8 | +//! These tests verify that `openshell gateway start` resumes from existing |
| 9 | +//! Docker volume state (after stop or container removal) and that the SSH |
| 10 | +//! handshake secret persists across container restarts. |
| 11 | +//! |
| 12 | +//! **Requires a running gateway** — the `e2e:rust` mise task bootstraps one. |
| 13 | +
|
| 14 | +use std::process::{Command, Stdio}; |
| 15 | +use std::time::Duration; |
| 16 | + |
| 17 | +use openshell_e2e::harness::binary::openshell_cmd; |
| 18 | +use openshell_e2e::harness::output::strip_ansi; |
| 19 | +use tokio::time::sleep; |
| 20 | + |
| 21 | +/// Default gateway name used by the e2e cluster. |
| 22 | +const GATEWAY_NAME: &str = "openshell"; |
| 23 | + |
| 24 | +/// Docker container name for the default gateway. |
| 25 | +fn container_name() -> String { |
| 26 | + format!("openshell-cluster-{GATEWAY_NAME}") |
| 27 | +} |
| 28 | + |
| 29 | +/// Run `openshell <args>` and return (combined output, exit code). |
| 30 | +async fn run_cli(args: &[&str]) -> (String, i32) { |
| 31 | + let mut cmd = openshell_cmd(); |
| 32 | + cmd.args(args) |
| 33 | + .stdout(Stdio::piped()) |
| 34 | + .stderr(Stdio::piped()); |
| 35 | + |
| 36 | + let output = cmd.output().await.expect("spawn openshell"); |
| 37 | + let stdout = String::from_utf8_lossy(&output.stdout).to_string(); |
| 38 | + let stderr = String::from_utf8_lossy(&output.stderr).to_string(); |
| 39 | + let combined = format!("{stdout}{stderr}"); |
| 40 | + let code = output.status.code().unwrap_or(-1); |
| 41 | + (combined, code) |
| 42 | +} |
| 43 | + |
| 44 | +/// Run `docker <args>` synchronously and return (stdout, exit code). |
| 45 | +fn docker_cmd(args: &[&str]) -> (String, i32) { |
| 46 | + let output = Command::new("docker") |
| 47 | + .args(args) |
| 48 | + .stdout(Stdio::piped()) |
| 49 | + .stderr(Stdio::piped()) |
| 50 | + .output() |
| 51 | + .expect("spawn docker"); |
| 52 | + let stdout = String::from_utf8_lossy(&output.stdout).to_string(); |
| 53 | + let code = output.status.code().unwrap_or(-1); |
| 54 | + (stdout, code) |
| 55 | +} |
| 56 | + |
| 57 | +/// Wait for the gateway to become healthy by polling `openshell status`. |
| 58 | +async fn wait_for_healthy(timeout: Duration) { |
| 59 | + let start = std::time::Instant::now(); |
| 60 | + loop { |
| 61 | + let (output, code) = run_cli(&["status"]).await; |
| 62 | + let clean = strip_ansi(&output).to_lowercase(); |
| 63 | + if code == 0 && (clean.contains("healthy") || clean.contains("running") || clean.contains("✓")) { |
| 64 | + return; |
| 65 | + } |
| 66 | + if start.elapsed() > timeout { |
| 67 | + panic!( |
| 68 | + "gateway did not become healthy within {}s. Last output:\n{}", |
| 69 | + timeout.as_secs(), |
| 70 | + strip_ansi(&output) |
| 71 | + ); |
| 72 | + } |
| 73 | + sleep(Duration::from_secs(3)).await; |
| 74 | + } |
| 75 | +} |
| 76 | + |
| 77 | +/// Read the SSH handshake secret from the K8s secret inside the cluster. |
| 78 | +fn read_ssh_handshake_secret() -> Option<String> { |
| 79 | + let cname = container_name(); |
| 80 | + let (output, code) = docker_cmd(&[ |
| 81 | + "exec", |
| 82 | + &cname, |
| 83 | + "sh", |
| 84 | + "-c", |
| 85 | + "KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl -n openshell get secret openshell-ssh-handshake -o jsonpath='{.data.secret}' 2>/dev/null", |
| 86 | + ]); |
| 87 | + if code == 0 && !output.trim().is_empty() { |
| 88 | + Some(output.trim().to_string()) |
| 89 | + } else { |
| 90 | + None |
| 91 | + } |
| 92 | +} |
| 93 | + |
| 94 | +// ------------------------------------------------------------------- |
| 95 | +// Test: `gateway start` on an already-running gateway succeeds |
| 96 | +// ------------------------------------------------------------------- |
| 97 | + |
| 98 | +/// When the gateway is already running, `openshell gateway start` should |
| 99 | +/// return immediately with exit code 0 and indicate it's already running. |
| 100 | +#[tokio::test] |
| 101 | +async fn gateway_start_on_running_gateway_succeeds() { |
| 102 | + // Precondition: gateway is running (e2e cluster is up). |
| 103 | + wait_for_healthy(Duration::from_secs(30)).await; |
| 104 | + |
| 105 | + let (output, code) = run_cli(&["gateway", "start"]).await; |
| 106 | + let clean = strip_ansi(&output); |
| 107 | + |
| 108 | + assert_eq!( |
| 109 | + code, 0, |
| 110 | + "gateway start on running gateway should exit 0:\n{clean}" |
| 111 | + ); |
| 112 | + assert!( |
| 113 | + clean.to_lowercase().contains("already running"), |
| 114 | + "output should indicate gateway is already running:\n{clean}" |
| 115 | + ); |
| 116 | +} |
| 117 | + |
| 118 | +// ------------------------------------------------------------------- |
| 119 | +// Test: gateway stop → start resumes, sandbox survives |
| 120 | +// ------------------------------------------------------------------- |
| 121 | + |
| 122 | +/// After `gateway stop` then `gateway start`, the gateway should resume |
| 123 | +/// from existing state. A sandbox created before the stop should still |
| 124 | +/// appear in the sandbox list after restart. |
| 125 | +#[tokio::test] |
| 126 | +async fn gateway_stop_start_resumes_with_sandbox() { |
| 127 | + // Precondition: gateway is healthy. |
| 128 | + wait_for_healthy(Duration::from_secs(30)).await; |
| 129 | + |
| 130 | + // Create a sandbox that we'll check for after restart. |
| 131 | + let (create_output, create_code) = |
| 132 | + run_cli(&["sandbox", "create", "--", "echo", "resume-test"]).await; |
| 133 | + let clean_create = strip_ansi(&create_output); |
| 134 | + assert_eq!( |
| 135 | + create_code, 0, |
| 136 | + "sandbox create should succeed:\n{clean_create}" |
| 137 | + ); |
| 138 | + |
| 139 | + // Extract sandbox name from output. |
| 140 | + let sandbox_name = clean_create |
| 141 | + .lines() |
| 142 | + .find_map(|line| { |
| 143 | + if let Some((_, rest)) = line.split_once("Created sandbox:") { |
| 144 | + rest.split_whitespace().next().map(ToOwned::to_owned) |
| 145 | + } else if let Some((_, rest)) = line.split_once("Name:") { |
| 146 | + rest.split_whitespace().next().map(ToOwned::to_owned) |
| 147 | + } else { |
| 148 | + None |
| 149 | + } |
| 150 | + }) |
| 151 | + .expect("should extract sandbox name from create output"); |
| 152 | + |
| 153 | + // Stop the gateway. |
| 154 | + let (stop_output, stop_code) = run_cli(&["gateway", "stop"]).await; |
| 155 | + assert_eq!( |
| 156 | + stop_code, 0, |
| 157 | + "gateway stop should succeed:\n{}", |
| 158 | + strip_ansi(&stop_output) |
| 159 | + ); |
| 160 | + |
| 161 | + // Wait a moment for the container to fully stop. |
| 162 | + sleep(Duration::from_secs(3)).await; |
| 163 | + |
| 164 | + // Verify container is stopped. |
| 165 | + let (inspect_out, _) = docker_cmd(&[ |
| 166 | + "inspect", |
| 167 | + "-f", |
| 168 | + "{{.State.Running}}", |
| 169 | + &container_name(), |
| 170 | + ]); |
| 171 | + assert_eq!( |
| 172 | + inspect_out.trim(), |
| 173 | + "false", |
| 174 | + "container should be stopped after gateway stop" |
| 175 | + ); |
| 176 | + |
| 177 | + // Start the gateway again — should resume from existing state. |
| 178 | + let (start_output, start_code) = run_cli(&["gateway", "start"]).await; |
| 179 | + let clean_start = strip_ansi(&start_output); |
| 180 | + assert_eq!( |
| 181 | + start_code, 0, |
| 182 | + "gateway start after stop should succeed:\n{clean_start}" |
| 183 | + ); |
| 184 | + |
| 185 | + // Wait for the gateway to become healthy again. |
| 186 | + wait_for_healthy(Duration::from_secs(180)).await; |
| 187 | + |
| 188 | + // Verify the sandbox still exists. |
| 189 | + let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await; |
| 190 | + let clean_list = strip_ansi(&list_output); |
| 191 | + assert_eq!( |
| 192 | + list_code, 0, |
| 193 | + "sandbox list should succeed after resume:\n{clean_list}" |
| 194 | + ); |
| 195 | + assert!( |
| 196 | + clean_list.contains(&sandbox_name), |
| 197 | + "sandbox '{sandbox_name}' should survive gateway stop/start.\nList output:\n{clean_list}" |
| 198 | + ); |
| 199 | + |
| 200 | + // Cleanup: delete the test sandbox. |
| 201 | + let _ = run_cli(&["sandbox", "delete", &sandbox_name]).await; |
| 202 | +} |
| 203 | + |
| 204 | +// ------------------------------------------------------------------- |
| 205 | +// Test: container removed → gateway start resumes |
| 206 | +// ------------------------------------------------------------------- |
| 207 | + |
| 208 | +/// After the Docker container is force-removed (simulating Docker restart), |
| 209 | +/// `openshell gateway start` should resume from the existing volume. |
| 210 | +#[tokio::test] |
| 211 | +async fn gateway_start_resumes_after_container_removal() { |
| 212 | + // Precondition: gateway is healthy. |
| 213 | + wait_for_healthy(Duration::from_secs(30)).await; |
| 214 | + |
| 215 | + // Create a sandbox to verify state persistence. |
| 216 | + let (create_output, create_code) = |
| 217 | + run_cli(&["sandbox", "create", "--", "echo", "container-rm-test"]).await; |
| 218 | + let clean_create = strip_ansi(&create_output); |
| 219 | + assert_eq!( |
| 220 | + create_code, 0, |
| 221 | + "sandbox create should succeed:\n{clean_create}" |
| 222 | + ); |
| 223 | + |
| 224 | + let sandbox_name = clean_create |
| 225 | + .lines() |
| 226 | + .find_map(|line| { |
| 227 | + if let Some((_, rest)) = line.split_once("Created sandbox:") { |
| 228 | + rest.split_whitespace().next().map(ToOwned::to_owned) |
| 229 | + } else if let Some((_, rest)) = line.split_once("Name:") { |
| 230 | + rest.split_whitespace().next().map(ToOwned::to_owned) |
| 231 | + } else { |
| 232 | + None |
| 233 | + } |
| 234 | + }) |
| 235 | + .expect("should extract sandbox name from create output"); |
| 236 | + |
| 237 | + // Force-remove the container (simulates Docker restart / OOM kill). |
| 238 | + let (_, rm_code) = docker_cmd(&["rm", "-f", &container_name()]); |
| 239 | + assert_eq!(rm_code, 0, "docker rm -f should succeed"); |
| 240 | + |
| 241 | + // Verify the volume still exists. |
| 242 | + let (vol_out, vol_code) = docker_cmd(&[ |
| 243 | + "volume", |
| 244 | + "inspect", |
| 245 | + &format!("openshell-cluster-{GATEWAY_NAME}"), |
| 246 | + ]); |
| 247 | + assert_eq!( |
| 248 | + vol_code, 0, |
| 249 | + "volume should still exist after container removal:\n{vol_out}" |
| 250 | + ); |
| 251 | + |
| 252 | + // Start the gateway — should resume from the volume. |
| 253 | + let (start_output, start_code) = run_cli(&["gateway", "start"]).await; |
| 254 | + let clean_start = strip_ansi(&start_output); |
| 255 | + assert_eq!( |
| 256 | + start_code, 0, |
| 257 | + "gateway start after container removal should succeed:\n{clean_start}" |
| 258 | + ); |
| 259 | + |
| 260 | + // Wait for healthy. |
| 261 | + wait_for_healthy(Duration::from_secs(180)).await; |
| 262 | + |
| 263 | + // Verify sandbox survived. |
| 264 | + let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await; |
| 265 | + let clean_list = strip_ansi(&list_output); |
| 266 | + assert_eq!( |
| 267 | + list_code, 0, |
| 268 | + "sandbox list should succeed after resume:\n{clean_list}" |
| 269 | + ); |
| 270 | + assert!( |
| 271 | + clean_list.contains(&sandbox_name), |
| 272 | + "sandbox '{sandbox_name}' should survive container removal + resume.\nList output:\n{clean_list}" |
| 273 | + ); |
| 274 | + |
| 275 | + // Cleanup. |
| 276 | + let _ = run_cli(&["sandbox", "delete", &sandbox_name]).await; |
| 277 | +} |
| 278 | + |
| 279 | +// ------------------------------------------------------------------- |
| 280 | +// Test: SSH handshake secret persists across container restart |
| 281 | +// ------------------------------------------------------------------- |
| 282 | + |
| 283 | +/// The SSH handshake K8s secret should persist across gateway stop/start |
| 284 | +/// cycles — the same base64-encoded value should be returned before and |
| 285 | +/// after the restart. |
| 286 | +#[tokio::test] |
| 287 | +async fn ssh_handshake_secret_persists_across_restart() { |
| 288 | + // Precondition: gateway is healthy. |
| 289 | + wait_for_healthy(Duration::from_secs(30)).await; |
| 290 | + |
| 291 | + // Read the SSH handshake secret before restart. |
| 292 | + let secret_before = read_ssh_handshake_secret() |
| 293 | + .expect("SSH handshake secret should exist before restart"); |
| 294 | + assert!( |
| 295 | + !secret_before.is_empty(), |
| 296 | + "SSH handshake secret should not be empty" |
| 297 | + ); |
| 298 | + |
| 299 | + // Stop the gateway. |
| 300 | + let (_, stop_code) = run_cli(&["gateway", "stop"]).await; |
| 301 | + assert_eq!(stop_code, 0, "gateway stop should succeed"); |
| 302 | + |
| 303 | + sleep(Duration::from_secs(3)).await; |
| 304 | + |
| 305 | + // Start the gateway. |
| 306 | + let (start_output, start_code) = run_cli(&["gateway", "start"]).await; |
| 307 | + assert_eq!( |
| 308 | + start_code, 0, |
| 309 | + "gateway start should succeed:\n{}", |
| 310 | + strip_ansi(&start_output) |
| 311 | + ); |
| 312 | + |
| 313 | + // Wait for healthy. |
| 314 | + wait_for_healthy(Duration::from_secs(180)).await; |
| 315 | + |
| 316 | + // Read the secret after restart. |
| 317 | + let secret_after = read_ssh_handshake_secret() |
| 318 | + .expect("SSH handshake secret should exist after restart"); |
| 319 | + |
| 320 | + assert_eq!( |
| 321 | + secret_before, secret_after, |
| 322 | + "SSH handshake secret should be identical before and after restart" |
| 323 | + ); |
| 324 | +} |
0 commit comments