Skip to content

Commit b917835

Browse files
committed
add e2e tests
1 parent 9b0af9c commit b917835

1 file changed

Lines changed: 324 additions & 0 deletions

File tree

e2e/rust/tests/gateway_resume.rs

Lines changed: 324 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,324 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#![cfg(feature = "e2e")]
5+
6+
//! E2E tests for gateway resume from existing state.
7+
//!
8+
//! These tests verify that `openshell gateway start` resumes from existing
9+
//! Docker volume state (after stop or container removal) and that the SSH
10+
//! handshake secret persists across container restarts.
11+
//!
12+
//! **Requires a running gateway** — the `e2e:rust` mise task bootstraps one.
13+
14+
use std::process::{Command, Stdio};
15+
use std::time::Duration;
16+
17+
use openshell_e2e::harness::binary::openshell_cmd;
18+
use openshell_e2e::harness::output::strip_ansi;
19+
use tokio::time::sleep;
20+
21+
/// Default gateway name used by the e2e cluster.
22+
const GATEWAY_NAME: &str = "openshell";
23+
24+
/// Docker container name for the default gateway.
25+
fn container_name() -> String {
26+
format!("openshell-cluster-{GATEWAY_NAME}")
27+
}
28+
29+
/// Run `openshell <args>` and return (combined output, exit code).
30+
async fn run_cli(args: &[&str]) -> (String, i32) {
31+
let mut cmd = openshell_cmd();
32+
cmd.args(args)
33+
.stdout(Stdio::piped())
34+
.stderr(Stdio::piped());
35+
36+
let output = cmd.output().await.expect("spawn openshell");
37+
let stdout = String::from_utf8_lossy(&output.stdout).to_string();
38+
let stderr = String::from_utf8_lossy(&output.stderr).to_string();
39+
let combined = format!("{stdout}{stderr}");
40+
let code = output.status.code().unwrap_or(-1);
41+
(combined, code)
42+
}
43+
44+
/// Run `docker <args>` synchronously and return (stdout, exit code).
45+
fn docker_cmd(args: &[&str]) -> (String, i32) {
46+
let output = Command::new("docker")
47+
.args(args)
48+
.stdout(Stdio::piped())
49+
.stderr(Stdio::piped())
50+
.output()
51+
.expect("spawn docker");
52+
let stdout = String::from_utf8_lossy(&output.stdout).to_string();
53+
let code = output.status.code().unwrap_or(-1);
54+
(stdout, code)
55+
}
56+
57+
/// Wait for the gateway to become healthy by polling `openshell status`.
58+
async fn wait_for_healthy(timeout: Duration) {
59+
let start = std::time::Instant::now();
60+
loop {
61+
let (output, code) = run_cli(&["status"]).await;
62+
let clean = strip_ansi(&output).to_lowercase();
63+
if code == 0 && (clean.contains("healthy") || clean.contains("running") || clean.contains("✓")) {
64+
return;
65+
}
66+
if start.elapsed() > timeout {
67+
panic!(
68+
"gateway did not become healthy within {}s. Last output:\n{}",
69+
timeout.as_secs(),
70+
strip_ansi(&output)
71+
);
72+
}
73+
sleep(Duration::from_secs(3)).await;
74+
}
75+
}
76+
77+
/// Read the SSH handshake secret from the K8s secret inside the cluster.
78+
fn read_ssh_handshake_secret() -> Option<String> {
79+
let cname = container_name();
80+
let (output, code) = docker_cmd(&[
81+
"exec",
82+
&cname,
83+
"sh",
84+
"-c",
85+
"KUBECONFIG=/etc/rancher/k3s/k3s.yaml kubectl -n openshell get secret openshell-ssh-handshake -o jsonpath='{.data.secret}' 2>/dev/null",
86+
]);
87+
if code == 0 && !output.trim().is_empty() {
88+
Some(output.trim().to_string())
89+
} else {
90+
None
91+
}
92+
}
93+
94+
// -------------------------------------------------------------------
95+
// Test: `gateway start` on an already-running gateway succeeds
96+
// -------------------------------------------------------------------
97+
98+
/// When the gateway is already running, `openshell gateway start` should
99+
/// return immediately with exit code 0 and indicate it's already running.
100+
#[tokio::test]
101+
async fn gateway_start_on_running_gateway_succeeds() {
102+
// Precondition: gateway is running (e2e cluster is up).
103+
wait_for_healthy(Duration::from_secs(30)).await;
104+
105+
let (output, code) = run_cli(&["gateway", "start"]).await;
106+
let clean = strip_ansi(&output);
107+
108+
assert_eq!(
109+
code, 0,
110+
"gateway start on running gateway should exit 0:\n{clean}"
111+
);
112+
assert!(
113+
clean.to_lowercase().contains("already running"),
114+
"output should indicate gateway is already running:\n{clean}"
115+
);
116+
}
117+
118+
// -------------------------------------------------------------------
119+
// Test: gateway stop → start resumes, sandbox survives
120+
// -------------------------------------------------------------------
121+
122+
/// After `gateway stop` then `gateway start`, the gateway should resume
123+
/// from existing state. A sandbox created before the stop should still
124+
/// appear in the sandbox list after restart.
125+
#[tokio::test]
126+
async fn gateway_stop_start_resumes_with_sandbox() {
127+
// Precondition: gateway is healthy.
128+
wait_for_healthy(Duration::from_secs(30)).await;
129+
130+
// Create a sandbox that we'll check for after restart.
131+
let (create_output, create_code) =
132+
run_cli(&["sandbox", "create", "--", "echo", "resume-test"]).await;
133+
let clean_create = strip_ansi(&create_output);
134+
assert_eq!(
135+
create_code, 0,
136+
"sandbox create should succeed:\n{clean_create}"
137+
);
138+
139+
// Extract sandbox name from output.
140+
let sandbox_name = clean_create
141+
.lines()
142+
.find_map(|line| {
143+
if let Some((_, rest)) = line.split_once("Created sandbox:") {
144+
rest.split_whitespace().next().map(ToOwned::to_owned)
145+
} else if let Some((_, rest)) = line.split_once("Name:") {
146+
rest.split_whitespace().next().map(ToOwned::to_owned)
147+
} else {
148+
None
149+
}
150+
})
151+
.expect("should extract sandbox name from create output");
152+
153+
// Stop the gateway.
154+
let (stop_output, stop_code) = run_cli(&["gateway", "stop"]).await;
155+
assert_eq!(
156+
stop_code, 0,
157+
"gateway stop should succeed:\n{}",
158+
strip_ansi(&stop_output)
159+
);
160+
161+
// Wait a moment for the container to fully stop.
162+
sleep(Duration::from_secs(3)).await;
163+
164+
// Verify container is stopped.
165+
let (inspect_out, _) = docker_cmd(&[
166+
"inspect",
167+
"-f",
168+
"{{.State.Running}}",
169+
&container_name(),
170+
]);
171+
assert_eq!(
172+
inspect_out.trim(),
173+
"false",
174+
"container should be stopped after gateway stop"
175+
);
176+
177+
// Start the gateway again — should resume from existing state.
178+
let (start_output, start_code) = run_cli(&["gateway", "start"]).await;
179+
let clean_start = strip_ansi(&start_output);
180+
assert_eq!(
181+
start_code, 0,
182+
"gateway start after stop should succeed:\n{clean_start}"
183+
);
184+
185+
// Wait for the gateway to become healthy again.
186+
wait_for_healthy(Duration::from_secs(180)).await;
187+
188+
// Verify the sandbox still exists.
189+
let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await;
190+
let clean_list = strip_ansi(&list_output);
191+
assert_eq!(
192+
list_code, 0,
193+
"sandbox list should succeed after resume:\n{clean_list}"
194+
);
195+
assert!(
196+
clean_list.contains(&sandbox_name),
197+
"sandbox '{sandbox_name}' should survive gateway stop/start.\nList output:\n{clean_list}"
198+
);
199+
200+
// Cleanup: delete the test sandbox.
201+
let _ = run_cli(&["sandbox", "delete", &sandbox_name]).await;
202+
}
203+
204+
// -------------------------------------------------------------------
205+
// Test: container removed → gateway start resumes
206+
// -------------------------------------------------------------------
207+
208+
/// After the Docker container is force-removed (simulating Docker restart),
209+
/// `openshell gateway start` should resume from the existing volume.
210+
#[tokio::test]
211+
async fn gateway_start_resumes_after_container_removal() {
212+
// Precondition: gateway is healthy.
213+
wait_for_healthy(Duration::from_secs(30)).await;
214+
215+
// Create a sandbox to verify state persistence.
216+
let (create_output, create_code) =
217+
run_cli(&["sandbox", "create", "--", "echo", "container-rm-test"]).await;
218+
let clean_create = strip_ansi(&create_output);
219+
assert_eq!(
220+
create_code, 0,
221+
"sandbox create should succeed:\n{clean_create}"
222+
);
223+
224+
let sandbox_name = clean_create
225+
.lines()
226+
.find_map(|line| {
227+
if let Some((_, rest)) = line.split_once("Created sandbox:") {
228+
rest.split_whitespace().next().map(ToOwned::to_owned)
229+
} else if let Some((_, rest)) = line.split_once("Name:") {
230+
rest.split_whitespace().next().map(ToOwned::to_owned)
231+
} else {
232+
None
233+
}
234+
})
235+
.expect("should extract sandbox name from create output");
236+
237+
// Force-remove the container (simulates Docker restart / OOM kill).
238+
let (_, rm_code) = docker_cmd(&["rm", "-f", &container_name()]);
239+
assert_eq!(rm_code, 0, "docker rm -f should succeed");
240+
241+
// Verify the volume still exists.
242+
let (vol_out, vol_code) = docker_cmd(&[
243+
"volume",
244+
"inspect",
245+
&format!("openshell-cluster-{GATEWAY_NAME}"),
246+
]);
247+
assert_eq!(
248+
vol_code, 0,
249+
"volume should still exist after container removal:\n{vol_out}"
250+
);
251+
252+
// Start the gateway — should resume from the volume.
253+
let (start_output, start_code) = run_cli(&["gateway", "start"]).await;
254+
let clean_start = strip_ansi(&start_output);
255+
assert_eq!(
256+
start_code, 0,
257+
"gateway start after container removal should succeed:\n{clean_start}"
258+
);
259+
260+
// Wait for healthy.
261+
wait_for_healthy(Duration::from_secs(180)).await;
262+
263+
// Verify sandbox survived.
264+
let (list_output, list_code) = run_cli(&["sandbox", "list", "--names"]).await;
265+
let clean_list = strip_ansi(&list_output);
266+
assert_eq!(
267+
list_code, 0,
268+
"sandbox list should succeed after resume:\n{clean_list}"
269+
);
270+
assert!(
271+
clean_list.contains(&sandbox_name),
272+
"sandbox '{sandbox_name}' should survive container removal + resume.\nList output:\n{clean_list}"
273+
);
274+
275+
// Cleanup.
276+
let _ = run_cli(&["sandbox", "delete", &sandbox_name]).await;
277+
}
278+
279+
// -------------------------------------------------------------------
280+
// Test: SSH handshake secret persists across container restart
281+
// -------------------------------------------------------------------
282+
283+
/// The SSH handshake K8s secret should persist across gateway stop/start
284+
/// cycles — the same base64-encoded value should be returned before and
285+
/// after the restart.
286+
#[tokio::test]
287+
async fn ssh_handshake_secret_persists_across_restart() {
288+
// Precondition: gateway is healthy.
289+
wait_for_healthy(Duration::from_secs(30)).await;
290+
291+
// Read the SSH handshake secret before restart.
292+
let secret_before = read_ssh_handshake_secret()
293+
.expect("SSH handshake secret should exist before restart");
294+
assert!(
295+
!secret_before.is_empty(),
296+
"SSH handshake secret should not be empty"
297+
);
298+
299+
// Stop the gateway.
300+
let (_, stop_code) = run_cli(&["gateway", "stop"]).await;
301+
assert_eq!(stop_code, 0, "gateway stop should succeed");
302+
303+
sleep(Duration::from_secs(3)).await;
304+
305+
// Start the gateway.
306+
let (start_output, start_code) = run_cli(&["gateway", "start"]).await;
307+
assert_eq!(
308+
start_code, 0,
309+
"gateway start should succeed:\n{}",
310+
strip_ansi(&start_output)
311+
);
312+
313+
// Wait for healthy.
314+
wait_for_healthy(Duration::from_secs(180)).await;
315+
316+
// Read the secret after restart.
317+
let secret_after = read_ssh_handshake_secret()
318+
.expect("SSH handshake secret should exist after restart");
319+
320+
assert_eq!(
321+
secret_before, secret_after,
322+
"SSH handshake secret should be identical before and after restart"
323+
);
324+
}

0 commit comments

Comments
 (0)