Skip to content

Commit 37c9ae7

Browse files
authored
fix(bootstrap): detect missing sandbox supervisor binary during gateway health check (#281)
1 parent 864286e commit 37c9ae7

15 files changed

Lines changed: 200 additions & 211 deletions

File tree

.agents/skills/debug-openshell-cluster/SKILL.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Use **only** `openshell` CLI commands (`openshell status`, `openshell doctor log
2626
- k3s API server readiness (`/readyz`)
2727
- `openshell` statefulset ready in `openshell` namespace
2828
- TLS secrets `openshell-server-tls` and `openshell-client-tls` exist in `openshell` namespace
29+
- Sandbox supervisor binary exists at `/opt/openshell/bin/openshell-sandbox` (emits `HEALTHCHECK_MISSING_SUPERVISOR` marker if absent)
2930

3031
For local deploys, metadata endpoint selection now depends on Docker connectivity:
3132

@@ -311,6 +312,8 @@ If DNS is broken, all image pulls from the distribution registry will fail, as w
311312
| `metrics-server` errors in logs | Normal k3s noise, not the root cause | These errors are benign — look for the actual failing health check component |
312313
| Stale NotReady nodes from previous deploys | Volume reused across container recreations | The deploy flow now auto-cleans stale nodes; if it still fails, manually delete NotReady nodes (see Step 2) or choose "Recreate" when prompted |
313314
| gRPC `UNIMPLEMENTED` for newer RPCs in push mode | Helm values still point at older pulled images instead of the pushed refs | Verify rendered `openshell-helmchart.yaml` uses the expected push refs (`server`, `sandbox`, `pki-job`) and not `:latest` |
315+
| Sandbox pods crash with `/opt/openshell/bin/openshell-sandbox: no such file or directory` | Supervisor binary missing from cluster image | The cluster image was built/published without the `supervisor-builder` stage. Rebuild with `mise run docker:build:cluster` and recreate gateway. Bootstrap auto-detects via `HEALTHCHECK_MISSING_SUPERVISOR` marker |
316+
| `HEALTHCHECK_MISSING_SUPERVISOR` in health check logs | `/opt/openshell/bin/openshell-sandbox` not found in gateway container | Rebuild cluster image: `mise run docker:build:cluster`, then `openshell gateway destroy <name> && openshell gateway start` |
314317

315318
## Full Diagnostic Dump
316319

@@ -359,6 +362,9 @@ openshell doctor exec -- kubectl -n kube-system logs -l job-name=helm-install-op
359362
echo "=== Registry Configuration ==="
360363
openshell doctor exec -- cat /etc/rancher/k3s/registries.yaml
361364

365+
echo "=== Supervisor Binary ==="
366+
openshell doctor exec -- ls -la /opt/openshell/bin/openshell-sandbox
367+
362368
echo "=== DNS Configuration ==="
363369
openshell doctor exec -- cat /etc/rancher/k3s/resolv.conf
364370
```

crates/openshell-bootstrap/src/constants.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
// SPDX-License-Identifier: Apache-2.0
33

4-
pub const NETWORK_NAME: &str = "openshell-cluster";
5-
64
/// Path to the kubeconfig inside the k3s container.
75
/// Used by in-container kubectl operations (node cleanup, PKI reconciliation, etc.).
86
pub const KUBECONFIG_PATH: &str = "/etc/rancher/k3s/k3s.yaml";

crates/openshell-bootstrap/src/docker.rs

Lines changed: 3 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,18 @@
22
// SPDX-License-Identifier: Apache-2.0
33

44
use crate::RemoteOptions;
5-
use crate::constants::{NETWORK_NAME, container_name, volume_name};
5+
use crate::constants::{container_name, volume_name};
66
use crate::image::{
77
self, DEFAULT_IMAGE_REPO_BASE, DEFAULT_REGISTRY, DEFAULT_REGISTRY_USERNAME, parse_image_ref,
88
};
99
use bollard::API_DEFAULT_VERSION;
1010
use bollard::Docker;
1111
use bollard::errors::Error as BollardError;
1212
use bollard::models::{
13-
ContainerCreateBody, DeviceRequest, HostConfig, NetworkCreateRequest, NetworkDisconnectRequest,
14-
PortBinding, VolumeCreateRequest,
13+
ContainerCreateBody, DeviceRequest, HostConfig, PortBinding, VolumeCreateRequest,
1514
};
1615
use bollard::query_parameters::{
17-
CreateContainerOptions, CreateImageOptions, InspectContainerOptions, InspectNetworkOptions,
16+
CreateContainerOptions, CreateImageOptions, InspectContainerOptions,
1817
ListContainersOptionsBuilder, RemoveContainerOptions, RemoveImageOptions, RemoveVolumeOptions,
1918
StartContainerOptions,
2019
};
@@ -186,53 +185,6 @@ pub async fn find_gateway_container(docker: &Docker, port: Option<u16>) -> Resul
186185
}
187186
}
188187

189-
pub async fn ensure_network(docker: &Docker) -> Result<()> {
190-
// Always remove and recreate the network to guarantee a clean state.
191-
// Stale Docker networks (e.g., from a previous interrupted destroy or
192-
// Docker Desktop restart) can leave broken routing that causes the
193-
// container to fail with "no default routes found".
194-
force_remove_network(docker).await?;
195-
196-
// Docker may return a 409 conflict if the previous network teardown has
197-
// not fully completed in the daemon. Retry a few times with back-off,
198-
// re-attempting the removal before each create.
199-
let mut last_err = None;
200-
for attempt in 0u64..5 {
201-
if attempt > 0 {
202-
tokio::time::sleep(std::time::Duration::from_millis(500 * attempt)).await;
203-
// Re-attempt removal in case the previous teardown has now settled.
204-
force_remove_network(docker).await?;
205-
}
206-
match docker
207-
.create_network(NetworkCreateRequest {
208-
name: NETWORK_NAME.to_string(),
209-
driver: Some("bridge".to_string()),
210-
attachable: Some(true),
211-
..Default::default()
212-
})
213-
.await
214-
{
215-
Ok(_) => return Ok(()),
216-
Err(err) if is_conflict(&err) => {
217-
tracing::debug!(
218-
"Network create conflict (attempt {}/5), retrying: {}",
219-
attempt + 1,
220-
err,
221-
);
222-
last_err = Some(err);
223-
}
224-
Err(err) => {
225-
return Err(err)
226-
.into_diagnostic()
227-
.wrap_err("failed to create Docker network");
228-
}
229-
}
230-
}
231-
Err(last_err.expect("at least one retry attempt"))
232-
.into_diagnostic()
233-
.wrap_err("failed to create Docker network after retries (network still in use)")
234-
}
235-
236188
pub async fn ensure_volume(docker: &Docker, name: &str) -> Result<()> {
237189
match docker.inspect_volume(name).await {
238190
Ok(_) => return Ok(()),
@@ -376,7 +328,6 @@ pub async fn ensure_container(
376328
privileged: Some(true),
377329
port_bindings: Some(port_bindings),
378330
binds: Some(vec![format!("{}:/var/lib/rancher/k3s", volume_name(name))]),
379-
network_mode: Some(NETWORK_NAME.to_string()),
380331
// Add host.docker.internal mapping for DNS resolution
381332
// This allows the entrypoint script to configure CoreDNS to use the host gateway
382333
extra_hosts: Some(vec!["host.docker.internal:host-gateway".to_string()]),
@@ -678,20 +629,6 @@ pub async fn destroy_gateway_resources(docker: &Docker, name: &str) -> Result<()
678629
.ok()
679630
.and_then(|info| info.image);
680631

681-
// Explicitly disconnect the container from the cluster network before
682-
// removing it. This ensures Docker tears down the network endpoint
683-
// synchronously so port bindings are released immediately and the
684-
// subsequent network cleanup sees zero connected containers.
685-
let _ = docker
686-
.disconnect_network(
687-
NETWORK_NAME,
688-
NetworkDisconnectRequest {
689-
container: container_name.clone(),
690-
force: Some(true),
691-
},
692-
)
693-
.await;
694-
695632
let _ = stop_container(docker, &container_name).await;
696633

697634
let remove_container = docker
@@ -763,50 +700,9 @@ pub async fn destroy_gateway_resources(docker: &Docker, name: &str) -> Result<()
763700
return Err(err).into_diagnostic();
764701
}
765702

766-
// Force-remove the network during a full destroy. First disconnect any
767-
// stale endpoints that Docker may still report (race between container
768-
// removal and network bookkeeping), then remove the network itself.
769-
force_remove_network(docker).await?;
770703
Ok(())
771704
}
772705

773-
/// Forcefully remove the gateway network, disconnecting any remaining
774-
/// containers first. This ensures that stale Docker network endpoints
775-
/// cannot prevent port bindings from being released.
776-
async fn force_remove_network(docker: &Docker) -> Result<()> {
777-
let network = match docker
778-
.inspect_network(NETWORK_NAME, None::<InspectNetworkOptions>)
779-
.await
780-
{
781-
Ok(info) => info,
782-
Err(err) if is_not_found(&err) => return Ok(()),
783-
Err(err) => return Err(err).into_diagnostic(),
784-
};
785-
786-
// Disconnect any containers still attached to the network.
787-
if let Some(containers) = network.containers {
788-
for (id, _) in containers {
789-
let _ = docker
790-
.disconnect_network(
791-
NETWORK_NAME,
792-
NetworkDisconnectRequest {
793-
container: id,
794-
force: Some(true),
795-
},
796-
)
797-
.await;
798-
}
799-
}
800-
801-
match docker.remove_network(NETWORK_NAME).await {
802-
Ok(()) => Ok(()),
803-
Err(err) if is_not_found(&err) => Ok(()),
804-
Err(err) => Err(err)
805-
.into_diagnostic()
806-
.wrap_err("failed to remove Docker network"),
807-
}
808-
}
809-
810706
fn is_not_found(err: &BollardError) -> bool {
811707
matches!(
812708
err,

crates/openshell-bootstrap/src/errors.rs

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,12 @@ const FAILURE_PATTERNS: &[FailurePattern] = &[
139139
match_mode: MatchMode::Any,
140140
diagnose: diagnose_node_pressure,
141141
},
142+
// Missing sandbox supervisor binary
143+
FailurePattern {
144+
matchers: &["HEALTHCHECK_MISSING_SUPERVISOR"],
145+
match_mode: MatchMode::Any,
146+
diagnose: diagnose_missing_supervisor,
147+
},
142148
// TLS/certificate issues
143149
FailurePattern {
144150
matchers: &[
@@ -342,6 +348,33 @@ fn diagnose_node_pressure(gateway_name: &str) -> GatewayFailureDiagnosis {
342348
}
343349
}
344350

351+
fn diagnose_missing_supervisor(gateway_name: &str) -> GatewayFailureDiagnosis {
352+
GatewayFailureDiagnosis {
353+
summary: "Sandbox supervisor binary missing from cluster image".to_string(),
354+
explanation: "The sandbox supervisor binary (/opt/openshell/bin/openshell-sandbox) \
355+
was not found in the gateway container. This binary is side-loaded into every \
356+
sandbox pod via a hostPath volume mount. Without it, all sandbox pods will \
357+
crash immediately with \"no such file or directory\". This typically means the \
358+
cluster image was built or published without the supervisor-builder stage."
359+
.to_string(),
360+
recovery_steps: vec![
361+
RecoveryStep::with_command(
362+
"Rebuild the cluster image with the supervisor binary included",
363+
"mise run docker:build:cluster",
364+
),
365+
RecoveryStep::with_command(
366+
"Destroy and recreate the gateway with the updated image",
367+
format!("openshell gateway destroy {gateway_name} && openshell gateway start"),
368+
),
369+
RecoveryStep::new(
370+
"Or set OPENSHELL_CLUSTER_IMAGE to a cluster image version that includes \
371+
the supervisor binary",
372+
),
373+
],
374+
retryable: false,
375+
}
376+
}
377+
345378
fn diagnose_certificate_issue(gateway_name: &str) -> GatewayFailureDiagnosis {
346379
GatewayFailureDiagnosis {
347380
summary: "TLS certificate issue".to_string(),

crates/openshell-bootstrap/src/lib.rs

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ use crate::constants::{
3030
};
3131
use crate::docker::{
3232
check_existing_gateway, check_port_conflicts, destroy_gateway_resources, ensure_container,
33-
ensure_image, ensure_network, ensure_volume, start_container, stop_container,
33+
ensure_image, ensure_volume, start_container, stop_container,
3434
};
3535
use crate::metadata::{
3636
create_gateway_metadata, create_gateway_metadata_with_host, local_gateway_host,
@@ -107,6 +107,10 @@ pub struct DeployOptions {
107107
/// created with GPU device requests (`--gpus all`) and the NVIDIA
108108
/// k8s-device-plugin is deployed inside the k3s cluster.
109109
pub gpu: bool,
110+
/// When true, destroy any existing gateway resources before deploying.
111+
/// When false, an existing gateway is left as-is and deployment is
112+
/// skipped (the caller is responsible for prompting the user first).
113+
pub recreate: bool,
110114
}
111115

112116
impl DeployOptions {
@@ -121,6 +125,7 @@ impl DeployOptions {
121125
disable_gateway_auth: false,
122126
registry_token: None,
123127
gpu: false,
128+
recreate: false,
124129
}
125130
}
126131

@@ -172,6 +177,13 @@ impl DeployOptions {
172177
self.gpu = gpu;
173178
self
174179
}
180+
181+
/// Set whether to destroy and recreate existing gateway resources.
182+
#[must_use]
183+
pub fn with_recreate(mut self, recreate: bool) -> Self {
184+
self.recreate = recreate;
185+
self
186+
}
175187
}
176188

177189
#[derive(Debug, Clone)]
@@ -232,6 +244,7 @@ where
232244
let disable_gateway_auth = options.disable_gateway_auth;
233245
let registry_token = options.registry_token;
234246
let gpu = options.gpu;
247+
let recreate = options.recreate;
235248

236249
// Wrap on_log in Arc<Mutex<>> so we can share it with pull_remote_image
237250
// which needs a 'static callback for the bollard streaming pull.
@@ -256,6 +269,22 @@ where
256269
),
257270
};
258271

272+
// If an existing gateway is found, either tear it down (when recreate is
273+
// requested) or bail out so the caller can prompt the user / reuse it.
274+
if let Some(existing) = check_existing_gateway(&target_docker, &name).await? {
275+
if recreate {
276+
log("[status] Removing existing gateway".to_string());
277+
destroy_gateway_resources(&target_docker, &name).await?;
278+
} else {
279+
return Err(miette::miette!(
280+
"Gateway '{name}' already exists (container_running={}).\n\
281+
Use --recreate to destroy and redeploy, or destroy it first with:\n\n \
282+
openshell gateway destroy {name}",
283+
existing.container_running,
284+
));
285+
}
286+
}
287+
259288
// Ensure the image is available on the target Docker daemon
260289
if remote_opts.is_some() {
261290
log("[status] Downloading gateway".to_string());
@@ -280,7 +309,6 @@ where
280309

281310
// All subsequent operations use the target Docker (remote or local)
282311
log("[status] Initializing environment".to_string());
283-
ensure_network(&target_docker).await?;
284312
ensure_volume(&target_docker, &volume_name(&name)).await?;
285313

286314
// Compute extra TLS SANs for remote deployments so the gateway and k3s

crates/openshell-bootstrap/src/runtime.rs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,13 @@ const DNS_FAILURE_MARKERS: &[&str] = &["DNS_PROBE_FAILED", "HEALTHCHECK_DNS_FAIL
2424
/// new scheduling, so the cluster will never become healthy on its own.
2525
const NODE_PRESSURE_MARKER: &str = "HEALTHCHECK_NODE_PRESSURE";
2626

27+
/// Log marker emitted by the health-check script when the sandbox supervisor
28+
/// binary (`/opt/openshell/bin/openshell-sandbox`) is missing from the node
29+
/// filesystem. Without this binary, every sandbox pod will crash immediately
30+
/// with "no such file or directory". This is a permanent error that requires
31+
/// rebuilding or updating the cluster image.
32+
const MISSING_SUPERVISOR_MARKER: &str = "HEALTHCHECK_MISSING_SUPERVISOR";
33+
2734
/// Number of consecutive polling iterations that must observe DNS failure
2835
/// markers before we treat the failure as persistent and abort. A small
2936
/// grace period avoids false positives on transient hiccups during startup.
@@ -116,6 +123,29 @@ where
116123
}
117124
}
118125

126+
// -- Missing supervisor binary detection ----------------------------
127+
// The health-check script verifies that /opt/openshell/bin/openshell-sandbox
128+
// exists on the node filesystem. If missing, every sandbox pod will crash.
129+
// This is a permanent error — fail immediately with actionable guidance.
130+
if recent_logs
131+
.iter()
132+
.any(|line| line.contains(MISSING_SUPERVISOR_MARKER))
133+
{
134+
result = Some(Err(miette::miette!(
135+
"The sandbox supervisor binary is missing from the cluster image.\n\
136+
The file /opt/openshell/bin/openshell-sandbox was not found in the gateway \
137+
container. Without it, sandbox pods cannot start.\n\n\
138+
This usually means the cluster image was built or published without the \
139+
supervisor-builder stage.\n\n\
140+
To fix:\n \
141+
1. Rebuild the cluster image: mise run docker:build:cluster\n \
142+
2. Or update to a cluster image that includes the supervisor binary\n \
143+
3. Then recreate the gateway: openshell gateway destroy && openshell gateway start\n\n{}",
144+
format_recent_logs(&recent_logs)
145+
)));
146+
break;
147+
}
148+
119149
let inspect = docker
120150
.inspect_container(&container_name, None::<InspectContainerOptions>)
121151
.await

crates/openshell-cli/src/bootstrap.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,9 @@ pub async fn run_bootstrap(
144144
);
145145
eprintln!();
146146

147-
let mut options = openshell_bootstrap::DeployOptions::new(&gateway_name);
147+
// Auto-bootstrap always recreates if stale Docker resources are found
148+
// (e.g. metadata was deleted but container/volume still exist).
149+
let mut options = openshell_bootstrap::DeployOptions::new(&gateway_name).with_recreate(true);
148150
if let Some(dest) = remote {
149151
let mut remote_opts = openshell_bootstrap::RemoteOptions::new(dest);
150152
if let Some(key) = ssh_key {

crates/openshell-cli/src/main.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -740,8 +740,8 @@ enum GatewayCommands {
740740

741741
/// Destroy and recreate the gateway from scratch if one already exists.
742742
///
743-
/// Without this flag, an interactive prompt asks what to do; in
744-
/// non-interactive mode the existing gateway is reused silently.
743+
/// Without this flag, an interactive prompt asks whether to recreate;
744+
/// in non-interactive mode the existing gateway is reused silently.
745745
#[arg(long)]
746746
recreate: bool,
747747

0 commit comments

Comments
 (0)