diff --git a/.agents/skills/test-backend/SKILL.md b/.agents/skills/test-backend/SKILL.md new file mode 100644 index 0000000..2696c80 --- /dev/null +++ b/.agents/skills/test-backend/SKILL.md @@ -0,0 +1,70 @@ +--- +name: test-backend +description: Launch and test the k3d or k3s backend lifecycle (init, up, kubectl, down, purge). Use when you want to run a full integration test of a stack backend. +user_invocable: true +metadata: + author: obol-team + version: "1.0.0" + domain: testing + triggers: test backend, test k3d, test k3s, integration test, flow test, backend test + role: tester + scope: validation + output-format: report +--- + +# Test Backend Skill + +Runs a full lifecycle integration test for the obol stack backend (k3d or k3s). + +## Arguments + +The skill accepts an optional argument specifying which backend to test: + +- `k3s` - Test the k3s (bare-metal) backend only +- `k3d` - Test the k3d (Docker-based) backend only +- `all` - Test both backends sequentially (default) +- No argument defaults to `all` + +Examples: +- `/test-backend k3s` +- `/test-backend k3d` +- `/test-backend all` +- `/test-backend` (same as `all`) + +## Workflow + +### 1. Pre-flight + +- Build the obol binary: `go build -o .workspace/bin/obol ./cmd/obol` from the project root +- Verify the binary was created successfully +- Set `OBOL_DEVELOPMENT=true` and add `.workspace/bin` to PATH + +### 2. Run Test Script + +Based on the argument, run the appropriate test script(s) located alongside this skill: + +- **k3s**: Run `.agents/skills/test-backend/scripts/test-k3s.sh` +- **k3d**: Run `.agents/skills/test-backend/scripts/test-k3d.sh` +- **all**: Run k3s first, then k3d (k3s requires sudo so test it first while credentials are fresh) + +Execute the script via Bash tool from the project root directory. The scripts require: +- **k3s**: Linux, sudo access, k3s binary in `.workspace/bin/` +- **k3d**: Docker running, k3d binary in `.workspace/bin/` + +### 3. Report Results + +After each script completes, report: +- Total pass/fail counts (shown in the RESULTS line) +- Any specific test failures with their names +- Overall verdict: all green or needs attention + +If a test script fails (non-zero exit), read the output to identify which test(s) failed and summarize. + +## Important Notes + +- The k3s backend requires **sudo access** - the user may need to enter their password +- The k3d backend requires **Docker to be running** +- Each test script performs its own cleanup (purge) before and after +- Tests are sequential and ordered: init -> up -> verify -> down -> restart -> purge +- Typical runtime: ~2-4 minutes per backend +- If the environment has issues (Docker not starting, k3s not installing), report the problem clearly rather than retrying endlessly diff --git a/.agents/skills/test-backend/scripts/test-k3d.sh b/.agents/skills/test-backend/scripts/test-k3d.sh new file mode 100755 index 0000000..9657254 --- /dev/null +++ b/.agents/skills/test-backend/scripts/test-k3d.sh @@ -0,0 +1,153 @@ +#!/usr/bin/env bash +set -euo pipefail + +# K3d Backend Integration Test +# Requires: Docker running, k3d binary, OBOL_DEVELOPMENT=true + +PROJECT_ROOT="$(cd "$(dirname "$0")/../../../.." && pwd)" +OBOL="${PROJECT_ROOT}/.workspace/bin/obol" +export OBOL_DEVELOPMENT=true +export PATH="${PROJECT_ROOT}/.workspace/bin:$PATH" + +cd "$PROJECT_ROOT" + +PASS=0 +FAIL=0 + +log() { echo "$(date +%H:%M:%S) $*"; } +pass() { log " PASS: $*"; PASS=$((PASS + 1)); } +fail() { log " FAIL: $*"; FAIL=$((FAIL + 1)); } + +check() { + local desc="$1"; shift + if "$@"; then pass "$desc"; else fail "$desc"; fi +} + +check_fail() { + local desc="$1"; shift + if ! "$@" 2>/dev/null; then pass "$desc"; else fail "$desc (should have failed)"; fi +} + +k3d_is_functional() { + $OBOL kubectl get nodes --no-headers 2>/dev/null | grep -q "Ready" +} + +# Pre-flight: verify Docker is running +if ! docker info >/dev/null 2>&1; then + log "ERROR: Docker is not running. Start Docker and try again." + exit 1 +fi + +log "=========================================" +log "K3d Backend Integration Test" +log "=========================================" + +# --- Cleanup --- +log "--- Cleanup: purging any existing stack ---" +$OBOL stack purge --force 2>/dev/null || true + +# --- TEST 1: stack init (default = k3d) --- +log "" +log "--- TEST 1: stack init (default = k3d) ---" +check "stack init" $OBOL stack init +check "k3d.yaml exists" test -f .workspace/config/k3d.yaml +check ".stack-id exists" test -f .workspace/config/.stack-id +check ".stack-backend exists" test -f .workspace/config/.stack-backend +check "defaults/ directory exists" test -d .workspace/config/defaults +BACKEND=$(cat .workspace/config/.stack-backend) +check "backend is k3d" test "$BACKEND" = "k3d" +STACK_ID=$(cat .workspace/config/.stack-id) +log " Stack ID: $STACK_ID" + +# --- TEST 2: stack init again (should fail without --force) --- +log "" +log "--- TEST 2: stack init again (should fail without --force) ---" +check_fail "init without --force correctly rejected" $OBOL stack init + +# --- TEST 3: stack init --force --- +log "" +log "--- TEST 3: stack init --force ---" +$OBOL stack init --force +NEW_ID=$(cat .workspace/config/.stack-id) +check "stack ID preserved on --force ($STACK_ID)" test "$STACK_ID" = "$NEW_ID" + +# --- TEST 4: stack up --- +log "" +log "--- TEST 4: stack up ---" +check "stack up" $OBOL stack up +check "kubeconfig.yaml exists" test -f .workspace/config/kubeconfig.yaml + +# Wait for nodes to be ready (k3d can take a moment) +log " Waiting for nodes to be ready..." +DEADLINE=$((SECONDS + 120)) +while [ $SECONDS -lt $DEADLINE ]; do + if k3d_is_functional; then break; fi + sleep 3 +done +check "k3d is functional (nodes ready)" k3d_is_functional + +# --- TEST 5: kubectl passthrough --- +log "" +log "--- TEST 5: kubectl passthrough ---" +NODES=$($OBOL kubectl get nodes --no-headers 2>/dev/null | wc -l) +check "kubectl sees nodes ($NODES)" test "$NODES" -ge 1 + +NS=$($OBOL kubectl get namespaces --no-headers 2>/dev/null | wc -l) +check "kubectl sees namespaces ($NS)" test "$NS" -ge 1 + +# --- TEST 6: stack down --- +log "" +log "--- TEST 6: stack down ---" +check "stack down" $OBOL stack down +check "config preserved after down" test -f .workspace/config/.stack-id + +# Verify cluster stopped (kubectl should fail) +sleep 2 +check_fail "kubectl unreachable after down" $OBOL kubectl get nodes --no-headers + +# --- TEST 7: stack down already stopped --- +log "" +log "--- TEST 7: stack down already stopped ---" +check "stack down (already stopped)" $OBOL stack down + +# --- TEST 8: stack up (restart after down) --- +log "" +log "--- TEST 8: stack up (restart) ---" +check "stack up (restart)" $OBOL stack up + +# Wait for nodes to be ready after restart +log " Waiting for nodes to be ready..." +DEADLINE=$((SECONDS + 120)) +while [ $SECONDS -lt $DEADLINE ]; do + if k3d_is_functional; then break; fi + sleep 3 +done +check "k3d functional after restart" k3d_is_functional + +READY=$($OBOL kubectl get nodes --no-headers 2>/dev/null | grep -c "Ready" || true) +check "node ready after restart ($READY)" test "$READY" -ge 1 + +# --- TEST 9: stack purge --- +log "" +log "--- TEST 9: stack purge ---" +check "stack purge" $OBOL stack purge +sleep 2 +check "config removed" test ! -f .workspace/config/.stack-id + +# --- TEST 10: full cycle + purge --force --- +log "" +log "--- TEST 10: full cycle + purge --force ---" +check "init for purge test" $OBOL stack init +check "up for purge test" $OBOL stack up +check "purge --force" $OBOL stack purge --force +sleep 2 +check "config removed after purge --force" test ! -f .workspace/config/.stack-id + +log "" +log "=========================================" +log "K3d RESULTS: $PASS passed, $FAIL failed" +log "=========================================" + +if [ "$FAIL" -gt 0 ]; then + exit 1 +fi diff --git a/.agents/skills/test-backend/scripts/test-k3s.sh b/.agents/skills/test-backend/scripts/test-k3s.sh new file mode 100755 index 0000000..03e3bca --- /dev/null +++ b/.agents/skills/test-backend/scripts/test-k3s.sh @@ -0,0 +1,149 @@ +#!/usr/bin/env bash +set -euo pipefail + +# K3s Backend Integration Test +# Requires: Linux, sudo access, k3s binary, OBOL_DEVELOPMENT=true + +PROJECT_ROOT="$(cd "$(dirname "$0")/../../../.." && pwd)" +OBOL="${PROJECT_ROOT}/.workspace/bin/obol" +export OBOL_DEVELOPMENT=true +export PATH="${PROJECT_ROOT}/.workspace/bin:$PATH" + +cd "$PROJECT_ROOT" + +PASS=0 +FAIL=0 + +log() { echo "$(date +%H:%M:%S) $*"; } +pass() { log " PASS: $*"; PASS=$((PASS + 1)); } +fail() { log " FAIL: $*"; FAIL=$((FAIL + 1)); } + +check() { + local desc="$1"; shift + if "$@"; then pass "$desc"; else fail "$desc"; fi +} + +check_fail() { + local desc="$1"; shift + if ! "$@" 2>/dev/null; then pass "$desc"; else fail "$desc (should have failed)"; fi +} + +k3s_is_functional() { + $OBOL kubectl get nodes --no-headers 2>/dev/null | grep -q "Ready" +} + +log "=========================================" +log "K3s Backend Integration Test" +log "=========================================" + +# --- Cleanup --- +log "--- Cleanup: purging any existing stack ---" +$OBOL stack purge --force 2>/dev/null || true + +# --- TEST 1: stack init --backend k3s --- +log "" +log "--- TEST 1: stack init --backend k3s ---" +check "stack init --backend k3s" $OBOL stack init --backend k3s +check "k3s-config.yaml exists" test -f .workspace/config/k3s-config.yaml +check ".stack-id exists" test -f .workspace/config/.stack-id +check ".stack-backend exists" test -f .workspace/config/.stack-backend +check "defaults/ directory exists" test -d .workspace/config/defaults +BACKEND=$(cat .workspace/config/.stack-backend) +check "backend is k3s" test "$BACKEND" = "k3s" +STACK_ID=$(cat .workspace/config/.stack-id) +log " Stack ID: $STACK_ID" + +# --- TEST 2: stack init again (should fail without --force) --- +log "" +log "--- TEST 2: stack init again (should fail without --force) ---" +check_fail "init without --force correctly rejected" $OBOL stack init --backend k3s + +# --- TEST 3: stack init --force (should preserve stack ID) --- +log "" +log "--- TEST 3: stack init --force (should preserve stack ID) ---" +$OBOL stack init --backend k3s --force +NEW_ID=$(cat .workspace/config/.stack-id) +check "stack ID preserved on --force ($STACK_ID)" test "$STACK_ID" = "$NEW_ID" + +# --- TEST 4: stack up --- +log "" +log "--- TEST 4: stack up ---" +check "stack up" $OBOL stack up +check "PID file exists" test -f .workspace/config/.k3s.pid +check "kubeconfig.yaml exists" test -f .workspace/config/kubeconfig.yaml +check "k3s is functional (nodes ready)" k3s_is_functional + +# --- TEST 5: kubectl passthrough --- +log "" +log "--- TEST 5: kubectl passthrough ---" +NODES=$($OBOL kubectl get nodes --no-headers 2>/dev/null | wc -l) +check "kubectl sees nodes ($NODES)" test "$NODES" -ge 1 + +NS=$($OBOL kubectl get namespaces --no-headers 2>/dev/null | wc -l) +check "kubectl sees namespaces ($NS)" test "$NS" -ge 1 + +# --- TEST 6: stack up idempotent (already running) --- +log "" +log "--- TEST 6: stack up idempotent ---" +OLD_PID=$(cat .workspace/config/.k3s.pid) +check "stack up while running" $OBOL stack up +NEW_PID=$(cat .workspace/config/.k3s.pid) +check "PID unchanged (idempotent) ($OLD_PID = $NEW_PID)" test "$OLD_PID" = "$NEW_PID" + +# --- TEST 7: stack down --- +log "" +log "--- TEST 7: stack down ---" +check "stack down" $OBOL stack down +check "PID file cleaned up" test ! -f .workspace/config/.k3s.pid +check "config preserved after down" test -f .workspace/config/.stack-id +log " Waiting for API server to become unreachable..." +API_DOWN=false +for i in $(seq 1 15); do + if ! $OBOL kubectl get nodes --no-headers 2>/dev/null; then + API_DOWN=true + break + fi + sleep 2 +done +check "kubectl unreachable after down" test "$API_DOWN" = "true" + +# --- TEST 8: stack down again (already stopped) --- +log "" +log "--- TEST 8: stack down already stopped ---" +check "stack down (already stopped)" $OBOL stack down + +# --- TEST 9: stack up (restart after down) --- +log "" +log "--- TEST 9: stack up (restart) ---" +check "stack up (restart)" $OBOL stack up +check "PID file exists after restart" test -f .workspace/config/.k3s.pid +check "k3s functional after restart" k3s_is_functional + +READY=$($OBOL kubectl get nodes --no-headers 2>/dev/null | grep -c "Ready" || true) +check "node ready after restart ($READY)" test "$READY" -ge 1 + +# --- TEST 10: stack purge (without --force) --- +log "" +log "--- TEST 10: stack purge ---" +check "stack purge" $OBOL stack purge +sleep 2 +check "config removed" test ! -f .workspace/config/.stack-id +check "k3s pid file removed" test ! -f .workspace/config/.k3s.pid + +# --- TEST 11: full cycle + purge --force --- +log "" +log "--- TEST 11: full cycle + purge --force ---" +check "init for purge test" $OBOL stack init --backend k3s +check "up for purge test" $OBOL stack up +check "purge --force" $OBOL stack purge --force +sleep 2 +check "config removed after purge --force" test ! -f .workspace/config/.stack-id + +log "" +log "=========================================" +log "K3s RESULTS: $PASS passed, $FAIL failed" +log "=========================================" + +if [ "$FAIL" -gt 0 ]; then + exit 1 +fi diff --git a/CLAUDE.md b/CLAUDE.md index 8aa79e8..e2fa9b4 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -20,6 +20,19 @@ The Obol Stack is a local Kubernetes-based framework for running blockchain netw 5. **Two-stage templating**: CLI flags → Go templates → Helmfile → Kubernetes resources 6. **Development mode**: Local `.workspace/` directory with `go run` wrapper for rapid development +### Routing and Gateway API + +Obol Stack uses Traefik with the Kubernetes Gateway API for HTTP routing. + +- Controller: Traefik Helm chart (`traefik` namespace) +- GatewayClass: `traefik` +- Gateway: `traefik-gateway` in `traefik` namespace +- HTTPRoute patterns: + - `/` → `obol-frontend` + - `/rpc` → `erpc` + - `/ethereum-/execution` and `/ethereum-/beacon` + - `/aztec-` and `/helios-` + ## Bootstrap Installer: obolup.sh ### Purpose diff --git a/README.md b/README.md index 0f24b0d..e525dca 100644 --- a/README.md +++ b/README.md @@ -394,6 +394,35 @@ obol stack purge -f > [!WARNING] > The `purge` command permanently deletes all cluster data and configuration. The `-f` flag is required to remove persistent volume claims (PVCs) owned by root. Use with caution. +### Dashboard Authentication (Better Auth) + +The dashboard UI is protected behind login when configured. RPC endpoints under `/rpc/*` remain unauthenticated (the x402 payment flow is handled separately). + +**Required environment variables (set before `obol stack up`):** + +- `STACK_PUBLIC_DOMAIN` (defaults to `obol.stack`; set to your Cloudflare tunnel hostname for internet exposure) +- `BETTER_AUTH_SECRET` (min 32 chars) +- `OBOL_GOOGLE_CLIENT_ID` +- `OBOL_GOOGLE_CLIENT_SECRET` + +**Google OAuth redirect URI:** + +Register this in Google Cloud Console: + +```text +https:///api/auth/callback/google +``` + +**Nodecore token refresh (for eRPC upstream header injection):** + +Create/update the Secret `erpc/nodecore-oauth-refresh` with: + +- `client_id` +- `client_secret` +- `refresh_token` + +The in-cluster CronJob refreshes a short-lived Google `id_token` and writes it into `erpc/obol-oauth-token`, which eRPC uses to inject `X-Nodecore-Token` on upstream requests. + ### Working with Kubernetes The `obol` CLI includes convenient wrappers for common Kubernetes tools. These automatically use the correct cluster configuration: diff --git a/cmd/obol/bootstrap.go b/cmd/obol/bootstrap.go index f2d3eb2..60683d3 100644 --- a/cmd/obol/bootstrap.go +++ b/cmd/obol/bootstrap.go @@ -27,7 +27,7 @@ func bootstrapCommand(cfg *config.Config) *cli.Command { // Step 1: Initialize stack fmt.Println("Initializing stack configuration...") - if err := stack.Init(cfg, false); err != nil { + if err := stack.Init(cfg, false, ""); err != nil { // Check if it's an "already exists" error - that's okay if !strings.Contains(err.Error(), "already exists") { return fmt.Errorf("bootstrap init failed: %w", err) diff --git a/cmd/obol/main.go b/cmd/obol/main.go index cde6626..871eb07 100644 --- a/cmd/obol/main.go +++ b/cmd/obol/main.go @@ -12,6 +12,7 @@ import ( "github.com/ObolNetwork/obol-stack/internal/app" "github.com/ObolNetwork/obol-stack/internal/config" "github.com/ObolNetwork/obol-stack/internal/stack" + "github.com/ObolNetwork/obol-stack/internal/tunnel" "github.com/ObolNetwork/obol-stack/internal/version" "github.com/urfave/cli/v2" ) @@ -57,6 +58,11 @@ COMMANDS: app sync Deploy application to cluster app delete Remove application and cluster resources + Tunnel Management: + tunnel status Show tunnel status and public URL + tunnel restart Restart tunnel to get a new URL + tunnel logs View cloudflared logs + Kubernetes Tools (with auto-configured KUBECONFIG): kubectl Run kubectl with stack kubeconfig (passthrough) helm Run helm with stack kubeconfig (passthrough) @@ -96,9 +102,14 @@ GLOBAL OPTIONS: Aliases: []string{"f"}, Usage: "Force overwrite existing configuration", }, + &cli.StringFlag{ + Name: "backend", + Usage: "Cluster backend: k3d (Docker-based) or k3s (bare-metal)", + EnvVars: []string{"OBOL_BACKEND"}, + }, }, Action: func(c *cli.Context) error { - return stack.Init(cfg, c.Bool("force")) + return stack.Init(cfg, c.Bool("force"), c.String("backend")) }, }, { @@ -157,6 +168,43 @@ GLOBAL OPTIONS: }, }, // ============================================================ + // Tunnel Management Commands + // ============================================================ + { + Name: "tunnel", + Usage: "Manage Cloudflare tunnel for public access", + Subcommands: []*cli.Command{ + { + Name: "status", + Usage: "Show tunnel status and public URL", + Action: func(c *cli.Context) error { + return tunnel.Status(cfg) + }, + }, + { + Name: "restart", + Usage: "Restart the tunnel to get a new URL", + Action: func(c *cli.Context) error { + return tunnel.Restart(cfg) + }, + }, + { + Name: "logs", + Usage: "View cloudflared logs", + Flags: []cli.Flag{ + &cli.BoolFlag{ + Name: "follow", + Aliases: []string{"f"}, + Usage: "Follow log output", + }, + }, + Action: func(c *cli.Context) error { + return tunnel.Logs(cfg, c.Bool("follow")) + }, + }, + }, + }, + // ============================================================ // Kubernetes Tool Passthroughs (with auto-configured KUBECONFIG) // ============================================================ { diff --git a/internal/embed/embed.go b/internal/embed/embed.go index 2c189eb..7a0d723 100644 --- a/internal/embed/embed.go +++ b/internal/embed/embed.go @@ -15,6 +15,9 @@ import ( //go:embed k3d-config.yaml var K3dConfig string +//go:embed k3s-config.yaml +var K3sConfig string + //go:embed all:infrastructure var infrastructureFS embed.FS diff --git a/internal/embed/infrastructure/base/templates/llm.yaml b/internal/embed/infrastructure/base/templates/llm.yaml new file mode 100644 index 0000000..5633866 --- /dev/null +++ b/internal/embed/infrastructure/base/templates/llm.yaml @@ -0,0 +1,262 @@ +--- +# LLM foundation services (OKR-1) +# +# This deploys: +# - Ollama (as the upstream LLM runtime) +# - llms.py (LLMSpy) as an OpenAI-compatible gateway / router over providers +# +# Design notes: +# - We default to Ollama Cloud (`glm-4.7:cloud`) to avoid requiring local GPU/VRAM. +# - We persist Ollama's identity keypair at `/root/.ollama/id_ed25519` so the +# Ollama Cloud "connect" binding survives pod restarts/upgrades. +# - Model cache is kept on `emptyDir` (ephemeral) per product decision. +apiVersion: v1 +kind: Namespace +metadata: + name: llm + +--- +# Persist Ollama identity (Ollama Cloud connect uses the public key derived from this keypair). +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: ollama-home + namespace: llm +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 256Mi + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ollama + namespace: llm + labels: + app: ollama +spec: + replicas: 1 + # Ollama uses a ReadWriteOnce PVC; avoid surging a second pod during updates. + strategy: + type: Recreate + selector: + matchLabels: + app: ollama + template: + metadata: + labels: + app: ollama + spec: + containers: + - name: ollama + image: ollama/ollama:latest + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 11434 + protocol: TCP + env: + # Store model blobs (including any cloud model stubs/cache) in an ephemeral volume. + - name: OLLAMA_MODELS + value: /models + # Explicitly bind the HTTP API to all interfaces in-cluster. + - name: OLLAMA_HOST + value: 0.0.0.0:11434 + volumeMounts: + # Persist identity + config (e.g. ~/.ollama/id_ed25519) for Ollama Cloud connect. + - name: ollama-home + mountPath: /root/.ollama + - name: ollama-models + mountPath: /models + readinessProbe: + httpGet: + path: /api/version + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 2 + livenessProbe: + httpGet: + path: /api/version + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 2 + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 2000m + memory: 4Gi + volumes: + - name: ollama-home + persistentVolumeClaim: + claimName: ollama-home + - name: ollama-models + emptyDir: {} + +--- +apiVersion: v1 +kind: Service +metadata: + name: ollama + namespace: llm + labels: + app: ollama +spec: + type: ClusterIP + selector: + app: ollama + ports: + - name: http + port: 11434 + targetPort: http + protocol: TCP + +--- +# llms.py configuration for Obol Stack: +# - Only enable the Ollama provider +# - Default model is `glm-4.7:cloud` (cloud-first) +apiVersion: v1 +kind: ConfigMap +metadata: + name: llmspy-config + namespace: llm +data: + llms.json: | + { + "defaults": { + "headers": { + "Content-Type": "application/json" + }, + "text": { + "model": "glm-4.7:cloud", + "messages": [ + { "role": "user", "content": "" } + ] + } + }, + "providers": { + "ollama": { + "enabled": true, + "type": "OllamaProvider", + "base_url": "http://ollama.llm.svc.cluster.local:11434", + "models": {}, + "all_models": true + } + } + } + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llmspy + namespace: llm + labels: + app: llmspy +spec: + replicas: 1 + selector: + matchLabels: + app: llmspy + template: + metadata: + labels: + app: llmspy + spec: + initContainers: + # Seed ~/.llms/llms.json from the ConfigMap. llms.py also writes runtime + # state (e.g. analytics) under ~/.llms, so we keep the directory writable. + - name: seed-config + image: busybox:1.36.1 + imagePullPolicy: IfNotPresent + command: + - sh + - -c + - | + set -eu + mkdir -p /data + cp /config/llms.json /data/llms.json + volumeMounts: + - name: llmspy-config + mountPath: /config + readOnly: true + - name: llmspy-home + mountPath: /data + containers: + - name: llmspy + # Official LLMSpy container image (published by upstream). + # Pin a specific version for reproducibility. + image: ghcr.io/servicestack/llms:v2.0.30 + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 8000 + protocol: TCP + command: + - llms + args: + - --config + - /home/llms/.llms/llms.json + - --serve + - "8000" + env: + # Avoid surprises if the image changes its default HOME. + - name: HOME + value: /home/llms + volumeMounts: + - name: llmspy-home + mountPath: /home/llms/.llms + readinessProbe: + httpGet: + path: / + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 2 + livenessProbe: + httpGet: + path: / + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 2 + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 1000m + memory: 1Gi + volumes: + - name: llmspy-config + configMap: + name: llmspy-config + items: + - key: llms.json + path: llms.json + - name: llmspy-home + emptyDir: {} + +--- +apiVersion: v1 +kind: Service +metadata: + name: llmspy + namespace: llm + labels: + app: llmspy +spec: + type: ClusterIP + selector: + app: llmspy + ports: + - name: http + port: 8000 + targetPort: http + protocol: TCP diff --git a/internal/embed/infrastructure/base/templates/local-path.yaml b/internal/embed/infrastructure/base/templates/local-path.yaml index 77713e9..2547c50 100644 --- a/internal/embed/infrastructure/base/templates/local-path.yaml +++ b/internal/embed/infrastructure/base/templates/local-path.yaml @@ -11,7 +11,7 @@ data: "nodePathMap":[ { "node":"DEFAULT_PATH_FOR_NON_LISTED_NODES", - "paths":["/data"] + "paths":["{{ .Values.dataDir }}"] } ] } diff --git a/internal/embed/infrastructure/base/templates/oauth-token.yaml b/internal/embed/infrastructure/base/templates/oauth-token.yaml new file mode 100644 index 0000000..d5baf56 --- /dev/null +++ b/internal/embed/infrastructure/base/templates/oauth-token.yaml @@ -0,0 +1,176 @@ +--- +# Nodecore OAuth token plumbing for eRPC upstream auth (issue #124) +apiVersion: v1 +kind: Namespace +metadata: + name: erpc + +--- +apiVersion: v1 +kind: Secret +metadata: + name: obol-oauth-token + namespace: erpc +type: Opaque +stringData: + # Google `id_token` (JWT). CronJob refreshes and writes into this Secret. + token: "" + +--- +apiVersion: v1 +kind: Secret +metadata: + name: nodecore-oauth-refresh + namespace: erpc +type: Opaque +stringData: + # Google OAuth client credentials + refresh token. + # This is intentionally stored separately from the ID token written to `obol-oauth-token`. + client_id: "" + client_secret: "" + refresh_token: "" + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: nodecore-token-writer + namespace: erpc +rules: + - apiGroups: [""] + resources: ["secrets"] + resourceNames: ["obol-oauth-token"] + verbs: ["get", "update", "patch"] + +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: nodecore-token-refresher + namespace: erpc + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: nodecore-token-writer + namespace: erpc +subjects: + - kind: ServiceAccount + name: nodecore-token-refresher + namespace: erpc +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: nodecore-token-writer + +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: nodecore-token-refresher + namespace: erpc +spec: + # Refresh every 45 minutes to stay ahead of typical 1h ID token expiry. + schedule: "0,45 * * * *" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + template: + spec: + serviceAccountName: nodecore-token-refresher + restartPolicy: OnFailure + containers: + - name: refresh + image: python:3.12-alpine + imagePullPolicy: IfNotPresent + env: + - name: GOOGLE_CLIENT_ID + valueFrom: + secretKeyRef: + name: nodecore-oauth-refresh + key: client_id + - name: GOOGLE_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: nodecore-oauth-refresh + key: client_secret + - name: GOOGLE_REFRESH_TOKEN + valueFrom: + secretKeyRef: + name: nodecore-oauth-refresh + key: refresh_token + command: + - python + - -c + - | + import base64 + import json + import os + import ssl + import urllib.parse + import urllib.request + + client_id = os.environ.get("GOOGLE_CLIENT_ID") + client_secret = os.environ.get("GOOGLE_CLIENT_SECRET") + refresh_token = os.environ.get("GOOGLE_REFRESH_TOKEN") + + if not client_id or not client_secret or not refresh_token: + raise SystemExit("Missing GOOGLE_CLIENT_ID/GOOGLE_CLIENT_SECRET/GOOGLE_REFRESH_TOKEN in Secret erpc/nodecore-oauth-refresh") + + token_url = "https://oauth2.googleapis.com/token" + body = urllib.parse.urlencode({ + "client_id": client_id, + "client_secret": client_secret, + "refresh_token": refresh_token, + "grant_type": "refresh_token", + }).encode("utf-8") + + req = urllib.request.Request( + token_url, + data=body, + method="POST", + headers={"Content-Type": "application/x-www-form-urlencoded"}, + ) + + with urllib.request.urlopen(req, timeout=20) as resp: + payload = json.loads(resp.read().decode("utf-8")) + + id_token = payload.get("id_token") + if not id_token: + raise SystemExit(f"Google token endpoint response missing id_token: {payload}") + + token_b64 = base64.b64encode(id_token.encode("utf-8")).decode("utf-8") + + namespace = "erpc" + secret_name = "obol-oauth-token" + api_server = "https://kubernetes.default.svc" + + sa_token_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" + sa_ca_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + + with open(sa_token_path, "r", encoding="utf-8") as f: + sa_token = f.read().strip() + + patch = json.dumps({"data": {"token": token_b64}}).encode("utf-8") + patch_url = f"{api_server}/api/v1/namespaces/{namespace}/secrets/{secret_name}" + + ctx = ssl.create_default_context(cafile=sa_ca_path) + patch_req = urllib.request.Request( + patch_url, + data=patch, + method="PATCH", + headers={ + "Authorization": f"Bearer {sa_token}", + "Content-Type": "application/merge-patch+json", + "Accept": "application/json", + }, + ) + + with urllib.request.urlopen(patch_req, timeout=20, context=ctx) as resp: + if resp.status < 200 or resp.status >= 300: + raise SystemExit(f"Failed to patch Secret {namespace}/{secret_name}: HTTP {resp.status} {resp.read().decode('utf-8')}") + + print("Updated Secret erpc/obol-oauth-token") diff --git a/internal/embed/infrastructure/base/templates/obol-agent.yaml b/internal/embed/infrastructure/base/templates/obol-agent.yaml index f73dda7..7220dbf 100644 --- a/internal/embed/infrastructure/base/templates/obol-agent.yaml +++ b/internal/embed/infrastructure/base/templates/obol-agent.yaml @@ -139,6 +139,24 @@ spec: - name: PUBLIC_MODE value: "false" + # OKR-1: Default LLM backend via llms.py + Ollama Cloud + # + # The Obol Stack agent is provider-agnostic: + # - `llms.py` (LLMSpy) exposes an OpenAI-compatible API at /v1 + # - LLMSpy forwards to Ollama (in-cluster), which can run `*:cloud` models + # + # Important: Ollama Cloud requires a one-time "connect" of the pod identity + # (public key derived from /root/.ollama/id_ed25519). We persist that key + # in the `llm/ollama-home` PVC so upgrades/restarts don't require re-connect. + - name: LLM_BACKEND + value: "llmspy" + - name: LLM_MODEL + value: "glm-4.7:cloud" + - name: OPENAI_API_BASE + value: "http://llmspy.llm.svc.cluster.local:8000/v1" + - name: OPENAI_API_KEY + value: "ollama" + # Health checks ensure the pod is ready to receive traffic livenessProbe: httpGet: @@ -179,4 +197,4 @@ spec: protocol: TCP name: http selector: - app: obol-agent # Routes traffic to pods with this label \ No newline at end of file + app: obol-agent # Routes traffic to pods with this label diff --git a/internal/embed/infrastructure/cloudflared/Chart.yaml b/internal/embed/infrastructure/cloudflared/Chart.yaml new file mode 100644 index 0000000..894505e --- /dev/null +++ b/internal/embed/infrastructure/cloudflared/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: cloudflared +description: Cloudflare Tunnel for public access +type: application +version: 0.1.0 +appVersion: "2024.12.2" diff --git a/internal/embed/infrastructure/cloudflared/templates/deployment.yaml b/internal/embed/infrastructure/cloudflared/templates/deployment.yaml new file mode 100644 index 0000000..212556d --- /dev/null +++ b/internal/embed/infrastructure/cloudflared/templates/deployment.yaml @@ -0,0 +1,44 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cloudflared + labels: + app.kubernetes.io/name: cloudflared + app.kubernetes.io/part-of: obol-stack +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: cloudflared + template: + metadata: + labels: + app.kubernetes.io/name: cloudflared + spec: + containers: + - name: cloudflared + image: cloudflare/cloudflared:2024.12.2 + args: + - tunnel + - --no-autoupdate + - --metrics + - 0.0.0.0:2000 + - --url + - http://traefik.traefik.svc.cluster.local:80 + ports: + - name: metrics + containerPort: 2000 + livenessProbe: + httpGet: + path: /ready + port: metrics + initialDelaySeconds: 10 + periodSeconds: 10 + resources: + requests: + cpu: 10m + memory: 64Mi + limits: + cpu: 100m + memory: 128Mi + restartPolicy: Always diff --git a/internal/embed/infrastructure/helmfile.yaml b/internal/embed/infrastructure/helmfile.yaml deleted file mode 100644 index 9f49d09..0000000 --- a/internal/embed/infrastructure/helmfile.yaml +++ /dev/null @@ -1,66 +0,0 @@ -# Helmfile for Obol Stack default infrastructure -# Orchestrates core infrastructure components deployed with every stack - -repositories: - - name: ingress-nginx - url: https://kubernetes.github.io/ingress-nginx - - name: obol - url: https://obolnetwork.github.io/helm-charts/ - - name: ethereum - url: https://ethpandaops.github.io/ethereum-helm-charts - -# Single source of truth: change this to switch networks -values: - - network: mainnet - -releases: - # Local storage provisioner (raw manifests wrapped as chart) - - name: base - namespace: kube-system - chart: ./base - values: - - dataDir: /data - - network: "{{ .Values.network }}" - - # Nginx ingress controller (upstream chart) - - name: ingress-nginx - namespace: ingress-nginx - chart: ingress-nginx/ingress-nginx - version: 4.13.3 - values: - - controller: - replicaCount: 1 - service: - type: LoadBalancer - externalTrafficPolicy: Local - resources: - limits: - cpu: 500m - memory: 512Mi - requests: - cpu: 100m - memory: 128Mi - tolerations: [] - admissionWebhooks: - enabled: false - - # eRPC - - name: erpc - namespace: erpc - chart: ethereum/erpc - needs: - - kube-system/base - - ingress-nginx/ingress-nginx - values: - - ./values/erpc.yaml.gotmpl - - # Obol Stack frontend - - name: obol-frontend - namespace: obol-frontend - chart: obol/obol-app - version: 0.1.0 - needs: - - ingress-nginx/ingress-nginx - - erpc/erpc - values: - - ./values/obol-frontend.yaml.gotmpl diff --git a/internal/embed/infrastructure/helmfile.yaml.gotmpl b/internal/embed/infrastructure/helmfile.yaml.gotmpl new file mode 100644 index 0000000..1fd2e7e --- /dev/null +++ b/internal/embed/infrastructure/helmfile.yaml.gotmpl @@ -0,0 +1,219 @@ +# Helmfile for Obol Stack default infrastructure +# Orchestrates core infrastructure components deployed with every stack +# Uses Traefik with Gateway API for routing (replaces nginx-ingress) +{{ $publicDomain := env "STACK_PUBLIC_DOMAIN" | default "obol.stack" -}} +{{- $dataDir := env "STACK_DATA_DIR" | default "/data" -}} +{{- $network := env "STACK_NETWORK" | default "mainnet" -}} +{{- $gatewayApiVersion := "v1.4.1" }} + +repositories: + - name: traefik + url: https://traefik.github.io/charts + - name: prometheus-community + url: https://prometheus-community.github.io/helm-charts + - name: obol + url: https://obolnetwork.github.io/helm-charts/ + - name: ethereum + url: https://ethpandaops.github.io/ethereum-helm-charts + - name: bedag + url: https://bedag.github.io/helm-charts/ + - name: stakater + url: https://stakater.github.io/stakater-charts + +releases: + # Local storage provisioner (raw manifests wrapped as chart) + - name: base + namespace: kube-system + chart: ./base + values: + - dataDir: '{{ $dataDir }}' + - network: "{{ $network }}" + + # Monitoring stack (Prometheus operator + Prometheus) + - name: monitoring + namespace: monitoring + createNamespace: true + chart: prometheus-community/kube-prometheus-stack + version: 79.5.0 + timeout: 600 + values: + - ./values/monitoring.yaml.gotmpl + + # Gateway API CRDs (applied from upstream release) + - name: gateway-api-crds + namespace: gateway-system + createNamespace: true + chart: bedag/raw + values: + - resources: [] + hooks: + - events: ["presync"] + showlogs: true + command: kubectl + args: + - apply + - -f + - https://github.com/kubernetes-sigs/gateway-api/releases/download/{{ $gatewayApiVersion }}/standard-install.yaml + + # Traefik ingress controller with Gateway API support + - name: traefik + namespace: traefik + createNamespace: true + chart: traefik/traefik + version: 38.0.2 + needs: + - gateway-system/gateway-api-crds + values: + # Gateway API provider configuration + - providers: + kubernetesGateway: + enabled: true + namespaces: [] # Watch all namespaces + kubernetesCRD: + enabled: true + kubernetesIngress: + enabled: false # Disable legacy Ingress support + # GatewayClass configuration + - gatewayClass: + enabled: true + name: traefik + # Gateway configuration (main entry point) + - gateway: + enabled: true + name: traefik-gateway + namespace: traefik + listeners: + web: + port: 8000 + protocol: HTTP + namespacePolicy: + from: All + # Ports configuration + - ports: + web: + port: 8000 + expose: + default: true + exposedPort: 80 + protocol: TCP + websecure: + port: 8443 + expose: + default: true + exposedPort: 443 + protocol: TCP + tls: + enabled: false # TLS termination disabled for local dev + # Service configuration + - service: + type: LoadBalancer + externalTrafficPolicy: Local + # Resource limits + - resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 100m + memory: 128Mi + # Disable dashboard by default + - ingressRoute: + dashboard: + enabled: false + + # Cloudflare Tunnel (quick tunnel mode for public access) + - name: cloudflared + namespace: traefik + chart: ./cloudflared + needs: + - traefik/traefik + + # Stakater Reloader (restarts workloads on Secret/ConfigMap change) + - name: reloader + namespace: reloader + createNamespace: true + chart: stakater/reloader + version: 2.2.7 + + # eRPC + - name: erpc + namespace: erpc + createNamespace: true + chart: ethereum/erpc + needs: + - kube-system/base + - traefik/traefik + values: + - ./values/erpc.yaml.gotmpl + + # eRPC HTTPRoute + - name: erpc-httproute + namespace: erpc + chart: bedag/raw + needs: + - traefik/traefik + - erpc/erpc + values: + - resources: + - apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + metadata: + name: erpc + namespace: erpc + spec: + parentRefs: + - name: traefik-gateway + namespace: traefik + sectionName: web + hostnames: + - "{{ $publicDomain }}" + rules: + - matches: + - path: + type: PathPrefix + value: /rpc + backendRefs: + - name: erpc + port: 4000 + + # Obol Stack frontend + - name: obol-frontend + namespace: obol-frontend + createNamespace: true + chart: obol/obol-app + version: 0.1.0 + needs: + - traefik/traefik + - erpc/erpc + values: + - ./values/obol-frontend.yaml.gotmpl + + # Obol Frontend HTTPRoute + - name: obol-frontend-httproute + namespace: obol-frontend + chart: bedag/raw + needs: + - traefik/traefik + - obol-frontend/obol-frontend + values: + - resources: + - apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + metadata: + name: obol-frontend + namespace: obol-frontend + spec: + parentRefs: + - name: traefik-gateway + namespace: traefik + sectionName: web + hostnames: + - "{{ $publicDomain }}" + rules: + - matches: + - path: + type: PathPrefix + value: / + backendRefs: + - name: obol-frontend-obol-app + port: 3000 diff --git a/internal/embed/infrastructure/values/erpc.yaml.gotmpl b/internal/embed/infrastructure/values/erpc.yaml.gotmpl index fdedc69..78274e9 100644 --- a/internal/embed/infrastructure/values/erpc.yaml.gotmpl +++ b/internal/embed/infrastructure/values/erpc.yaml.gotmpl @@ -1,4 +1,5 @@ -{{- $network := .Values.network -}} +{{- $network := env "STACK_NETWORK" | default "mainnet" -}} +{{- $publicDomain := env "STACK_PUBLIC_DOMAIN" | default "obol.stack" -}} {{- $chainId := 1 -}} {{/* Default: mainnet */}} {{- if eq $network "hoodi" -}} {{- $chainId = 560048 -}} @@ -48,6 +49,14 @@ config: |- projects: - id: rpc + upstreams: + - id: nodecore + endpoint: https://rpc.nodecore.io + evm: + chainId: {{ $chainId }} + jsonRpc: + headers: + X-Nodecore-Token: "${OBOL_OAUTH_TOKEN}" networks: - architecture: evm evm: @@ -78,23 +87,28 @@ config: |- allowCredentials: true maxAge: 3600 -# Secret env variables +# Secret env variables (chart expects flat string map, e.g. KEY: "value") +# The OBOL_OAUTH_TOKEN is injected from a Kubernetes secret via extraEnv instead secretEnv: {} +# Inject the OAuth token from the Kubernetes secret +extraEnv: + - name: OBOL_OAUTH_TOKEN + valueFrom: + secretKeyRef: + name: obol-oauth-token + key: token + optional: true + # Extra args for the erpc container extraArgs: [] # Command replacement for the erpc container customCommand: [] +# Disable legacy Ingress - using Gateway API HTTPRoute instead ingress: - enabled: true - className: nginx - hosts: - - host: obol.stack - paths: - - path: /rpc - pathType: Prefix + enabled: false service: type: ClusterIP @@ -106,7 +120,8 @@ affinity: {} imagePullSecrets: [] # Annotations for the Deployment -annotations: {} +annotations: + secret.reloader.stakater.com/reload: "obol-oauth-token" # Liveness probe livenessProbe: @@ -131,7 +146,8 @@ nodeSelector: {} podLabels: {} # Pod annotations -podAnnotations: {} +podAnnotations: + secret.reloader.stakater.com/reload: "obol-oauth-token" # Pod management policy podManagementPolicy: OrderedReady diff --git a/internal/embed/infrastructure/values/monitoring.yaml.gotmpl b/internal/embed/infrastructure/values/monitoring.yaml.gotmpl new file mode 100644 index 0000000..a7a6095 --- /dev/null +++ b/internal/embed/infrastructure/values/monitoring.yaml.gotmpl @@ -0,0 +1,37 @@ +prometheus: + enabled: true + prometheusSpec: + serviceMonitorSelectorNilUsesHelmValues: false + serviceMonitorSelector: + matchLabels: + release: monitoring + serviceMonitorNamespaceSelector: {} + podMonitorSelectorNilUsesHelmValues: false + podMonitorSelector: + matchLabels: + release: monitoring + podMonitorNamespaceSelector: {} + retention: 6h + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 1Gi + +prometheusOperator: + admissionWebhooks: + enabled: false # Disable webhook pre-install hooks (avoids timeout on fresh k3s) + +grafana: + enabled: false # Enable when we want UI access + +alertmanager: + enabled: false # Disable to keep the local stack lean + +kubeStateMetrics: + enabled: true + +nodeExporter: + enabled: true diff --git a/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl b/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl index 3301156..66f068b 100644 --- a/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl +++ b/internal/embed/infrastructure/values/obol-frontend.yaml.gotmpl @@ -1,29 +1,59 @@ -{{- $network := .Values.network -}} +{{- $network := env "STACK_NETWORK" | default "mainnet" -}} +{{- $publicDomain := env "STACK_PUBLIC_DOMAIN" | default "obol.stack" -}} replicaCount: 1 +serviceAccount: + name: obol-frontend + image: environment: - name: NEXT_PUBLIC_HELIOS_CLIENT_URL value: "http://helios-{{ $network }}.helios.svc.cluster.local:8545" - name: NEXT_PUBLIC_ERPC_URL - value: "http://erpc.default.svc.cluster.local:4000/rpc" + value: "https://{{ $publicDomain }}/rpc" - name: NEXT_PUBLIC_AZTEC_SEQUENCER_URL value: "http://l2-sequencer-node-mainnet-node.aztec.svc.cluster.local:8080" + - name: BETTER_AUTH_SECRET + value: '{{ env "BETTER_AUTH_SECRET" }}' + - name: BETTER_AUTH_URL + value: "https://{{ $publicDomain }}" + - name: OBOL_GOOGLE_CLIENT_ID + value: '{{ env "OBOL_GOOGLE_CLIENT_ID" }}' + - name: OBOL_GOOGLE_CLIENT_SECRET + value: '{{ env "OBOL_GOOGLE_CLIENT_SECRET" }}' + - name: OBOL_AUTH_DB_PATH + value: "/data/auth.sqlite" + + # Obol Agent (ADK) in-cluster URL for CopilotKit runtime + - name: ADK_AGENT_URL + value: "http://obol-agent.agent.svc.cluster.local:8000/" + - name: NEXT_PUBLIC_ADK_AGENT_URL + value: "http://obol-agent.agent.svc.cluster.local:8000/" + + # Ollama in-cluster URL (used by dashboard to surface Ollama Cloud connect URL) + - name: OLLAMA_URL + value: "http://ollama.llm.svc.cluster.local:11434" repository: obolnetwork/obol-stack-front-end pullPolicy: Always - tag: "v0.1.1" + tag: "latest" service: type: ClusterIP port: 3000 +podSecurityContext: + fsGroup: 1001 + +volumes: + - name: auth-db + emptyDir: {} + +volumeMounts: + - name: auth-db + mountPath: /data + +# Disable legacy Ingress - using Gateway API HTTPRoute instead ingress: - enabled: true - className: "nginx" - hosts: - - host: obol.stack - paths: - - path: / - pathType: Prefix + enabled: false diff --git a/internal/embed/k3d-config.yaml b/internal/embed/k3d-config.yaml index 563d697..9a97c5d 100644 --- a/internal/embed/k3d-config.yaml +++ b/internal/embed/k3d-config.yaml @@ -35,7 +35,7 @@ options: - arg: --kube-apiserver-arg=feature-gates=KubeletInUserNamespace=true nodeFilters: - server:* - # Disable Traefik to use nginx instead + # Disable bundled Traefik (we install Traefik via Helm) - arg: --disable=traefik nodeFilters: - server:* diff --git a/internal/embed/k3s-config.yaml b/internal/embed/k3s-config.yaml new file mode 100644 index 0000000..1c75e5a --- /dev/null +++ b/internal/embed/k3s-config.yaml @@ -0,0 +1,24 @@ +# k3s server configuration for Obol Stack +# Generated by: obol stack init --backend k3s + +# Disable components we manage ourselves (matching k3d config) +disable: + - traefik + - local-storage + +# Data directory for k3s internal state +data-dir: {{DATA_DIR}}/k3s + +# Bind to all interfaces for local access +bind-address: 0.0.0.0 +https-listen-port: 6443 + +# TLS SANs for local access +tls-san: + - "127.0.0.1" + - "localhost" + - "obol.stack" + +# Node labels +node-label: + - "obol.cluster-id={{STACK_ID}}" diff --git a/internal/embed/networks/aztec/templates/ingress.yaml b/internal/embed/networks/aztec/templates/ingress.yaml index 1e8ddd3..821537d 100644 --- a/internal/embed/networks/aztec/templates/ingress.yaml +++ b/internal/embed/networks/aztec/templates/ingress.yaml @@ -1,23 +1,32 @@ {{- if eq .Release.Name "aztec-ingress" }} -apiVersion: networking.k8s.io/v1 -kind: Ingress +# HTTPRoute for Aztec sequencer node RPC +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute metadata: name: aztec namespace: {{ .Release.Namespace }} - annotations: - nginx.ingress.kubernetes.io/rewrite-target: /$2 - nginx.ingress.kubernetes.io/use-regex: "true" spec: - ingressClassName: nginx + parentRefs: + - name: traefik-gateway + namespace: traefik + sectionName: web + hostnames: + - obol.stack rules: - - host: obol.stack - http: - paths: - - path: /{{ .Release.Namespace }}(/|$)(.*) - pathType: ImplementationSpecific - backend: - service: - name: l2-sequencer-node-{{ .Values.id }}-node - port: - number: 8080 + - matches: + - path: + type: Exact + value: /{{ .Release.Namespace }} + - path: + type: PathPrefix + value: /{{ .Release.Namespace }}/ + filters: + - type: URLRewrite + urlRewrite: + path: + type: ReplacePrefixMatch + replacePrefixMatch: / + backendRefs: + - name: l2-sequencer-node-{{ .Values.id }}-node + port: 8080 {{- end }} diff --git a/internal/embed/networks/ethereum/templates/ingress.yaml b/internal/embed/networks/ethereum/templates/ingress.yaml index 75a39a6..76c745e 100644 --- a/internal/embed/networks/ethereum/templates/ingress.yaml +++ b/internal/embed/networks/ethereum/templates/ingress.yaml @@ -1,30 +1,63 @@ {{- if eq .Release.Name "ethereum-ingress" }} -apiVersion: networking.k8s.io/v1 -kind: Ingress +# HTTPRoute for Ethereum execution client RPC +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute metadata: - name: ethereum + name: ethereum-execution namespace: {{ .Release.Namespace }} - annotations: - nginx.ingress.kubernetes.io/rewrite-target: /$2 - nginx.ingress.kubernetes.io/use-regex: "true" spec: - ingressClassName: nginx + parentRefs: + - name: traefik-gateway + namespace: traefik + sectionName: web + hostnames: + - obol.stack rules: - - host: obol.stack - http: - paths: - - path: /{{ .Release.Namespace }}/execution(/|$)(.*) - pathType: ImplementationSpecific - backend: - service: - name: ethereum-execution - port: - number: 8545 - - path: /{{ .Release.Namespace }}/beacon(/|$)(.*) - pathType: ImplementationSpecific - backend: - service: - name: ethereum-beacon - port: - number: 5052 + - matches: + - path: + type: Exact + value: /{{ .Release.Namespace }}/execution + - path: + type: PathPrefix + value: /{{ .Release.Namespace }}/execution/ + filters: + - type: URLRewrite + urlRewrite: + path: + type: ReplacePrefixMatch + replacePrefixMatch: / + backendRefs: + - name: ethereum-execution + port: 8545 +--- +# HTTPRoute for Ethereum beacon client RPC +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: ethereum-beacon + namespace: {{ .Release.Namespace }} +spec: + parentRefs: + - name: traefik-gateway + namespace: traefik + sectionName: web + hostnames: + - obol.stack + rules: + - matches: + - path: + type: Exact + value: /{{ .Release.Namespace }}/beacon + - path: + type: PathPrefix + value: /{{ .Release.Namespace }}/beacon/ + filters: + - type: URLRewrite + urlRewrite: + path: + type: ReplacePrefixMatch + replacePrefixMatch: / + backendRefs: + - name: ethereum-beacon + port: 5052 {{- end }} diff --git a/internal/embed/networks/helios/helmfile.yaml.gotmpl b/internal/embed/networks/helios/helmfile.yaml.gotmpl index 2be4293..c0a5d96 100644 --- a/internal/embed/networks/helios/helmfile.yaml.gotmpl +++ b/internal/embed/networks/helios/helmfile.yaml.gotmpl @@ -28,17 +28,45 @@ releases: size: 10Gi storageClass: local-path + # Disable legacy Ingress - using Gateway API HTTPRoute instead - ingress: - enabled: true - className: nginx - annotations: - nginx.ingress.kubernetes.io/rewrite-target: /$2 - nginx.ingress.kubernetes.io/use-regex: "true" - hosts: - - host: obol.stack - paths: - - path: /helios-{{ .Values.id }}(/|$)(.*) - pathType: ImplementationSpecific + enabled: false + + # HTTPRoute for Helios RPC endpoint + - name: helios-httproute + namespace: helios-{{ .Values.id }} + chart: bedag/raw + values: + - resources: + - apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + metadata: + name: helios + namespace: helios-{{ .Values.id }} + spec: + parentRefs: + - name: traefik-gateway + namespace: traefik + sectionName: web + hostnames: + - obol.stack + rules: + - matches: + - path: + type: Exact + value: /helios-{{ .Values.id }} + - path: + type: PathPrefix + value: /helios-{{ .Values.id }}/ + filters: + - type: URLRewrite + urlRewrite: + path: + type: ReplacePrefixMatch + replacePrefixMatch: / + backendRefs: + - name: helios-{{ .Values.network }} + port: 8545 # Metadata ConfigMap for frontend discovery - name: helios-metadata diff --git a/internal/stack/backend.go b/internal/stack/backend.go new file mode 100644 index 0000000..f26014d --- /dev/null +++ b/internal/stack/backend.go @@ -0,0 +1,77 @@ +package stack + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/ObolNetwork/obol-stack/internal/config" +) + +const ( + // BackendK3d is the k3d backend (Docker-based, default) + BackendK3d = "k3d" + // BackendK3s is the standalone k3s backend (bare-metal) + BackendK3s = "k3s" + + stackBackendFile = ".stack-backend" +) + +// Backend abstracts the Kubernetes cluster runtime (k3d, k3s) +type Backend interface { + // Name returns the backend identifier (e.g., "k3d", "k3s") + Name() string + + // Init generates backend-specific cluster configuration files + Init(cfg *config.Config, stackID string) error + + // Up creates or starts the cluster and returns kubeconfig contents + Up(cfg *config.Config, stackID string) (kubeconfigData []byte, err error) + + // IsRunning returns true if the cluster is currently running + IsRunning(cfg *config.Config, stackID string) (bool, error) + + // Down stops the cluster without destroying configuration or data + Down(cfg *config.Config, stackID string) error + + // Destroy removes the cluster entirely (containers/processes) + Destroy(cfg *config.Config, stackID string) error + + // DataDir returns the storage path for the local-path-provisioner. + // For k3d this is "/data" (Docker volume mount point). + // For k3s this is the absolute host path to cfg.DataDir. + DataDir(cfg *config.Config) string + + // Prerequisites checks that required software/permissions are available + Prerequisites(cfg *config.Config) error +} + +// NewBackend creates a Backend by name +func NewBackend(name string) (Backend, error) { + switch name { + case BackendK3d: + return &K3dBackend{}, nil + case BackendK3s: + return &K3sBackend{}, nil + default: + return nil, fmt.Errorf("unknown backend: %s (supported: k3d, k3s)", name) + } +} + +// LoadBackend reads the persisted backend choice from .stack-backend file. +// Falls back to k3d if no file exists (backward compatibility). +func LoadBackend(cfg *config.Config) (Backend, error) { + path := filepath.Join(cfg.ConfigDir, stackBackendFile) + data, err := os.ReadFile(path) + if err != nil { + return &K3dBackend{}, nil + } + return NewBackend(strings.TrimSpace(string(data))) +} + +// SaveBackend persists the backend choice +func SaveBackend(cfg *config.Config, name string) error { + path := filepath.Join(cfg.ConfigDir, stackBackendFile) + return os.WriteFile(path, []byte(name), 0644) +} diff --git a/internal/stack/backend_k3d.go b/internal/stack/backend_k3d.go new file mode 100644 index 0000000..8fdd3de --- /dev/null +++ b/internal/stack/backend_k3d.go @@ -0,0 +1,164 @@ +package stack + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + + "github.com/ObolNetwork/obol-stack/internal/config" + "github.com/ObolNetwork/obol-stack/internal/embed" +) + +const ( + k3dConfigFile = "k3d.yaml" +) + +// K3dBackend manages clusters via k3d (k3s inside Docker containers) +type K3dBackend struct{} + +func (b *K3dBackend) Name() string { return BackendK3d } + +func (b *K3dBackend) Prerequisites(cfg *config.Config) error { + // Check Docker is running + cmd := exec.Command("docker", "info") + cmd.Stdout = nil + cmd.Stderr = nil + if err := cmd.Run(); err != nil { + return fmt.Errorf("Docker is not running. k3d backend requires Docker.\nStart Docker and try again") + } + + // Check k3d binary exists + k3dPath := filepath.Join(cfg.BinDir, "k3d") + if _, err := os.Stat(k3dPath); os.IsNotExist(err) { + return fmt.Errorf("k3d not found at %s\nRun obolup.sh to install dependencies", k3dPath) + } + return nil +} + +func (b *K3dBackend) Init(cfg *config.Config, stackID string) error { + absDataDir, err := filepath.Abs(cfg.DataDir) + if err != nil { + return fmt.Errorf("failed to get absolute path for data directory: %w", err) + } + + absConfigDir, err := filepath.Abs(cfg.ConfigDir) + if err != nil { + return fmt.Errorf("failed to get absolute path for config directory: %w", err) + } + + // Template k3d config with actual values + k3dConfig := embed.K3dConfig + k3dConfig = strings.ReplaceAll(k3dConfig, "{{STACK_ID}}", stackID) + k3dConfig = strings.ReplaceAll(k3dConfig, "{{DATA_DIR}}", absDataDir) + k3dConfig = strings.ReplaceAll(k3dConfig, "{{CONFIG_DIR}}", absConfigDir) + + k3dConfigPath := filepath.Join(cfg.ConfigDir, k3dConfigFile) + if err := os.WriteFile(k3dConfigPath, []byte(k3dConfig), 0644); err != nil { + return fmt.Errorf("failed to write k3d config: %w", err) + } + + fmt.Printf("K3d config saved to: %s\n", k3dConfigPath) + return nil +} + +func (b *K3dBackend) IsRunning(cfg *config.Config, stackID string) (bool, error) { + stackName := fmt.Sprintf("obol-stack-%s", stackID) + listCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "list", "--no-headers") + output, err := listCmd.Output() + if err != nil { + return false, fmt.Errorf("k3d list command failed: %w", err) + } + return strings.Contains(string(output), stackName), nil +} + +func (b *K3dBackend) Up(cfg *config.Config, stackID string) ([]byte, error) { + stackName := fmt.Sprintf("obol-stack-%s", stackID) + k3dConfigPath := filepath.Join(cfg.ConfigDir, k3dConfigFile) + + running, err := b.IsRunning(cfg, stackID) + if err != nil { + return nil, err + } + + if running { + fmt.Printf("Stack already exists, attempting to start: %s (id: %s)\n", stackName, stackID) + startCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "start", stackName) + startCmd.Stdout = os.Stdout + startCmd.Stderr = os.Stderr + if err := startCmd.Run(); err != nil { + return nil, fmt.Errorf("failed to start existing cluster: %w", err) + } + } else { + // Create data directory if it doesn't exist + absDataDir, err := filepath.Abs(cfg.DataDir) + if err != nil { + return nil, fmt.Errorf("failed to get absolute path for data directory: %w", err) + } + if err := os.MkdirAll(absDataDir, 0755); err != nil { + return nil, fmt.Errorf("failed to create data directory: %w", err) + } + + fmt.Println("Creating k3d cluster...") + createCmd := exec.Command( + filepath.Join(cfg.BinDir, "k3d"), + "cluster", "create", stackName, + "--config", k3dConfigPath, + "--kubeconfig-update-default=false", + ) + createCmd.Stdout = os.Stdout + createCmd.Stderr = os.Stderr + if err := createCmd.Run(); err != nil { + return nil, fmt.Errorf("failed to create cluster: %w", err) + } + } + + // Export kubeconfig + kubeconfigCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "kubeconfig", "get", stackName) + kubeconfigData, err := kubeconfigCmd.Output() + if err != nil { + return nil, fmt.Errorf("failed to get kubeconfig: %w", err) + } + + return kubeconfigData, nil +} + +func (b *K3dBackend) Down(cfg *config.Config, stackID string) error { + stackName := fmt.Sprintf("obol-stack-%s", stackID) + + fmt.Printf("Stopping stack gracefully: %s (id: %s)\n", stackName, stackID) + + stopCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "stop", stackName) + stopCmd.Stdout = os.Stdout + stopCmd.Stderr = os.Stderr + if err := stopCmd.Run(); err != nil { + fmt.Println("Graceful stop timed out or failed, forcing cluster deletion") + deleteCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "delete", stackName) + deleteCmd.Stdout = os.Stdout + deleteCmd.Stderr = os.Stderr + if err := deleteCmd.Run(); err != nil { + return fmt.Errorf("failed to stop cluster: %w", err) + } + } + + return nil +} + +func (b *K3dBackend) Destroy(cfg *config.Config, stackID string) error { + stackName := fmt.Sprintf("obol-stack-%s", stackID) + + fmt.Printf("Deleting cluster containers: %s\n", stackName) + deleteCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "delete", stackName) + deleteCmd.Stdout = os.Stdout + deleteCmd.Stderr = os.Stderr + if err := deleteCmd.Run(); err != nil { + fmt.Printf("Failed to delete cluster (may already be deleted): %v\n", err) + } + + return nil +} + +func (b *K3dBackend) DataDir(cfg *config.Config) string { + return "/data" +} diff --git a/internal/stack/backend_k3s.go b/internal/stack/backend_k3s.go new file mode 100644 index 0000000..3325b13 --- /dev/null +++ b/internal/stack/backend_k3s.go @@ -0,0 +1,330 @@ +package stack + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "strconv" + "strings" + "time" + + "github.com/ObolNetwork/obol-stack/internal/config" + "github.com/ObolNetwork/obol-stack/internal/embed" +) + +const ( + k3sConfigFile = "k3s-config.yaml" + k3sPidFile = ".k3s.pid" + k3sLogFile = "k3s.log" +) + +// K3sBackend manages a standalone k3s cluster (bare-metal) +type K3sBackend struct{} + +func (b *K3sBackend) Name() string { return BackendK3s } + +func (b *K3sBackend) Prerequisites(cfg *config.Config) error { + if runtime.GOOS != "linux" { + return fmt.Errorf("k3s backend is only supported on Linux") + } + + // Check sudo access (allow interactive password prompt) + cmd := exec.Command("sudo", "-v") + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + return fmt.Errorf("k3s backend requires root/sudo access") + } + + // Check k3s binary exists + k3sPath := filepath.Join(cfg.BinDir, "k3s") + if _, err := os.Stat(k3sPath); os.IsNotExist(err) { + return fmt.Errorf("k3s not found at %s\nRun obolup.sh to install dependencies", k3sPath) + } + + return nil +} + +func (b *K3sBackend) Init(cfg *config.Config, stackID string) error { + absDataDir, err := filepath.Abs(cfg.DataDir) + if err != nil { + return fmt.Errorf("failed to get absolute path for data directory: %w", err) + } + + // Template k3s config with actual values + k3sConfig := embed.K3sConfig + k3sConfig = strings.ReplaceAll(k3sConfig, "{{STACK_ID}}", stackID) + k3sConfig = strings.ReplaceAll(k3sConfig, "{{DATA_DIR}}", absDataDir) + + k3sConfigPath := filepath.Join(cfg.ConfigDir, k3sConfigFile) + if err := os.WriteFile(k3sConfigPath, []byte(k3sConfig), 0644); err != nil { + return fmt.Errorf("failed to write k3s config: %w", err) + } + + fmt.Printf("K3s config saved to: %s\n", k3sConfigPath) + return nil +} + +func (b *K3sBackend) IsRunning(cfg *config.Config, stackID string) (bool, error) { + pid, err := b.readPid(cfg) + if err != nil { + return false, nil + } + + return b.isProcessAlive(pid), nil +} + +func (b *K3sBackend) Up(cfg *config.Config, stackID string) ([]byte, error) { + running, _ := b.IsRunning(cfg, stackID) + if running { + fmt.Println("k3s is already running") + kubeconfigPath := filepath.Join(cfg.ConfigDir, kubeconfigFile) + data, err := os.ReadFile(kubeconfigPath) + if err != nil { + return nil, fmt.Errorf("k3s is running but kubeconfig not found: %w", err) + } + return data, nil + } + + // Clean up stale PID file if it exists (QA R6) + b.cleanStalePid(cfg) + + k3sConfigPath := filepath.Join(cfg.ConfigDir, k3sConfigFile) + if _, err := os.Stat(k3sConfigPath); os.IsNotExist(err) { + return nil, fmt.Errorf("k3s config not found at %s\nRun 'obol stack init --backend k3s' first", k3sConfigPath) + } + + // Create data directory + absDataDir, err := filepath.Abs(cfg.DataDir) + if err != nil { + return nil, fmt.Errorf("failed to get absolute path for data directory: %w", err) + } + if err := os.MkdirAll(absDataDir, 0755); err != nil { + return nil, fmt.Errorf("failed to create data directory: %w", err) + } + + kubeconfigPath := filepath.Join(cfg.ConfigDir, kubeconfigFile) + k3sBinary := filepath.Join(cfg.BinDir, "k3s") + logPath := filepath.Join(cfg.ConfigDir, k3sLogFile) + + // Remove stale kubeconfig so we wait for k3s to write a fresh one + os.Remove(kubeconfigPath) + + // Open log file for k3s output + logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644) + if err != nil { + return nil, fmt.Errorf("failed to create k3s log file: %w", err) + } + + fmt.Println("Starting k3s server...") + + // Start k3s server as background process via sudo + cmd := exec.Command("sudo", + k3sBinary, "server", + "--config", k3sConfigPath, + "--write-kubeconfig", kubeconfigPath, + "--write-kubeconfig-mode", "0600", + ) + cmd.Stdout = logFile + cmd.Stderr = logFile + + if err := cmd.Start(); err != nil { + logFile.Close() + return nil, fmt.Errorf("failed to start k3s: %w", err) + } + + // Save PID before releasing the process handle + pid := cmd.Process.Pid + + // Write PID file + pidPath := filepath.Join(cfg.ConfigDir, k3sPidFile) + if err := os.WriteFile(pidPath, []byte(strconv.Itoa(pid)), 0600); err != nil { + logFile.Close() + return nil, fmt.Errorf("failed to write k3s PID file: %w", err) + } + + // Detach the process + cmd.Process.Release() + logFile.Close() + + fmt.Printf("k3s started (pid: %d)\n", pid) + fmt.Printf("Logs: %s\n", logPath) + + // Wait for kubeconfig to be written by k3s + fmt.Println("Waiting for kubeconfig...") + deadline := time.Now().Add(2 * time.Minute) + for time.Now().Before(deadline) { + if info, err := os.Stat(kubeconfigPath); err == nil && info.Size() > 0 { + // Fix ownership: k3s writes kubeconfig as root via sudo + exec.Command("sudo", "chown", fmt.Sprintf("%d:%d", os.Getuid(), os.Getgid()), kubeconfigPath).Run() + + data, err := os.ReadFile(kubeconfigPath) + if err == nil && len(data) > 0 { + fmt.Println("Kubeconfig ready, waiting for API server...") + + // Wait for the API server to actually respond + apiDeadline := time.Now().Add(90 * time.Second) + kubectlPath := filepath.Join(cfg.BinDir, "kubectl") + for time.Now().Before(apiDeadline) { + probe := exec.Command(kubectlPath, "--kubeconfig", kubeconfigPath, + "get", "nodes", "--no-headers") + if out, err := probe.Output(); err == nil && len(out) > 0 { + fmt.Println("API server ready") + return data, nil + } + time.Sleep(3 * time.Second) + } + + // Return kubeconfig even if API isn't fully ready yet + fmt.Println("Warning: API server not fully ready, proceeding anyway") + return data, nil + } + } + time.Sleep(2 * time.Second) + } + + return nil, fmt.Errorf("k3s did not write kubeconfig within timeout\nCheck logs: %s", logPath) +} + +func (b *K3sBackend) Down(cfg *config.Config, stackID string) error { + pid, err := b.readPid(cfg) + if err != nil { + fmt.Println("k3s PID file not found, may not be running") + return nil + } + + if !b.isProcessAlive(pid) { + fmt.Println("k3s process not running, cleaning up PID file") + b.removePidFile(cfg) + return nil + } + + fmt.Printf("Stopping k3s (pid: %d)...\n", pid) + + // Send SIGTERM to the sudo/k3s process only (not the process group). + // Using negative PID (process group kill) is unsafe here because the saved PID + // is the sudo wrapper, whose process group can include unrelated system processes + // like systemd-logind — killing those crashes the desktop session. + // sudo forwards SIGTERM to k3s, which handles its own child process cleanup. + pidStr := strconv.Itoa(pid) + stopCmd := exec.Command("sudo", "kill", "-TERM", pidStr) + stopCmd.Stdout = os.Stdout + stopCmd.Stderr = os.Stderr + if err := stopCmd.Run(); err != nil { + fmt.Printf("SIGTERM failed, sending SIGKILL: %v\n", err) + exec.Command("sudo", "kill", "-9", pidStr).Run() + } + + // Wait for process to exit (up to 30 seconds) + deadline := time.Now().Add(30 * time.Second) + for time.Now().Before(deadline) { + if !b.isProcessAlive(pid) { + break + } + time.Sleep(1 * time.Second) + } + + // Clean up orphaned k3s child processes (containerd-shim, etc.) + // Use k3s-killall.sh if available, otherwise kill containerd shims directly. + killallPath := "/usr/local/bin/k3s-killall.sh" + if _, err := os.Stat(killallPath); err == nil { + fmt.Println("Running k3s cleanup...") + cleanCmd := exec.Command("sudo", killallPath) + cleanCmd.Stdout = os.Stdout + cleanCmd.Stderr = os.Stderr + cleanCmd.Run() + } else { + // k3s-killall.sh not installed (binary-only install via obolup). + // Kill orphaned containerd-shim processes that use the k3s socket. + fmt.Println("Cleaning up k3s child processes...") + exec.Command("sudo", "pkill", "-TERM", "-f", "containerd-shim.*k3s").Run() + time.Sleep(2 * time.Second) + // Force-kill any that survived SIGTERM + exec.Command("sudo", "pkill", "-KILL", "-f", "containerd-shim.*k3s").Run() + } + + b.removePidFile(cfg) + fmt.Println("k3s stopped") + return nil +} + +func (b *K3sBackend) Destroy(cfg *config.Config, stackID string) error { + // Stop if running + b.Down(cfg, stackID) + + // Clean up k3s state directories (default + custom data-dir) + absDataDir, _ := filepath.Abs(cfg.DataDir) + cleanDirs := []string{ + "/var/lib/rancher/k3s", + "/etc/rancher/k3s", + filepath.Join(absDataDir, "k3s"), + } + for _, dir := range cleanDirs { + if _, err := os.Stat(dir); err == nil { + fmt.Printf("Cleaning up: %s\n", dir) + exec.Command("sudo", "rm", "-rf", dir).Run() + } + } + + // Run uninstall script if available + uninstallPath := "/usr/local/bin/k3s-uninstall.sh" + if _, err := os.Stat(uninstallPath); err == nil { + fmt.Println("Running k3s uninstall...") + uninstallCmd := exec.Command("sudo", uninstallPath) + uninstallCmd.Stdout = os.Stdout + uninstallCmd.Stderr = os.Stderr + uninstallCmd.Run() + } + + return nil +} + +func (b *K3sBackend) DataDir(cfg *config.Config) string { + absDataDir, _ := filepath.Abs(cfg.DataDir) + return absDataDir +} + +// readPid reads the k3s PID from the PID file +func (b *K3sBackend) readPid(cfg *config.Config) (int, error) { + pidPath := filepath.Join(cfg.ConfigDir, k3sPidFile) + data, err := os.ReadFile(pidPath) + if err != nil { + return 0, err + } + pid, err := strconv.Atoi(strings.TrimSpace(string(data))) + if err != nil { + return 0, fmt.Errorf("invalid PID in %s: %w", pidPath, err) + } + if pid <= 0 { + return 0, fmt.Errorf("invalid PID in %s: %d", pidPath, pid) + } + return pid, nil +} + +// cleanStalePid removes the PID file if the process is no longer running +func (b *K3sBackend) cleanStalePid(cfg *config.Config) { + pid, err := b.readPid(cfg) + if err != nil { + return + } + if !b.isProcessAlive(pid) { + fmt.Printf("Cleaning up stale PID file (pid %d no longer running)\n", pid) + b.removePidFile(cfg) + } +} + +// isProcessAlive checks if a root-owned process is still running. +// Uses sudo kill -0 since the k3s process runs as root and direct +// signal(0) from an unprivileged user returns EPERM. +func (b *K3sBackend) isProcessAlive(pid int) bool { + return exec.Command("sudo", "kill", "-0", strconv.Itoa(pid)).Run() == nil +} + +// removePidFile removes the k3s PID file +func (b *K3sBackend) removePidFile(cfg *config.Config) { + pidPath := filepath.Join(cfg.ConfigDir, k3sPidFile) + os.Remove(pidPath) +} diff --git a/internal/stack/backend_k3s_test.go b/internal/stack/backend_k3s_test.go new file mode 100644 index 0000000..e7a09ba --- /dev/null +++ b/internal/stack/backend_k3s_test.go @@ -0,0 +1,97 @@ +package stack + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/ObolNetwork/obol-stack/internal/config" +) + +func TestK3sReadPid(t *testing.T) { + tests := []struct { + name string + content string + wantPid int + wantErr bool + errContains string + }{ + {name: "valid pid", content: "12345", wantPid: 12345}, + {name: "with trailing newline", content: "12345\n", wantPid: 12345}, + {name: "with whitespace", content: " 12345 ", wantPid: 12345}, + {name: "pid 1", content: "1", wantPid: 1}, + {name: "large pid", content: "4194304", wantPid: 4194304}, + {name: "not a number", content: "not-a-number", wantErr: true, errContains: "invalid PID"}, + {name: "empty content", content: "", wantErr: true, errContains: "invalid PID"}, + {name: "float", content: "123.45", wantErr: true, errContains: "invalid PID"}, + {name: "negative", content: "-1", wantErr: true, errContains: "invalid PID"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + pidPath := filepath.Join(tmpDir, k3sPidFile) + if err := os.WriteFile(pidPath, []byte(tt.content), 0600); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + b := &K3sBackend{} + pid, err := b.readPid(cfg) + if tt.wantErr { + if err == nil { + t.Fatalf("readPid() = %d, nil error; want error containing %q", pid, tt.errContains) + } + if !strings.Contains(err.Error(), tt.errContains) { + t.Errorf("readPid() error = %q, want containing %q", err.Error(), tt.errContains) + } + return + } + if err != nil { + t.Fatalf("readPid() unexpected error: %v", err) + } + if pid != tt.wantPid { + t.Errorf("readPid() = %d, want %d", pid, tt.wantPid) + } + }) + } + + t.Run("missing file", func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + b := &K3sBackend{} + _, err := b.readPid(cfg) + if err == nil { + t.Fatal("readPid() with no file should return error") + } + }) +} + +func TestK3sRemovePidFile(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + pidPath := filepath.Join(tmpDir, k3sPidFile) + if err := os.WriteFile(pidPath, []byte("12345"), 0600); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + b := &K3sBackend{} + b.removePidFile(cfg) + + if _, err := os.Stat(pidPath); !os.IsNotExist(err) { + t.Error("PID file should have been removed") + } +} + +func TestK3sRemovePidFileNoop(t *testing.T) { + // Removing a non-existent PID file should not panic or error + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + b := &K3sBackend{} + b.removePidFile(cfg) // should not panic +} diff --git a/internal/stack/backend_test.go b/internal/stack/backend_test.go new file mode 100644 index 0000000..e59836c --- /dev/null +++ b/internal/stack/backend_test.go @@ -0,0 +1,321 @@ +package stack + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/ObolNetwork/obol-stack/internal/config" +) + +// Compile-time interface compliance checks +var ( + _ Backend = (*K3dBackend)(nil) + _ Backend = (*K3sBackend)(nil) +) + +func TestNewBackend(t *testing.T) { + tests := []struct { + name string + input string + wantName string + wantErr bool + errContains string + }{ + {name: "k3d backend", input: "k3d", wantName: "k3d"}, + {name: "k3s backend", input: "k3s", wantName: "k3s"}, + {name: "unknown backend", input: "docker", wantErr: true, errContains: "unknown backend"}, + {name: "empty string", input: "", wantErr: true, errContains: "unknown backend"}, + {name: "case sensitive", input: "K3D", wantErr: true, errContains: "unknown backend"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + backend, err := NewBackend(tt.input) + if tt.wantErr { + if err == nil { + t.Fatalf("NewBackend(%q) = nil error, want error containing %q", tt.input, tt.errContains) + } + if !strings.Contains(err.Error(), tt.errContains) { + t.Errorf("NewBackend(%q) error = %q, want containing %q", tt.input, err.Error(), tt.errContains) + } + return + } + if err != nil { + t.Fatalf("NewBackend(%q) unexpected error: %v", tt.input, err) + } + if backend.Name() != tt.wantName { + t.Errorf("NewBackend(%q).Name() = %q, want %q", tt.input, backend.Name(), tt.wantName) + } + }) + } +} + +func TestK3dBackendName(t *testing.T) { + b := &K3dBackend{} + if got := b.Name(); got != BackendK3d { + t.Errorf("K3dBackend.Name() = %q, want %q", got, BackendK3d) + } +} + +func TestK3sBackendName(t *testing.T) { + b := &K3sBackend{} + if got := b.Name(); got != BackendK3s { + t.Errorf("K3sBackend.Name() = %q, want %q", got, BackendK3s) + } +} + +func TestK3dBackendDataDir(t *testing.T) { + // k3d DataDir must always return "/data" regardless of cfg.DataDir, + // because k3d mounts the host data dir to /data inside the container. + tests := []struct { + name string + dataDir string + }{ + {name: "absolute path", dataDir: "/home/user/.local/share/obol"}, + {name: "relative path", dataDir: ".workspace/data"}, + {name: "empty string", dataDir: ""}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + b := &K3dBackend{} + cfg := &config.Config{DataDir: tt.dataDir} + if got := b.DataDir(cfg); got != "/data" { + t.Errorf("K3dBackend.DataDir() = %q, want %q (must always be /data for Docker mount)", got, "/data") + } + }) + } +} + +func TestK3sBackendDataDir(t *testing.T) { + // k3s DataDir must return an absolute version of cfg.DataDir, + // because k3s runs directly on the host. + b := &K3sBackend{} + + t.Run("absolute path passthrough", func(t *testing.T) { + cfg := &config.Config{DataDir: "/home/user/.local/share/obol"} + got := b.DataDir(cfg) + if got != "/home/user/.local/share/obol" { + t.Errorf("K3sBackend.DataDir() = %q, want %q", got, "/home/user/.local/share/obol") + } + }) + + t.Run("relative path resolved to absolute", func(t *testing.T) { + cfg := &config.Config{DataDir: "relative/path"} + got := b.DataDir(cfg) + if !filepath.IsAbs(got) { + t.Errorf("K3sBackend.DataDir() = %q, want absolute path", got) + } + if !strings.HasSuffix(got, "relative/path") { + t.Errorf("K3sBackend.DataDir() = %q, want suffix %q", got, "relative/path") + } + }) +} + +func TestSaveAndLoadBackend(t *testing.T) { + tests := []struct { + name string + backend string + wantName string + }{ + {name: "save k3s load k3s", backend: "k3s", wantName: "k3s"}, + {name: "save k3d load k3d", backend: "k3d", wantName: "k3d"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + if err := SaveBackend(cfg, tt.backend); err != nil { + t.Fatalf("SaveBackend() error: %v", err) + } + + backend, err := LoadBackend(cfg) + if err != nil { + t.Fatalf("LoadBackend() error: %v", err) + } + if backend.Name() != tt.wantName { + t.Errorf("LoadBackend().Name() = %q, want %q", backend.Name(), tt.wantName) + } + }) + } +} + +func TestLoadBackendFallsBackToK3d(t *testing.T) { + // When no .stack-backend file exists, LoadBackend must return k3d + // for backward compatibility with existing stacks. + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + backend, err := LoadBackend(cfg) + if err != nil { + t.Fatalf("LoadBackend() error: %v", err) + } + if backend.Name() != BackendK3d { + t.Errorf("LoadBackend() with no file = %q, want %q (backward compat)", backend.Name(), BackendK3d) + } +} + +func TestLoadBackendWithWhitespace(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + // Write file with trailing newline and whitespace + path := filepath.Join(tmpDir, stackBackendFile) + if err := os.WriteFile(path, []byte("k3s\n "), 0644); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + backend, err := LoadBackend(cfg) + if err != nil { + t.Fatalf("LoadBackend() error: %v", err) + } + if backend.Name() != BackendK3s { + t.Errorf("LoadBackend() = %q, want %q", backend.Name(), BackendK3s) + } +} + +func TestLoadBackendInvalidName(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + path := filepath.Join(tmpDir, stackBackendFile) + if err := os.WriteFile(path, []byte("docker-swarm"), 0644); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + _, err := LoadBackend(cfg) + if err == nil { + t.Fatal("LoadBackend() with invalid backend name should return error") + } + if !strings.Contains(err.Error(), "unknown backend") { + t.Errorf("LoadBackend() error = %q, want containing %q", err.Error(), "unknown backend") + } +} + +func TestK3dBackendInit(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ + ConfigDir: tmpDir, + DataDir: filepath.Join(tmpDir, "data"), + } + + b := &K3dBackend{} + if err := b.Init(cfg, "test-stack"); err != nil { + t.Fatalf("K3dBackend.Init() error: %v", err) + } + + // Verify config file was written + configPath := filepath.Join(tmpDir, k3dConfigFile) + data, err := os.ReadFile(configPath) + if err != nil { + t.Fatalf("Failed to read generated config: %v", err) + } + + content := string(data) + + // Verify placeholders were replaced + if strings.Contains(content, "{{STACK_ID}}") { + t.Error("Config still contains {{STACK_ID}} placeholder") + } + if strings.Contains(content, "{{DATA_DIR}}") { + t.Error("Config still contains {{DATA_DIR}} placeholder") + } + if strings.Contains(content, "{{CONFIG_DIR}}") { + t.Error("Config still contains {{CONFIG_DIR}} placeholder") + } + + // Verify actual values are present + if !strings.Contains(content, "test-stack") { + t.Error("Config does not contain stack ID 'test-stack'") + } + + // Verify paths are absolute + if !strings.Contains(content, tmpDir) { + t.Errorf("Config does not contain absolute data dir path %q", tmpDir) + } +} + +func TestK3sBackendInit(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ + ConfigDir: tmpDir, + DataDir: filepath.Join(tmpDir, "data"), + } + + b := &K3sBackend{} + if err := b.Init(cfg, "my-cluster"); err != nil { + t.Fatalf("K3sBackend.Init() error: %v", err) + } + + // Verify config file was written + configPath := filepath.Join(tmpDir, k3sConfigFile) + data, err := os.ReadFile(configPath) + if err != nil { + t.Fatalf("Failed to read generated config: %v", err) + } + + content := string(data) + + // Verify placeholders were replaced + if strings.Contains(content, "{{STACK_ID}}") { + t.Error("Config still contains {{STACK_ID}} placeholder") + } + if strings.Contains(content, "{{DATA_DIR}}") { + t.Error("Config still contains {{DATA_DIR}} placeholder") + } + + // Verify actual values are present + if !strings.Contains(content, "my-cluster") { + t.Error("Config does not contain stack ID 'my-cluster'") + } + + // Verify data-dir uses absolute path + absDataDir, _ := filepath.Abs(filepath.Join(tmpDir, "data")) + expectedDataDir := absDataDir + "/k3s" + if !strings.Contains(content, expectedDataDir) { + t.Errorf("Config does not contain absolute data-dir %q", expectedDataDir) + } +} + +func TestGetStackID(t *testing.T) { + tests := []struct { + name string + content string + want string + }{ + {name: "simple id", content: "happy-panda", want: "happy-panda"}, + {name: "with trailing newline", content: "happy-panda\n", want: "happy-panda"}, + {name: "with whitespace", content: " happy-panda \n", want: "happy-panda"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + path := filepath.Join(tmpDir, stackIDFile) + if err := os.WriteFile(path, []byte(tt.content), 0644); err != nil { + t.Fatalf("WriteFile error: %v", err) + } + + got := getStackID(cfg) + if got != tt.want { + t.Errorf("getStackID() = %q, want %q", got, tt.want) + } + }) + } + + t.Run("missing file returns empty", func(t *testing.T) { + tmpDir := t.TempDir() + cfg := &config.Config{ConfigDir: tmpDir} + + got := getStackID(cfg) + if got != "" { + t.Errorf("getStackID() with no file = %q, want empty string", got) + } + }) +} diff --git a/internal/stack/integration_test.go b/internal/stack/integration_test.go new file mode 100644 index 0000000..66088bc --- /dev/null +++ b/internal/stack/integration_test.go @@ -0,0 +1,255 @@ +//go:build integration + +package stack_test + +import ( + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + "time" +) + +// Integration tests for the k3s backend user flows. +// Requires: sudo access, k3s binary, OBOL_DEVELOPMENT=true. +// +// Run with: +// go test -tags integration -timeout 15m -v ./internal/stack/ + +func TestK3sUserFlows(t *testing.T) { + if os.Getenv("OBOL_DEVELOPMENT") != "true" { + t.Skip("OBOL_DEVELOPMENT not set, skipping integration test") + } + + projectRoot := findProjectRoot(t) + obol := filepath.Join(projectRoot, ".workspace", "bin", "obol") + if _, err := os.Stat(obol); os.IsNotExist(err) { + t.Fatalf("obol binary not found at %s — build it first", obol) + } + + configDir := filepath.Join(projectRoot, ".workspace", "config") + binDir := filepath.Join(projectRoot, ".workspace", "bin") + + // Helper to run obol commands + run := func(t *testing.T, args ...string) (string, error) { + t.Helper() + cmd := exec.Command(obol, args...) + cmd.Env = append(os.Environ(), + "OBOL_DEVELOPMENT=true", + "PATH="+binDir+":"+os.Getenv("PATH"), + ) + cmd.Dir = projectRoot + out, err := cmd.CombinedOutput() + return string(out), err + } + + // Cleanup before tests + run(t, "stack", "purge", "--force") + + // Cleanup after all tests + t.Cleanup(func() { + run(t, "stack", "purge", "--force") + }) + + t.Run("init", func(t *testing.T) { + out, err := run(t, "stack", "init", "--backend", "k3s") + if err != nil { + t.Fatalf("stack init failed: %v\n%s", err, out) + } + + // Verify config files created + for _, f := range []string{"k3s-config.yaml", ".stack-id", ".stack-backend"} { + if _, err := os.Stat(filepath.Join(configDir, f)); os.IsNotExist(err) { + t.Errorf("expected %s to exist after init", f) + } + } + + // Verify defaults directory + if _, err := os.Stat(filepath.Join(configDir, "defaults")); os.IsNotExist(err) { + t.Error("expected defaults/ directory after init") + } + + // Verify backend is k3s + data, _ := os.ReadFile(filepath.Join(configDir, ".stack-backend")) + if got := strings.TrimSpace(string(data)); got != "k3s" { + t.Errorf("backend = %q, want k3s", got) + } + }) + + t.Run("init_rejects_without_force", func(t *testing.T) { + _, err := run(t, "stack", "init", "--backend", "k3s") + if err == nil { + t.Error("init without --force should fail when config exists") + } + }) + + t.Run("init_force_preserves_stack_id", func(t *testing.T) { + idBefore, _ := os.ReadFile(filepath.Join(configDir, ".stack-id")) + out, err := run(t, "stack", "init", "--backend", "k3s", "--force") + if err != nil { + t.Fatalf("stack init --force failed: %v\n%s", err, out) + } + idAfter, _ := os.ReadFile(filepath.Join(configDir, ".stack-id")) + if string(idBefore) != string(idAfter) { + t.Errorf("stack ID changed: %q → %q", string(idBefore), string(idAfter)) + } + }) + + t.Run("up", func(t *testing.T) { + out, err := run(t, "stack", "up") + if err != nil { + t.Fatalf("stack up failed: %v\n%s", err, out) + } + + // Verify PID file and kubeconfig exist + if _, err := os.Stat(filepath.Join(configDir, ".k3s.pid")); os.IsNotExist(err) { + t.Error("PID file not found after stack up") + } + if _, err := os.Stat(filepath.Join(configDir, "kubeconfig.yaml")); os.IsNotExist(err) { + t.Error("kubeconfig not found after stack up") + } + }) + + t.Run("kubectl_passthrough", func(t *testing.T) { + out, err := run(t, "kubectl", "get", "nodes", "--no-headers") + if err != nil { + t.Fatalf("kubectl passthrough failed: %v\n%s", err, out) + } + lines := strings.Split(strings.TrimSpace(out), "\n") + if len(lines) < 1 { + t.Error("kubectl get nodes returned no nodes") + } + + out, err = run(t, "kubectl", "get", "namespaces", "--no-headers") + if err != nil { + t.Fatalf("kubectl get namespaces failed: %v\n%s", err, out) + } + lines = strings.Split(strings.TrimSpace(out), "\n") + if len(lines) < 1 { + t.Error("kubectl get namespaces returned no namespaces") + } + }) + + t.Run("up_idempotent", func(t *testing.T) { + pidBefore, _ := os.ReadFile(filepath.Join(configDir, ".k3s.pid")) + + out, err := run(t, "stack", "up") + if err != nil { + t.Fatalf("stack up (idempotent) failed: %v\n%s", err, out) + } + + pidAfter, _ := os.ReadFile(filepath.Join(configDir, ".k3s.pid")) + if string(pidBefore) != string(pidAfter) { + t.Errorf("PID changed on idempotent up: %q → %q", string(pidBefore), string(pidAfter)) + } + }) + + t.Run("down", func(t *testing.T) { + out, err := run(t, "stack", "down") + if err != nil { + t.Fatalf("stack down failed: %v\n%s", err, out) + } + + // PID file should be cleaned up + if _, err := os.Stat(filepath.Join(configDir, ".k3s.pid")); !os.IsNotExist(err) { + t.Error("PID file should be removed after down") + } + + // Config should be preserved + if _, err := os.Stat(filepath.Join(configDir, ".stack-id")); os.IsNotExist(err) { + t.Error("stack ID should be preserved after down") + } + }) + + t.Run("down_already_stopped", func(t *testing.T) { + out, err := run(t, "stack", "down") + if err != nil { + t.Fatalf("stack down (already stopped) failed: %v\n%s", err, out) + } + }) + + t.Run("up_restart_after_down", func(t *testing.T) { + out, err := run(t, "stack", "up") + if err != nil { + t.Fatalf("stack up (restart) failed: %v\n%s", err, out) + } + + // Verify PID file exists + if _, err := os.Stat(filepath.Join(configDir, ".k3s.pid")); os.IsNotExist(err) { + t.Error("PID file not found after restart") + } + + // Wait for node to be ready + deadline := time.Now().Add(60 * time.Second) + for time.Now().Before(deadline) { + out, err := run(t, "kubectl", "get", "nodes", "--no-headers") + if err == nil && strings.Contains(out, "Ready") { + break + } + time.Sleep(3 * time.Second) + } + + out, _ = run(t, "kubectl", "get", "nodes", "--no-headers") + if !strings.Contains(out, "Ready") { + t.Error("node not ready after restart") + } + }) + + t.Run("purge", func(t *testing.T) { + out, err := run(t, "stack", "purge") + if err != nil { + t.Fatalf("stack purge failed: %v\n%s", err, out) + } + + time.Sleep(2 * time.Second) + + if _, err := os.Stat(filepath.Join(configDir, ".stack-id")); !os.IsNotExist(err) { + t.Error("stack ID should be removed after purge") + } + if _, err := os.Stat(filepath.Join(configDir, ".k3s.pid")); !os.IsNotExist(err) { + t.Error("PID file should be removed after purge") + } + }) + + t.Run("full_cycle_purge_force", func(t *testing.T) { + out, err := run(t, "stack", "init", "--backend", "k3s") + if err != nil { + t.Fatalf("init: %v\n%s", err, out) + } + + out, err = run(t, "stack", "up") + if err != nil { + t.Fatalf("up: %v\n%s", err, out) + } + + out, err = run(t, "stack", "purge", "--force") + if err != nil { + t.Fatalf("purge --force: %v\n%s", err, out) + } + + time.Sleep(2 * time.Second) + + if _, err := os.Stat(filepath.Join(configDir, ".stack-id")); !os.IsNotExist(err) { + t.Error("config should be removed after purge --force") + } + }) +} + +func findProjectRoot(t *testing.T) string { + t.Helper() + dir, err := os.Getwd() + if err != nil { + t.Fatalf("failed to get working directory: %v", err) + } + for { + if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { + return dir + } + parent := filepath.Dir(dir) + if parent == dir { + t.Fatal("could not find project root (no go.mod)") + } + dir = parent + } +} diff --git a/internal/stack/stack.go b/internal/stack/stack.go index c8366f6..8e2442b 100644 --- a/internal/stack/stack.go +++ b/internal/stack/stack.go @@ -13,21 +13,30 @@ import ( ) const ( - k3dConfigFile = "k3d.yaml" kubeconfigFile = "kubeconfig.yaml" stackIDFile = ".stack-id" ) // Init initializes the stack configuration -func Init(cfg *config.Config, force bool) error { - // Create flat stack config directory - k3dConfigPath := filepath.Join(cfg.ConfigDir, k3dConfigFile) - - // Check if config already exists - if _, err := os.Stat(k3dConfigPath); err == nil { - if !force { - return fmt.Errorf("stack configuration already exists at %s\nUse --force to overwrite", k3dConfigPath) - } +func Init(cfg *config.Config, force bool, backendName string) error { + // Check if any stack config already exists + stackIDPath := filepath.Join(cfg.ConfigDir, stackIDFile) + backendFilePath := filepath.Join(cfg.ConfigDir, stackBackendFile) + + hasExistingConfig := false + if _, err := os.Stat(stackIDPath); err == nil { + hasExistingConfig = true + } + if _, err := os.Stat(backendFilePath); err == nil { + hasExistingConfig = true + } + // Also check legacy k3d.yaml for backward compatibility + if _, err := os.Stat(filepath.Join(cfg.ConfigDir, k3dConfigFile)); err == nil { + hasExistingConfig = true + } + + if hasExistingConfig && !force { + return fmt.Errorf("stack configuration already exists at %s\nUse --force to overwrite", cfg.ConfigDir) } if err := os.MkdirAll(cfg.ConfigDir, 0755); err != nil { @@ -35,46 +44,37 @@ func Init(cfg *config.Config, force bool) error { } // Check if stack ID already exists (preserve on --force) - stackIDPath := filepath.Join(cfg.ConfigDir, stackIDFile) var stackID string if existingID, err := os.ReadFile(stackIDPath); err == nil { - stackID = string(existingID) + stackID = strings.TrimSpace(string(existingID)) fmt.Printf("Preserving existing stack ID: %s (use purge to reset)\n", stackID) } else { - // Generate unique stack ID only if one doesn't exist stackID = petname.Generate(2, "-") } - fmt.Println("Initializing cluster configuration") - fmt.Printf("Cluster ID: %s\n", stackID) - - absDataDir, err := filepath.Abs(cfg.DataDir) - if err != nil { - return fmt.Errorf("failed to get absolute path for data directory: %w", err) + // Default to k3d if no backend specified + if backendName == "" { + backendName = BackendK3d } - absConfigDir, err := filepath.Abs(cfg.ConfigDir) + backend, err := NewBackend(backendName) if err != nil { - return fmt.Errorf("failed to get absolute path for config directory: %w", err) - } - - // Check if overwriting config - if _, err := os.Stat(k3dConfigPath); err == nil { - fmt.Printf("Overwriting existing stack configuration: %s\n", k3dConfigPath) + return err } - // Replace placeholder in k3d config with actual stack ID - k3dConfig := embed.K3dConfig - k3dConfig = strings.ReplaceAll(k3dConfig, "{{STACK_ID}}", stackID) - k3dConfig = strings.ReplaceAll(k3dConfig, "{{DATA_DIR}}", absDataDir) - k3dConfig = strings.ReplaceAll(k3dConfig, "{{CONFIG_DIR}}", absConfigDir) + fmt.Println("Initializing cluster configuration") + fmt.Printf("Cluster ID: %s\n", stackID) + fmt.Printf("Backend: %s\n", backend.Name()) - // Write k3d config with stack ID to destination - if err := os.WriteFile(k3dConfigPath, []byte(k3dConfig), 0644); err != nil { - return fmt.Errorf("failed to write k3d config: %w", err) + // Check prerequisites + if err := backend.Prerequisites(cfg); err != nil { + return fmt.Errorf("prerequisites check failed: %w", err) } - fmt.Printf("K3d config saved to: %s\n", k3dConfigPath) + // Generate backend-specific config + if err := backend.Init(cfg, stackID); err != nil { + return err + } // Copy embedded defaults (helmfile + charts for infrastructure) defaultsDir := filepath.Join(cfg.ConfigDir, "defaults") @@ -83,100 +83,50 @@ func Init(cfg *config.Config, force bool) error { } fmt.Printf("Defaults copied to: %s\n", defaultsDir) - // Store stack ID for later use (stackIDPath already declared above) + // Store stack ID if err := os.WriteFile(stackIDPath, []byte(stackID), 0644); err != nil { return fmt.Errorf("failed to write stack ID: %w", err) } - fmt.Printf("Initialized stack configuration: %s\n", k3dConfigPath) + // Save backend choice + if err := SaveBackend(cfg, backendName); err != nil { + return fmt.Errorf("failed to save backend choice: %w", err) + } + + fmt.Printf("Initialized stack configuration\n") fmt.Printf("Stack ID: %s\n", stackID) return nil } -// Up starts the k3d cluster +// Up starts the cluster using the configured backend func Up(cfg *config.Config) error { - k3dConfigPath := filepath.Join(cfg.ConfigDir, k3dConfigFile) - kubeconfigPath := filepath.Join(cfg.ConfigDir, kubeconfigFile) - - // Check if config exists - if _, err := os.Stat(k3dConfigPath); os.IsNotExist(err) { - return fmt.Errorf("stack config not found, run 'obol stack init' first") - } - - // Get stack ID and full stack name stackID := getStackID(cfg) if stackID == "" { return fmt.Errorf("stack ID not found, run 'obol stack init' first") } - stackName := getStackName(cfg) - - // Check if cluster already exists using cluster list - listCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "list", "--no-headers") - listCmdOutput, err := listCmd.Output() + backend, err := LoadBackend(cfg) if err != nil { - return fmt.Errorf("k3d list command failed: %w", err) + return fmt.Errorf("failed to load backend: %w", err) } - if stackExists(string(listCmdOutput), stackName) { - // Cluster exists - check if it's stopped or running - fmt.Printf("Stack already exists, attempting to start: %s (id: %s)\n", stackName, stackID) - startCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "start", stackName) - startCmd.Stdout = os.Stdout - startCmd.Stderr = os.Stderr - if err := startCmd.Run(); err != nil { - return fmt.Errorf("failed to start existing cluster: %w", err) - } - - if err := syncDefaults(cfg, kubeconfigPath); err != nil { - return err - } - - fmt.Println("Stack restarted successfully") - fmt.Printf("Stack ID: %s\n", stackID) - return nil - } - - fmt.Printf("Starting stack: %s (id: %s)\n", stackName, stackID) - - // Get absolute path to data directory for k3d volume mount - absDataDir, err := filepath.Abs(cfg.DataDir) - if err != nil { - return fmt.Errorf("failed to get absolute path for data directory: %w", err) - } - - // Create data directory if it doesn't exist - if err := os.MkdirAll(absDataDir, 0755); err != nil { - return fmt.Errorf("failed to create data directory: %w", err) - } - - // Create cluster using k3d config with custom name - fmt.Println("Creating k3d cluster...") - createCmd := exec.Command( - filepath.Join(cfg.BinDir, "k3d"), - "cluster", "create", stackName, - "--config", k3dConfigPath, - "--kubeconfig-update-default=false", - ) - createCmd.Stdout = os.Stdout - createCmd.Stderr = os.Stderr + kubeconfigPath := filepath.Join(cfg.ConfigDir, kubeconfigFile) - if err := createCmd.Run(); err != nil { - return fmt.Errorf("failed to create cluster: %w", err) - } + fmt.Printf("Starting stack (id: %s, backend: %s)\n", stackID, backend.Name()) - // Export kubeconfig - kubeconfigCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "kubeconfig", "get", stackName) - kubeconfigData, err := kubeconfigCmd.Output() + kubeconfigData, err := backend.Up(cfg, stackID) if err != nil { - return fmt.Errorf("failed to get kubeconfig: %w", err) + return err } + // Write kubeconfig (backend may have already written it, but ensure consistency) if err := os.WriteFile(kubeconfigPath, kubeconfigData, 0600); err != nil { return fmt.Errorf("failed to write kubeconfig: %w", err) } - if err := syncDefaults(cfg, kubeconfigPath); err != nil { + // Sync defaults with backend-aware dataDir + dataDir := backend.DataDir(cfg) + if err := syncDefaults(cfg, kubeconfigPath, dataDir); err != nil { return err } @@ -187,85 +137,50 @@ func Up(cfg *config.Config) error { return nil } -// Down stops the k3d cluster +// Down stops the cluster func Down(cfg *config.Config) error { stackID := getStackID(cfg) if stackID == "" { return fmt.Errorf("stack ID not found, stack may not be initialized") } - stackName := getStackName(cfg) - - fmt.Printf("Stopping stack gracefully: %s (id: %s)\n", stackName, stackID) - - // First attempt graceful stop (allows processes to shutdown gracefully) - stopCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "stop", stackName) - stopCmd.Stdout = os.Stdout - stopCmd.Stderr = os.Stderr - - if err := stopCmd.Run(); err != nil { - fmt.Println("Graceful stop timed out or failed, forcing cluster deletion") - // Fallback to delete if stop fails - deleteCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "delete", stackName) - deleteCmd.Stdout = os.Stdout - deleteCmd.Stderr = os.Stderr - if err := deleteCmd.Run(); err != nil { - return fmt.Errorf("failed to stop cluster: %w", err) - } + + backend, err := LoadBackend(cfg) + if err != nil { + return fmt.Errorf("failed to load backend: %w", err) } - fmt.Println("Stack stopped successfully") - return nil + return backend.Down(cfg, stackID) } // Purge deletes the cluster config and optionally data func Purge(cfg *config.Config, force bool) error { - // Delete cluster containers - stackName := getStackName(cfg) - if stackName != "" { + stackID := getStackID(cfg) + + backend, err := LoadBackend(cfg) + if err != nil { + return fmt.Errorf("failed to load backend: %w", err) + } + + // Destroy cluster if we have a stack ID + if stackID != "" { if force { - // Force delete without graceful shutdown - fmt.Printf("Force deleting cluster containers: %s\n", stackName) - deleteCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "delete", stackName) - deleteCmd.Stdout = os.Stdout - deleteCmd.Stderr = os.Stderr - if err := deleteCmd.Run(); err != nil { - fmt.Printf("Failed to delete cluster (may already be deleted): %v\n", err) - } - fmt.Println("Cluster containers force deleted") + fmt.Printf("Force destroying cluster (id: %s)\n", stackID) } else { - // Graceful shutdown first to ensure data is written properly - fmt.Printf("Gracefully stopping cluster before deletion: %s\n", stackName) - stopCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "stop", stackName) - stopCmd.Stdout = os.Stdout - stopCmd.Stderr = os.Stderr - if err := stopCmd.Run(); err != nil { - fmt.Println("Graceful stop timed out or failed, proceeding with deletion anyway") - } else { - fmt.Println("Cluster stopped gracefully") - } - - // Now delete the stopped cluster - fmt.Println("Deleting cluster containers") - deleteCmd := exec.Command(filepath.Join(cfg.BinDir, "k3d"), "cluster", "delete", stackName) - deleteCmd.Stdout = os.Stdout - deleteCmd.Stderr = os.Stderr - if err := deleteCmd.Run(); err != nil { - fmt.Printf("Failed to delete cluster (may already be deleted): %v\n", err) - } - fmt.Println("Cluster containers deleted") + fmt.Printf("Destroying cluster (id: %s)\n", stackID) + } + if err := backend.Destroy(cfg, stackID); err != nil { + fmt.Printf("Failed to destroy cluster (may already be deleted): %v\n", err) } } // Remove stack config directory - stackConfigDir := filepath.Join(cfg.ConfigDir) - if err := os.RemoveAll(stackConfigDir); err != nil { + if err := os.RemoveAll(cfg.ConfigDir); err != nil { return fmt.Errorf("failed to remove stack config: %w", err) } fmt.Println("Removed cluster config directory") // Remove data directory only if force flag is set if force { - // Use sudo to remove data directory since it may contain root-owned files fmt.Println("Removing data directory...") rmCmd := exec.Command("sudo", "rm", "-rf", cfg.DataDir) rmCmd.Stdout = os.Stdout @@ -284,12 +199,6 @@ func Purge(cfg *config.Config, force bool) error { return nil } -// stackExists checks if stack name exists in k3d cluster list output -func stackExists(output, name string) bool { - // Check if the stack name appears in the output - return strings.Contains(output, name) -} - // getStackID reads the stored stack ID func getStackID(cfg *config.Config) string { stackIDPath := filepath.Join(cfg.ConfigDir, stackIDFile) @@ -300,15 +209,6 @@ func getStackID(cfg *config.Config) string { return strings.TrimSpace(string(data)) } -// getStackName returns the full stack name (obol-stack-{stackid}) -func getStackName(cfg *config.Config) string { - stackID := getStackID(cfg) - if stackID == "" { - return "" - } - return fmt.Sprintf("obol-stack-%s", stackID) -} - // GetStackID reads the stored stack ID (exported for use in main) func GetStackID(cfg *config.Config) string { return getStackID(cfg) @@ -316,23 +216,25 @@ func GetStackID(cfg *config.Config) string { // syncDefaults deploys the default infrastructure using helmfile // If deployment fails, the cluster is automatically stopped via Down() -func syncDefaults(cfg *config.Config, kubeconfigPath string) error { +func syncDefaults(cfg *config.Config, kubeconfigPath string, dataDir string) error { fmt.Println("Deploying default infrastructure with helmfile") - // Sync defaults using helmfile (handles Helm hooks properly) defaultsHelmfilePath := filepath.Join(cfg.ConfigDir, "defaults") helmfileCmd := exec.Command( filepath.Join(cfg.BinDir, "helmfile"), - "--file", filepath.Join(defaultsHelmfilePath, "helmfile.yaml"), + "--file", filepath.Join(defaultsHelmfilePath, "helmfile.yaml.gotmpl"), "--kubeconfig", kubeconfigPath, "sync", ) + helmfileCmd.Env = append(os.Environ(), + fmt.Sprintf("KUBECONFIG=%s", kubeconfigPath), + fmt.Sprintf("STACK_DATA_DIR=%s", dataDir), + ) helmfileCmd.Stdout = os.Stdout helmfileCmd.Stderr = os.Stderr if err := helmfileCmd.Run(); err != nil { fmt.Println("Failed to apply defaults helmfile, stopping cluster") - // Attempt to stop the cluster to clean up if downErr := Down(cfg); downErr != nil { fmt.Printf("Failed to stop cluster during cleanup: %v\n", downErr) } diff --git a/internal/tunnel/tunnel.go b/internal/tunnel/tunnel.go new file mode 100644 index 0000000..355e9ea --- /dev/null +++ b/internal/tunnel/tunnel.go @@ -0,0 +1,177 @@ +package tunnel + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "regexp" + "strings" + "time" + + "github.com/ObolNetwork/obol-stack/internal/config" +) + +const ( + tunnelNamespace = "traefik" + tunnelLabelSelector = "app.kubernetes.io/name=cloudflared" +) + +// Status displays the current tunnel status and URL +func Status(cfg *config.Config) error { + kubectlPath := filepath.Join(cfg.BinDir, "kubectl") + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + + // Check if kubeconfig exists + if _, err := os.Stat(kubeconfigPath); os.IsNotExist(err) { + return fmt.Errorf("stack not running, use 'obol stack up' first") + } + + // Check pod status first + podStatus, err := getPodStatus(kubectlPath, kubeconfigPath) + if err != nil { + printStatusBox("quick", "not deployed", "", time.Now()) + fmt.Println("\nTroubleshooting:") + fmt.Println(" - Start the stack: obol stack up") + return nil + } + + // Try to get tunnel URL from logs + url, err := GetTunnelURL(cfg) + if err != nil { + printStatusBox("quick", podStatus, "(not available)", time.Now()) + fmt.Println("\nTroubleshooting:") + fmt.Println(" - Check logs: obol tunnel logs") + fmt.Println(" - Restart tunnel: obol tunnel restart") + return nil + } + + printStatusBox("quick", "active", url, time.Now()) + fmt.Printf("\nTest with: curl %s/\n", url) + + return nil +} + +// GetTunnelURL parses cloudflared logs to extract the quick tunnel URL +func GetTunnelURL(cfg *config.Config) (string, error) { + kubectlPath := filepath.Join(cfg.BinDir, "kubectl") + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + + cmd := exec.Command(kubectlPath, + "--kubeconfig", kubeconfigPath, + "logs", "-n", tunnelNamespace, + "-l", tunnelLabelSelector, + "--tail=100", + ) + + output, err := cmd.Output() + if err != nil { + return "", fmt.Errorf("failed to get tunnel logs: %w", err) + } + + // Parse URL from logs (quick tunnel uses cfargotunnel.com) + re := regexp.MustCompile(`https://[a-z0-9-]+\.cfargotunnel\.com`) + matches := re.FindString(string(output)) + if matches == "" { + // Also try trycloudflare.com as fallback + re = regexp.MustCompile(`https://[a-z0-9-]+\.trycloudflare\.com`) + matches = re.FindString(string(output)) + } + if matches == "" { + return "", fmt.Errorf("tunnel URL not found in logs") + } + + return matches, nil +} + +// Restart restarts the cloudflared deployment to get a new tunnel URL +func Restart(cfg *config.Config) error { + kubectlPath := filepath.Join(cfg.BinDir, "kubectl") + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + + // Check if kubeconfig exists + if _, err := os.Stat(kubeconfigPath); os.IsNotExist(err) { + return fmt.Errorf("stack not running, use 'obol stack up' first") + } + + fmt.Println("Restarting cloudflared tunnel...") + + cmd := exec.Command(kubectlPath, + "--kubeconfig", kubeconfigPath, + "rollout", "restart", "deployment/cloudflared", + "-n", tunnelNamespace, + ) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + if err := cmd.Run(); err != nil { + return fmt.Errorf("failed to restart tunnel: %w", err) + } + + fmt.Println("\nTunnel restarting...") + fmt.Println("Run 'obol tunnel status' to see the new URL once ready (may take 10-30 seconds).") + + return nil +} + +// Logs displays cloudflared logs +func Logs(cfg *config.Config, follow bool) error { + kubectlPath := filepath.Join(cfg.BinDir, "kubectl") + kubeconfigPath := filepath.Join(cfg.ConfigDir, "kubeconfig.yaml") + + // Check if kubeconfig exists + if _, err := os.Stat(kubeconfigPath); os.IsNotExist(err) { + return fmt.Errorf("stack not running, use 'obol stack up' first") + } + + args := []string{ + "--kubeconfig", kubeconfigPath, + "logs", "-n", tunnelNamespace, + "-l", tunnelLabelSelector, + } + + if follow { + args = append(args, "-f") + } + + cmd := exec.Command(kubectlPath, args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + cmd.Stdin = os.Stdin + + return cmd.Run() +} + +// getPodStatus returns the status of the cloudflared pod +func getPodStatus(kubectlPath, kubeconfigPath string) (string, error) { + cmd := exec.Command(kubectlPath, + "--kubeconfig", kubeconfigPath, + "get", "pods", "-n", tunnelNamespace, + "-l", tunnelLabelSelector, + "-o", "jsonpath={.items[0].status.phase}", + ) + + output, err := cmd.Output() + if err != nil { + return "", err + } + + status := strings.TrimSpace(string(output)) + if status == "" { + return "", fmt.Errorf("no pods found") + } + + return strings.ToLower(status), nil +} + +// printStatusBox prints a formatted status box +func printStatusBox(mode, status, url string, lastUpdated time.Time) { + fmt.Println() + fmt.Println("Cloudflare Tunnel Status") + fmt.Println(strings.Repeat("─", 50)) + fmt.Printf("Mode: %s\n", mode) + fmt.Printf("Status: %s\n", status) + fmt.Printf("URL: %s\n", url) + fmt.Printf("Last Updated: %s\n", lastUpdated.Format(time.RFC3339)) + fmt.Println(strings.Repeat("─", 50)) +} diff --git a/notes.md b/notes.md index 025b7ef..6550e6a 100644 --- a/notes.md +++ b/notes.md @@ -6,7 +6,7 @@ - obol agent - skeleton out the cmd - this should have a dummy manifest which templates a config map secret - - obol agent init, gets the secret from google account + - OKR-1: default LLM flow is llms.py -> Ollama Cloud (no API key copy/paste) - frontend (default) - erpc, helios (default) diff --git a/plans/okr1-llmspy-integration.md b/plans/okr1-llmspy-integration.md new file mode 100644 index 0000000..a6f1fc7 --- /dev/null +++ b/plans/okr1-llmspy-integration.md @@ -0,0 +1,267 @@ +# OKR-1 Integration Plan: LLMSpy (`llms.py`) for Keyless, Multi-Provider LLM Access + +Date: 2026-02-03 + +## Goal (Objective 1) +Make Obol Stack the easiest way to spin up and use an on-chain AI agent. + +**Key Results** +1. Median time from install to first successful agent query ≤ **10 minutes** +2. Agent setup requires ≤ **5 user actions** (**no manual API key copy/paste in default flow**) +3. **100 Monthly Active Returning Users (MAUs)** interacting with the agent at least once per month +4. ≥ **60% of new Stack installs** complete agent setup successfully + +## Scope of this integration +Integrate **LLMSpy (`llms.py`)** as an **in-cluster OpenAI-compatible LLM gateway** that can route requests to: +- **Local LLMs** (default path to satisfy “no API key”) +- **Remote providers** (optional, later; keys or OAuth-derived tokens) + +This enables Obol Agent (ADK/FastAPI) to become **provider-agnostic**, while keeping the Dashboard UX simple. + +## Non-goals (for this iteration) +- Building a hosted “Obol-managed” LLM key/service (would change threat model/cost structure) +- Exposing LLMSpy publicly by default (we keep it internal unless explicitly enabled) +- Replacing ADK/AG-UI or refactoring the agent’s tool system +- Adding x402 payment to LLM calls (future candidate; not required for LLMSpy integration) + +--- + +## Current state (baseline) +### User experience bottleneck +- `obol agent init` currently requires a **manually created Google AI Studio API key** (copy/paste) before the agent works. +- Dashboard agent sidebar shows “Initialize your Obol Agent by running `obol agent init`…” when the agent is unavailable. + +### System architecture (today) +``` +Browser + -> Dashboard (Next.js, Better Auth) + -> POST /api/copilotkit (server route) + -> HttpAgent -> obol-agent (FastAPI / Google ADK) + -> Gemini via GOOGLE_API_KEY (direct) +``` + +--- + +## Proposed target architecture (with LLMSpy + Ollama; cloud-first) + +### Runtime request flow (agent query) +``` +Browser (signed-in) + -> Dashboard (Next.js) + -> /api/copilotkit (server; auth-gated) + -> obol-agent (FastAPI/ADK, AG-UI) + -> LiteLLM client (OpenAI-compatible) + -> LLMSpy (llms.py) [cluster-internal service] + -> Provider A: Local (Ollama) [no keys, default] + -> Provider B+: Remote (optional; keys/OAuth later) +``` + +### Deployment topology (Kubernetes) +Namespaces: +- `agent` + - `obol-agent` Deployment (existing) +- `llm` (new) + - **`llmspy`** (`llms.py`) Deployment + ClusterIP Service + - **`ollama`** Deployment + ClusterIP Service (default provider) + - Optional model warmup Job (`ollama pull `) + +Storage: +- Ollama runtime + model cache uses `emptyDir` (ephemeral). +- **Ollama Cloud auth key**: + - Minimum viable: also `emptyDir` (user reconnects after pod restart). + - Recommended: mount a small PVC or Secret-backed volume for `/root/.ollama/id_ed25519` so reconnect isn’t needed after upgrades/restarts. + +--- + +## UX: “≤5 actions” and “≤10 minutes” target + +### Default flow (no API keys) +**Default provider:** Ollama (in-cluster) via LLMSpy, using **Ollama Cloud models** (e.g. `glm-4.7:cloud`). + +Target action count: +1. Install Obol Stack CLI (existing flow) +2. `obol stack init` (if required by current UX) +3. `obol stack up` +4. Open Dashboard URL and sign in +5. Send first message in agent sidebar + +Notes: +- Remove the **mandatory** `obol agent init` step from the default path. +- Replace the “paste an API key” step with an **Ollama Cloud connect** step: + - If Ollama isn’t signed in, show a “Connect Ollama Cloud” action in the dashboard. + - Clicking it surfaces the `https://ollama.com/connect?...` URL returned by the Ollama API and guides the user through login. + +### Time-to-first-query tactics +- Default to a **cloud model** to avoid GPU/VRAM constraints: + - `glm-4.7:cloud` is explicitly supported as a cloud model in Ollama. +- Add a lightweight warmup/prefetch mechanism: + - Post-install Job: `ollama pull glm-4.7:cloud` (downloads the stub/metadata so first chat is faster) + - Readiness gate: “ready” once Ollama is connected and the model is pullable +- Ensure agent readiness checks are reliable and fast: + - Keep `/api/copilotkit/health` public (already required) + - Add `llmspy` and `ollama` readiness checks and surface status in the UI + +--- + +## Configuration model + +### LLMSpy +LLMSpy is configured by `~/.llms/llms.json` (in-container: `/home/llms/.llms/llms.json`). + +We will manage this in-cluster using: +- ConfigMap for `llms.json` +- Volume mount to `/home/llms/.llms` (likely `emptyDir`; no secrets required for Ollama) + +Runtime: +- Prefer the upstream-published container image for reproducibility: + - `ghcr.io/servicestack/llms:v2.0.30` (pinned) + +Key config points (concrete based on llms.py docs): +- Only one enabled provider: `ollama` +- `providers.ollama.type = "OllamaProvider"` +- `providers.ollama.base_url = "http://ollama.llm.svc.cluster.local:11434"` +- `providers.ollama.all_models = true` (or restrict to `glm-4.7:cloud`) +- `defaults.text.model = "glm-4.7:cloud"` + +### Obol Agent +Make the agent model/backend configurable: +- `LLM_BACKEND`: + - `gemini` (existing path, requires `GOOGLE_API_KEY`) + - `llmspy` (new default path) +- `LLM_MODEL` (default to the cloud model) +- `OPENAI_API_BASE` set to `http://llmspy.llm.svc.cluster.local:/v1` +- `OPENAI_API_KEY` set to a dummy value (LiteLLM/OpenAI provider compatibility) + +NOTE: With `llmspy` as backend, the agent sends OpenAI-style requests to LLMSpy and LLMSpy forwards to Ollama. + +## Default model choice +Use `glm-4.7:cloud` by default to maximize quality and avoid local GPU requirements. + +This keeps the “no manual API key copy/paste” OKR achievable because Ollama supports a browser-based connect flow (user signs in; Ollama authenticates subsequent cloud requests). + +## OpenClaw tie-in (validation + reuse) +We can validate “tool-calling robustness” of the chosen Ollama model in two ways: + +1) **Direct OpenClaw + Ollama** (matches Ollama’s built-in `openclaw` integration) + - OpenClaw already supports an Ollama provider using the OpenAI-compatible `/v1` API. + - Ollama’s own code includes an integration that edits `~/.openclaw/openclaw.json` to point at Ollama and set `agents.defaults.model.primary`. + +2) **OpenClaw + LLMSpy (preferred for consistency)** + - Configure OpenClaw’s “OpenAI” provider baseUrl to LLMSpy (`http://llmspy.llm.svc.cluster.local:/v1`) + - This ensures OpenClaw and Obol Agent exercise the same gateway path. + +We should treat OpenClaw as: +- A **validation harness** for model/tool behavior (pre-flight testing + regression checks) +- Potential future **multi-channel UX** (WhatsApp/Telegram/etc) once dashboard MVP is stable + +### Obol Stack CLI changes (user-facing) +Reframe `obol agent init` into a provider configuration command: +- Default: **no command needed** +- Optional: `obol agent configure --provider <...>` or `obol agent set-llm --provider <...>` + - Writes K8s secrets/configmaps and triggers rollout restart of `obol-agent` and/or `llmspy` + +--- + +## Security & exposure +- Dashboard remains protected by Better Auth (Google now; GitHub later). +- `/rpc/*` remains public/unprotected (x402 responsibility). +- `/api/copilotkit/health` remains public for monitoring. +- **LLMSpy and Ollama remain cluster-internal by default**: + - No HTTPRoute for them + - ClusterIP only + - (Optional later) expose behind dashboard auth for debugging + +Threat model considerations: +- Ensure LLMSpy cannot be used as an open relay from the internet. +- Ensure remote provider keys (if configured) never get logged or surfaced in UI. + +--- + +## Observability + OKR measurement plan + +### Metrics we can measure in-product (self-hosted) +- `agent_query_success_total` / `agent_query_error_total` +- `agent_query_latency_seconds` histogram +- `agent_first_success_timestamp` (per install) – used for “time to first query” +- `agent_provider_backend` label (gemini vs llmspy; local vs remote) + +### MAU / “install success rate” (cross-install aggregation) +This requires centralized telemetry. Options: +- Opt-in telemetry to an Obol endpoint (privacy-preserving, hashed install id) +- Or a “bring your own analytics” integration (PostHog/Amplitude) + +Proposed approach for this OKR: +- Add **opt-in** telemetry flag at install time +- Emit minimal events: + - `stack_install_completed` + - `agent_ready` + - `agent_first_query_success` + - `agent_returning_user_monthly` (count only) + +--- + +## Implementation workstreams (by repo) + +### 1) `obol-stack` (installer + infra) +- Add `llmspy` Deployment/Service manifest under `internal/embed/infrastructure/base/templates/` +- Add `ollama` Deployment/Service (or allow external Ollama endpoint) +- Add “model warmup” Job (optional but recommended for ≤10 min) +- Add values/env wiring to configure: + - LLMSpy port, config map, and secret mounts + - Obol Agent env vars (`LLM_BACKEND`, `LLM_MODEL`, `OPENAI_API_BASE`, etc.) +- Update CLI: + - Make `obol agent init` optional or replace with `obol agent configure` + - Provide a keyless default; ensure docs and errors reflect new flow +- Update README (agent quickstart + troubleshooting) + +### 2) `obol-agent` (runtime changes) +- Read `LLM_MODEL` from env (remove hard-coded model) +- Add `LLM_BACKEND` switch: + - `gemini` (current) + - `llmspy` using ADK’s `LiteLlm` wrapper + OpenAI-compatible base URL +- Add health diagnostics: + - Include provider status in `/health` (e.g., “llm backend reachable”) +- Add unit/integration tests: + - Mock LLMSpy OpenAI endpoint + - Verify tool calling works with chosen default local model + +### 3) `obol-stack-front-end` (onboarding UX) +- Replace “run `obol agent init`” message with: + - “Agent is initializing” / “Model downloading” (with helpful tips) + - A “Retry health check” action + - A link to agent setup docs for optional remote providers +- Add an “Agent Setup” panel: + - Shows current backend (local/remote) + - Shows readiness status (agent/llmspy/ollama) + +### 4) `helm-charts` (if needed) +- Only if we decide to migrate these new services into charts instead of raw manifests. +- Otherwise, keep in `base/templates/` for speed. + +--- + +## Milestones + +### Milestone A — “Keyless Agent Works Locally” +Acceptance: +- Fresh install: no API keys required +- Agent responds from dashboard +- Median time to first response ≤ 10 min in test environment + +### Milestone B — “Provider Choice” +Acceptance: +- Optional remote providers via secrets/config (still no copy/paste required in default) +- Failover behavior works (local first, remote fallback if configured) + +### Milestone C — “OKR Instrumentation” +Acceptance: +- Prometheus metrics available +- Optional telemetry pipeline documented and implemented (if approved) + +--- + +## Open questions (needs product decision) +1. Do we persist `/root/.ollama/id_ed25519` so the Ollama Cloud connection survives pod restarts/upgrades? +2. Do we want to expose a “Connect Ollama Cloud” UX in the dashboard (recommended) or require a CLI step? +3. Telemetry: opt-in vs opt-out; where is the endpoint; privacy guarantees. +4. Do we expose LLMSpy UI behind auth for debugging, or keep it internal-only? diff --git a/renovate.json b/renovate.json index 6932b83..afab9bf 100644 --- a/renovate.json +++ b/renovate.json @@ -20,6 +20,19 @@ "datasourceTemplate": "github-releases", "depNameTemplate": "ObolNetwork/obol-stack-front-end", "versioningTemplate": "semver" + }, + { + "customType": "regex", + "description": "Update Gateway API release version", + "matchStrings": [ + "gatewayApiVersion:\\s*[\"']?(?v[0-9]+\\.[0-9]+\\.[0-9]+)[\"']?" + ], + "fileMatch": [ + "^internal/embed/infrastructure/helmfile\\.yaml$" + ], + "datasourceTemplate": "github-releases", + "depNameTemplate": "kubernetes-sigs/gateway-api", + "versioningTemplate": "semver" } ], "packageRules": [