diff --git a/Makefile b/Makefile index 0b21d62b..5b84b21b 100644 --- a/Makefile +++ b/Makefile @@ -22,27 +22,28 @@ DOCKER_BUILD_ARGS := \ --target $(DOCKER_TARGET) \ -t $(DOCKER_IMAGE) -# Test configuration -BUILD_DMR ?= 1 - # Phony targets grouped by category -.PHONY: build run clean test integration-tests build-cli install-cli +.PHONY: build build-cli build-dmr install-cli run clean test integration-tests .PHONY: validate validate-all lint help .PHONY: docker-build docker-build-multiplatform docker-run docker-run-impl .PHONY: docker-build-vllm docker-run-vllm docker-build-sglang docker-run-sglang .PHONY: test-docker-ce-installation .PHONY: vllm-metal-build vllm-metal-install vllm-metal-dev vllm-metal-clean .PHONY: diffusers-build diffusers-install diffusers-dev diffusers-clean -# Default target +# Default target: build server, CLI plugin, and dmr convenience wrapper .DEFAULT_GOAL := build -# Build the Go application -build: +build: build-server build-cli build-dmr + +build-server: CGO_ENABLED=1 go build -ldflags="-s -w -X main.Version=$(shell git describe --tags --always --dirty --match 'v*')" -o $(APP_NAME) . build-cli: $(MAKE) -C cmd/cli +build-dmr: + go build -ldflags="-s -w" -o dmr ./cmd/dmr + install-cli: $(MAKE) -C cmd/cli install @@ -61,6 +62,7 @@ run: build # Clean build artifacts clean: rm -f $(APP_NAME) + rm -f dmr rm -f model-runner.sock # Run tests @@ -77,7 +79,7 @@ integration-tests: echo "$$INVALID_TESTS" | sed 's/func \([^(]*\).*/\1/'; \ exit 1; \ fi - @BUILD_DMR=$(BUILD_DMR) go test -v -race -count=1 -tags=integration -run "^TestIntegration" -timeout=5m ./cmd/cli/commands + go test -v -race -count=1 -tags=integration -run "^TestIntegration" -timeout=5m ./cmd/cli/commands @echo "Integration tests completed!" test-docker-ce-installation: @@ -308,7 +310,8 @@ diffusers-clean: help: @echo "Available targets:" - @echo " build - Build the Go application" + @echo " build - Build server, CLI plugin, and dmr wrapper (default)" + @echo " build-server - Build the model-runner server" @echo " build-cli - Build the CLI (docker-model plugin)" @echo " install-cli - Build and install the CLI as a Docker plugin" @echo " docs - Generate CLI documentation" diff --git a/README.md b/README.md index 56b42850..e472e0a9 100644 --- a/README.md +++ b/README.md @@ -96,60 +96,40 @@ Before building from source, ensure you have the following installed: ### Building the Complete Stack -#### Step 1: Clone and Build model-runner (Server/Daemon) +After cloning, a single `make` builds everything — the server, CLI plugin, and a `dmr` convenience wrapper: ```bash -# Clone the model-runner repository -git clone https://github.com/docker/model-runner.git -cd model-runner - -# Build the model-runner binary -make build - -# Or build with specific backend arguments -make run LLAMA_ARGS="--verbose --jinja -ngl 999 --ctx-size 2048" - -# Run tests to verify the build -make test +make ``` -The `model-runner` binary will be created in the current directory. This is the backend server that manages models. - -#### Step 2: Build model-cli (Client) +`dmr` starts the server on a free port, waits for it to be ready, runs your CLI command, then shuts the server down: ```bash -# From the root directory, navigate to the model-cli directory -cd cmd/cli - -# Build the CLI binary -make build - -# The binary will be named 'model-cli' -# Optionally, install it as a Docker CLI plugin -make install # This will link it to ~/.docker/cli-plugins/docker-model +./dmr run ai/smollm2 "Hello, how are you?" +./dmr ls +./dmr run qwen3:0.6B-Q4_0 tell me today's news ``` +These components can also be built, run, and tested separately using the Makefile. + ### Testing the Complete Stack End-to-End > **Note:** We use port 13434 in these examples to avoid conflicts with Docker Desktop's built-in Model Runner, which typically runs on port 12434. -#### Option 1: Local Development (Recommended for Contributors) +#### Option 1: Manual two-terminal setup 1. **Start model-runner in one terminal:** ```bash -cd model-runner MODEL_RUNNER_PORT=13434 ./model-runner -# The server will start on port 13434 ``` 2. **Use model-cli in another terminal:** ```bash -cd cmd/cli -# List available models (connecting to port 13434) -MODEL_RUNNER_HOST=http://localhost:13434 ./model-cli list +# List available models +MODEL_RUNNER_HOST=http://localhost:13434 ./cmd/cli/model-cli list # Pull and run a model -MODEL_RUNNER_HOST=http://localhost:13434 ./model-cli run ai/smollm2 "Hello, how are you?" +MODEL_RUNNER_HOST=http://localhost:13434 ./cmd/cli/model-cli run ai/smollm2 "Hello, how are you?" ``` #### Option 2: Using Docker @@ -422,6 +402,118 @@ in the form of [a Helm chart and static YAML](charts/docker-model-runner/README. If you are interested in a specific Kubernetes use-case, please start a discussion on the issue tracker. +<<<<<<< Updated upstream +======= +## dmrlet: Container Orchestrator for AI Inference + +dmrlet is a purpose-built container orchestrator for AI inference workloads. Unlike Kubernetes, it focuses exclusively on running stateless inference containers with zero configuration overhead. Multi-GPU mapping "just works" without YAML, device plugins, or node selectors. + +### Key Features + +| Feature | Kubernetes | dmrlet | +|---------|------------|--------| +| Multi-GPU setup | Device plugins + node selectors + resource limits YAML | `dmrlet serve llama3 --gpus all` | +| Config overhead | 50+ lines of YAML minimum | Zero YAML, CLI-only | +| Time to first inference | Minutes (pod scheduling, image pull) | Seconds (model already local) | +| Model management | External (mount PVCs, manage yourself) | Integrated with Docker Model Runner store | + +### Building dmrlet + +```bash +# Build the dmrlet binary +go build -o dmrlet ./cmd/dmrlet + +# Verify it works +./dmrlet --help +``` + +### Usage + +**Start the daemon:** +```bash +# Start in foreground +dmrlet daemon + +# With custom socket path +dmrlet daemon --socket /tmp/dmrlet.sock +``` + +**Serve a model:** +```bash +# Auto-detect backend and GPUs +dmrlet serve llama3.2 + +# Specify backend +dmrlet serve llama3.2 --backend vllm + +# Specify GPU allocation +dmrlet serve llama3.2 --gpus 0,1 +dmrlet serve llama3.2 --gpus all + +# Multiple replicas +dmrlet serve llama3.2 --replicas 2 + +# Backend-specific options +dmrlet serve llama3.2 --ctx-size 4096 # llama.cpp context size +dmrlet serve llama3.2 --gpu-memory 0.8 # vLLM GPU memory utilization +``` + +**List running models:** +```bash +dmrlet ps +# MODEL BACKEND REPLICAS GPUS ENDPOINTS STATUS +# llama3.2 llama.cpp 1 [0,1,2,3] localhost:30000 healthy +``` + +**View logs:** +```bash +dmrlet logs llama3.2 # Last 100 lines +dmrlet logs llama3.2 -f # Follow logs +``` + +**Scale replicas:** +```bash +dmrlet scale llama3.2 4 # Scale to 4 replicas +``` + +**Stop a model:** +```bash +dmrlet stop llama3.2 +dmrlet stop --all # Stop all models +``` + +**Check status:** +```bash +dmrlet status +# DAEMON: running +# SOCKET: /var/run/dmrlet.sock +# +# GPUs: +# GPU 0: NVIDIA A100 80GB 81920MB (in use: llama3.2) +# GPU 1: NVIDIA A100 80GB 81920MB (available) +# +# MODELS: 1 running +``` + +### Supported Backends + +- **llama.cpp** - Default backend for GGUF models +- **vLLM** - High-throughput serving for safetensors models +- **SGLang** - Fast serving with RadixAttention + +### Architecture + +``` +dmrlet daemon + ├── GPU Manager - Auto-detect and allocate GPUs + ├── Container Manager - Docker-based container lifecycle + ├── Service Registry - Endpoint discovery with load balancing + ├── Health Monitor - Auto-restart unhealthy containers + ├── Auto-scaler - Scale based on QPS/latency/GPU utilization + └── Log Aggregator - Centralized log collection +``` + +>>>>>>> Stashed changes ## Community For general questions and discussion, please use [Docker Model Runner's Slack channel](https://dockercommunity.slack.com/archives/C09H9P5E57B). diff --git a/cmd/dmr/main.go b/cmd/dmr/main.go new file mode 100644 index 00000000..59cf34bb --- /dev/null +++ b/cmd/dmr/main.go @@ -0,0 +1,130 @@ +// dmr is a developer convenience wrapper that starts the model-runner server on +// a free port and runs a model-cli command against it in one step. +// +// Usage: dmr +// +// Example: dmr run qwen3:0.6B-Q4_0 tell me today's news +package main + +import ( + "errors" + "fmt" + "net" + "net/http" + "os" + "os/exec" + "os/signal" + "path/filepath" + "strconv" + "syscall" + "time" +) + +func freePort() (int, error) { + l, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + return 0, err + } + defer l.Close() + return l.Addr().(*net.TCPAddr).Port, nil +} + +func waitForServer(url string, timeout time.Duration) error { + client := &http.Client{Timeout: time.Second} + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + resp, err := client.Get(url) + if err == nil { + resp.Body.Close() + if resp.StatusCode == http.StatusOK { + return nil + } + } + time.Sleep(200 * time.Millisecond) + } + return fmt.Errorf("server not ready after %s", timeout) +} + +func checkBinary(path, name, expectedLayout string) error { + if _, err := os.Stat(path); os.IsNotExist(err) { + return fmt.Errorf("missing %s binary at %s\n\nExpected directory layout:\n%s\n\nPlease run 'make build' to build all binaries", name, path, expectedLayout) + } + return nil +} + +func main() { + self, err := os.Executable() + if err != nil { + fmt.Fprintf(os.Stderr, "dmr: %v\n", err) + os.Exit(1) + } + dir := filepath.Dir(self) + + serverBin := filepath.Join(dir, "model-runner") + cliBin := filepath.Join(dir, "cmd", "cli", "model-cli") + + expectedLayout := fmt.Sprintf(`%s/ +├── model-runner (server binary) +├── dmr (this wrapper) +└── cmd/ + └── cli/ + └── model-cli (CLI binary)`, dir) + + if err := checkBinary(serverBin, "model-runner", expectedLayout); err != nil { + fmt.Fprintf(os.Stderr, "dmr: %v\n", err) + os.Exit(1) + } + if err := checkBinary(cliBin, "model-cli", expectedLayout); err != nil { + fmt.Fprintf(os.Stderr, "dmr: %v\n", err) + os.Exit(1) + } + + port, err := freePort() + if err != nil { + fmt.Fprintf(os.Stderr, "dmr: failed to find free port: %v\n", err) + os.Exit(1) + } + portStr := strconv.Itoa(port) + serverURL := "http://localhost:" + portStr + + fmt.Fprintf(os.Stderr, "dmr: starting model-runner on port %d\n", port) + + server := exec.Command(serverBin) + server.Env = append(os.Environ(), "MODEL_RUNNER_PORT="+portStr) + server.Stderr = os.Stderr + server.Stdout = os.Stdout + + if err := server.Start(); err != nil { + fmt.Fprintf(os.Stderr, "dmr: failed to start model-runner: %v\n", err) + os.Exit(1) + } + defer server.Process.Kill() + + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) + go func() { + <-sigCh + server.Process.Kill() + }() + + if err := waitForServer(serverURL+"/", 30*time.Second); err != nil { + fmt.Fprintf(os.Stderr, "dmr: %v\n", err) + os.Exit(1) + } + + // #nosec G702 - Intentional: dmr is a CLI wrapper that forwards arguments to model-cli + cli := exec.Command(cliBin, os.Args[1:]...) + cli.Env = append(os.Environ(), "MODEL_RUNNER_HOST="+serverURL) + cli.Stdin = os.Stdin + cli.Stdout = os.Stdout + cli.Stderr = os.Stderr + + if err := cli.Run(); err != nil { + var exitErr *exec.ExitError + if errors.As(err, &exitErr) { + os.Exit(exitErr.ExitCode()) + } + fmt.Fprintf(os.Stderr, "dmr: %v\n", err) + os.Exit(1) + } +}