Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 12 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,27 +22,28 @@ DOCKER_BUILD_ARGS := \
--target $(DOCKER_TARGET) \
-t $(DOCKER_IMAGE)

# Test configuration
BUILD_DMR ?= 1

# Phony targets grouped by category
.PHONY: build run clean test integration-tests build-cli install-cli
.PHONY: build build-cli build-dmr install-cli run clean test integration-tests
.PHONY: validate validate-all lint help
.PHONY: docker-build docker-build-multiplatform docker-run docker-run-impl
.PHONY: docker-build-vllm docker-run-vllm docker-build-sglang docker-run-sglang
.PHONY: test-docker-ce-installation
.PHONY: vllm-metal-build vllm-metal-install vllm-metal-dev vllm-metal-clean
.PHONY: diffusers-build diffusers-install diffusers-dev diffusers-clean
# Default target
# Default target: build server, CLI plugin, and dmr convenience wrapper
.DEFAULT_GOAL := build

# Build the Go application
build:
build: build-server build-cli build-dmr

build-server:
CGO_ENABLED=1 go build -ldflags="-s -w -X main.Version=$(shell git describe --tags --always --dirty --match 'v*')" -o $(APP_NAME) .

build-cli:
$(MAKE) -C cmd/cli

build-dmr:
go build -ldflags="-s -w" -o dmr ./cmd/dmr

install-cli:
$(MAKE) -C cmd/cli install

Expand All @@ -61,6 +62,7 @@ run: build
# Clean build artifacts
clean:
rm -f $(APP_NAME)
rm -f dmr
rm -f model-runner.sock

# Run tests
Expand All @@ -77,7 +79,7 @@ integration-tests:
echo "$$INVALID_TESTS" | sed 's/func \([^(]*\).*/\1/'; \
exit 1; \
fi
@BUILD_DMR=$(BUILD_DMR) go test -v -race -count=1 -tags=integration -run "^TestIntegration" -timeout=5m ./cmd/cli/commands
go test -v -race -count=1 -tags=integration -run "^TestIntegration" -timeout=5m ./cmd/cli/commands
@echo "Integration tests completed!"

test-docker-ce-installation:
Expand Down Expand Up @@ -308,7 +310,8 @@ diffusers-clean:

help:
@echo "Available targets:"
@echo " build - Build the Go application"
@echo " build - Build server, CLI plugin, and dmr wrapper (default)"
@echo " build-server - Build the model-runner server"
@echo " build-cli - Build the CLI (docker-model plugin)"
@echo " install-cli - Build and install the CLI as a Docker plugin"
@echo " docs - Generate CLI documentation"
Expand Down
156 changes: 124 additions & 32 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,60 +96,40 @@ Before building from source, ensure you have the following installed:

### Building the Complete Stack

#### Step 1: Clone and Build model-runner (Server/Daemon)
After cloning, a single `make` builds everything — the server, CLI plugin, and a `dmr` convenience wrapper:

```bash
# Clone the model-runner repository
git clone https://github.com/docker/model-runner.git
cd model-runner

# Build the model-runner binary
make build

# Or build with specific backend arguments
make run LLAMA_ARGS="--verbose --jinja -ngl 999 --ctx-size 2048"

# Run tests to verify the build
make test
make
```

The `model-runner` binary will be created in the current directory. This is the backend server that manages models.

#### Step 2: Build model-cli (Client)
`dmr` starts the server on a free port, waits for it to be ready, runs your CLI command, then shuts the server down:

```bash
# From the root directory, navigate to the model-cli directory
cd cmd/cli

# Build the CLI binary
make build

# The binary will be named 'model-cli'
# Optionally, install it as a Docker CLI plugin
make install # This will link it to ~/.docker/cli-plugins/docker-model
./dmr run ai/smollm2 "Hello, how are you?"
./dmr ls
./dmr run qwen3:0.6B-Q4_0 tell me today's news
```
These components can also be built, run, and tested separately using the Makefile.
### Testing the Complete Stack End-to-End
> **Note:** We use port 13434 in these examples to avoid conflicts with Docker Desktop's built-in Model Runner, which typically runs on port 12434.

#### Option 1: Local Development (Recommended for Contributors)
#### Option 1: Manual two-terminal setup

1. **Start model-runner in one terminal:**
```bash
cd model-runner
MODEL_RUNNER_PORT=13434 ./model-runner
# The server will start on port 13434
```

2. **Use model-cli in another terminal:**
```bash
cd cmd/cli
# List available models (connecting to port 13434)
MODEL_RUNNER_HOST=http://localhost:13434 ./model-cli list
# List available models
MODEL_RUNNER_HOST=http://localhost:13434 ./cmd/cli/model-cli list
# Pull and run a model
MODEL_RUNNER_HOST=http://localhost:13434 ./model-cli run ai/smollm2 "Hello, how are you?"
MODEL_RUNNER_HOST=http://localhost:13434 ./cmd/cli/model-cli run ai/smollm2 "Hello, how are you?"
```

#### Option 2: Using Docker
Expand Down Expand Up @@ -422,6 +402,118 @@ in the form of [a Helm chart and static YAML](charts/docker-model-runner/README.
If you are interested in a specific Kubernetes use-case, please start a
discussion on the issue tracker.
<<<<<<< Updated upstream
=======
## dmrlet: Container Orchestrator for AI Inference
dmrlet is a purpose-built container orchestrator for AI inference workloads. Unlike Kubernetes, it focuses exclusively on running stateless inference containers with zero configuration overhead. Multi-GPU mapping "just works" without YAML, device plugins, or node selectors.
### Key Features
| Feature | Kubernetes | dmrlet |
|---------|------------|--------|
| Multi-GPU setup | Device plugins + node selectors + resource limits YAML | `dmrlet serve llama3 --gpus all` |
| Config overhead | 50+ lines of YAML minimum | Zero YAML, CLI-only |
| Time to first inference | Minutes (pod scheduling, image pull) | Seconds (model already local) |
| Model management | External (mount PVCs, manage yourself) | Integrated with Docker Model Runner store |
### Building dmrlet
```bash
# Build the dmrlet binary
go build -o dmrlet ./cmd/dmrlet
# Verify it works
./dmrlet --help
```
### Usage
**Start the daemon:**
```bash
# Start in foreground
dmrlet daemon
# With custom socket path
dmrlet daemon --socket /tmp/dmrlet.sock
```
**Serve a model:**
```bash
# Auto-detect backend and GPUs
dmrlet serve llama3.2
# Specify backend
dmrlet serve llama3.2 --backend vllm
# Specify GPU allocation
dmrlet serve llama3.2 --gpus 0,1
dmrlet serve llama3.2 --gpus all
# Multiple replicas
dmrlet serve llama3.2 --replicas 2
# Backend-specific options
dmrlet serve llama3.2 --ctx-size 4096 # llama.cpp context size
dmrlet serve llama3.2 --gpu-memory 0.8 # vLLM GPU memory utilization
```
**List running models:**
```bash
dmrlet ps
# MODEL BACKEND REPLICAS GPUS ENDPOINTS STATUS
# llama3.2 llama.cpp 1 [0,1,2,3] localhost:30000 healthy
```
**View logs:**
```bash
dmrlet logs llama3.2 # Last 100 lines
dmrlet logs llama3.2 -f # Follow logs
```
**Scale replicas:**
```bash
dmrlet scale llama3.2 4 # Scale to 4 replicas
```
**Stop a model:**
```bash
dmrlet stop llama3.2
dmrlet stop --all # Stop all models
```
**Check status:**
```bash
dmrlet status
# DAEMON: running
# SOCKET: /var/run/dmrlet.sock
#
# GPUs:
# GPU 0: NVIDIA A100 80GB 81920MB (in use: llama3.2)
# GPU 1: NVIDIA A100 80GB 81920MB (available)
#
# MODELS: 1 running
```
### Supported Backends
- **llama.cpp** - Default backend for GGUF models
- **vLLM** - High-throughput serving for safetensors models
- **SGLang** - Fast serving with RadixAttention
### Architecture
```
dmrlet daemon
├── GPU Manager - Auto-detect and allocate GPUs
├── Container Manager - Docker-based container lifecycle
├── Service Registry - Endpoint discovery with load balancing
├── Health Monitor - Auto-restart unhealthy containers
├── Auto-scaler - Scale based on QPS/latency/GPU utilization
└── Log Aggregator - Centralized log collection
```
>>>>>>> Stashed changes
## Community
For general questions and discussion, please use [Docker Model Runner's Slack channel](https://dockercommunity.slack.com/archives/C09H9P5E57B).
Expand Down
130 changes: 130 additions & 0 deletions cmd/dmr/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
// dmr is a developer convenience wrapper that starts the model-runner server on
// a free port and runs a model-cli command against it in one step.
//
// Usage: dmr <cli-args...>
//
// Example: dmr run qwen3:0.6B-Q4_0 tell me today's news
package main

import (
"errors"
"fmt"
"net"
"net/http"
"os"
"os/exec"
"os/signal"
"path/filepath"
"strconv"
"syscall"
"time"
)

func freePort() (int, error) {
l, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
return 0, err
}
defer l.Close()
return l.Addr().(*net.TCPAddr).Port, nil
}

func waitForServer(url string, timeout time.Duration) error {
client := &http.Client{Timeout: time.Second}
deadline := time.Now().Add(timeout)
for time.Now().Before(deadline) {
resp, err := client.Get(url)
if err == nil {
resp.Body.Close()
if resp.StatusCode == http.StatusOK {
return nil
}
}
time.Sleep(200 * time.Millisecond)
}
return fmt.Errorf("server not ready after %s", timeout)
}

func checkBinary(path, name, expectedLayout string) error {
if _, err := os.Stat(path); os.IsNotExist(err) {
return fmt.Errorf("missing %s binary at %s\n\nExpected directory layout:\n%s\n\nPlease run 'make build' to build all binaries", name, path, expectedLayout)
}
return nil
}

func main() {
self, err := os.Executable()
if err != nil {
fmt.Fprintf(os.Stderr, "dmr: %v\n", err)
os.Exit(1)
}
dir := filepath.Dir(self)

serverBin := filepath.Join(dir, "model-runner")
cliBin := filepath.Join(dir, "cmd", "cli", "model-cli")

expectedLayout := fmt.Sprintf(`%s/
├── model-runner (server binary)
├── dmr (this wrapper)
└── cmd/
└── cli/
└── model-cli (CLI binary)`, dir)

if err := checkBinary(serverBin, "model-runner", expectedLayout); err != nil {
fmt.Fprintf(os.Stderr, "dmr: %v\n", err)
os.Exit(1)
}
if err := checkBinary(cliBin, "model-cli", expectedLayout); err != nil {
fmt.Fprintf(os.Stderr, "dmr: %v\n", err)
os.Exit(1)
}

port, err := freePort()
if err != nil {
fmt.Fprintf(os.Stderr, "dmr: failed to find free port: %v\n", err)
os.Exit(1)
}
portStr := strconv.Itoa(port)
serverURL := "http://localhost:" + portStr

fmt.Fprintf(os.Stderr, "dmr: starting model-runner on port %d\n", port)

server := exec.Command(serverBin)
server.Env = append(os.Environ(), "MODEL_RUNNER_PORT="+portStr)
server.Stderr = os.Stderr
server.Stdout = os.Stdout

if err := server.Start(); err != nil {
fmt.Fprintf(os.Stderr, "dmr: failed to start model-runner: %v\n", err)
os.Exit(1)
}
defer server.Process.Kill()

sigCh := make(chan os.Signal, 1)
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
go func() {
<-sigCh
server.Process.Kill()
}()

if err := waitForServer(serverURL+"/", 30*time.Second); err != nil {
fmt.Fprintf(os.Stderr, "dmr: %v\n", err)
os.Exit(1)
}

// #nosec G702 - Intentional: dmr is a CLI wrapper that forwards arguments to model-cli
cli := exec.Command(cliBin, os.Args[1:]...)
cli.Env = append(os.Environ(), "MODEL_RUNNER_HOST="+serverURL)
cli.Stdin = os.Stdin
cli.Stdout = os.Stdout
cli.Stderr = os.Stderr

if err := cli.Run(); err != nil {
var exitErr *exec.ExitError
if errors.As(err, &exitErr) {
os.Exit(exitErr.ExitCode())
}
fmt.Fprintf(os.Stderr, "dmr: %v\n", err)
os.Exit(1)
}
}
Loading