diff --git a/compute/engine.go b/compute/engine.go index 620caac..4b9ca04 100644 --- a/compute/engine.go +++ b/compute/engine.go @@ -12,6 +12,8 @@ import ( // FusedRMSNormer is an optional interface for engines that support GPU-accelerated // fused RMSNorm. Layers can type-assert to this to use the fused kernel. // Returns (output, scales) where scales contains per-row rsqrt values for backward pass. +// +// This API is not covered by the v1 stability guarantee. type FusedRMSNormer interface { FusedRMSNormGPU(input, weight *tensor.TensorNumeric[float32], epsilon float32) (output, scales *tensor.TensorNumeric[float32], err error) } @@ -19,6 +21,8 @@ type FusedRMSNormer interface { // PoolResetter is an optional interface for engines that use arena-based // memory pools. Call ResetPool() at the start of each forward pass to // reclaim all per-pass intermediate allocations in O(1). +// +// This API is not covered by the v1 stability guarantee. type PoolResetter interface { ResetPool() } @@ -27,6 +31,8 @@ type PoolResetter interface { // model weights to device memory at load time. This eliminates per-operation // host-to-device copies during inference. Each tensor's storage is replaced // in-place from CPUStorage to device-resident storage. +// +// This API is not covered by the v1 stability guarantee. type WeightUploader interface { UploadWeights(tensors []*tensor.TensorNumeric[float32]) error } @@ -35,12 +41,16 @@ type WeightUploader interface { // C = A * B^T without explicitly transposing B. This avoids an extra // GPU allocation and kernel launch for the transpose operation. // A is [batch, m, k], B is [batch, n, k], result is [batch, m, n]. +// +// This API is not covered by the v1 stability guarantee. type TransposeBMatMuler[T tensor.Numeric] interface { MatMulTransposeB(ctx context.Context, a, b *tensor.TensorNumeric[T], dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) } // StreamProvider is an optional interface for engines that expose their // underlying GPU stream for CUDA graph capture. +// +// This API is not covered by the v1 stability guarantee. type StreamProvider interface { // Stream returns the engine's GPU stream as an unsafe.Pointer (cudaStream_t). Stream() unsafe.Pointer @@ -49,6 +59,8 @@ type StreamProvider interface { // GPUStreamAccessor is an optional interface for engines that provide their // gpuapi.Stream for async memory operations (e.g., KV cache D2D copies // during CUDA graph capture). +// +// This API is not covered by the v1 stability guarantee. type GPUStreamAccessor interface { GPUStream() gpuapi.Stream } @@ -56,6 +68,8 @@ type GPUStreamAccessor interface { // GPUArgmaxer is an optional interface for engines that can compute argmax // entirely on GPU, returning just the index without copying logits to host. // This eliminates the ~1MB D2H copy per token for greedy decoding. +// +// This API is not covered by the v1 stability guarantee. type GPUArgmaxer interface { GPUArgmax(t *tensor.TensorNumeric[float32]) (int, error) } @@ -63,6 +77,8 @@ type GPUArgmaxer interface { // FP16ToF32Converter is an optional interface for engines that can convert // a tensor with Float16Storage to a regular float32 GPU tensor. This is used // at the end of the FP16 forward pass to produce F32 logits for sampling. +// +// This API is not covered by the v1 stability guarantee. type FP16ToF32Converter interface { ConvertFP16ToF32(t *tensor.TensorNumeric[float32]) (*tensor.TensorNumeric[float32], error) } @@ -72,6 +88,8 @@ type FP16ToF32Converter interface { // supports paged attention, callers can pass block pointers and indices // instead of contiguous KV tensors. // +// This API is not covered by the v1 stability guarantee. +// // Q: [batch*numQHeads, headDim] // blockPtrsK: device array of float* pointers to K blocks // blockPtrsV: device array of float* pointers to V blocks diff --git a/compute/engine_proxy.go b/compute/engine_proxy.go index 7720f9a..ef91be7 100644 --- a/compute/engine_proxy.go +++ b/compute/engine_proxy.go @@ -9,6 +9,8 @@ import ( ) // TraceRecorder is the interface used by EngineProxy to record traced operations. +// +// This API is not covered by the v1 stability guarantee. type TraceRecorder[T tensor.Numeric] interface { Record(opName string, inputs []*tensor.TensorNumeric[T], output *tensor.TensorNumeric[T], extra map[string]any) RecordMultiOutput(opName string, inputs []*tensor.TensorNumeric[T], outputs []*tensor.TensorNumeric[T], extra map[string]any) diff --git a/compute/flash_decode.go b/compute/flash_decode.go index edd1017..82d8b90 100644 --- a/compute/flash_decode.go +++ b/compute/flash_decode.go @@ -15,6 +15,8 @@ import ( // O: [batch * numQHeads, headDim] — output (caller-allocated). // // Supports GQA: numQHeads must be a multiple of numKVHeads. +// +// This API is not covered by the v1 stability guarantee. func FlashDecode( Q, K, V, O []float32, batch, numQHeads, numKVHeads, kvLen, headDim int, diff --git a/compute/fused_add_rmsnorm.go b/compute/fused_add_rmsnorm.go index 9460ce9..8acc323 100644 --- a/compute/fused_add_rmsnorm.go +++ b/compute/fused_add_rmsnorm.go @@ -7,6 +7,8 @@ import ( // FusedAddRMSNormProvider is implemented by engines that support fused // residual-add + RMS normalization in a single GPU kernel launch. // This eliminates one kernel launch per fusion point (2 per transformer layer). +// +// This API is not covered by the v1 stability guarantee. type FusedAddRMSNormProvider[T tensor.Numeric] interface { // GPUFusedAddRMSNorm computes: // sum = input + residual diff --git a/compute/fused_norm_add.go b/compute/fused_norm_add.go index f04099b..fbe7c00 100644 --- a/compute/fused_norm_add.go +++ b/compute/fused_norm_add.go @@ -8,6 +8,8 @@ import ( // RMSNorm + elementwise Add in a single GPU kernel launch. // output = rmsnorm(input, weight, eps) + residual. // This eliminates one kernel launch per fusion point. +// +// This API is not covered by the v1 stability guarantee. type FusedNormAddProvider[T tensor.Numeric] interface { // GPUFusedNormAdd computes: // normed = rmsnorm(input, weight, eps) diff --git a/compute/fused_qk_norm_rope.go b/compute/fused_qk_norm_rope.go index 7150cc4..15cdf8e 100644 --- a/compute/fused_qk_norm_rope.go +++ b/compute/fused_qk_norm_rope.go @@ -8,6 +8,8 @@ import ( // per-head QK RMSNorm + RoPE in a single GPU kernel launch. // This replaces 4 kernel launches (Q_norm + K_norm + Q_RoPE + K_RoPE) // with 1 per GQA layer during decode. +// +// This API is not covered by the v1 stability guarantee. type FusedQKNormRoPEProvider[T tensor.Numeric] interface { // GPUFusedQKNormRoPE applies per-head RMSNorm + RoPE to combined Q+K data. // input: [totalHeads, headDim] (Q heads then K heads, contiguous). diff --git a/compute/fused_rmsnorm.go b/compute/fused_rmsnorm.go index 5200951..317ad4b 100644 --- a/compute/fused_rmsnorm.go +++ b/compute/fused_rmsnorm.go @@ -11,6 +11,8 @@ import ( // Weight shape: [D]. // Returns (output, scales) where output has same shape as input and scales // has shape [..., 1] containing the per-row rsqrt(mean(x^2)+eps) values. +// +// This API is not covered by the v1 stability guarantee. func FusedRMSNorm(input, weight *tensor.TensorNumeric[float32], epsilon float32) (output, scales *tensor.TensorNumeric[float32], err error) { shape := input.Shape() D := shape[len(shape)-1] diff --git a/compute/fused_rope.go b/compute/fused_rope.go index a322e19..b398755 100644 --- a/compute/fused_rope.go +++ b/compute/fused_rope.go @@ -8,12 +8,16 @@ import ( ) // FusedRoPEProvider is implemented by engines that support fused GPU RoPE. +// +// This API is not covered by the v1 stability guarantee. type FusedRoPEProvider[T tensor.Numeric] interface { GPUFusedRoPE(input, cosAngles, sinAngles *tensor.TensorNumeric[T], rotaryDim int) (*tensor.TensorNumeric[T], error) } // FusedRoPE applies rotary position embeddings in a single pass. // Input shape: [batch, seq_len, head_dim] where head_dim is even. +// +// This API is not covered by the v1 stability guarantee. // cos/sin shape: [seq_len, half_dim] (precomputed angles). // rotaryDim: number of dimensions that receive rotation (<= head_dim, must be even). // For each position (b, s): diff --git a/compute/fused_scaled_softmax.go b/compute/fused_scaled_softmax.go index 1002b60..5fda568 100644 --- a/compute/fused_scaled_softmax.go +++ b/compute/fused_scaled_softmax.go @@ -7,6 +7,8 @@ import ( // FusedScaledSoftmaxProvider is implemented by engines that support fused GPU scaled softmax. // It computes output = softmax(input * scale) in a single kernel launch, // eliminating the MulScalar + Softmax chain (saves 1 kernel launch per call). +// +// This API is not covered by the v1 stability guarantee. type FusedScaledSoftmaxProvider[T tensor.Numeric] interface { GPUScaledSoftmax(input *tensor.TensorNumeric[T], scale float32, axis int) (*tensor.TensorNumeric[T], error) } diff --git a/compute/fused_silugate.go b/compute/fused_silugate.go index ceef64a..a553a5c 100644 --- a/compute/fused_silugate.go +++ b/compute/fused_silugate.go @@ -11,6 +11,8 @@ import ( // SiLU(x) = x * sigmoid(x) = x / (1 + exp(-x)). // gate and up must have the same shape. // This avoids materializing separate sigmoid, mul, and mul intermediate tensors. +// +// This API is not covered by the v1 stability guarantee. func FusedSiLUGate(gate, up *tensor.TensorNumeric[float32]) (*tensor.TensorNumeric[float32], error) { gShape := gate.Shape() uShape := up.Shape() diff --git a/compute/fused_swiglu.go b/compute/fused_swiglu.go index 261b2a9..f3a01e2 100644 --- a/compute/fused_swiglu.go +++ b/compute/fused_swiglu.go @@ -7,6 +7,8 @@ import ( // FusedSwiGLUProvider is implemented by engines that support fused GPU SwiGLU. // It computes output[i] = w1[i] * sigmoid(w1[i]) * w3[i] in a single kernel, // eliminating the Concat + Split + sigmoid + Mul + Mul chain. +// +// This API is not covered by the v1 stability guarantee. type FusedSwiGLUProvider[T tensor.Numeric] interface { GPUFusedSwiGLU(w1, w3 *tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error) } diff --git a/compute/testable_engine.go b/compute/testable_engine.go index 22c38a6..5bb1cbd 100644 --- a/compute/testable_engine.go +++ b/compute/testable_engine.go @@ -9,8 +9,10 @@ import ( "github.com/zerfoo/ztensor/tensor" ) -// TestableEngine extends CPUEngine with methods that allow controlled error injection +// TestableEngine extends CPUEngine with methods that allow controlled error injection. // This enables testing of previously unreachable error paths. +// +// This API is not covered by the v1 stability guarantee. type TestableEngine[T tensor.Numeric] struct { *CPUEngine[T] } @@ -23,6 +25,8 @@ func NewTestableEngine[T tensor.Numeric](ops numeric.Arithmetic[T]) *TestableEng } // FailableTensor wraps a tensor and can be configured to fail on specific operations. +// +// This API is not covered by the v1 stability guarantee. type FailableTensor[T tensor.Numeric] struct { *tensor.TensorNumeric[T] failOnSet bool @@ -140,6 +144,8 @@ func (e *TestableEngine[T]) TestableTranspose(_ context.Context, a *tensor.Tenso } // FailableZeroer can be configured to fail on Zero operations. +// +// This API is not covered by the v1 stability guarantee. type FailableZeroer[T tensor.Numeric] struct { engine *TestableEngine[T] failZero bool diff --git a/docs/adr/001-api-stability-v1.md b/docs/adr/001-api-stability-v1.md new file mode 100644 index 0000000..7e6a0bb --- /dev/null +++ b/docs/adr/001-api-stability-v1.md @@ -0,0 +1,218 @@ +# ADR-001: API Stability Contract for ztensor v1.0.0 + +**Status:** Accepted +**Date:** 2026-03-29 +**Authors:** David Ndungu + +## Context + +The `ztensor` module (`github.com/zerfoo/ztensor`) is the tensor, compute, and graph foundation for the Zerfoo ML framework. Downstream consumers — primarily `github.com/zerfoo/zerfoo` — depend heavily on its exported surface. Before tagging v1.0.0 we need a clear contract defining which APIs are covered by the Go compatibility promise (no breaking changes without a v2 major version bump) and which may evolve in minor releases. + +## Decision + +### Stable Surface (v1 compatibility guarantee) + +The following packages and their exported symbols are **stable**. Breaking changes to these APIs require a v2 major version. + +#### `compute` — Engine interface and CPU/GPU implementations + +| Symbol | Kind | Description | +|--------|------|-------------| +| `Engine[T]` | interface | Core computation engine — all methods | +| `CPUEngine[T]` | struct | CPU engine implementation | +| `NewCPUEngine[T]` | constructor | | +| `GPUEngine[T]` | struct | GPU (CUDA) engine implementation | +| `NewGPUEngine[T]` | constructor | | +| `EngineProxy[T]` | struct | Engine wrapper/proxy | +| `NewEngineProxy[T]` | constructor | | +| `DType` | type | Data type enum | +| `DTypeF32`, etc. | constants | DType values | +| `DefaultMaxAllocBytes` | constant | Default memory limit | +| `ErrMemoryLimitExceeded` | var | Sentinel error | + +The following `compute` symbols are exported but **not** covered by the v1 stability guarantee (they support specialised kernel paths, GPU internals, or testing infrastructure): + +| Symbol | Kind | Reason unstable | +|--------|------|-----------------| +| `FusedAddRMSNormProvider[T]` | interface | Fusion provider — kernel interface may evolve | +| `FusedNormAddProvider[T]` | interface | Fusion provider | +| `FusedQKNormRoPEProvider[T]` | interface | Fusion provider | +| `FusedRMSNormer` | interface | Fusion provider | +| `FusedRoPEProvider[T]` | interface | Fusion provider | +| `FusedScaledSoftmaxProvider[T]` | interface | Fusion provider | +| `FusedSwiGLUProvider[T]` | interface | Fusion provider | +| `FusedRMSNorm` | func | Standalone fused op | +| `FusedRoPE` | func | Standalone fused op | +| `FusedSiLUGate` | func | Standalone fused op | +| `FlashDecode` | func | Flash attention kernel entry point | +| `FlashDecodeSplitKV` | func | Flash attention kernel entry point | +| `GPUArgmaxer` | interface | GPU-specific capability | +| `GPUStreamAccessor` | interface | GPU-specific capability | +| `PagedGQAer` | interface | GPU-specific capability | +| `PoolResetter` | interface | GPU-specific capability | +| `StreamProvider` | interface | GPU-specific capability | +| `WeightUploader` | interface | GPU-specific capability | +| `FP16ToF32Converter` | interface | GPU-specific capability | +| `TransposeBMatMuler[T]` | interface | GPU-specific capability | +| `W4A16MatMuler[T]` | interface | Quantisation kernel interface | +| `W4A16Precision` | struct | Quantisation detail | +| `W4A16Info[T]` | func | Quantisation detail | +| `IsW4A16[T]` | func | Quantisation detail | +| `MatMulW4A16[T]` | func | Quantisation kernel | +| `TryW4A16MatMul[T]` | func | Quantisation kernel | +| `DequantW4ToFP16` | func | Quantisation kernel | +| `ComputeAmax[T]` | func | FP8 helper | +| `ScaleForFP8[T]` | func | FP8 helper | +| `QuantFormat[T]` | func | Quantisation helper | +| `HadamardMatrix[T]` | func | Specialised math | +| `TernaryGEMV` | func | Ternary kernel | +| `TernaryGEMVGPU` | func | Ternary GPU kernel | +| `HardwareProfile` | struct | Hardware profiling | +| `ProfileHardware` | func | Hardware profiling | +| `MemoryTracker` | struct | Memory tracking | +| `NewMemoryTracker` | constructor | Memory tracking | +| `TensorArena` | struct | Arena allocator | +| `TensorPool[T]` | struct | Pool allocator | +| `NewTensorPool[T]` | constructor | Pool allocator | +| `FailableTensor[T]` | struct | Testing utility | +| `NewFailableTensor[T]` | constructor | Testing utility | +| `FailableZeroer[T]` | struct | Testing utility | +| `NewFailableZeroer[T]` | constructor | Testing utility | +| `TestableEngine[T]` | struct | Testing utility | +| `NewTestableEngine[T]` | constructor | Testing utility | +| `TraceRecorder[T]` | interface | Tracing/debugging | +| `TracedOp` | struct | Tracing/debugging | +| `Tracer[T]` | struct | Tracing/debugging | +| `NewTracer[T]` | constructor | Tracing/debugging | + +#### `tensor` — Tensor types and storage + +| Symbol | Kind | Description | +|--------|------|-------------| +| `Numeric` | interface constraint | Core type constraint for all numeric types | +| `Float` | interface constraint | Floating-point subset of Numeric | +| `Addable` | interface constraint | Types supporting addition | +| `TensorNumeric[T]` | struct | Primary tensor type | +| `New[T]` | constructor | Create tensor from data | +| `NewFromBytes[T]` | constructor | Create tensor from bytes | +| `NewWithStorage[T]` | constructor | Create tensor with custom storage | +| `ToCPU[T]` | func | Transfer tensor to CPU | +| `ToGPU[T]` | func | Transfer tensor to GPU | +| `ToGPUDevice[T]` | func | Transfer tensor to specific GPU | +| `Tensor` | interface | Non-generic tensor interface | +| `NewFromType` | constructor | Create tensor from reflect.Type | +| `TensorBool` | struct | Boolean tensor | +| `NewBool` | constructor | | +| `TensorString` | struct | String tensor | +| `NewString` | constructor | | +| `Storage[T]` | interface | Storage backend interface | +| `CPUStorage[T]` | struct | CPU storage | +| `NewCPUStorage[T]` | constructor | | +| `GPUStorage[T]` | struct | GPU storage | +| `NewGPUStorage[T]`, `NewGPUStorageFromSlice[T]`, etc. | constructors | GPU storage constructors | +| `Equals[T]` | func | Tensor equality | +| `AssertClose[T]` | func | Testing helper | +| `AssertEquals[T]` | func | Testing helper | +| `BroadcastShapes` | func | Shape broadcasting | +| `BroadcastIndex` | func | Index broadcasting | +| `SameShape`, `ShapesEqual` | funcs | Shape comparison | +| `Product` | func | Shape product | +| `ConvertInt64ToInt`, `ConvertIntToInt64` | funcs | Index conversion | + +The following `tensor` symbols are exported but **not** covered by the v1 stability guarantee (quantisation storage types, mmap internals, GGML type enums): + +| Symbol | Kind | Reason unstable | +|--------|------|-----------------| +| `GGMLType` | type + constants | GGML format detail | +| `Q4Storage`, `Q4KStorage`, `Q5KStorage`, `Q5_0Storage`, `Q6KStorage`, `Q8Storage` | structs | Quantisation storage — format may evolve | +| `IQ2XXSStorage`, `IQ3SStorage`, `IQ4NLStorage` | structs | Quantisation storage | +| `AWQStorage`, `GPTQStorage`, `NF4Storage`, `NVFloat4Storage` | structs | Quantisation storage | +| `W8A8Storage` | struct | Quantisation storage | +| `Float16Storage`, `BFloat16Storage` | structs | Precision storage | +| `FP8E4M3Storage`, `FP8E5M2Storage` | structs | FP8 storage | +| `TernaryStorage` | struct | Ternary storage | +| `MmapStorage` | struct | Mmap-backed storage | +| `Dequantizer` | interface | Quantisation registry | +| `RegisterQuantType`, `GetQuantType`, `ListQuantTypes` | funcs | Quantisation registry | +| `DequantizeQ4K`, `DequantizeQ5K`, `DequantizeQ5_0`, `DequantizeQ6K`, `DequantizeIQ3S`, `DequantizeIQ4NL` | funcs | Dequantisation | +| `QuantizeQ4`, `QuantizeQ8`, `QuantizeAWQ`, `QuantizeGPTQ`, `QuantizeW8A8` | funcs | Quantisation | +| `GemmW8A8`, `GemmW8A8NT`, `GemmF32W8A8NT` | funcs | Quantised GEMM kernels | +| `IQ4NLTable` | var | Lookup table | +| `Mmap`, `MmapFile`, `Munmap` | funcs | Memory-mapped I/O | +| `MadviseSequential`, `MadviseRandom`, `MadviseWillNeed`, `MadviseDontNeed` | funcs | Madvise helpers | +| `Float32ToBytes`, `Int8ToBytes`, `Uint8ToBytes` | funcs | Byte conversion | +| `Q4GPUDataOffset`, `Q4GPUScaleOffset` | funcs | GPU quantisation layout | +| `MergeQ4Storage`, `MergeQ4KStorage`, `MergeQ6KStorage`, `MergeIQ4NLStorage` | funcs | Storage merging | + +#### `device` — Device abstraction + +| Symbol | Kind | Description | +|--------|------|-------------| +| `Device` | interface | Device abstraction — all methods | +| `Get` | func | Device lookup by ID | +| `Type` | type | Device type enum | +| `CPU` | constant | CPU device type | +| `Allocator` | interface | Memory allocator | +| `NewCPUAllocator` | constructor | | +| `NewCUDAAllocator` | constructor | | + +All symbols in `device` are stable. + +#### `numeric` — Arithmetic operations + +| Symbol | Kind | Description | +|--------|------|-------------| +| `Arithmetic[T]` | interface | Core arithmetic interface | +| `Float32Ops` | struct | float32 arithmetic | +| `Float64Ops` | struct | float64 arithmetic | +| `Float16Ops` | struct | float16 arithmetic | +| `BFloat16Ops` | struct | bfloat16 arithmetic | +| `Float8Ops` | struct | float8 arithmetic | +| `Int8Ops` | struct | int8 arithmetic | +| `IntOps` | struct | int arithmetic | +| `Uint8Ops` | struct | uint8 arithmetic | +| `QuantizationConfig` | struct | Quantisation parameters | +| `ComputeQuantizationParams` | func | Compute quantisation scale/zero-point | +| `NewQuantizationConfig` | func | | +| `Pack4BitSlice`, `Unpack4BitSlice` | funcs | 4-bit packing | +| `Pack4BitWeights`, `Unpack4BitWeights` | funcs | 4-bit packing | + +The following `numeric` symbols are testing utilities and **not** covered by the v1 stability guarantee: + +| Symbol | Kind | Reason unstable | +|--------|------|-----------------| +| `TestArithmeticOp[T]` | func | Testing helper | +| `TestUnaryOp[T]` | func | Testing helper | +| `TestLeakyReLUOp[T]` | func | Testing helper | +| `TestSumOp[T]` | func | Testing helper | +| `ArithmeticTestCase[T]` | struct | Testing helper | +| `UnaryTestCase[T]` | struct | Testing helper | +| `LeakyReLUTestCase[T]` | struct | Testing helper | +| `SumTestCase[T]` | struct | Testing helper | +| `Float16TestData` | func | Testing helper | +| `Float8TestData` | func | Testing helper | + +### Explicitly Unstable Packages + +The following packages are **not** covered by the v1 stability guarantee. They may change in minor versions. + +| Package | Reason | +|---------|--------| +| `internal/*` | Go internal convention — not importable outside module | +| `graph/` | Computation graph compilation pipeline is still evolving | +| `graph/kv/` | KV cache management — API actively changing | +| `batched/` | Batched multi-model inference — new, API not yet settled | +| `gguf/` | GGUF writer — low-level format utility | +| `log/` | Logging abstraction — may be replaced | +| `metrics/` | Metrics and correlation functions | +| `metrics/runtime/` | Runtime metrics collection | +| `types/` | Shared types (e.g. `BackwardMode`) — may be reorganised | + +These packages are used by `github.com/zerfoo/zerfoo` and must remain exported, but their APIs may change in ztensor v1.x minor releases. + +## Consequences + +1. **v1.0.0 tag** — Once tagged, the five stable packages (`compute`, `tensor`, `device`, `numeric`, and their stable symbols as listed above) follow Go module compatibility: no breaking changes until v2. +2. **Unstable packages** — Consumers of `graph`, `batched`, `gguf`, `log`, `metrics`, and `types` should pin to exact ztensor versions and expect potential breakage on minor upgrades. +3. **Documentation** — Unstable symbols in stable packages carry a doc comment: `// This API is not covered by the v1 stability guarantee.` +4. **Future promotion** — As unstable packages mature (especially `graph/`), they may be promoted to stable in a future minor release. Promotion is additive and never breaking.