diff --git a/compute/engine.go b/compute/engine.go
index 620caac..4b9ca04 100644
--- a/compute/engine.go
+++ b/compute/engine.go
@@ -12,6 +12,8 @@ import (
 // FusedRMSNormer is an optional interface for engines that support GPU-accelerated
 // fused RMSNorm. Layers can type-assert to this to use the fused kernel.
 // Returns (output, scales) where scales contains per-row rsqrt values for backward pass.
+//
+// This API is not covered by the v1 stability guarantee.
 type FusedRMSNormer interface {
 	FusedRMSNormGPU(input, weight *tensor.TensorNumeric[float32], epsilon float32) (output, scales *tensor.TensorNumeric[float32], err error)
 }
@@ -19,6 +21,8 @@ type FusedRMSNormer interface {
 // PoolResetter is an optional interface for engines that use arena-based
 // memory pools. Call ResetPool() at the start of each forward pass to
 // reclaim all per-pass intermediate allocations in O(1).
+//
+// This API is not covered by the v1 stability guarantee.
 type PoolResetter interface {
 	ResetPool()
 }
@@ -27,6 +31,8 @@ type PoolResetter interface {
 // model weights to device memory at load time. This eliminates per-operation
 // host-to-device copies during inference. Each tensor's storage is replaced
 // in-place from CPUStorage to device-resident storage.
+//
+// This API is not covered by the v1 stability guarantee.
 type WeightUploader interface {
 	UploadWeights(tensors []*tensor.TensorNumeric[float32]) error
 }
@@ -35,12 +41,16 @@ type WeightUploader interface {
 // C = A * B^T without explicitly transposing B. This avoids an extra
 // GPU allocation and kernel launch for the transpose operation.
 // A is [batch, m, k], B is [batch, n, k], result is [batch, m, n].
+//
+// This API is not covered by the v1 stability guarantee.
 type TransposeBMatMuler[T tensor.Numeric] interface {
 	MatMulTransposeB(ctx context.Context, a, b *tensor.TensorNumeric[T], dst ...*tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)
 }
 
 // StreamProvider is an optional interface for engines that expose their
 // underlying GPU stream for CUDA graph capture.
+//
+// This API is not covered by the v1 stability guarantee.
 type StreamProvider interface {
 	// Stream returns the engine's GPU stream as an unsafe.Pointer (cudaStream_t).
 	Stream() unsafe.Pointer
@@ -49,6 +59,8 @@ type StreamProvider interface {
 // GPUStreamAccessor is an optional interface for engines that provide their
 // gpuapi.Stream for async memory operations (e.g., KV cache D2D copies
 // during CUDA graph capture).
+//
+// This API is not covered by the v1 stability guarantee.
 type GPUStreamAccessor interface {
 	GPUStream() gpuapi.Stream
 }
@@ -56,6 +68,8 @@ type GPUStreamAccessor interface {
 // GPUArgmaxer is an optional interface for engines that can compute argmax
 // entirely on GPU, returning just the index without copying logits to host.
 // This eliminates the ~1MB D2H copy per token for greedy decoding.
+//
+// This API is not covered by the v1 stability guarantee.
 type GPUArgmaxer interface {
 	GPUArgmax(t *tensor.TensorNumeric[float32]) (int, error)
 }
@@ -63,6 +77,8 @@ type GPUArgmaxer interface {
 // FP16ToF32Converter is an optional interface for engines that can convert
 // a tensor with Float16Storage to a regular float32 GPU tensor. This is used
 // at the end of the FP16 forward pass to produce F32 logits for sampling.
+//
+// This API is not covered by the v1 stability guarantee.
 type FP16ToF32Converter interface {
 	ConvertFP16ToF32(t *tensor.TensorNumeric[float32]) (*tensor.TensorNumeric[float32], error)
 }
@@ -72,6 +88,8 @@ type FP16ToF32Converter interface {
 // supports paged attention, callers can pass block pointers and indices
 // instead of contiguous KV tensors.
 //
+// This API is not covered by the v1 stability guarantee.
+//
 // Q:            [batch*numQHeads, headDim]
 // blockPtrsK:   device array of float* pointers to K blocks
 // blockPtrsV:   device array of float* pointers to V blocks
diff --git a/compute/engine_proxy.go b/compute/engine_proxy.go
index 7720f9a..ef91be7 100644
--- a/compute/engine_proxy.go
+++ b/compute/engine_proxy.go
@@ -9,6 +9,8 @@ import (
 )
 
 // TraceRecorder is the interface used by EngineProxy to record traced operations.
+//
+// This API is not covered by the v1 stability guarantee.
 type TraceRecorder[T tensor.Numeric] interface {
 	Record(opName string, inputs []*tensor.TensorNumeric[T], output *tensor.TensorNumeric[T], extra map[string]any)
 	RecordMultiOutput(opName string, inputs []*tensor.TensorNumeric[T], outputs []*tensor.TensorNumeric[T], extra map[string]any)
diff --git a/compute/flash_decode.go b/compute/flash_decode.go
index edd1017..82d8b90 100644
--- a/compute/flash_decode.go
+++ b/compute/flash_decode.go
@@ -15,6 +15,8 @@ import (
 // O:  [batch * numQHeads, headDim]         — output (caller-allocated).
 //
 // Supports GQA: numQHeads must be a multiple of numKVHeads.
+//
+// This API is not covered by the v1 stability guarantee.
 func FlashDecode(
 	Q, K, V, O []float32,
 	batch, numQHeads, numKVHeads, kvLen, headDim int,
diff --git a/compute/fused_add_rmsnorm.go b/compute/fused_add_rmsnorm.go
index 9460ce9..8acc323 100644
--- a/compute/fused_add_rmsnorm.go
+++ b/compute/fused_add_rmsnorm.go
@@ -7,6 +7,8 @@ import (
 // FusedAddRMSNormProvider is implemented by engines that support fused
 // residual-add + RMS normalization in a single GPU kernel launch.
 // This eliminates one kernel launch per fusion point (2 per transformer layer).
+//
+// This API is not covered by the v1 stability guarantee.
 type FusedAddRMSNormProvider[T tensor.Numeric] interface {
 	// GPUFusedAddRMSNorm computes:
 	//   sum    = input + residual
diff --git a/compute/fused_norm_add.go b/compute/fused_norm_add.go
index f04099b..fbe7c00 100644
--- a/compute/fused_norm_add.go
+++ b/compute/fused_norm_add.go
@@ -8,6 +8,8 @@ import (
 // RMSNorm + elementwise Add in a single GPU kernel launch.
 // output = rmsnorm(input, weight, eps) + residual.
 // This eliminates one kernel launch per fusion point.
+//
+// This API is not covered by the v1 stability guarantee.
 type FusedNormAddProvider[T tensor.Numeric] interface {
 	// GPUFusedNormAdd computes:
 	//   normed = rmsnorm(input, weight, eps)
diff --git a/compute/fused_qk_norm_rope.go b/compute/fused_qk_norm_rope.go
index 7150cc4..15cdf8e 100644
--- a/compute/fused_qk_norm_rope.go
+++ b/compute/fused_qk_norm_rope.go
@@ -8,6 +8,8 @@ import (
 // per-head QK RMSNorm + RoPE in a single GPU kernel launch.
 // This replaces 4 kernel launches (Q_norm + K_norm + Q_RoPE + K_RoPE)
 // with 1 per GQA layer during decode.
+//
+// This API is not covered by the v1 stability guarantee.
 type FusedQKNormRoPEProvider[T tensor.Numeric] interface {
 	// GPUFusedQKNormRoPE applies per-head RMSNorm + RoPE to combined Q+K data.
 	// input: [totalHeads, headDim] (Q heads then K heads, contiguous).
diff --git a/compute/fused_rmsnorm.go b/compute/fused_rmsnorm.go
index 5200951..317ad4b 100644
--- a/compute/fused_rmsnorm.go
+++ b/compute/fused_rmsnorm.go
@@ -11,6 +11,8 @@ import (
 // Weight shape: [D].
 // Returns (output, scales) where output has same shape as input and scales
 // has shape [..., 1] containing the per-row rsqrt(mean(x^2)+eps) values.
+//
+// This API is not covered by the v1 stability guarantee.
 func FusedRMSNorm(input, weight *tensor.TensorNumeric[float32], epsilon float32) (output, scales *tensor.TensorNumeric[float32], err error) {
 	shape := input.Shape()
 	D := shape[len(shape)-1]
diff --git a/compute/fused_rope.go b/compute/fused_rope.go
index a322e19..b398755 100644
--- a/compute/fused_rope.go
+++ b/compute/fused_rope.go
@@ -8,12 +8,16 @@ import (
 )
 
 // FusedRoPEProvider is implemented by engines that support fused GPU RoPE.
+//
+// This API is not covered by the v1 stability guarantee.
 type FusedRoPEProvider[T tensor.Numeric] interface {
 	GPUFusedRoPE(input, cosAngles, sinAngles *tensor.TensorNumeric[T], rotaryDim int) (*tensor.TensorNumeric[T], error)
 }
 
 // FusedRoPE applies rotary position embeddings in a single pass.
 // Input shape: [batch, seq_len, head_dim] where head_dim is even.
+//
+// This API is not covered by the v1 stability guarantee.
 // cos/sin shape: [seq_len, half_dim] (precomputed angles).
 // rotaryDim: number of dimensions that receive rotation (<= head_dim, must be even).
 // For each position (b, s):
diff --git a/compute/fused_scaled_softmax.go b/compute/fused_scaled_softmax.go
index 1002b60..5fda568 100644
--- a/compute/fused_scaled_softmax.go
+++ b/compute/fused_scaled_softmax.go
@@ -7,6 +7,8 @@ import (
 // FusedScaledSoftmaxProvider is implemented by engines that support fused GPU scaled softmax.
 // It computes output = softmax(input * scale) in a single kernel launch,
 // eliminating the MulScalar + Softmax chain (saves 1 kernel launch per call).
+//
+// This API is not covered by the v1 stability guarantee.
 type FusedScaledSoftmaxProvider[T tensor.Numeric] interface {
 	GPUScaledSoftmax(input *tensor.TensorNumeric[T], scale float32, axis int) (*tensor.TensorNumeric[T], error)
 }
diff --git a/compute/fused_silugate.go b/compute/fused_silugate.go
index ceef64a..a553a5c 100644
--- a/compute/fused_silugate.go
+++ b/compute/fused_silugate.go
@@ -11,6 +11,8 @@ import (
 // SiLU(x) = x * sigmoid(x) = x / (1 + exp(-x)).
 // gate and up must have the same shape.
 // This avoids materializing separate sigmoid, mul, and mul intermediate tensors.
+//
+// This API is not covered by the v1 stability guarantee.
 func FusedSiLUGate(gate, up *tensor.TensorNumeric[float32]) (*tensor.TensorNumeric[float32], error) {
 	gShape := gate.Shape()
 	uShape := up.Shape()
diff --git a/compute/fused_swiglu.go b/compute/fused_swiglu.go
index 261b2a9..f3a01e2 100644
--- a/compute/fused_swiglu.go
+++ b/compute/fused_swiglu.go
@@ -7,6 +7,8 @@ import (
 // FusedSwiGLUProvider is implemented by engines that support fused GPU SwiGLU.
 // It computes output[i] = w1[i] * sigmoid(w1[i]) * w3[i] in a single kernel,
 // eliminating the Concat + Split + sigmoid + Mul + Mul chain.
+//
+// This API is not covered by the v1 stability guarantee.
 type FusedSwiGLUProvider[T tensor.Numeric] interface {
 	GPUFusedSwiGLU(w1, w3 *tensor.TensorNumeric[T]) (*tensor.TensorNumeric[T], error)
 }
diff --git a/compute/testable_engine.go b/compute/testable_engine.go
index 22c38a6..5bb1cbd 100644
--- a/compute/testable_engine.go
+++ b/compute/testable_engine.go
@@ -9,8 +9,10 @@ import (
 	"github.com/zerfoo/ztensor/tensor"
 )
 
-// TestableEngine extends CPUEngine with methods that allow controlled error injection
+// TestableEngine extends CPUEngine with methods that allow controlled error injection.
 // This enables testing of previously unreachable error paths.
+//
+// This API is not covered by the v1 stability guarantee.
 type TestableEngine[T tensor.Numeric] struct {
 	*CPUEngine[T]
 }
@@ -23,6 +25,8 @@ func NewTestableEngine[T tensor.Numeric](ops numeric.Arithmetic[T]) *TestableEng
 }
 
 // FailableTensor wraps a tensor and can be configured to fail on specific operations.
+//
+// This API is not covered by the v1 stability guarantee.
 type FailableTensor[T tensor.Numeric] struct {
 	*tensor.TensorNumeric[T]
 	failOnSet    bool
@@ -140,6 +144,8 @@ func (e *TestableEngine[T]) TestableTranspose(_ context.Context, a *tensor.Tenso
 }
 
 // FailableZeroer can be configured to fail on Zero operations.
+//
+// This API is not covered by the v1 stability guarantee.
 type FailableZeroer[T tensor.Numeric] struct {
 	engine   *TestableEngine[T]
 	failZero bool
diff --git a/docs/adr/001-api-stability-v1.md b/docs/adr/001-api-stability-v1.md
new file mode 100644
index 0000000..7e6a0bb
--- /dev/null
+++ b/docs/adr/001-api-stability-v1.md
@@ -0,0 +1,218 @@
+# ADR-001: API Stability Contract for ztensor v1.0.0
+
+**Status:** Accepted
+**Date:** 2026-03-29
+**Authors:** David Ndungu
+
+## Context
+
+The `ztensor` module (`github.com/zerfoo/ztensor`) is the tensor, compute, and graph foundation for the Zerfoo ML framework. Downstream consumers — primarily `github.com/zerfoo/zerfoo` — depend heavily on its exported surface. Before tagging v1.0.0 we need a clear contract defining which APIs are covered by the Go compatibility promise (no breaking changes without a v2 major version bump) and which may evolve in minor releases.
+
+## Decision
+
+### Stable Surface (v1 compatibility guarantee)
+
+The following packages and their exported symbols are **stable**. Breaking changes to these APIs require a v2 major version.
+
+#### `compute` — Engine interface and CPU/GPU implementations
+
+| Symbol | Kind | Description |
+|--------|------|-------------|
+| `Engine[T]` | interface | Core computation engine — all methods |
+| `CPUEngine[T]` | struct | CPU engine implementation |
+| `NewCPUEngine[T]` | constructor | |
+| `GPUEngine[T]` | struct | GPU (CUDA) engine implementation |
+| `NewGPUEngine[T]` | constructor | |
+| `EngineProxy[T]` | struct | Engine wrapper/proxy |
+| `NewEngineProxy[T]` | constructor | |
+| `DType` | type | Data type enum |
+| `DTypeF32`, etc. | constants | DType values |
+| `DefaultMaxAllocBytes` | constant | Default memory limit |
+| `ErrMemoryLimitExceeded` | var | Sentinel error |
+
+The following `compute` symbols are exported but **not** covered by the v1 stability guarantee (they support specialised kernel paths, GPU internals, or testing infrastructure):
+
+| Symbol | Kind | Reason unstable |
+|--------|------|-----------------|
+| `FusedAddRMSNormProvider[T]` | interface | Fusion provider — kernel interface may evolve |
+| `FusedNormAddProvider[T]` | interface | Fusion provider |
+| `FusedQKNormRoPEProvider[T]` | interface | Fusion provider |
+| `FusedRMSNormer` | interface | Fusion provider |
+| `FusedRoPEProvider[T]` | interface | Fusion provider |
+| `FusedScaledSoftmaxProvider[T]` | interface | Fusion provider |
+| `FusedSwiGLUProvider[T]` | interface | Fusion provider |
+| `FusedRMSNorm` | func | Standalone fused op |
+| `FusedRoPE` | func | Standalone fused op |
+| `FusedSiLUGate` | func | Standalone fused op |
+| `FlashDecode` | func | Flash attention kernel entry point |
+| `FlashDecodeSplitKV` | func | Flash attention kernel entry point |
+| `GPUArgmaxer` | interface | GPU-specific capability |
+| `GPUStreamAccessor` | interface | GPU-specific capability |
+| `PagedGQAer` | interface | GPU-specific capability |
+| `PoolResetter` | interface | GPU-specific capability |
+| `StreamProvider` | interface | GPU-specific capability |
+| `WeightUploader` | interface | GPU-specific capability |
+| `FP16ToF32Converter` | interface | GPU-specific capability |
+| `TransposeBMatMuler[T]` | interface | GPU-specific capability |
+| `W4A16MatMuler[T]` | interface | Quantisation kernel interface |
+| `W4A16Precision` | struct | Quantisation detail |
+| `W4A16Info[T]` | func | Quantisation detail |
+| `IsW4A16[T]` | func | Quantisation detail |
+| `MatMulW4A16[T]` | func | Quantisation kernel |
+| `TryW4A16MatMul[T]` | func | Quantisation kernel |
+| `DequantW4ToFP16` | func | Quantisation kernel |
+| `ComputeAmax[T]` | func | FP8 helper |
+| `ScaleForFP8[T]` | func | FP8 helper |
+| `QuantFormat[T]` | func | Quantisation helper |
+| `HadamardMatrix[T]` | func | Specialised math |
+| `TernaryGEMV` | func | Ternary kernel |
+| `TernaryGEMVGPU` | func | Ternary GPU kernel |
+| `HardwareProfile` | struct | Hardware profiling |
+| `ProfileHardware` | func | Hardware profiling |
+| `MemoryTracker` | struct | Memory tracking |
+| `NewMemoryTracker` | constructor | Memory tracking |
+| `TensorArena` | struct | Arena allocator |
+| `TensorPool[T]` | struct | Pool allocator |
+| `NewTensorPool[T]` | constructor | Pool allocator |
+| `FailableTensor[T]` | struct | Testing utility |
+| `NewFailableTensor[T]` | constructor | Testing utility |
+| `FailableZeroer[T]` | struct | Testing utility |
+| `NewFailableZeroer[T]` | constructor | Testing utility |
+| `TestableEngine[T]` | struct | Testing utility |
+| `NewTestableEngine[T]` | constructor | Testing utility |
+| `TraceRecorder[T]` | interface | Tracing/debugging |
+| `TracedOp` | struct | Tracing/debugging |
+| `Tracer[T]` | struct | Tracing/debugging |
+| `NewTracer[T]` | constructor | Tracing/debugging |
+
+#### `tensor` — Tensor types and storage
+
+| Symbol | Kind | Description |
+|--------|------|-------------|
+| `Numeric` | interface constraint | Core type constraint for all numeric types |
+| `Float` | interface constraint | Floating-point subset of Numeric |
+| `Addable` | interface constraint | Types supporting addition |
+| `TensorNumeric[T]` | struct | Primary tensor type |
+| `New[T]` | constructor | Create tensor from data |
+| `NewFromBytes[T]` | constructor | Create tensor from bytes |
+| `NewWithStorage[T]` | constructor | Create tensor with custom storage |
+| `ToCPU[T]` | func | Transfer tensor to CPU |
+| `ToGPU[T]` | func | Transfer tensor to GPU |
+| `ToGPUDevice[T]` | func | Transfer tensor to specific GPU |
+| `Tensor` | interface | Non-generic tensor interface |
+| `NewFromType` | constructor | Create tensor from reflect.Type |
+| `TensorBool` | struct | Boolean tensor |
+| `NewBool` | constructor | |
+| `TensorString` | struct | String tensor |
+| `NewString` | constructor | |
+| `Storage[T]` | interface | Storage backend interface |
+| `CPUStorage[T]` | struct | CPU storage |
+| `NewCPUStorage[T]` | constructor | |
+| `GPUStorage[T]` | struct | GPU storage |
+| `NewGPUStorage[T]`, `NewGPUStorageFromSlice[T]`, etc. | constructors | GPU storage constructors |
+| `Equals[T]` | func | Tensor equality |
+| `AssertClose[T]` | func | Testing helper |
+| `AssertEquals[T]` | func | Testing helper |
+| `BroadcastShapes` | func | Shape broadcasting |
+| `BroadcastIndex` | func | Index broadcasting |
+| `SameShape`, `ShapesEqual` | funcs | Shape comparison |
+| `Product` | func | Shape product |
+| `ConvertInt64ToInt`, `ConvertIntToInt64` | funcs | Index conversion |
+
+The following `tensor` symbols are exported but **not** covered by the v1 stability guarantee (quantisation storage types, mmap internals, GGML type enums):
+
+| Symbol | Kind | Reason unstable |
+|--------|------|-----------------|
+| `GGMLType` | type + constants | GGML format detail |
+| `Q4Storage`, `Q4KStorage`, `Q5KStorage`, `Q5_0Storage`, `Q6KStorage`, `Q8Storage` | structs | Quantisation storage — format may evolve |
+| `IQ2XXSStorage`, `IQ3SStorage`, `IQ4NLStorage` | structs | Quantisation storage |
+| `AWQStorage`, `GPTQStorage`, `NF4Storage`, `NVFloat4Storage` | structs | Quantisation storage |
+| `W8A8Storage` | struct | Quantisation storage |
+| `Float16Storage`, `BFloat16Storage` | structs | Precision storage |
+| `FP8E4M3Storage`, `FP8E5M2Storage` | structs | FP8 storage |
+| `TernaryStorage` | struct | Ternary storage |
+| `MmapStorage` | struct | Mmap-backed storage |
+| `Dequantizer` | interface | Quantisation registry |
+| `RegisterQuantType`, `GetQuantType`, `ListQuantTypes` | funcs | Quantisation registry |
+| `DequantizeQ4K`, `DequantizeQ5K`, `DequantizeQ5_0`, `DequantizeQ6K`, `DequantizeIQ3S`, `DequantizeIQ4NL` | funcs | Dequantisation |
+| `QuantizeQ4`, `QuantizeQ8`, `QuantizeAWQ`, `QuantizeGPTQ`, `QuantizeW8A8` | funcs | Quantisation |
+| `GemmW8A8`, `GemmW8A8NT`, `GemmF32W8A8NT` | funcs | Quantised GEMM kernels |
+| `IQ4NLTable` | var | Lookup table |
+| `Mmap`, `MmapFile`, `Munmap` | funcs | Memory-mapped I/O |
+| `MadviseSequential`, `MadviseRandom`, `MadviseWillNeed`, `MadviseDontNeed` | funcs | Madvise helpers |
+| `Float32ToBytes`, `Int8ToBytes`, `Uint8ToBytes` | funcs | Byte conversion |
+| `Q4GPUDataOffset`, `Q4GPUScaleOffset` | funcs | GPU quantisation layout |
+| `MergeQ4Storage`, `MergeQ4KStorage`, `MergeQ6KStorage`, `MergeIQ4NLStorage` | funcs | Storage merging |
+
+#### `device` — Device abstraction
+
+| Symbol | Kind | Description |
+|--------|------|-------------|
+| `Device` | interface | Device abstraction — all methods |
+| `Get` | func | Device lookup by ID |
+| `Type` | type | Device type enum |
+| `CPU` | constant | CPU device type |
+| `Allocator` | interface | Memory allocator |
+| `NewCPUAllocator` | constructor | |
+| `NewCUDAAllocator` | constructor | |
+
+All symbols in `device` are stable.
+
+#### `numeric` — Arithmetic operations
+
+| Symbol | Kind | Description |
+|--------|------|-------------|
+| `Arithmetic[T]` | interface | Core arithmetic interface |
+| `Float32Ops` | struct | float32 arithmetic |
+| `Float64Ops` | struct | float64 arithmetic |
+| `Float16Ops` | struct | float16 arithmetic |
+| `BFloat16Ops` | struct | bfloat16 arithmetic |
+| `Float8Ops` | struct | float8 arithmetic |
+| `Int8Ops` | struct | int8 arithmetic |
+| `IntOps` | struct | int arithmetic |
+| `Uint8Ops` | struct | uint8 arithmetic |
+| `QuantizationConfig` | struct | Quantisation parameters |
+| `ComputeQuantizationParams` | func | Compute quantisation scale/zero-point |
+| `NewQuantizationConfig` | func | |
+| `Pack4BitSlice`, `Unpack4BitSlice` | funcs | 4-bit packing |
+| `Pack4BitWeights`, `Unpack4BitWeights` | funcs | 4-bit packing |
+
+The following `numeric` symbols are testing utilities and **not** covered by the v1 stability guarantee:
+
+| Symbol | Kind | Reason unstable |
+|--------|------|-----------------|
+| `TestArithmeticOp[T]` | func | Testing helper |
+| `TestUnaryOp[T]` | func | Testing helper |
+| `TestLeakyReLUOp[T]` | func | Testing helper |
+| `TestSumOp[T]` | func | Testing helper |
+| `ArithmeticTestCase[T]` | struct | Testing helper |
+| `UnaryTestCase[T]` | struct | Testing helper |
+| `LeakyReLUTestCase[T]` | struct | Testing helper |
+| `SumTestCase[T]` | struct | Testing helper |
+| `Float16TestData` | func | Testing helper |
+| `Float8TestData` | func | Testing helper |
+
+### Explicitly Unstable Packages
+
+The following packages are **not** covered by the v1 stability guarantee. They may change in minor versions.
+
+| Package | Reason |
+|---------|--------|
+| `internal/*` | Go internal convention — not importable outside module |
+| `graph/` | Computation graph compilation pipeline is still evolving |
+| `graph/kv/` | KV cache management — API actively changing |
+| `batched/` | Batched multi-model inference — new, API not yet settled |
+| `gguf/` | GGUF writer — low-level format utility |
+| `log/` | Logging abstraction — may be replaced |
+| `metrics/` | Metrics and correlation functions |
+| `metrics/runtime/` | Runtime metrics collection |
+| `types/` | Shared types (e.g. `BackwardMode`) — may be reorganised |
+
+These packages are used by `github.com/zerfoo/zerfoo` and must remain exported, but their APIs may change in ztensor v1.x minor releases.
+
+## Consequences
+
+1. **v1.0.0 tag** — Once tagged, the five stable packages (`compute`, `tensor`, `device`, `numeric`, and their stable symbols as listed above) follow Go module compatibility: no breaking changes until v2.
+2. **Unstable packages** — Consumers of `graph`, `batched`, `gguf`, `log`, `metrics`, and `types` should pin to exact ztensor versions and expect potential breakage on minor upgrades.
+3. **Documentation** — Unstable symbols in stable packages carry a doc comment: `// This API is not covered by the v1 stability guarantee.`
+4. **Future promotion** — As unstable packages mature (especially `graph/`), they may be promoted to stable in a future minor release. Promotion is additive and never breaking.