From c8e6f32331ad37c231a8f57839c1209344fa8f2c Mon Sep 17 00:00:00 2001
From: Ceng23333 <441651826@qq.com>
Date: Wed, 8 Apr 2026 11:11:02 +0000
Subject: [PATCH 01/11] squash for refactor

Signed-off-by: Ceng23333 <441651826@qq.com>
---
 .gitignore                                    |   5 +-
 MINICPM_SALA_BUILD_AND_CHANGES.md             | 244 +++++
 MiniCPM_SALA_alignment_progress.md            | 359 ++++++++
 csrc/cache/kv_cache.cpp                       |  67 +-
 csrc/cache/kv_cache.hpp                       |  25 +-
 csrc/config/config_factory.cpp                |   2 +-
 csrc/engine/infer_engine.cpp                  |  10 +-
 csrc/engine/rank_worker.cpp                   |   3 +-
 .../minicpm_sala/minicpm_sala_attention.cpp   | 575 +++++++++---
 .../minicpm_sala/minicpm_sala_attention.hpp   | 142 +--
 .../minicpm_sala_decoderLayer.cpp             |  61 --
 .../minicpm_sala_decoderLayer.hpp             |  34 -
 .../minicpm_sala_decoder_layer.cpp            |  83 ++
 .../minicpm_sala_decoder_layer.hpp            |  53 ++
 .../minicpm_sala_for_causal_lm.cpp            |  77 +-
 .../minicpm_sala_for_causal_lm.hpp            |  35 +-
 csrc/models/minicpm_sala/minicpm_sala_mlp.cpp |  32 +
 csrc/models/minicpm_sala/minicpm_sala_mlp.hpp |  31 +
 .../minicpm_sala/minicpm_sala_model.cpp       | 171 ++++
 .../minicpm_sala/minicpm_sala_model.hpp       |  66 ++
 csrc/models/model_factory.cpp                 |  30 +-
 csrc/pybind11/engine/engine.hpp               |   2 +-
 examples/collect_metrics_longtext_decode.py   | 355 +++++++
 examples/compare_inference_speed.py           | 868 ++++++++++++++++++
 examples/jiuge.py                             |  16 +-
 examples/metrics_16k_prefill.md               | 152 +++
 examples/metrics_longtext_mem.md              | 378 ++++++++
 examples/run_infinicore_ops_before_logits.sh  |  18 +
 examples/run_longtext_metrics_cases.sh        |  59 ++
 include/infinicore_infer/cache.h              |   5 +
 include/infinicore_infer/weights_loader.h     |   5 +
 python/infinilm/auto_config.py                |   2 +
 python/infinilm/infer_engine.py               | 110 ++-
 python/infinilm/llm/llm.py                    |  10 +-
 python/infinilm/llm/static_scheduler.py       |  22 +-
 python/infinilm/modeling_utils.py             |  65 +-
 .../infinilm/server/chat_message_normalize.py |  76 ++
 python/infinilm/server/inference_server.py    |  34 +-
 xmake.lua                                     |   3 +-
 39 files changed, 3871 insertions(+), 414 deletions(-)
 create mode 100644 MINICPM_SALA_BUILD_AND_CHANGES.md
 create mode 100644 MiniCPM_SALA_alignment_progress.md
 delete mode 100644 csrc/models/minicpm_sala/minicpm_sala_decoderLayer.cpp
 delete mode 100644 csrc/models/minicpm_sala/minicpm_sala_decoderLayer.hpp
 create mode 100644 csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp
 create mode 100644 csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp
 create mode 100644 csrc/models/minicpm_sala/minicpm_sala_mlp.cpp
 create mode 100644 csrc/models/minicpm_sala/minicpm_sala_mlp.hpp
 create mode 100644 csrc/models/minicpm_sala/minicpm_sala_model.cpp
 create mode 100644 csrc/models/minicpm_sala/minicpm_sala_model.hpp
 create mode 100644 examples/collect_metrics_longtext_decode.py
 create mode 100644 examples/compare_inference_speed.py
 create mode 100644 examples/metrics_16k_prefill.md
 create mode 100644 examples/metrics_longtext_mem.md
 create mode 100755 examples/run_infinicore_ops_before_logits.sh
 create mode 100755 examples/run_longtext_metrics_cases.sh
 create mode 100644 python/infinilm/server/chat_message_normalize.py

diff --git a/.gitignore b/.gitignore
index b728e6ea..1d4781b7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,4 +30,7 @@ __pycache__/
 
 *.http
 
-*.nsys-rep
+**/*.nsys-rep
+**/*.jsonl
+*.jsonl
+**/*.mem
diff --git a/MINICPM_SALA_BUILD_AND_CHANGES.md b/MINICPM_SALA_BUILD_AND_CHANGES.md
new file mode 100644
index 00000000..1ec53fad
--- /dev/null
+++ b/MINICPM_SALA_BUILD_AND_CHANGES.md
@@ -0,0 +1,244 @@
+# MiniCPM-SALA on InfiniLM: Build Guide and Change Summary
+
+This document describes the changes in **InfiniCore** and **InfiniLM** from their baseline commits to support MiniCPM-SALA with InfLLM-v2, the **prerequisites**, and a **step-by-step build and run guide**. With these changes, `InfiniLM/examples/jiuge.py` produces **reasonable MiniCPM-SALA generation output** when run with the correct environment.
+
+**Baseline commits (for reference):**
+
+- **InfiniLM:** `main`
+- **InfiniCore:** `5fc85c8b1e6728839993f1b743a525a066da585f`
+
+To see the exact diff from baseline:  
+`git diff 5fc85c8b1e6728839993f1b743a525a066da585f -- InfiniCore` and  
+`git diff main -- InfiniLM`.
+
+---
+
+## 1. Changes in InfiniCore (from `5fc85c8b1e6728839993f1b743a525a066da585f`)
+
+InfiniCore was extended to **wire InfLLM-v2** (Stage-2 sparse attention) so that when built with `--infllmv2=y`, the C++ API calls `mha_varlen_fwd` and `mha_fwd_kvcache` from the infllmv2_cuda_impl .so.
+
+### 1.1 New or modified files (summary)
+
+| Area | Path | Purpose |
+|------|------|--------|
+| API (decl) | `include/infinicore/ops/infllmv2_api.hpp` | Declares `mha_varlen_fwd`, `mha_fwd_kvcache` (must be provided by infllmv2 .so at link/runtime). |
+| API (decl) | `include/infinicore/ops/infllmv2_attention.hpp` | Public op header for infllmv2 attention. |
+| Ops impl | `src/infinicore/ops/infllmv2_attention/infllmv2_attention.cc` | Implements `infllmv2_varlen` and `infllmv2_kvcache` by calling the above APIs when `ENABLE_INFLLMV2` and `ENABLE_ATEN` are set. |
+| Pybind | `src/infinicore/pybind11/ops/infllmv2_attention.hpp` | Exposes infllmv2 ops to Python. |
+| Pybind | `src/infinicore/pybind11/ops.hpp` | Includes infllmv2 op bindings. |
+| Python | `python/infinicore/ops/infllmv2_attention.py` | Python wrapper for `infllmv2_varlen` / `infllmv2_kvcache`. |
+| Python | `python/infinicore/__init__.py` | Exports `infllmv2_varlen`, `infllmv2_kvcache`. |
+| Build | `xmake.lua` | New option `--infllmv2=y`; when set with `--aten=y`, defines `ENABLE_INFLLMV2` and links/rpath to the auto-detected .so. |
+| Test | `test/infinicore/ops/test_infllmv2_attention.py` | Unit tests for infllmv2 varlen/kvcache (skipped if not built or no CUDA). |
+| Example | `examples/infllmv2_sanity.py` | Sanity script for InfLLM-v2 (skips if .so absent or no CUDA). |
+
+### 1.2 Build option
+
+- **Option:** `infllmv2` (enable InfLLM-v2; xmake auto-detects `infllm_v2/*.so` under `InfiniCore/third_party/infllmv2_cuda_impl/build/...`).
+- **Requires:** `aten=y` (InfiniCore must be built with PyTorch/ATen).
+- **Effect:** Defines `ENABLE_INFLLMV2`, adds link and rpath to the auto-detected infllmv2 .so. At runtime, `libinfinicore_cpp_api.so` resolves `mha_varlen_fwd` / `mha_fwd_kvcache` from that .so (via `LD_LIBRARY_PATH` or `LD_PRELOAD`).
+
+---
+
+## 2. Changes in InfiniLM (from `main`)
+
+InfiniLM was extended to support the **MiniCPM-SALA** model (embedding, layers, attention, MLP, LM head) and to use InfiniCore (including InfLLM-v2 when available) for inference.
+
+### 2.1 New or modified files (summary)
+
+| Area | Path | Purpose |
+|------|------|--------|
+| C++ model | `csrc/models/minicpm_sala/*.cpp`, `*.hpp` | MiniCPM-SALA model: `minicpm_sala_attention`, `minicpm_sala_decoder_layer`, `minicpm_sala_model`, `minicpm_sala_for_causal_lm`, `minicpm_sala_mlp`. Per-layer dense KV cache; lightning (GLA) and optional InfLLM-v2 (minicpm4) attention paths. |
+| C++ factory | `csrc/models/model_factory.cpp` | Registers MiniCPM-SALA model type. |
+| Config | `python/infinilm/auto_config.py` | MiniCPM-SALA config handling. |
+| Weights | `python/infinilm/modeling_utils.py` | MiniCPM-SALA weight loading (MuP scaling, etc.). |
+| Examples | `examples/jiuge.py` | Generic InferEngine generation script; docstring updated with env (PYTHONPATH, LD_LIBRARY_PATH, LD_PRELOAD) for MiniCPM-SALA. |
+| Examples | `examples/minicpm_sala_logits_sanity.py` | HF vs InfiniLM logits sanity (prefill/decode1/decodeN); single-token decode for correct KV cache; one-prompt output comparison. |
+| Examples | `examples/modeling_minicpm_sala.py` | HF-side MiniCPM-SALA modeling (reference). |
+| Docs | `MiniCPM_SALA_alignment_progress.md` | Alignment and debugging notes. |
+
+### 2.2 Behaviour notes
+
+- **Attention:** Layer 0 (minicpm4) can use compiled InfLLM-v2 when InfiniCore is built with `--infllmv2=y` and the .so is preloaded; other layers use lightning (GLA) path.
+- **Attention overhead optimizations:** In `minicpm_sala_attention.cpp`: (1) sequence lengths are read in one place when both `past_sequence_lengths` and `total_sequence_lengths` are present (`has_cache_meta`), avoiding duplicate logic; (2) Q/K/V use a single `contiguous()->view` chain after projections; (3) lightning path builds `q_bthd` via one `permute->contiguous` from `q_perm`; (4) sparse path uses `q_perm` directly (already contiguous) and only calls `contiguous()` on K/V when repeating heads. Semantics and logits are unchanged.
+- **KV cache:** Decode must use **single-token input** per step; passing the full sequence each step would misalign the per-layer KV cache (see sanity script).
+- **Engine / KV cache config:** MiniCPM-SALA uses per-layer dense KV cache in C++; the engine’s `cache_config` is used only for scheduling (e.g. `past_sequence_lengths` / `total_sequence_lengths`). **Static cache** is recommended (default in `jiuge.py` when not passing `--enable-paged-attn`). For static, `jiuge.py` sets `max_cache_len = max(initial_capacity, max_position_embeddings)` when `model_type == "minicpm_sala"` so long contexts are supported without re-alloc.
+
+---
+
+## 3. Prerequisites
+
+### 3.1 System and toolchain
+
+- **OS:** Linux.
+- **Python:** 3.12 recommended (match the infllmv2 .so and InfiniCore pybind ABI).
+- **CUDA:** 11.6+ (e.g. 12.x); `nvcc` in `PATH` (e.g. via `CUDA_HOME=/usr/local/cuda` and `PATH=$CUDA_HOME/bin:$PATH`).
+- **C++:** GCC (e.g. `CC=gcc CXX=g++`) for infllmv2_cuda_impl and InfiniCore.
+- **xmake:** For building InfiniCore (install from https://xmake.io or use a project-provided path).
+- **PyTorch:** Installed in the same Python env used to build infllmv2 and to run InfiniLM (InfiniCore with `aten=y` links against this PyTorch’s libs).
+
+### 3.2 Python environment
+
+Use a **single venv** (or env) that has:
+
+- `torch`
+- `transformers`
+- `triton` (e.g. 3.2.0; for MiniCPM-SALA HF path; if CUDA 12.8, a small patch may be needed for Triton’s `ptx_get_version` or use a Triton version that supports 12.8)
+- `flash-linear-attention` (or HF deps for MiniCPM-SALA)
+- Other InfiniLM/InfiniCore runtime deps
+
+Build **infllmv2_cuda_impl** and **InfiniCore** with this same Python (and thus same PyTorch ABI).
+
+### 3.3 Repo layout
+
+- **minicpm-sala-support** (repo root) contains:
+  - **InfiniCore/** — InfiniCore with InfLLM-v2 wiring.
+  - **InfiniLM/** — InfiniLM with MiniCPM-SALA.
+  - **InfiniCore/third_party/infllmv2_cuda_impl/** — InfLLM-v2 CUDA kernel implementation (provides `mha_varlen_fwd`, `mha_fwd_kvcache`).
+
+---
+
+## 4. Build Guide
+
+### 4.1 Build InfLLM-v2 (infllmv2_cuda_impl)
+
+This produces the `.so` that provides `mha_varlen_fwd` and `mha_fwd_kvcache`. InfiniCore must be built with a PyTorch/ABI-compatible env (same Python/torch as here).
+
+1. **From repo root:**
+   ```bash
+   cd InfiniCore/third_party/infllmv2_cuda_impl
+   ```
+2. **Submodules:**
+   ```bash
+   git submodule update --init --recursive
+   ```
+3. **Env (recommended):**
+   ```bash
+   export CC=gcc CXX=g++
+   export CUDA_HOME=/usr/local/cuda   # or your CUDA path
+   export PATH=$CUDA_HOME/bin:$PATH
+   ```
+4. **Build/install** (use the Python that has torch and that you will use for InfiniLM):
+   ```bash
+   python setup.py install
+   ```
+   Or: `pip install -e .`
+5. **Locate the .so:**  
+   Typically under `build/lib.linux-x86_64-cpython-312/infllm_v2/` (name like `C.cpython-312-x86_64-linux-gnu.so`). Set:
+   ```bash
+   INFLLMV2_SO_DIR="<repo>/InfiniCore/third_party/infllmv2_cuda_impl/build/lib.linux-x86_64-cpython-312/infllm_v2"
+   ```
+
+### 4.2 Build InfiniCore (with InfLLM-v2)
+
+InfiniCore must be built with **aten** and, for MiniCPM-SALA with InfLLM-v2, with **infllmv2=y** enabled (xmake auto-detects the .so).
+
+1. **Install Infini dependencies** (if not already):  
+   Build and install Infini libs so they are under `$INFINI_ROOT` (default `~/.infini`). InfiniCore’s xmake expects `include/` and `lib/` there (e.g. `libinfinicore_cpp_api.so`, `libinfiniop.so`, etc.).
+
+2. **From repo root:**
+   ```bash
+   cd InfiniCore
+   ```
+3. **Configure** (use the same Python/torch as infllmv2):
+   ```bash
+   xmake config -y --root --nv-gpu=y --aten=y --infllmv2=y
+   ```
+   Omit `--infllmv2=y` for a build without InfLLM-v2 (then no MiniCPM-SALA layer0 infllmv2 path).
+4. **Build the Python extension:**
+   ```bash
+   xmake --root _infinicore
+   ```
+5. **Optional – install to ~/.infini:**
+   ```bash
+   xmake install
+   ```
+   The Python loadable is also copied under `InfiniCore/python/infinicore/lib/` by the build.
+
+### 4.3 Run jiuge.py (MiniCPM-SALA)
+
+Use the **same venv** that has `torch`, `transformers`, etc., and set env so InfiniCore and the infllmv2 .so are found and symbols resolve.
+
+**Required:**
+
+- `PYTHONPATH`: InfiniLM and InfiniCore Python packages.
+- `LD_LIBRARY_PATH`: Torch lib, Infini lib (`/root/.infini/lib` or your `INFINI_ROOT/lib`), and optionally `INFLLMV2_SO_DIR` (if not using `LD_PRELOAD`).
+- If InfiniCore was built with InfLLM-v2: **`LD_PRELOAD`** the infllmv2 .so so `libinfinicore_cpp_api.so` resolves `mha_varlen_fwd` (and `mha_fwd_kvcache`).
+
+**Example (from repo root):**
+
+```bash
+INFLLMV2_SO_DIR="$(pwd)/InfiniCore/third_party/infllmv2_cuda_impl/build/lib.linux-x86_64-cpython-312/infllm_v2"
+
+PYTHONPATH="$(pwd)/InfiniLM/python:$(pwd)/InfiniCore/python:$PYTHONPATH" \
+LD_LIBRARY_PATH="$(python -c 'import torch; print(torch.__path__[0])')/lib:/root/.infini/lib:${INFLLMV2_SO_DIR}:$LD_LIBRARY_PATH" \
+LD_PRELOAD="${INFLLMV2_SO_DIR}/C.cpython-312-x86_64-linux-gnu.so" \
+python InfiniLM/examples/jiuge.py --nvidia --model_path /root/.cache/modelscope/hub/models/OpenBMB/MiniCPM-SALA
+```
+
+Use the **venv** Python explicitly if needed, e.g.:
+
+```bash
+/path/to/venv/bin/python InfiniLM/examples/jiuge.py ...
+```
+
+For Triton (HF path) on CUDA 12.8 you may need:
+
+```bash
+TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
+```
+
+---
+
+## 5. Verification
+
+- **InfiniCore InfLLM-v2 ops:**  
+  `PYTHONPATH=InfiniCore/python:InfiniCore/test/infinicore LD_LIBRARY_PATH=<torch_lib>:${INFLLMV2_SO_DIR}:/root/.infini/lib LD_PRELOAD=${INFLLMV2_SO_DIR}/C.cpython-312-x86_64-linux-gnu.so python InfiniCore/test/infinicore/ops/test_infllmv2_attention.py --nvidia`
+
+- **HF vs InfiniLM logits (one-prompt decode):**  
+  Same env + `LD_PRELOAD` and (if needed) `TRITON_PTXAS_PATH`:  
+  `python InfiniLM/examples/minicpm_sala_logits_sanity.py --model_path <path> --mode decodeN --decode_steps 64`
+
+- **Generation:**  
+  `jiuge.py` with the same env should produce **reasonable MiniCPM-SALA output** (e.g. for prompt "How are you").
+
+---
+
+## 6. Related docs
+
+- **CURRENT_PROGRESS.md** — Local progress, InfLLM-v2 plan, and run commands.
+- **InfiniLM/MiniCPM_SALA_alignment_progress.md** — Alignment and debugging details.
+- **InfiniCore/third_party/infllmv2_cuda_impl/README.md** — InfLLM-v2 kernel design and install.
+- **InfiniLM/examples/jiuge.py** — Docstring at top with env summary.
+
+---
+
+## 7. TODO
+
+- **Remove temporal log and dump code** — Strip or gate debug logging, `INFINI_DEBUG_*`, and temporary dump paths (e.g. `/tmp/` tensor dumps, `dump_tensor_to_bin_if_enabled`, `log_tensor_stats_if_enabled`) from InfiniLM/InfiniCore once alignment and bring-up are stable.
+- **Adapt inference_server.py** — Wire MiniCPM-SALA (and InfiniLM InferEngine) into the inference server (e.g. `inference_server.py` or equivalent in the workspace) so that the server can load and serve MiniCPM-SALA with the same env (PYTHONPATH, LD_LIBRARY_PATH, LD_PRELOAD) and run generation endpoints.
+
+### 7.1 Debug and sanity env and code (for future erasing)
+
+When removing temporal log and dump code, use this as the reference for **env parsing** and **locations to erase or gate**.
+
+**Environment variables (debug / sanity):**
+
+| Env var | Parsing / behavior | Purpose |
+|---------|---------------------|--------|
+| `INFINI_DEBUG_LOG` | Set to a file path (e.g. `/tmp/minicpm_sala_sanity_debug.log`). When set, C++ and Python append JSON/text lines to this file. | Text log for alignment debugging. |
+| `INFINI_DEBUG_ATTN_DUMP` | Presence = enable (e.g. `"1"` or any). When set, tensors are written to fixed `/tmp/` paths below. | Enable binary tensor dumps and per-layer stats. |
+
+**Where they are read:**
+
+- **InfiniLM C++:** `std::getenv("INFINI_DEBUG_LOG")`, `std::getenv("INFINI_DEBUG_ATTN_DUMP")` in:
+  - `InfiniLM/csrc/models/minicpm_sala/minicpm_sala_attention.cpp` (dump_tensor_f32, layer q/k/v/g_gamma and attn out dumps)
+  - `InfiniLM/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp` (log_tensor_stats_if_enabled, tensor_to_f32_and_dump, layer input/out dumps)
+  - `InfiniLM/csrc/models/minicpm_sala/minicpm_sala_model.cpp` (dump_tensor_to_bin_if_enabled, log_tensor_stats_if_enabled; embed and final hidden dumps)
+- **InfiniLM Python (sanity script):** `os.environ["INFINI_DEBUG_LOG"]`, `os.environ["INFINI_DEBUG_ATTN_DUMP"]` set in `InfiniLM/examples/minicpm_sala_logits_sanity.py` before runs; `os.getenv("INFINI_DEBUG_*")` in `InfiniLM/examples/modeling_minicpm_sala.py` (HF-side hooks that write `/tmp/hf_*.pt` and log to `INFINI_DEBUG_LOG`).
+
+**Temporary paths to remove or stop writing:**
+
+- **C++ dumps (binary):** `/tmp/inf_embed_out.bin`, `/tmp/inf_final_hidden.bin`, `/tmp/inf_layer0_q.bin`, `/tmp/inf_layer0_k.bin`, `/tmp/inf_layer0_v.bin`, `/tmp/inf_layer0_g_gamma.bin`, `/tmp/inf_layer1_q.bin`, `/tmp/inf_layer1_k.bin`, `/tmp/inf_layer1_v.bin`, `/tmp/inf_layer1_g_gamma.bin`, `/tmp/inf_layer0_attn_input.bin`, `/tmp/inf_attn_out_layer0.bin`, `/tmp/inf_attn_out_layer1.bin`, `/tmp/inf_layer_out_<N>.bin`.
+- **Python (sanity) writes:** `DEBUG_LOG_PATH` (e.g. `/tmp/minicpm_sala_sanity_debug.log`); `/tmp/hf_embed_out.pt`, `/tmp/hf_final_hidden.pt`, `/tmp/hf_layer0_attn_input.pt`, `/tmp/hf_layer_out_<idx>.pt`, `/tmp/hf_layer0_q.pt`, `/tmp/hf_layer0_k.pt`, `/tmp/hf_layer0_v.pt`, `/tmp/hf_attn_out_layer0.pt`, `/tmp/hf_layer1_q.pt`, `/tmp/hf_layer1_k.pt`, `/tmp/hf_layer1_v.pt`, `/tmp/hf_attn_out_layer1.pt`.
+- **Helpers to remove or gate:** `dump_tensor_f32`, `dump_tensor_to_bin_if_enabled`, `log_tensor_stats_if_enabled`, `tensor_to_f32_and_dump`; sanity script’s `_append_debug_log`, and all `torch.save(..., "/tmp/...")` / `np.fromfile("/tmp/...")` / `os.path.isfile("/tmp/...")` blocks that exist only for alignment comparison.
diff --git a/MiniCPM_SALA_alignment_progress.md b/MiniCPM_SALA_alignment_progress.md
new file mode 100644
index 00000000..538208c9
--- /dev/null
+++ b/MiniCPM_SALA_alignment_progress.md
@@ -0,0 +1,359 @@
+### MiniCPM‑SALA sanity alignment – current status
+
+### Scope
+
+- **Goal**: Align InfiniLM MiniCPM‑SALA logits with HF reference on the dense/GLA (non‑sparse) path, using the `examples/minicpm_sala_logits_sanity.py` script running inside the `minicpm-sala` container.
+
+---
+
+### Instrumentation and plumbing
+
+- **Sanity script (`minicpm_sala_logits_sanity.py`)**
+  - **Backend lock**: All InfiniLM `InferEngine` paths now use `attention_backend="default"` so they hit the dense/GLA fallback.
+  - **Debug log target**: The script sets `INFINI_DEBUG_LOG=/home/zenghua/repos/.cursor/debug-9146ea.log` and `INFINI_DEBUG_ATTN_DUMP=1` so both Python and C++ write to the same NDJSON file.
+  - **HF per-layer hooks**:
+    - `_register_hf_layer_hooks` walks the model (`hf.transformer.layers`, `hf.model.layers`, or `hf.layers`) and registers forward hooks on the first 3 layers.
+    - For each layer \(i\), it logs:
+      - `min`, `max`, `mean`, `l2` of the layer output, as `hypothesisId="HF_L"`, `data.layer = i`.
+    - Hooks are installed for `run_prefill_only` and removed after the forward pass.
+
+- **InfiniLM attention (`minicpm_sala_attention.cpp`)**
+  - Existing **layer‑0** diagnostics:
+    - At entry to `forward_dense_`: `forward_dense_entry` logs env/config, including `INFINI_DEBUG_ATTN_DUMP`, `use_rope`, `use_qk_norm`, `use_output_gate`, `use_output_norm`, `is_sparse_layer`, and shapes.
+    - For layer 0, logs stats for:
+      - Pre‑gate attention output (`attn_pre_gate`): full tensor min/max/mean, `l2`, shape and scaling.
+      - Post‑gate/norm (`attn_post_gate`), and post‑`o_proj` (`attn_post_oproj`).
+  - **Planned / partially implemented**: extended logging for `layer_idx_ < 2` (layers 0 and 1) with:
+    - `attn_pre_gate_l0` / `attn_pre_gate_l1`.
+    - `attn_post_gate_l0` / `attn_post_gate_l1`.
+    - `attn_post_oproj_l0` / `attn_post_oproj_l1`.
+  - Current runs still only show layer‑0 entries; the `_infinilm` binary in use has not yet picked up the `_l1` variants (see below).
+
+- **InfiniLM decoder layer (`minicpm_sala_decoder_layer.cpp/.hpp`)**
+  - **MuP residual scaling**:
+    - `residual_scale_ = scale_depth / sqrt(num_hidden_layers)` using `scale_depth` from `ModelConfig` (matches HF path).
+    - `forward` applies:
+      - `out1 = hidden_states + residual_scale_ * attn_out`.
+      - `out2 = out1 + residual_scale_ * mlp_out`.
+  - **Per-layer Inf output stats**:
+    - New member `size_t layer_idx_` stored from constructor.
+    - For `layer_idx_ < 3`, after computing `out2`, it:
+      - Copies to CPU, converts BF16/F16/F32 to float, computes `min`, `max`, `mean`, `l2` and shape.
+      - Logs as `hypothesisId="INF_L"`, with `data.layer = layer_idx_`.
+
+- **Weight scaling / MuP configuration (`modeling_utils.py`)**
+  - Loader reads `config.json` and applies MiniCPM‑style scaling:
+    - `scale_input = scale_emb`, `scale_depth`, `num_hidden_layers`, `dim_model_base`, `hidden_size`.
+    - For `model_type == "minicpm_sala"`:
+      - `scale_o` and `scale_down` are reset to 1.0 (residual scaling is done at C++ forward time).
+      - `scale_lm_head = dim_model_base / hidden_size` is baked into `lm_head.weight`.
+    - Embedding and norm weights are scaled as in the MiniCPM scripts.
+
+- **Rebuild and install (`rebuild.sh`, xmake)**
+  - `rebuild.sh`:
+    - `InfiniCore`: `python scripts/install.py --nv-gpu=y --ccl=y --aten=y`, then `xmake build _infinicore` and `xmake install _infinicore`.
+    - `InfiniLM`: optional `xmake clean`, then `xmake build _infinilm` and `xmake install _infinilm`.
+  - Verified inside container:
+    - Shared libs in `/root/.infini/lib` are updated (e.g. `libinfiniop.so`, `libinfinicore_cpp_api.so` with current timestamps).
+    - Python sees `infinilm` from `/home/zenghua/repos/InfiniLM/python/infinilm`.
+    - The extension in use is `_infinilm` at:
+      - `/home/zenghua/repos/InfiniLM/python/infinilm/lib/_infinilm.cpython-312-x86_64-linux-gnu.so`.
+
+---
+
+### Sanity run behavior and current misalignment
+
+- **Command used (container, GPU 1)**:
+  ```bash
+  docker exec -e CUDA_VISIBLE_DEVICES=1 minicpm-sala bash -lc '
+    source /app/docker/nvidia/env-set.sh
+    cd /home/zenghua/repos/InfiniLM
+    python3 examples/minicpm_sala_logits_sanity.py \
+      --model_path /data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA \
+      --mode prefill \
+      --prompt "How are you"
+  '
+  ```
+- **HF vs Inf logits (from `SANITY_ONELINE`)**
+  - `inf_norm ≈ 387.66`
+  - `hf_norm ≈ 1588.89`
+  - **ratio_inf_hf ≈ 0.244**
+  - `max_diff ≈ 12.77`, `mean_diff ≈ 4.64`
+  - Top‑1 token IDs differ (HF: 74, Inf: 59358).
+
+- **HF early layers (from `HF_L` logs)**
+  - Using the HF hooks in the sanity script:
+    - Layer 0: `l2 ≈ 59.49`
+    - Layer 1: `l2 ≈ 73.91` (first GLA layer)
+    - Layer 2: `l2 ≈ 87.38`
+  - Norms grow smoothly with depth; nothing obviously pathological on HF side.
+
+- **Inf attention layer‑0 vs HF**
+  - HF layer‑0 pre‑gate attention (`modeling_minicpm_sala.py:attn_pre_gate`):
+    - Shape `[1, 4, 4096]`, `min=-8.375`, `max=9.0`, `mean≈-0.1273`.
+  - Inf layer‑0:
+    - **Pre‑gate (`attn_pre_gate`)**:
+      - `l2 ≈ 105.50`, `min=-8.375`, `max=9.0`.
+      - Python’s comparison (`compare_attn`) reports `norm_ratio_inf_hf ≈ 0.4487`, i.e. Inf pre‑gate norm ≈ 0.45× HF’s.
+    - **Post‑gate/norm (`attn_post_gate`)**:
+      - `l2 ≈ 60.38`, very close to HF layer‑0 output `l2 ≈ 59.49`.
+    - **Post‑o_proj (`attn_post_oproj`)**:
+      - `l2 ≈ 98.66` (used as input to the decoder’s residual path).
+  - Interpretation:
+    - By the end of the **layer‑0 attention block**, Inf and HF are roughly matched in scale at the decoder output (norms ≈ 60).
+    - The severe **0.244 logits norm ratio** is therefore not due to an immediate blow‑up/vanish at layer‑0 attention output; it accumulates later (likely starting at the first GLA layer and/or via MuP/residual/MLP scaling).
+
+---
+
+### Binary / build state
+
+- **Extension module mapping**
+  - In container, importing `infinilm` shows:
+    - `infinilm.__file__` → `/home/zenghua/repos/InfiniLM/python/infinilm/__init__.py`
+    - `_infinilm` (top‑level) → `/home/zenghua/repos/InfiniLM/python/infinilm/lib/_infinilm.cpython-312-x86_64-linux-gnu.so`
+  - That is the `.so` used by the sanity script.
+
+- **Why new attention logs for layer 1 don’t appear yet**
+  - `strings _infinilm.cpython-312-...so | grep 'attn_pre_gate_l1'` currently returns **no matches**:
+    - This confirms the loaded `_infinilm` was built **before** we added the `_l1` logging strings.
+  - We attempted a fresh `_infinilm` build and initially hit:
+    - C++ error in `MiniCPMSALADecoderLayer::forward`: `layer_idx_` not declared.
+  - That prevented `_infinilm` from rebuilding/overwriting the old `.so`, so your layer‑1 logging changes never reached runtime.
+
+- **Decoder fix applied to unblock rebuild**
+  - Added `size_t layer_idx_ = 0;` as a private member in `minicpm_sala_decoder_layer.hpp`.
+  - Set `layer_idx_ = layer_idx;` in the decoder layer constructor.
+  - After this fix, `_infinilm` can compile; `rebuild.sh` now proceeds past the decoder layer and updates the core libraries (and should be able to update `_infinilm` when the entire build/install completes successfully).
+
+---
+
+### Open issues / next steps
+
+- **1. Get the new `_infinilm` into use**
+  - Ensure `rebuild.sh` completes the `_infinilm` build + install step successfully (no early termination due to missing libffi/openssl/ca‑certificates link checks).
+  - Confirm via:
+    ```bash
+    strings /home/zenghua/repos/InfiniLM/python/infinilm/lib/_infinilm.cpython-312-x86_64-linux-gnu.so \
+      | grep -E 'attn_pre_gate_l1|attn_post_gate_l1|attn_post_oproj_l1'
+    ```
+    If this prints the `_l1` labels, the new binary is in place.
+
+- **2. Re‑run sanity and capture layer‑1 attention logs**
+  - With the updated `_infinilm`, re‑run the prefill sanity script and inspect `debug-9146ea.log` for:
+    - `minicpm_sala_attention.cpp:attn_pre_gate_l1`
+    - `minicpm_sala_attention.cpp:attn_post_gate_l1`
+    - `minicpm_sala_attention.cpp:attn_post_oproj_l1`
+  - Compare their `l2` to HF layer‑1 (`HF_L` `l2 ≈ 73.9`).
+  - This will tell us whether the **first GLA layer** is where Inf starts to diverge in norm, or whether norms remain close through layer 1 and drift later.
+
+- **3. Use decoder `INF_L` logs to see per‑layer drift**
+  - Once `_infinilm` is rebuilt, `MiniCPMSALADecoderLayer`’s per‑layer `INF_L` logs for `layer_idx_ < 3` should appear in `debug-9146ea.log`.
+  - By comparing HF (`HF_L`) vs Inf (`INF_L`) for layers 0/1/2, we can see exactly where norm ratios deviate from ~1 and head toward ~0.244 at the logits.
+  - That will guide targeted fixes in:
+    - GLA gating / normalization (in `minicpm_sala_attention.cpp`), and/or
+    - MuP residual & MLP scaling (still matching HF in formula, but potentially interacting differently with the SALA configuration).
+
+---
+
+### Summary
+
+- **Plumbing**: Shared log path and HF/Inf instrumentation are in place; per‑layer HF stats and layer‑0 Inf attention stats work and confirm that **layer‑0 attention output scale is roughly aligned**.
+- **Mismatch**: Final logits norm is still **Inf/HF ≈ 0.244**, so the discrepancy is accumulating across layers, likely starting at or after the first GLA layer.
+- **Blocking issue**: The `_infinilm` C++ extension in use predates the layer‑1 logging changes; an earlier C++ compile error prevented a fresh install. That decode‑layer bug has been fixed so we can now rebuild and get the new diagnostics into the runtime.
+- **Next milestone**: Successfully rebuild `_infinilm`, confirm the `_l1` log strings are present, rerun sanity, and use the new layer‑1 and decoder `INF_L` stats to precisely locate where Inf’s norms start drifting away from HF.
+
+---
+
+### Host follow-up (2026-03-14)
+
+- Ran `examples/minicpm_sala_logits_sanity.py --mode prefill --prompt "How are you"` directly on the host using the local venv and the same base env as the documented `jiuge.py` run.
+- Extra host-only prep required for the HF reference path:
+  - installed `flash-linear-attention` to provide the `fla` module
+  - installed `triton==3.2.0` to avoid the Triton `STAGE` autotune import failure
+  - created `/home/zenghua/repos/.cursor/` because the script hardcodes `DEBUG_LOG_PATH` there
+- Result on host:
+  - `SANITY_ONELINE ratio=0.6215 max_diff=11.5391 mean_diff=2.5607`
+  - HF top-1 token id `74`, Inf top-1 token id `23917`
+- Interpretation:
+  - The host environment now reproduces the alignment issue without Docker.
+  - The ratio is better than the older container snapshot (`~0.244`) but still far from aligned, so the poor generation quality remains consistent with a real logits mismatch.
+- Full reproducibility details for this host run were appended to `CURRENT_PROGRESS.md`.
+
+---
+
+### HF MiniCPM4 dense-fallback experiment (2026-03-14)
+
+- Goal:
+  - Test whether the remaining mismatch is coming from the HF `minicpm4` sparse-vs-dense code path by forcing `minicpm4` layers onto the standard dense attention implementation.
+- HF model-file change:
+  - Patched both cached copies of `modeling_minicpm_sala.py` so `MiniCPMSALADecoderLayer` uses `MINICPM_ATTENTION_CLASSES[config._attn_implementation]` for `mixer_type == "minicpm4"` instead of `MiniCPMInfLLMv2Attention`.
+  - Backups:
+    - `/root/.cache/modelscope/hub/models/OpenBMB/MiniCPM-SALA/modeling_minicpm_sala.py.bak-20260314-210428`
+    - `/root/.cache/huggingface/modules/transformers_modules/MiniCPM-SALA/modeling_minicpm_sala.py.bak-20260314-210619`
+- Rerun result:
+  - `SANITY_ONELINE ratio=0.6215 max_diff=11.5391 mean_diff=2.5607`
+  - HF top-1 token id `74`, Inf top-1 token id `23917`
+  - These numbers are unchanged from the earlier host run.
+- Fresh per-layer log from `debug-9146ea.log`:
+  - HF decoder output `l2`:
+    - layer 0: `59.49`
+    - layer 1: `73.91`
+    - layer 2: `87.38`
+  - Inf decoder output `l2`:
+    - layer 0: `35.08`
+    - layer 1: `295.86`
+    - layer 2: `531.38`
+  - Inf layer-1 attention stats:
+    - pre-gate `l2 ~= 749.58`
+    - post-gate `l2 ~= 745.29`
+    - post-`o_proj` `l2 ~= 1112.6`
+- Interpretation:
+  - For this short prefill case, forcing HF `minicpm4` to the dense fallback path does not move the mismatch at all.
+  - The strongest current evidence is that the large norm drift starts in the InfiniLM implementation at or immediately after the first `lightning-attn` layer, not in the HF `minicpm4` branch.
+
+---
+
+### InfiniLM MiniCPM4 HF-math experiment (2026-03-14)
+
+- Goal:
+  - Make the InfiniLM `minicpm4` layer compute the same dense attention math as the HF reference path and see whether layer 0 aligns at the start of sanity.
+- C++ change:
+  - In `csrc/models/minicpm_sala/minicpm_sala_attention.cpp`, replaced the `minicpm4` sparse/varlen/grouped fallback branch with an explicit HF-style dense path:
+    - repeat KV heads to `num_attention_heads`
+    - compute per-head dense causal attention
+    - keep the same sigmoid output gate and `o_proj`
+- Rebuild:
+  - Rebuilt and reinstalled `_infinilm` successfully using the local `xmake` toolchain.
+- Rerun result:
+  - `SANITY_ONELINE ratio=0.6215 max_diff=11.5391 mean_diff=2.5607`
+  - HF top-1 token id `74`, Inf top-1 token id `23917`
+  - These numbers are unchanged.
+- Fresh layer stats after the InfiniLM-side change:
+  - HF decoder output `l2`: `59.49 -> 73.91 -> 87.38`
+  - Inf decoder output `l2`: `35.08 -> 295.86 -> 531.38`
+  - Inf layer-0 attention:
+    - pre-gate `142.87`
+    - post-gate `80.43`
+    - post-`o_proj` `135.39`
+- Interpretation:
+  - Even after making the InfiniLM `minicpm4` branch follow the HF dense attention structure, layer 0 does not move toward HF.
+  - This strongly suggests the remaining mismatch is not in the `minicpm4` attention branch itself; attention should shift to other decoder-path components and especially the first `lightning-attn` layer.
+
+---
+
+### Temporary all-lightning experiment (2026-03-14)
+
+- Goal:
+  - Force both HF and InfiniLM to use lightning-style attention math for former `minicpm4` layers as a temporary precision-alignment probe, without changing checkpoint tensor shapes.
+- Why not use `config.json` only:
+  - A direct `mixer_types -> all lightning-attn` config edit failed during HF weight load because former `minicpm4` layers have incompatible checkpoint shapes for the stock `LightningAttention` module (e.g. `256 x 4096` vs `4096 x 4096`).
+  - The original `mixer_types` config was restored.
+- Temporary override implementation:
+  - Added env flag `MINICPM_SALA_FORCE_ALL_LIGHTNING=1`.
+  - HF side:
+    - former `minicpm4` layers instantiate `MiniCPMAttention` under the flag
+    - `MiniCPMAttention.forward()` switches to lightning-style GLA computation under the flag, while keeping original q/k/v/o_proj/o_gate weights
+  - InfiniLM side:
+    - `minicpm_sala_attention.cpp` routes sparse layers through `gla_attention` under the same flag
+  - Sanity script:
+    - `examples/minicpm_sala_logits_sanity.py` now sets `MINICPM_SALA_FORCE_ALL_LIGHTNING=1` for this experiment
+- Result:
+  - `SANITY_ONELINE ratio=0.4728 max_diff=12.1406 mean_diff=1.9942`
+  - HF top-1 token id `59375`, Inf top-1 token id `59358`
+- Fresh per-layer stats under the override:
+  - HF decoder output `l2`:
+    - layer 0: `385.10`
+    - layer 1: `374.87`
+    - layer 2: `426.87`
+  - Inf decoder output `l2`:
+    - layer 0: `26.23`
+    - layer 1: `208.72`
+    - layer 2: `403.90`
+  - Inf layer-0 attention:
+    - pre-gate `105.50`
+    - post-gate `60.38`
+    - post-`o_proj` `98.66`
+  - Inf layer-1 attention:
+    - pre-gate `672.74`
+    - post-gate `459.67`
+    - post-`o_proj` `737.03`
+- Interpretation:
+  - The override is definitely active on both sides, because HF logits/top-1 and HF early-layer norms changed substantially.
+  - However, the former `minicpm4` layers still do not align numerically with InfiniLM under lightning-style attention.
+  - This points to a mismatch in the lightning formulation itself (decay/slopes, layout, gating, norm/casting, or related details), not just in the original mixed `mixer_types` layout.
+
+---
+
+### Layer-0 narrowing after matched temporary semantics (2026-03-14)
+
+- Change:
+  - Updated the temporary HF override so its former `minicpm4` path uses the same grouped causal-softmax math as `InfiniCore` `gla_attention`, instead of `simple_gla` with decay.
+  - Added layer-0 sub-stage logging on both sides:
+    - HF: `inputs_embeds`, `input_layernorm`, `attn_pre_gate`, `attn_post_oproj`
+    - Inf: embedding output, `input_layernorm`, `attn_pre_gate`, `attn_post_oproj`
+- Result:
+  - Layer-0 pre-gate attention still mismatches strongly:
+    - HF `attn_pre_gate l2 ~= 235.11`
+    - Inf `attn_pre_gate l2 ~= 105.50`
+    - `Inf/HF ~= 0.4487`
+  - But this is no longer the earliest divergence.
+- New root-cause evidence:
+  - Embedding output already differs:
+    - HF `inputs_embeds l2 ~= 44.09`
+    - Inf embed output `l2 ~= 25.51`
+  - First decoder layer pre-norm output also differs:
+    - HF layer0 `input_layernorm l2 ~= 95.88`
+    - Inf layer0 `input_layernorm l2 ~= 70.94`
+- Interpretation:
+  - The mismatch starts before layer-0 attention.
+  - Attention, gating, and `o_proj` are downstream amplifiers, but not the first source.
+  - The next priority should be MiniCPM-SALA embedding behavior in InfiniLM:
+    - verify `model.embed_tokens.weight` load/scaling,
+    - verify runtime embedding lookup output against HF for the same token ids,
+    - then re-check whether layer-0 attention comes into line automatically.
+
+---
+
+### Multi-layer alignment after embed fix (2026-03-14)
+
+- Instrumentation added:
+  - InfiniLM dumps decoder layer outputs (out2) for layers 0–2 to `/tmp/inf_layer_out_{0,1,2}.bin` and final hidden (after norm) to `/tmp/inf_final_hidden.bin` when `INFINI_DEBUG_ATTN_DUMP=1`.
+  - HF hooks save layer outputs to `/tmp/hf_layer_out_{0,1,2}.pt` and final hidden to `/tmp/hf_final_hidden.pt`.
+  - Sanity script prints per-layer and final-hidden norm_ratio and max/mean diff.
+- Result (prefill "How are you", int32 input_ids workaround):
+  - **Layer 0**: norm_ratio ≈ 1.0002, max_diff ≈ 0.0625 → aligned.
+  - **Layer 1**: norm_ratio ≈ 3.24, max_diff ≈ 28.4 → large divergence.
+  - **Layer 2**: norm_ratio ≈ 5.73 → further drift.
+- Root cause for layer 1+:
+  - Config: layer 0 = `minicpm4` (sparse/dense), layer 1+ = `lightning-attn`.
+  - HF `LightningAttention` uses **Simple GLA** (`chunk_simple_gla` / `fused_recurrent_simple_gla`): linear/recurrent attention with decay (g_gamma), not causal softmax.
+  - InfiniLM now routes lightning layers through **Simple GLA** (InfiniCore `simple_gla_*` ops), matching HF’s formulation (recurrent with decay).
+- Next step to align after layer 0:
+  - Implement Simple GLA (chunk or fused_recurrent) in InfiniCore and route lightning layers through it, matching HF’s `attn_fn` (decay, scale=1/sqrt(d), layout).
+
+---
+
+### MMLU-Pro validation mismatches vs logit work (2026-03-24)
+
+Paired lm-eval `--log_samples` runs (HF vs local chat / Infini server) often disagree for **heterogeneous** reasons. Treat them differently before spending time on logits:
+
+| Heuristic tag (export script) | Meaning | Use logits / greedy trace? |
+|------------------------------|---------|----------------------------|
+| `model_disagreement` | Both sides return a valid letter choice but disagree; text is on-topic. | **Yes** — same `input_ids` + `run_prefill_and_greedy_trace` localizes numerical / decode divergence. |
+| `parse_or_format` | One side `[invalid]` or regex extraction differs though the model may agree. | **No** (first fix template, stops, or metric extraction). |
+| `garbage` | Off-topic or corrupted completion (e.g. wrong language / spam). | **No** — serving hygiene, batching, or cache contamination. |
+
+**Repo tooling**
+
+- `InfiniLM/examples/eval_tasks/mmlu_pro_val/export_mismatch_subset.py` — join two `samples_*.jsonl` dirs on `doc_hash`, optional filters, heuristic tag, write `mismatch_subset.json` + `.md` (includes `arguments_a` / `arguments_b` for replay).
+- `InfiniLM/examples/eval_tasks/mmlu_pro_val/mmlu_pro_val_prompt.py` — rebuild `input_ids` from logged rows (rendered string vs JsonChat message list) like lm-eval.
+- `InfiniLM/examples/eval_tasks/mmlu_pro_val/mmlu_pro_val_logit_probe.py` — drive `minicpm_sala_logits_sanity.run_prefill_and_greedy_trace` on subset rows (in-process HF + `InferEngine` only; HTTP cannot return logits).
+- `InfiniLM/examples/minicpm_sala_logits_sanity.py` — `--mode greedy_trace` for ad-hoc prompts; shared `run_prefill_and_greedy_trace()` for subset probes.
+
+If greedy trace matches HF on a row but the API eval still differs, diff **chat template**, **stop sequences**, **max_tokens**, or server batching — not the GLA kernel alone.
+
+**HF vs `local-chat-completions` harness (practical parity)**
+
+- For the same `doc_hash`, the **rendered prompt string** from `--model hf` can match **byte-for-byte** re-templating the JSON messages the API path logs (verified on a biology mismatch example).
+- Differences that still moved scores: **regex extraction** used the *first* `answer is (X)` in long CoT while the model’s final line said another letter; `_default_template_yaml` now uses `group_select: -1` (last match) and case-insensitive pattern.
+- **Server**: strip lm-eval’s per-message `type: text` wrapper to `{role, content}` before `apply_chat_template`, and set `continue_final_message=not add_generation_prompt` like lm-eval’s HF model class (`inference_server.py`, `llm.py`).
diff --git a/csrc/cache/kv_cache.cpp b/csrc/cache/kv_cache.cpp
index 4c97edfa..a7220773 100644
--- a/csrc/cache/kv_cache.cpp
+++ b/csrc/cache/kv_cache.cpp
@@ -4,6 +4,7 @@
 #include "../utils.hpp"
 #include "infinicore/ops.hpp"
 #include <stdexcept>
+#include <cstdlib>
 
 namespace infinilm::cache {
 // ==========================
@@ -45,7 +46,9 @@ StaticKVCache::StaticKVCache(
     infinicore::Size max_positional_embedding,
     infinicore::DataType dtype,
     const StaticKVCacheConfig &config,
-    const engine::distributed::RankInfo &rank_info)
+    const engine::distributed::RankInfo &rank_info,
+    infinicore::Size gla_recurrent_num_heads,
+    infinicore::Size gla_recurrent_head_dim)
     : Cache(),
       k_dim_(k_dim),
       v_dim_(v_dim),
@@ -54,7 +57,9 @@ StaticKVCache::StaticKVCache(
       rank_batch_size_(config.max_batch_size()),
       cache_len_(config.max_cache_len() == std::numeric_limits<infinicore::Size>::max() || config.max_cache_len() == 0 ? max_positional_embedding : config.max_cache_len()),
       rank_num_layers_(num_layers),
-      dtype_(dtype) {
+      dtype_(dtype),
+      gla_recurrent_num_heads_(gla_recurrent_num_heads),
+      gla_recurrent_head_dim_(gla_recurrent_head_dim) {
 
     // Allocate K cache
     k_caches_ = infinicore::Tensor::empty(
@@ -75,6 +80,17 @@ StaticKVCache::StaticKVCache(
          v_dim_},
         dtype_,
         rank_info.device);
+
+    if (gla_recurrent_num_heads_ > 0 && gla_recurrent_head_dim_ > 0) {
+        gla_state_ = infinicore::Tensor::zeros(
+            {rank_num_layers_,
+             rank_batch_size_,
+             gla_recurrent_num_heads_,
+             gla_recurrent_head_dim_,
+             gla_recurrent_head_dim_},
+            infinicore::DataType::F32,
+            rank_info.device);
+    }
 }
 
 infinicore::Tensor StaticKVCache::create_layer_kv_cache(
@@ -125,12 +141,27 @@ StaticKVCache::update(size_t layer_idx,
     auto device = k_cache_layer->device();
 
 #ifdef ENABLE_KV_CACHING
-    infinicore::op::kv_caching_(
-        k_cache_layer,
-        v_cache_layer,
-        k,
-        v,
-        past_sequence_lengths);
+    // Some debug builds have shown incremental decode (update_len=1) may diverge
+    // from full-sequence recompute when using the optimized kv_caching_ kernel.
+    // Provide an env override to fall back to the simple (and slower) copy update.
+    const char *disable_kv_caching = std::getenv("INFINI_DISABLE_KV_CACHING");
+    const bool force_copy_update = disable_kv_caching && disable_kv_caching[0] != '\0' && disable_kv_caching[0] != '0';
+    if (force_copy_update) {
+        size_t cache_pos = reinterpret_cast<int32_t *>(past_sequence_lengths->to(infinicore::Device::cpu())->data())[0];
+        auto result_len = cache_pos + update_len;
+        ASSERT(result_len <= cache_len_);
+        auto k_cache_update = k_cache_layer->narrow({{2, cache_pos, update_len}});
+        auto v_cache_update = v_cache_layer->narrow({{2, cache_pos, update_len}});
+        k_cache_update->copy_from(k);
+        v_cache_update->copy_from(v);
+    } else {
+        infinicore::op::kv_caching_(
+            k_cache_layer,
+            v_cache_layer,
+            k,
+            v,
+            past_sequence_lengths);
+    }
 #else
     size_t cache_pos = reinterpret_cast<int32_t *>(past_sequence_lengths->to(infinicore::Device::cpu())->data())[0];
     auto result_len = cache_pos + update_len;
@@ -146,6 +177,26 @@ StaticKVCache::update(size_t layer_idx,
     return {k_cache_layer, v_cache_layer};
 }
 
+std::tuple<infinicore::Tensor, infinicore::Tensor>
+StaticKVCache::get_layer_kv(size_t layer_idx) {
+    ASSERT(layer_idx < rank_num_layers_);
+    auto k_cache_layer = k_caches_->narrow({{0, layer_idx, 1}})->squeeze(0);
+    auto v_cache_layer = v_caches_->narrow({{0, layer_idx, 1}})->squeeze(0);
+    return {k_cache_layer, v_cache_layer};
+}
+
+bool
+StaticKVCache::has_gla_recurrent_state() const {
+    return gla_recurrent_num_heads_ > 0 && gla_recurrent_head_dim_ > 0 && static_cast<bool>(gla_state_);
+}
+
+infinicore::Tensor
+StaticKVCache::gla_recurrent_state_for_layer(size_t layer_idx) {
+    ASSERT(layer_idx < rank_num_layers_);
+    ASSERT(has_gla_recurrent_state());
+    return gla_state_->narrow({{0, layer_idx, 1}})->squeeze(0);
+}
+
 // ==========================
 // PagedKVCacheConfig
 // ==========================
diff --git a/csrc/cache/kv_cache.hpp b/csrc/cache/kv_cache.hpp
index e6e640df..cbef0722 100644
--- a/csrc/cache/kv_cache.hpp
+++ b/csrc/cache/kv_cache.hpp
@@ -12,6 +12,7 @@
 #include <memory>
 #include <numeric>
 #include <stdexcept>
+#include <cstdlib>
 #include <utility>
 
 #include <spdlog/spdlog.h>
@@ -43,7 +44,9 @@ class StaticKVCache final : public Cache {
         infinicore::Size max_positional_embedding,
         infinicore::DataType dtype,
         const StaticKVCacheConfig &config,
-        const engine::distributed::RankInfo &rank_info);
+        const engine::distributed::RankInfo &rank_info,
+        infinicore::Size gla_recurrent_num_heads = 0,
+        infinicore::Size gla_recurrent_head_dim = 0);
 
     static infinicore::Tensor create_layer_kv_cache(
         const infinicore::Size k_dim,
@@ -72,6 +75,20 @@ class StaticKVCache final : public Cache {
            const infinicore::Tensor &v,
            const infinicore::Tensor &past_sequence_lengths);
 
+    /**
+     * @brief Get KV cache tensors for a layer (views).
+     *
+     * @return (k_cache_layer, v_cache_layer)
+     *         k_cache_layer: [batch, num_rank_k_heads, max_cache_len, k_dim]
+     *         v_cache_layer: [batch, num_rank_v_heads, max_cache_len, v_dim]
+     */
+    std::tuple<infinicore::Tensor, infinicore::Tensor>
+    get_layer_kv(size_t layer_idx);
+
+    /** Per-layer Simple GLA recurrent state for lightning decode: [batch, H, D, D] float32 (in-place for decode_step). */
+    bool has_gla_recurrent_state() const;
+    infinicore::Tensor gla_recurrent_state_for_layer(size_t layer_idx);
+
     ~StaticKVCache() override = default;
 
 private:
@@ -89,6 +106,12 @@ class StaticKVCache final : public Cache {
 
     // [num_layers, max_batch, num_rank_v_heads, max_cache_len, v_dim]
     infinicore::Tensor v_caches_;
+
+    infinicore::Size gla_recurrent_num_heads_{0};
+    infinicore::Size gla_recurrent_head_dim_{0};
+    // [num_layers, max_batch, gla_recurrent_num_heads, D, D], F32; empty when heads==0
+    infinicore::Tensor gla_state_;
+
 };
 
 class PagedKVCacheConfig final : public CacheConfig {
diff --git a/csrc/config/config_factory.cpp b/csrc/config/config_factory.cpp
index c822983e..aff8b986 100644
--- a/csrc/config/config_factory.cpp
+++ b/csrc/config/config_factory.cpp
@@ -16,7 +16,7 @@ std::shared_ptr<infinilm::config::ModelConfig> ConfigFactory::createConfig(const
     if (it != config_map.end()) {
         it->second(model_config);
     } else {
-        std::vector<std::string> classic_models = {"llama", "qwen2", "minicpm", "fm9g", "fm9g7b"};
+        std::vector<std::string> classic_models = {"llama", "qwen2", "minicpm", "minicpm_sala", "fm9g", "fm9g7b"};
         const std::string &model_type = model_config->get<std::string>("model_type");
         if (std::find(classic_models.begin(), classic_models.end(), model_type) == classic_models.end()) {
             throw std::invalid_argument("infinilm::config::ConfigFactory::createConfig: Unsupported model config type: " + model_type);
diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp
index f1afd84b..2a5c5ff4 100644
--- a/csrc/engine/infer_engine.cpp
+++ b/csrc/engine/infer_engine.cpp
@@ -121,7 +121,15 @@ InferEngine::Input::to_model_input(infinicore::Device device) const {
 
     auto to_device = [&](const std::optional<infinicore::Tensor> &t)
         -> std::optional<infinicore::Tensor> {
-        return t.has_value() ? t.value()->to(device) : t;
+        if (!t.has_value()) {
+            return t;
+        }
+        auto ten = t.value();
+        // Avoid redundant copies when the tensor is already on the target device.
+        if (ten->device() == device) {
+            return ten;
+        }
+        return ten->to(device);
     };
 
     infinilm::InfinilmModel::Input input = {
diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp
index 1542c1e0..1ba89ca1 100644
--- a/csrc/engine/rank_worker.cpp
+++ b/csrc/engine/rank_worker.cpp
@@ -5,6 +5,7 @@
 #include "../models/models_registry.hpp"
 #include "infinicore/ops.hpp"
 #include <iostream>
+#include <cstdlib>
 #include <spdlog/spdlog.h>
 #include <stdexcept>
 
@@ -261,7 +262,7 @@ void RankWorker::thread_loop() {
                     rank_info_.device,
                     pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr);
             } else {
-                std::vector<std::string> classic_models = {"llama", "qwen2", "minicpm", "fm9g", "fm9g7b"};
+                std::vector<std::string> classic_models = {"llama", "qwen2", "minicpm", "minicpm_sala", "fm9g", "fm9g7b"};
                 if ((std::find(classic_models.begin(), classic_models.end(), model_type) != classic_models.end())) {
                     model_ = InfinilmModelFactory::createModel(
                         model_config_,
diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp
index c1e20f76..c426ec1c 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp
@@ -1,133 +1,496 @@
 #include "minicpm_sala_attention.hpp"
-#include "../../global_state/global_state.hpp"
+
+#include "infinicore/ops.hpp"
+#include "infinicore/ops/infllmv2_attention.hpp"
+#include "infinicore/ops/simple_gla_attention.hpp"
+#include "infinicore/ops/simple_gla_prefill.hpp"
+#include "infinicore/context/context.hpp"
+#include "../debug_utils/tensor_utils.hpp"
+
+#include <cmath>
+#include <cstdlib>
+#include <fstream>
 #include <stdexcept>
+#include <vector>
 
 namespace infinilm::models::minicpm_sala {
 
-AttentionBase::AttentionBase(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                             size_t num_attention_heads,
-                             size_t num_key_value_heads,
-                             size_t layer_idx,
-                             const infinicore::Device &device)
-    : layer_idx_(layer_idx),
-      hidden_size_(model_config->get<size_t>("hidden_size")),
-      head_dim_(model_config->get<size_t>("head_dim")) {
-
-    const auto &dtype{model_config->get_dtype()};
-
-    use_bias_ = model_config->get_or<bool>("attention_bias", true);
-    use_output_bias_ = model_config->get_or<bool>("attention_output_bias", false);
-
-    attention_backend_ = infinilm::global_state::get_infinilm_config().attention_backend;
-    const engine::distributed::RankInfo &rank_info = infinilm::global_state::get_tensor_model_parallel_rank_info();
-    int tp_rank = infinilm::global_state::get_tensor_model_parallel_rank();
-    int tp_size = infinilm::global_state::get_tensor_model_parallel_world_size();
-
-    const size_t total_num_heads = num_attention_heads;
-    const size_t total_num_kv_heads = num_key_value_heads;
-    if ((total_num_kv_heads < static_cast<size_t>(tp_size)) || (0 != (total_num_kv_heads % static_cast<size_t>(tp_size)))) {
-        throw std::runtime_error("infinilm::models::minicpm_sala::AttentionBase: num_key_value_heads must be divisible by tp_size");
+namespace {
+// Same as HF MiniCPM-SALA _build_slope_tensor (used for Simple GLA decay).
+std::vector<float> build_slope_tensor(size_t n) {
+    auto get_slopes_power_of_2 = [](size_t n) -> std::vector<float> {
+        double log2n = std::log2(static_cast<double>(n));
+        double start = std::pow(2.0, -(std::pow(2.0, -(log2n - 3))));
+        double ratio = start;
+        std::vector<float> out;
+        out.reserve(n);
+        for (size_t i = 0; i < n; ++i) {
+            out.push_back(static_cast<float>(start * std::pow(ratio, static_cast<double>(i))));
+        }
+        return out;
+    };
+    if (n == 0) return {};
+    double log2n = std::log2(static_cast<double>(n));
+    if (std::abs(log2n - std::floor(log2n)) < 1e-9) {
+        return get_slopes_power_of_2(n);
     }
-
-    num_attention_heads_ = total_num_heads / static_cast<size_t>(tp_size);
-    num_key_value_heads_ = total_num_kv_heads / static_cast<size_t>(tp_size);
-
-    auto quant_scheme = model_config->get_quant_scheme();
-    auto quantization_method = model_config->get_quantization_method();
-    switch (quant_scheme) {
-    case infinicore::quantization::QuantScheme::NONE:
-        INFINICORE_NN_MODULE_INIT(q_proj, hidden_size_, total_num_heads * head_dim_, quantization_method,
-                                  use_bias_, dtype, device, tp_rank, tp_size);
-        INFINICORE_NN_MODULE_INIT(k_proj, hidden_size_, total_num_kv_heads * head_dim_, quantization_method,
-                                  use_bias_, dtype, device, tp_rank, tp_size);
-        INFINICORE_NN_MODULE_INIT(v_proj, hidden_size_, total_num_kv_heads * head_dim_, quantization_method,
-                                  use_bias_, dtype, device, tp_rank, tp_size);
-        INFINICORE_NN_MODULE_INIT(o_proj, total_num_heads * head_dim_, hidden_size_, quantization_method,
-                                  use_output_bias_, dtype, device, tp_rank, tp_size, rank_info.comm);
-        break;
-    default:
-        throw std::runtime_error("infinilm::models::minicpm_sala::AttentionBase: unsupported quantization scheme");
-        break;
+    size_t closest = static_cast<size_t>(std::pow(2.0, std::floor(log2n)));
+    auto first = get_slopes_power_of_2(closest);
+    auto rest = build_slope_tensor(2 * closest);
+    for (size_t i = 0; i < n - closest; ++i) {
+        first.push_back(rest[i * 2]);
     }
+    return first;
+}
 
-    rotary_emb_ = infinilm::layers::rotary_embedding::get_rope(model_config, device);
+} // namespace
 
-    float scaling = 1.0f / std::sqrt(static_cast<float>(head_dim_));
-    attn_ = std::make_shared<infinilm::layers::attention::AttentionLayer>(num_attention_heads_, head_dim_, scaling,
-                                                                          num_key_value_heads_, layer_idx_,
-                                                                          kv_cache_k_scale_, kv_cache_v_scale_, attention_backend_);
+MiniCPMSALAAttention::MiniCPMSALAAttention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                                           const infinicore::Device &device,
+                                           size_t layer_idx,
+                                           const std::string &mixer_type,
+                                           engine::distributed::RankInfo rank_info,
+                                           backends::AttentionBackend attention_backend)
+    : model_config_(std::move(model_config)),
+      rank_info_(rank_info),
+      layer_idx_(layer_idx),
+      attention_backend_(attention_backend) {
 
-    auto kv_quant_scheme = infinilm::global_state::get_infinilm_config().model_config->get_kv_quant_scheme();
-    switch (kv_quant_scheme) {
-    case (infinicore::quantization::KVQuantAlgo::NONE): {
-        break;
-    }
-    case (infinicore::quantization::KVQuantAlgo::INT8): {
-        INFINICORE_NN_PARAMETER_INIT(kv_cache_k_scale, ({1}, infinicore::DataType::F32, device, 0, 0, 1));
-        INFINICORE_NN_PARAMETER_INIT(kv_cache_v_scale, ({1}, infinicore::DataType::F32, device, 0, 0, 1));
-        break;
+    // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA).
+    const auto dtype = model_config_->get_dtype();
+    hidden_size_ = model_config_->get<size_t>("hidden_size");
+    if (mixer_type == "minicpm4") {
+        is_sparse_layer_ = true;
+        num_attention_heads_ = model_config_->get<size_t>("num_attention_heads");
+        num_key_value_heads_ = model_config_->get<size_t>("num_key_value_heads");
+        head_dim_ = model_config_->get<size_t>("head_dim");
+
+        // InfLLM-v2 local-window masking (causal-local semantics) for minicpm4.
+        // Prefer `sparse_window_size`, but fall back to `window_size` if needed.
+        int sparse_window_size = model_config_->get_or<int>("sparse_window_size", -1);
+        if (sparse_window_size <= 0) {
+            // Some HF configs store this under `sparse_config.window_size`.
+            auto sparse_cfg = model_config_->get_or<nlohmann::json>("sparse_config", nlohmann::json{});
+            if (!sparse_cfg.is_null() && sparse_cfg.contains("window_size")) {
+                sparse_window_size = sparse_cfg["window_size"].get<int>();
+            } else {
+                sparse_window_size = model_config_->get_or<int>("window_size", -1);
+            }
+        }
+        if (sparse_window_size > 0) {
+            infllmv2_window_left_ = sparse_window_size;
+            infllmv2_window_right_ = 0;
+            use_local_window_ = true;
+        }
+    } else {
+        // Lightning layers have their own head config.
+        num_attention_heads_ = model_config_->get_or<size_t>("lightning_nh", model_config_->get<size_t>("num_attention_heads"));
+        num_key_value_heads_ = model_config_->get_or<size_t>("lightning_nkv", model_config_->get<size_t>("num_key_value_heads"));
+        head_dim_ = model_config_->get_or<size_t>("lightning_head_dim", model_config_->get<size_t>("head_dim"));
     }
-    default: {
-        throw std::runtime_error("infinilm::layers::attention: unsupported kv_quant_scheme");
-        break;
+    scaling_ = static_cast<float>(1.0 / std::sqrt(static_cast<double>(head_dim_)));
+
+    // StaticKVCache is allocated as a compact slab per cache type:
+    //  - minicpm4-cache stores only layers where mixer_types[i] == "minicpm4"
+    //  - lightning-cache stores only layers where mixer_types[i] != "minicpm4"
+    //
+    // Compute this attention instance's local cache index (0-based) from its
+    // absolute layer_idx_.
+    {
+        bool this_is_minicpm4_cache = (mixer_type == "minicpm4");
+        std::vector<std::string> mixer_types;
+        try {
+            mixer_types = model_config_->get<std::vector<std::string>>("mixer_types");
+        } catch (...) {
+            mixer_types.assign(model_config_->get<size_t>("num_hidden_layers"), "minicpm4");
+        }
+        // Be defensive if mixer_types size mismatches.
+        if (mixer_types.size() != model_config_->get<size_t>("num_hidden_layers")) {
+            mixer_types.resize(model_config_->get<size_t>("num_hidden_layers"), "minicpm4");
+        }
+        size_t count = 0;
+        for (size_t i = 0; i <= layer_idx_ && i < mixer_types.size(); ++i) {
+            const bool is_minicpm4_layer = (mixer_types[i] == "minicpm4");
+            if (is_minicpm4_layer == this_is_minicpm4_cache) {
+                ++count;
+            }
+        }
+        // layer_idx_ is always a valid layer, so count should be >= 1.
+        cache_layer_idx_ = count > 0 ? (count - 1) : 0;
     }
+
+    // HyPE: RoPE in lightning layers, NoPE in sparse (minicpm4) layers.
+    // We treat all non-minicpm4 as "linear" (lightning-attn) for M1 dense fallback.
+    use_rope_ = (mixer_type != "minicpm4") && model_config_->get_or<bool>("lightning_use_rope", true);
+
+    // MiniCPM-SALA uses QK-norm and output gates by default.
+    use_qk_norm_ = model_config_->get_or<bool>("qk_norm", true) && (mixer_type != "minicpm4");
+    use_output_gate_ = model_config_->get_or<bool>("use_output_gate", true);
+
+    // Projections
+    INFINICORE_NN_MODULE_INIT(q_proj, hidden_size_, num_attention_heads_ * head_dim_, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(k_proj, hidden_size_, num_key_value_heads_ * head_dim_, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(v_proj, hidden_size_, num_key_value_heads_ * head_dim_, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(o_proj, num_attention_heads_ * head_dim_, hidden_size_, false, dtype, device);
+
+    if (mixer_type == "minicpm4") {
+        // Sparse layers use o_gate (sigmoid gate on attention output)
+        INFINICORE_NN_MODULE_INIT(o_gate, hidden_size_, hidden_size_, false, dtype, device);
+    } else {
+        // Lightning layers use q/k norm + output norm and z-projection gate
+        if (use_qk_norm_) {
+            INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, model_config_->get<double>("rms_norm_eps"), dtype, device);
+            INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, model_config_->get<double>("rms_norm_eps"), dtype, device);
+        }
+        use_output_norm_ = true;
+        // Checkpoint uses o_norm over hidden_size (shape [hidden_size]).
+        INFINICORE_NN_MODULE_INIT(o_norm, hidden_size_, model_config_->get<double>("rms_norm_eps"), dtype, device);
+        INFINICORE_NN_MODULE_INIT(z_proj, hidden_size_, hidden_size_, false, dtype, device);
     }
+    // Simple GLA decay for lightning path: g_gamma = _build_slope_tensor * -1.
+    std::vector<float> slopes = build_slope_tensor(num_attention_heads_);
+    auto g_cpu = infinicore::Tensor::empty(
+        {num_attention_heads_}, infinicore::DataType::F32, infinicore::Device::cpu());
+    float *ptr = reinterpret_cast<float *>(g_cpu->data());
+    for (size_t h = 0; h < num_attention_heads_; ++h)
+        ptr[h] = -slopes[h];
+    g_gamma_ = g_cpu->to(device);
 }
 
-InfLLMv2Attention::InfLLMv2Attention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                                     size_t layer_idx,
-                                     const infinicore::Device &device)
-    : AttentionBase(model_config,
-                    model_config->get<size_t>("num_attention_heads"),
-                    model_config->get<size_t>("num_key_value_heads"),
-                    layer_idx, device) {
-    use_output_gate_ = model_config->get_or<bool>("use_output_gate", false);
-    const auto &dtype{model_config->get_dtype()};
-    size_t num_attention_heads = model_config->get<size_t>("num_attention_heads");
-    if (use_output_gate_) {
-        INFINICORE_NN_MODULE_INIT(o_gate, hidden_size_, num_attention_heads * head_dim_,
-                                  model_config->get_quantization_method(), use_bias_, dtype, device);
-    }
+void MiniCPMSALAAttention::set_rotary_emb(const std::shared_ptr<infinicore::nn::RoPE> &rotary_emb) {
+    rotary_emb_ = rotary_emb;
+}
+
+void MiniCPMSALAAttention::reset_cache() {
+    // KV state is maintained by the shared engine cache (StaticKVCache).
 }
 
-infinicore::Tensor InfLLMv2Attention::forward(const infinicore::Tensor &positions,
-                                              const infinicore::Tensor &hidden_states) const {
-    spdlog::error("InfLLMv2Attention is not implemented");
-    return hidden_states;
+
+infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &hidden_states,
+                                                 const infinicore::Tensor &position_ids,
+                                                 std::shared_ptr<infinilm::cache::Cache> kv_cache,
+                                                 std::optional<infinicore::Tensor> past_sequence_lengths,
+                                                 std::optional<infinicore::Tensor> total_sequence_lengths,
+                                                 std::optional<infinicore::Tensor> input_offsets,
+                                                 std::optional<infinicore::Tensor> cu_seqlens,
+                                                 std::optional<infinicore::Tensor> block_tables,
+                                                 std::optional<infinicore::Tensor> slot_mapping) const {
+    (void)input_offsets;
+    (void)block_tables;
+    (void)slot_mapping;
+    return forward_dense_(hidden_states, position_ids, kv_cache, past_sequence_lengths, total_sequence_lengths, cu_seqlens);
 }
 
-LightningAttention::LightningAttention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                                       size_t layer_idx,
-                                       const infinicore::Device &device)
-    : AttentionBase(model_config,
-                    model_config->get<size_t>("num_attention_heads"),
-                    model_config->get<size_t>("lightning_nkv"),
-                    layer_idx, device) {
-
-    qk_norm_ = model_config->get_or<bool>("qk_norm", false);
-    use_output_norm_ = model_config->get_or<bool>("use_output_norm", false);
-    use_output_gate_ = model_config->get_or<bool>("use_output_gate", false);
-    const auto &dtype{model_config->get_dtype()};
-    double rms_norm_eps = model_config->get<double>("rms_norm_eps");
-    size_t num_attention_heads = model_config->get<size_t>("num_attention_heads");
-
-    if (qk_norm_) {
-        INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, rms_norm_eps, dtype, device);
-        INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, rms_norm_eps, dtype, device);
+infinicore::Tensor MiniCPMSALAAttention::forward_dense_(const infinicore::Tensor &hidden_states,
+                                                       const infinicore::Tensor &position_ids,
+                                                       std::shared_ptr<infinilm::cache::Cache> kv_cache,
+                                                       std::optional<infinicore::Tensor> past_sequence_lengths,
+                                                       std::optional<infinicore::Tensor> total_sequence_lengths,
+                                                       std::optional<infinicore::Tensor> cu_seqlens) const {
+    // Input: [B, S, H]
+    auto shape = hidden_states->shape();
+    const size_t batch_size = shape[0];
+    const size_t seq_len = shape[1];
+
+    auto hs_mut = hidden_states;
+    auto q = q_proj_->forward(hs_mut);
+    auto k = k_proj_->forward(hs_mut);
+    auto v = v_proj_->forward(hs_mut);
+    // View requires contiguous layout; only call contiguous when needed (proj output often already contiguous).
+    auto q_reshaped = q->contiguous()->view({batch_size, seq_len, num_attention_heads_, head_dim_});
+    auto k_reshaped = k->contiguous()->view({batch_size, seq_len, num_key_value_heads_, head_dim_});
+    auto v_reshaped = v->contiguous()->view({batch_size, seq_len, num_key_value_heads_, head_dim_});
+
+    if (use_qk_norm_) {
+        // RMSNorm op only supports 2D/3D; normalize over head_dim with a 3D view.
+        auto q3 = q_reshaped->view({batch_size * seq_len, num_attention_heads_, head_dim_});
+        auto k3 = k_reshaped->view({batch_size * seq_len, num_key_value_heads_, head_dim_});
+        q3 = q_norm_->forward(q3);
+        k3 = k_norm_->forward(k3);
+        q_reshaped = q3->view({batch_size, seq_len, num_attention_heads_, head_dim_});
+        k_reshaped = k3->view({batch_size, seq_len, num_key_value_heads_, head_dim_});
+    }
+
+    // RoPE only for lightning layers (HyPE)
+    if (use_rope_) {
+        if (!rotary_emb_) {
+            throw std::runtime_error("MiniCPMSALAAttention: rotary_emb is not set but use_rope=true");
+        }
+        // position_ids can be [B,S] or [S]; follow LlamaAttention behavior.
+        auto pos_shape = position_ids->shape();
+        infinicore::Tensor pos_ids_for_rope = position_ids;
+        if (pos_shape.size() == 2) {
+            auto pos_narrowed = position_ids->narrow({{0, 0, 1}});
+            pos_ids_for_rope = pos_narrowed->contiguous()->view({pos_shape[1]});
+        } else if (pos_shape.size() == 1) {
+            pos_ids_for_rope = position_ids->contiguous();
+        } else {
+            throw std::runtime_error("MiniCPMSALAAttention: Unexpected position_ids shape");
+        }
+
+        rotary_emb_->forward(q_reshaped, pos_ids_for_rope, true);
+        rotary_emb_->forward(k_reshaped, pos_ids_for_rope, true);
+    }
+
+    // Compute dense attention (GQA): reshape as LlamaAttention does
+    size_t total_seq_len = seq_len;
+    size_t cache_pos = 0;
+    const bool has_cache_meta = past_sequence_lengths.has_value() && total_sequence_lengths.has_value();
+    if (has_cache_meta) {
+        // Single device-to-host sync: read both scalars (engine could pass these as scalars later).
+        auto past_cpu = past_sequence_lengths.value()->to(infinicore::Device::cpu());
+        auto total_cpu = total_sequence_lengths.value()->to(infinicore::Device::cpu());
+        cache_pos = reinterpret_cast<int32_t *>(past_cpu->data())[0];
+        size_t total_seq_len_raw = reinterpret_cast<int32_t *>(total_cpu->data())[0];
+        total_seq_len = total_seq_len_raw;
+        // Some engine call sites pass `total_sequence_lengths` as the *input* length (e.g. 1 for decode),
+        // while `past_sequence_lengths` is the cached KV length. Attention needs total KV length.
+        // Use KV semantics: total_kv_len = cache_pos + current seq_len.
+        total_seq_len = cache_pos + seq_len;
+    } else if (total_sequence_lengths.has_value()) {
+        total_seq_len = reinterpret_cast<int32_t *>(total_sequence_lengths.value()->to(infinicore::Device::cpu())->data())[0];
+    }
+
+    // Cache expects [B, n_kv, S, D]. Keep this as a strided view and let the caching op handle strides
+    // to avoid a full rearrange (permute->contiguous) copy on long-context prefill.
+    // Correctness: kv_caching_ / StaticKVCache::update is sensitive to input stride/layout.
+    // Restore contiguous to match HF logits exactly before re-applying any strided optimizations.
+    auto k_permuted = k_reshaped->permute({0, 2, 1, 3})->contiguous(); // [B, n_kv, S, D]
+    auto v_permuted = v_reshaped->permute({0, 2, 1, 3})->contiguous(); // [B, n_kv, S, D]
+
+    // HF-like dense KV caching using the engine-provided StaticKVCache.
+    infinicore::Tensor k_total = k_permuted;
+    infinicore::Tensor v_total = v_permuted;
+    std::shared_ptr<cache::StaticKVCache> static_kv_cache = nullptr;
+    if (kv_cache != nullptr && has_cache_meta) {
+        static_kv_cache = std::dynamic_pointer_cast<cache::StaticKVCache>(kv_cache);
+        if (!static_kv_cache) {
+            throw std::runtime_error("MiniCPMSALAAttention: Unsupported cache type (expected StaticKVCache)");
+        }
+        // Default behavior: update cache here. For minicpm4 decode we may override and let InfLLM-v2 update.
+        auto [k_cached, v_cached] = static_kv_cache->update(
+            cache_layer_idx_, k_permuted, v_permuted, past_sequence_lengths.value());
+        k_total = k_cached;
+        v_total = v_cached;
+    } else {
+        // No cache metadata => treat as prefill-only.
+        total_seq_len = seq_len;
+    }
+
+    // Slice to total_seq_len (decode-only / cont-batch)
+    if (total_seq_len > k_total->shape()[2]) {
+        throw std::runtime_error("MiniCPMSALAAttention: total_seq_len exceeds available KV length (cache not correctly updated)");
     }
-    if (use_output_norm_) {
-        INFINICORE_NN_MODULE_INIT(o_norm, num_attention_heads * head_dim_, rms_norm_eps, dtype, device);
+    k_total = k_total->narrow({{2, 0, total_seq_len}});
+    v_total = v_total->narrow({{2, 0, total_seq_len}});
+
+    infinicore::Tensor attn_output;
+    if (!is_sparse_layer_) {
+        // Lightning-attn: Simple GLA (HF-aligned).
+        // simple_gla_attention(q,k,v,g_gamma,scale) expects [B, T, H, D]; g_gamma [H].
+        const size_t n_h = num_attention_heads_;
+        const size_t n_kv = num_key_value_heads_;
+        infinicore::Tensor k_use = k_total;
+        infinicore::Tensor v_use = v_total;
+        if (n_kv < n_h) {
+            // Repeat KV heads to match n_h (same as HF repeat_kv / repeat_interleave).
+            // Use as_strided view then contiguous() so one copy instead of n_h narrow/copy_from calls.
+            const size_t ngroup = n_h / n_kv;
+            const std::vector<ptrdiff_t> repeat_strides = {
+                static_cast<ptrdiff_t>(n_kv * total_seq_len * head_dim_),
+                static_cast<ptrdiff_t>(total_seq_len * head_dim_),
+                0,
+                static_cast<ptrdiff_t>(head_dim_),
+                1,
+            };
+            k_use = k_total->as_strided(
+                         {batch_size, n_kv, ngroup, total_seq_len, head_dim_}, repeat_strides)
+                         ->contiguous()
+                         ->view({batch_size, n_h, total_seq_len, head_dim_});
+            v_use = v_total->as_strided(
+                         {batch_size, n_kv, ngroup, total_seq_len, head_dim_}, repeat_strides)
+                         ->contiguous()
+                         ->view({batch_size, n_h, total_seq_len, head_dim_});
+        }
+        // GLA expects [B, S, H, D]. `q_reshaped` is already [B, S, H, D], so avoid permute+contiguous.
+        auto q_bthd = q_reshaped;                                 // [B, S_q, H, D]
+        // Correctness: restore contiguous layout for K/V before `simple_gla_attention`.
+        auto k_bthd = k_use->permute({0, 2, 1, 3})->contiguous(); // [B, S_kv, H, D]
+        auto v_bthd = v_use->permute({0, 2, 1, 3})->contiguous(); // [B, S_kv, H, D]
+
+        // Lightning GLA decode must use recurrent state (StaticKVCache) whenever available.
+        const bool is_lightning_decode = has_cache_meta && static_kv_cache && (seq_len < total_seq_len);
+        if (is_lightning_decode && !static_kv_cache->has_gla_recurrent_state()) {
+            throw std::runtime_error(
+                "MiniCPMSALAAttention(lightning): Lightning decode requires StaticKVCache gla_recurrent_state "
+                "(missing recurrent buffer in StaticKVCache).");
+        }
+
+        const bool recurrent_gla = static_kv_cache && static_kv_cache->has_gla_recurrent_state() && has_cache_meta;
+
+        infinicore::Tensor gla_out;
+        if (recurrent_gla && seq_len == 1 && total_seq_len > 1) {
+            auto S = static_kv_cache->gla_recurrent_state_for_layer(cache_layer_idx_);
+            auto q_new = q_bthd;
+            auto k_new = k_bthd->narrow({{1, total_seq_len - 1, 1}});
+            auto v_new = v_bthd->narrow({{1, total_seq_len - 1, 1}});
+            gla_out = infinicore::op::simple_gla_decode_step(q_new, k_new, v_new, S, g_gamma_, scaling_);
+        } else {
+            infinicore::Tensor q_full;
+            if (seq_len == total_seq_len) {
+                q_full = q_bthd;
+            } else {
+                // Decode: q has seq_len (e.g. 1), kv has total_seq_len; pad q to [B, total_seq_len, H, D].
+                q_full = infinicore::Tensor::zeros(
+                    {batch_size, total_seq_len, n_h, head_dim_}, q_bthd->dtype(), q_bthd->device());
+                auto q_slot = q_full->narrow({{1, total_seq_len - seq_len, seq_len}});
+                q_slot->copy_from(q_bthd);
+            }
+            // Fused prefill: naive kernel for head_dim<=64; chunked/tiled kernel for head_dim>64 (e.g. 128).
+            bool use_fused_prefill = (batch_size == 1) && (seq_len == total_seq_len);
+            if (use_fused_prefill) {
+                gla_out = infinicore::op::simple_gla_prefill(q_full, k_bthd, v_bthd, g_gamma_, scaling_);
+            } else {
+                gla_out = infinicore::op::simple_gla_attention(q_full, k_bthd, v_bthd, g_gamma_, scaling_);
+            }
+
+            // Keep per-layer recurrent state aligned with simple_gla_attention / prefill outputs.
+            // Use batched GEMM (CUDA+ATen) instead of O(seq_len) decode_step launches; see
+            // simple_gla_recurrent_state_append_segment (closed form: S <- g^L S + Σ g^{L-1-j} outer(k,v)).
+            if (recurrent_gla) {
+                auto S = static_kv_cache->gla_recurrent_state_for_layer(cache_layer_idx_);
+                if (cache_pos == 0) {
+                    infinicore::op::zeros_(S);
+                }
+                auto k_seg = k_bthd->narrow({{1, cache_pos, seq_len}});
+                auto v_seg = v_bthd->narrow({{1, cache_pos, seq_len}});
+                infinicore::op::simple_gla_recurrent_state_append_segment(S, k_seg, v_seg, g_gamma_);
+            }
+        }
+
+        infinicore::Tensor out_slice = (recurrent_gla && seq_len == 1 && total_seq_len > 1)
+                                           ? gla_out
+                                           : gla_out->narrow({{1, total_seq_len - seq_len, seq_len}});
+        attn_output = out_slice->view({batch_size, seq_len, n_h * head_dim_});
+    } else {
+        // minicpm4 layers must use InfLLM-v2 attention (hard error if not available).
+        // NOTE: Lightning layers keep Simple GLA for correctness; only minicpm4 routes here.
+        try {
+            if (!total_sequence_lengths.has_value()) {
+                throw std::runtime_error(
+                    "MiniCPMSALAAttention(minicpm4): total_sequence_lengths is required for InfLLM-v2 path");
+            }
+            // `infllmv2_kvcache` expects the number of valid K/V entries in the
+            // provided cache tensors. Since we already appended the current
+            // token via StaticKVCache::update, the valid length is the total
+            // KV length (past + current token).
+            const auto cache_lens = total_sequence_lengths.value();
+
+            // Prefill: InfLLM-v2 varlen (Q and K packed lengths match `seq_len == total_seq_len` here).
+            // Decode: `seq_len < total_seq_len` — use `infllmv2_kvcache` after StaticKVCache::update
+            // (valid KV length == `total_seq_len`). Using varlen for decode (1 query vs long K) hit NaNs
+            // in practice for modest sequence lengths; kvcache matches operator tests and Flash path.
+            const bool force_varlen_decode = [&]() {
+                const char *env = std::getenv("INFINI_MINICPM4_DECODE_VARLEN");
+                return env && env[0] != '\0' && env[0] != '0';
+            }();
+
+            if (seq_len == total_seq_len || (force_varlen_decode && batch_size == 1)) {
+                if (batch_size != 1) {
+                    throw std::runtime_error("MiniCPMSALAAttention(minicpm4): varlen prefill path currently requires batch_size=1");
+                }
+                auto q_bshd = q_reshaped->contiguous();                     // [B, S, n_h, D]
+                auto k_btkd = k_total->permute({0, 2, 1, 3})->contiguous();  // [B, T, n_kv, D]
+                auto v_btkd = v_total->permute({0, 2, 1, 3})->contiguous();  // [B, T, n_kv, D]
+                auto q_var = q_bshd->view({static_cast<ptrdiff_t>(seq_len), static_cast<ptrdiff_t>(num_attention_heads_), static_cast<ptrdiff_t>(head_dim_)});
+                auto k_var = k_btkd->view({static_cast<ptrdiff_t>(total_seq_len), static_cast<ptrdiff_t>(num_key_value_heads_), static_cast<ptrdiff_t>(head_dim_)});
+                auto v_var = v_btkd->view({static_cast<ptrdiff_t>(total_seq_len), static_cast<ptrdiff_t>(num_key_value_heads_), static_cast<ptrdiff_t>(head_dim_)});
+
+                auto cuq_cpu = infinicore::Tensor::empty({2}, infinicore::DataType::I32, infinicore::Device::cpu());
+                reinterpret_cast<int32_t *>(cuq_cpu->data())[0] = 0;
+                reinterpret_cast<int32_t *>(cuq_cpu->data())[1] = static_cast<int32_t>(seq_len);
+                infinicore::Tensor cu_q = cuq_cpu->to(q_var->device());
+                // cu_k corresponds to the full KV length used by k_var/v_var.
+                auto cuk_cpu = infinicore::Tensor::empty({2}, infinicore::DataType::I32, infinicore::Device::cpu());
+                reinterpret_cast<int32_t *>(cuk_cpu->data())[0] = 0;
+                reinterpret_cast<int32_t *>(cuk_cpu->data())[1] = static_cast<int32_t>(total_seq_len);
+                infinicore::Tensor cu_k = cuk_cpu->to(q_var->device());
+
+                const bool infllmv2_causal = !use_local_window_;
+                const int window_left = use_local_window_ ? infllmv2_window_left_ : -1;
+                const int window_right = use_local_window_ ? 0 : -1;
+
+                auto out_var = infinicore::op::infllmv2_varlen(
+                    q_var, k_var, v_var,
+                    cu_q, cu_k,
+                    static_cast<int>(seq_len),
+                    static_cast<int>(total_seq_len),
+                    scaling_,
+                    /*causal=*/infllmv2_causal,
+                    /*window_size_left=*/window_left,
+                    /*window_size_right=*/window_right);
+                attn_output = out_var->view({batch_size, seq_len, num_attention_heads_ * head_dim_});
+            } else if (static_kv_cache) {
+                if (batch_size != 1) {
+                    throw std::runtime_error("MiniCPMSALAAttention(minicpm4): kvcache decode path currently requires batch_size=1");
+                }
+                auto q_bshd = q_reshaped->contiguous();                     // [B, S_q, n_h, D]
+                auto k_bthd = k_total->permute({0, 2, 1, 3})->contiguous(); // [B, T, n_kv, D]
+                auto v_bthd = v_total->permute({0, 2, 1, 3})->contiguous(); // [B, T, n_kv, D]
+
+                const bool infllmv2_causal = !use_local_window_;
+                const int window_left = use_local_window_ ? infllmv2_window_left_ : -1;
+                const int window_right = use_local_window_ ? 0 : -1;
+
+                auto out_bshd = infinicore::op::infllmv2_kvcache(
+                    q_bshd,
+                    k_bthd,
+                    v_bthd,
+                    cache_lens,
+                    scaling_,
+                    /*causal=*/infllmv2_causal,
+                    /*window_size_left=*/window_left,
+                    /*window_size_right=*/window_right);
+                attn_output = out_bshd->contiguous()->view(
+                    {batch_size, seq_len, num_attention_heads_ * head_dim_});
+            } else {
+                throw std::runtime_error(
+                    "MiniCPMSALAAttention(minicpm4): decode requires StaticKVCache (missing cache metadata or cache)");
+            }
+        } catch (const std::exception &e) {
+            throw std::runtime_error(
+                std::string("MiniCPMSALAAttention(minicpm4): InfLLM-v2 attention failed. ")
+                + "This build must provide InfLLM-v2 (ENABLE_INFLLMV2+ENABLE_ATEN) and the infllmv2_cuda_impl .so "
+                + "must be available via LD_PRELOAD/LD_LIBRARY_PATH. Original error: " + e.what());
+        }
     }
+
+    // Output norm + gate variants
     if (use_output_gate_) {
-        INFINICORE_NN_MODULE_INIT(z_proj, hidden_size_, num_attention_heads * head_dim_,
-                                  model_config->get_quantization_method(), use_bias_, dtype, device);
+        if (o_gate_) {
+            // Sparse (minicpm4): y = sigmoid(o_gate(x)) * attn_output
+            auto gate_in = hidden_states;
+            auto gate = o_gate_->forward(gate_in);
+            infinicore::op::sigmoid_(gate, gate);
+            attn_output = infinicore::op::mul(attn_output, gate);
+        } else if (z_proj_) {
+            // Lightning: match HF LightningAttention: o_norm(o) then o * sigmoid(z_proj(x)).
+            auto z_in = hidden_states;
+            auto z = z_proj_->forward(z_in);
+            infinicore::op::sigmoid_(z, z);
+            if (use_output_norm_ && o_norm_) {
+                attn_output = o_norm_->forward(attn_output);
+            }
+            attn_output = infinicore::op::mul(attn_output, z);
+        }
+    } else if (use_output_norm_ && o_norm_) {
+        attn_output = o_norm_->forward(attn_output);
     }
-}
 
-infinicore::Tensor LightningAttention::forward(const infinicore::Tensor &positions,
-                                               const infinicore::Tensor &hidden_states) const {
-    spdlog::error("LightningAttention is not implemented");
-    return hidden_states;
+    auto attn_out_mut = attn_output;
+    auto out = o_proj_->forward(attn_out_mut);
+
+    return out;
 }
 
 } // namespace infinilm::models::minicpm_sala
diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp
index 81a032b6..3cd8f284 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp
@@ -1,88 +1,102 @@
 #pragma once
 
-#include "../../layers/common_modules.hpp"
+#include "../../backends/attention_backends.hpp"
+#include "../../cache/kv_cache.hpp"
+#include "../../config/model_config.hpp"
+#include "../../engine/distributed/distributed.hpp"
 
-namespace infinilm::layers::attention {
-class AttentionLayer;
-}
+#include "infinicore/nn/linear.hpp"
+#include "infinicore/nn/module.hpp"
+#include "infinicore/nn/rmsnorm.hpp"
+#include "infinicore/nn/rope.hpp"
+#include "infinicore/tensor.hpp"
 
-namespace infinilm::models::minicpm_sala {
+#include <memory>
+#include <string>
 
-class AttentionBase : public infinicore::nn::Module {
-protected:
-    AttentionBase(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                  size_t num_attention_heads,
-                  size_t num_key_value_heads,
-                  size_t layer_idx,
-                  const infinicore::Device &device);
+namespace infinilm::models::minicpm_sala {
 
+// Dense attention fallback implementation used for Milestone 1.
+// Parameter names are aligned with HF MiniCPM-SALA safetensors keys:
+//   model.layers.N.self_attn.{q_proj,k_proj,v_proj,o_proj,...}
+// TODO(refactor): KV cache is currently per-layer dense; refactor to use engine paged KV pool
+// and block_tables/slot_mapping to match SGLang minicpm-sala pattern (see minicpm_sala_attention.cpp).
+class MiniCPMSALAAttention : public infinicore::nn::Module {
 public:
-    size_t layer_idx() const { return layer_idx_; }
-    size_t num_heads() const { return num_attention_heads_; }
-    size_t num_kv_heads() const { return num_key_value_heads_; }
-    size_t head_dim() const { return head_dim_; }
-    size_t hidden_size() const { return hidden_size_; }
+    MiniCPMSALAAttention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                         const infinicore::Device &device,
+                         size_t layer_idx,
+                         const std::string &mixer_type,
+                         engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(),
+                         backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
+
+    infinicore::Tensor forward(const infinicore::Tensor &hidden_states,
+                               const infinicore::Tensor &position_ids,
+                               std::shared_ptr<infinilm::cache::Cache> kv_cache,
+                               std::optional<infinicore::Tensor> past_sequence_lengths,
+                               std::optional<infinicore::Tensor> total_sequence_lengths,
+                               std::optional<infinicore::Tensor> input_offsets,
+                               std::optional<infinicore::Tensor> cu_seqlens,
+                               std::optional<infinicore::Tensor> block_tables,
+                               std::optional<infinicore::Tensor> slot_mapping) const;
+
+    void set_rotary_emb(const std::shared_ptr<infinicore::nn::RoPE> &rotary_emb);
+    void reset_cache();
+
+private:
+    infinicore::Tensor forward_dense_(const infinicore::Tensor &hidden_states,
+                                     const infinicore::Tensor &position_ids,
+                                     std::shared_ptr<infinilm::cache::Cache> kv_cache,
+                                     std::optional<infinicore::Tensor> past_sequence_lengths,
+                                     std::optional<infinicore::Tensor> total_sequence_lengths,
+                                     std::optional<infinicore::Tensor> cu_seqlens) const;
 
 protected:
-    INFINICORE_NN_MODULE(infinilm::layers::linear::ColumnParallelLinear, q_proj);
-    INFINICORE_NN_MODULE(infinilm::layers::linear::ColumnParallelLinear, k_proj);
-    INFINICORE_NN_MODULE(infinilm::layers::linear::ColumnParallelLinear, v_proj);
-    INFINICORE_NN_MODULE(infinilm::layers::linear::RowParallelLinear, o_proj);
+    // Projections (HF-aligned naming)
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, q_proj);
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, k_proj);
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, v_proj);
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, o_proj);
+
+    // Optional (Lightning layers): q_norm/k_norm/o_norm + z_proj
+    INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, q_norm);
+    INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, k_norm);
+    INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, o_norm);
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, z_proj);
+
+    // Optional (Sparse layers): o_gate
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, o_gate);
 
-    std::shared_ptr<infinilm::layers::attention::AttentionLayer> attn_;
-    ::infinilm::backends::AttentionBackend attention_backend_;
+    std::shared_ptr<infinilm::config::ModelConfig> model_config_;
     std::shared_ptr<infinicore::nn::RoPE> rotary_emb_;
+    engine::distributed::RankInfo rank_info_;
 
     size_t layer_idx_;
+    // Layer index remapped into the cache instance (minicpm4-cache vs lightning-cache).
+    // StaticKVCache allocates a compact [num_layers, ...] slab per cache type.
+    size_t cache_layer_idx_ = 0;
     size_t hidden_size_;
     size_t num_attention_heads_;
     size_t num_key_value_heads_;
     size_t head_dim_;
-    bool use_bias_;
-    bool use_output_bias_;
-
-    // For off-line kv cache quantization
-    INFINICORE_NN_PARAMETER(kv_cache_k_scale);
-    INFINICORE_NN_PARAMETER(kv_cache_v_scale);
-};
+    float scaling_;
 
-/**
- * @brief InfLLMv2 attention with optional output gate
- */
-class InfLLMv2Attention : public AttentionBase {
-public:
-    InfLLMv2Attention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                      size_t layer_idx,
-                      const infinicore::Device &device);
-
-    infinicore::Tensor forward(const infinicore::Tensor &positions,
-                               const infinicore::Tensor &hidden_states) const;
-
-protected:
-    bool use_output_gate_;
-    INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, o_gate);
-};
+    bool use_qk_norm_ = false;
+    bool use_output_gate_ = false;
+    bool use_output_norm_ = false;
+    bool use_rope_ = false;
+    bool is_sparse_layer_ = false;
 
-/**
- * @brief Lightning attention with optional output norm and gate
- */
-class LightningAttention : public AttentionBase {
-public:
-    LightningAttention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                       size_t layer_idx,
-                       const infinicore::Device &device);
+    // InfLLM-v2 local-window masking plumbing for `mixer_type=="minicpm4"`.
+    // When enabled: causal=false + window_size_left=sparse_window_size + window_size_right=0.
+    int infllmv2_window_left_ = -1;
+    int infllmv2_window_right_ = -1;
+    bool use_local_window_ = false;
 
-    infinicore::Tensor forward(const infinicore::Tensor &positions,
-                               const infinicore::Tensor &hidden_states) const;
+    backends::AttentionBackend attention_backend_;
 
-protected:
-    bool qk_norm_;
-    bool use_output_norm_;
-    bool use_output_gate_;
-    INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, q_norm);
-    INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, k_norm);
-    INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, o_norm);
-    INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, z_proj);
+    // Lightning layers only: per-head log-decay for Simple GLA (HF _build_slope_tensor * -1).
+    infinicore::Tensor g_gamma_;
 };
 
 } // namespace infinilm::models::minicpm_sala
diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoderLayer.cpp b/csrc/models/minicpm_sala/minicpm_sala_decoderLayer.cpp
deleted file mode 100644
index ff3c113f..00000000
--- a/csrc/models/minicpm_sala/minicpm_sala_decoderLayer.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-#include "minicpm_sala_decoderLayer.hpp"
-
-#include "infinicore/ops.hpp"
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-namespace infinilm::models::minicpm_sala {
-
-MiniCPMSALADecoderLayer::MiniCPMSALADecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                                                 size_t layer_idx,
-                                                 const infinicore::Device &device)
-    : layer_idx_(layer_idx) {
-    const auto &dtype{model_config->get_dtype()};
-    size_t hidden_size = model_config->get<size_t>("hidden_size");
-    double rms_norm_eps = model_config->get<double>("rms_norm_eps");
-
-    INFINICORE_NN_MODULE_INIT(input_layernorm, hidden_size, rms_norm_eps, dtype, device);
-    INFINICORE_NN_MODULE_INIT(post_attention_layernorm, hidden_size, rms_norm_eps, dtype, device);
-    INFINICORE_NN_MODULE_INIT(mlp, model_config, device);
-
-    std::vector<std::string> mixer_types = model_config->get<std::vector<std::string>>("mixer_types");
-    std::string mixer_type = mixer_types[layer_idx];
-    if ("minicpm4" == mixer_type) {
-        self_attn_ = std::make_shared<MiniCPMSALAAttention>(this->register_module<InfLLMv2Attention>("self_attn", model_config, layer_idx, device));
-    } else if ("lightning" == mixer_type || "lightning_attn" == mixer_type || "lightning-attn" == mixer_type) {
-        self_attn_ = std::make_shared<MiniCPMSALAAttention>(this->register_module<LightningAttention>("self_attn", model_config, layer_idx, device));
-    } else {
-        throw std::runtime_error("infinilm::models::minicpm_sala::MiniCPMSALADecoderLayer: unsupported mixer_type '" + mixer_type + "' for layer " + std::to_string(layer_idx));
-    }
-}
-
-std::tuple<infinicore::Tensor, infinicore::Tensor> MiniCPMSALADecoderLayer::forward(const infinicore::Tensor &positions,
-                                                                                    infinicore::Tensor &hidden_states,
-                                                                                    infinicore::Tensor &residual) {
-    input_layernorm_->forward_inplace(hidden_states, residual);
-    hidden_states = std::visit(
-        [&](auto &attn_ptr) { return attn_ptr->forward(positions, hidden_states); }, *self_attn_);
-
-    post_attention_layernorm_->forward_inplace(hidden_states, residual);
-    hidden_states = mlp_->forward(hidden_states);
-    return std::make_tuple(hidden_states, residual);
-}
-
-infinicore::Tensor MiniCPMSALADecoderLayer::forward(const infinicore::Tensor &positions,
-                                                    infinicore::Tensor &hidden_states) {
-    auto residual = hidden_states;
-    hidden_states = input_layernorm_->forward(hidden_states);
-    hidden_states = std::visit(
-        [&](auto &attn_ptr) { return attn_ptr->forward(positions, hidden_states); }, *self_attn_);
-
-    hidden_states = infinicore::op::add(residual, hidden_states);
-
-    residual = hidden_states;
-    hidden_states = post_attention_layernorm_->forward(hidden_states);
-    hidden_states = mlp_->forward(hidden_states);
-    hidden_states = infinicore::op::add(residual, hidden_states);
-    return hidden_states;
-}
-
-} // namespace infinilm::models::minicpm_sala
diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoderLayer.hpp b/csrc/models/minicpm_sala/minicpm_sala_decoderLayer.hpp
deleted file mode 100644
index 5e8faafb..00000000
--- a/csrc/models/minicpm_sala/minicpm_sala_decoderLayer.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-#pragma once
-
-#include "../../layers/mlp/mlp.hpp"
-#include "minicpm_sala_attention.hpp"
-#include <tuple>
-#include <variant>
-
-namespace infinilm::models::minicpm_sala {
-using MiniCPMMLP = infinilm::layers::MLP;
-using MiniCPMSALAAttention = std::variant<std::shared_ptr<InfLLMv2Attention>, std::shared_ptr<LightningAttention>>;
-
-class MiniCPMSALADecoderLayer : public infinicore::nn::Module {
-public:
-    MiniCPMSALADecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                            size_t layer_idx,
-                            const infinicore::Device &device);
-
-    std::tuple<infinicore::Tensor, infinicore::Tensor> forward(const infinicore::Tensor &positions,
-                                                               infinicore::Tensor &hidden_states,
-                                                               infinicore::Tensor &residual);
-
-    infinicore::Tensor forward(const infinicore::Tensor &positions,
-                               infinicore::Tensor &hidden_states);
-
-protected:
-    INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, input_layernorm);
-    INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, post_attention_layernorm);
-    INFINICORE_NN_MODULE(MiniCPMSALAAttention, self_attn);
-    INFINICORE_NN_MODULE(MiniCPMMLP, mlp);
-
-    size_t layer_idx_;
-};
-
-} // namespace infinilm::models::minicpm_sala
diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp
new file mode 100644
index 00000000..391b626b
--- /dev/null
+++ b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp
@@ -0,0 +1,83 @@
+#include "minicpm_sala_decoder_layer.hpp"
+
+#include "infinicore/ops.hpp"
+#include "infinicore/context/context.hpp"
+#include <cmath>
+#include <cstdio>
+#include <chrono>
+#include <cstdlib>
+#include <fstream>
+#include <vector>
+
+namespace infinilm::models::minicpm_sala {
+
+
+MiniCPMSALADecoderLayer::MiniCPMSALADecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                                                 const infinicore::Device &device,
+                                                 size_t layer_idx,
+                                                 const std::string &mixer_type,
+                                                 engine::distributed::RankInfo rank_info,
+                                                 backends::AttentionBackend attention_backend) {
+    layer_idx_ = layer_idx;
+    // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA).
+    const auto dtype = model_config->get_dtype();
+    const double eps = model_config->get<double>("rms_norm_eps");
+
+    // MuP residual scaling at forward (o_proj/down_proj not scaled in loader for minicpm_sala).
+    const double scale_depth = model_config->get_or<double>("scale_depth", 1.0);
+    const size_t num_layers = model_config->get<size_t>("num_hidden_layers");
+    residual_scale_ = scale_depth / std::sqrt(static_cast<double>(num_layers));
+
+    INFINICORE_NN_MODULE_INIT(input_layernorm, model_config->get<size_t>("hidden_size"), eps, dtype, device);
+    INFINICORE_NN_MODULE_INIT(self_attn, model_config, device, layer_idx, mixer_type, rank_info, attention_backend);
+    INFINICORE_NN_MODULE_INIT(post_attention_layernorm, model_config->get<size_t>("hidden_size"), eps, dtype, device);
+    INFINICORE_NN_MODULE_INIT(mlp, model_config, device);
+}
+
+void MiniCPMSALADecoderLayer::set_rotary_emb(const std::shared_ptr<infinicore::nn::RoPE> &rotary_emb) {
+    self_attn_->set_rotary_emb(rotary_emb);
+}
+
+void MiniCPMSALADecoderLayer::reset_cache() {
+    self_attn_->reset_cache();
+}
+
+infinicore::Tensor MiniCPMSALADecoderLayer::forward(const infinicore::Tensor &hidden_states,
+                                                    const infinicore::Tensor &position_ids,
+                                                    std::shared_ptr<infinilm::cache::Cache> kv_cache,
+                                                    std::optional<infinicore::Tensor> past_sequence_lengths,
+                                                    std::optional<infinicore::Tensor> total_sequence_lengths,
+                                                    std::optional<infinicore::Tensor> input_offsets,
+                                                    std::optional<infinicore::Tensor> cu_seqlens,
+                                                    std::optional<infinicore::Tensor> block_tables,
+                                                    std::optional<infinicore::Tensor> slot_mapping) const {
+    // Pre-norm attention
+    auto hs1 = input_layernorm_->forward(hidden_states);
+    auto attn_out = self_attn_->forward(
+        hs1,
+        position_ids,
+        kv_cache,
+        past_sequence_lengths,
+        total_sequence_lengths,
+        input_offsets,
+        cu_seqlens,
+        block_tables,
+        slot_mapping);
+
+    // residual + scale_down * attn_out (MuP)
+    auto ones_attn = infinicore::Tensor::empty(attn_out->shape(), attn_out->dtype(), attn_out->device());
+    infinicore::op::ones_(ones_attn);
+    auto out1 = infinicore::op::addcmul(hidden_states, attn_out, ones_attn, static_cast<float>(residual_scale_));
+
+    // Pre-norm MLP
+    auto hs2 = post_attention_layernorm_->forward(out1);
+    auto mlp_out = mlp_->forward(hs2);
+    // residual + scale_down * mlp_out (MuP)
+    auto ones_mlp = infinicore::Tensor::empty(mlp_out->shape(), mlp_out->dtype(), mlp_out->device());
+    infinicore::op::ones_(ones_mlp);
+    auto out2 = infinicore::op::addcmul(out1, mlp_out, ones_mlp, static_cast<float>(residual_scale_));
+
+    return out2;
+}
+
+} // namespace infinilm::models::minicpm_sala
diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp
new file mode 100644
index 00000000..948e4d97
--- /dev/null
+++ b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp
@@ -0,0 +1,53 @@
+#pragma once
+
+#include "minicpm_sala_attention.hpp"
+#include "minicpm_sala_mlp.hpp"
+
+#include "../../backends/attention_backends.hpp"
+#include "../../cache/kv_cache.hpp"
+#include "../../config/model_config.hpp"
+#include "../../engine/distributed/distributed.hpp"
+
+#include "infinicore/nn/module.hpp"
+#include "infinicore/nn/rmsnorm.hpp"
+#include "infinicore/tensor.hpp"
+
+#include <memory>
+#include <string>
+
+namespace infinilm::models::minicpm_sala {
+
+class MiniCPMSALADecoderLayer : public infinicore::nn::Module {
+public:
+    MiniCPMSALADecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                            const infinicore::Device &device,
+                            size_t layer_idx,
+                            const std::string &mixer_type,
+                            engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(),
+                            backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
+
+    infinicore::Tensor forward(const infinicore::Tensor &hidden_states,
+                               const infinicore::Tensor &position_ids,
+                               std::shared_ptr<infinilm::cache::Cache> kv_cache,
+                               std::optional<infinicore::Tensor> past_sequence_lengths,
+                               std::optional<infinicore::Tensor> total_sequence_lengths,
+                               std::optional<infinicore::Tensor> input_offsets,
+                               std::optional<infinicore::Tensor> cu_seqlens,
+                               std::optional<infinicore::Tensor> block_tables,
+                               std::optional<infinicore::Tensor> slot_mapping) const;
+
+    void set_rotary_emb(const std::shared_ptr<infinicore::nn::RoPE> &rotary_emb);
+    void reset_cache();
+
+private:
+    double residual_scale_ = 1.0;
+    size_t layer_idx_ = 0;
+
+protected:
+    INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, input_layernorm);
+    INFINICORE_NN_MODULE(MiniCPMSALAAttention, self_attn);
+    INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, post_attention_layernorm);
+    INFINICORE_NN_MODULE(MiniCPMSALAMLP, mlp);
+};
+
+} // namespace infinilm::models::minicpm_sala
diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp
index 793f86bd..ce2e9474 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp
@@ -1,56 +1,63 @@
 #include "minicpm_sala_for_causal_lm.hpp"
-#include "../../global_state/global_state.hpp"
-#include "../models_registry.hpp"
+
+#include "infinicore/ops.hpp"
+#include <cmath>
 #include <stdexcept>
-#include <string>
 
 namespace infinilm::models::minicpm_sala {
 
-MiniCPMSALAForCausalLM::MiniCPMSALAForCausalLM(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                                               const infinicore::Device &device) {
-    model_config_ = model_config;
-    size_t hidden_size = model_config->get<size_t>("hidden_size");
-    size_t vocab_size = model_config->get<size_t>("vocab_size");
-    const auto &dtype{model_config->get_dtype()};
+MiniCPMSALAForCausalLM::MiniCPMSALAForCausalLM(
+    std::shared_ptr<infinilm::config::ModelConfig> model_config,
+    const infinicore::Device &device,
+    engine::distributed::RankInfo rank_info,
+    backends::AttentionBackend attention_backend) {
+    device_ = device;
+
+    // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA).
+    const auto dtype = model_config->get_dtype();
+    INFINICORE_NN_MODULE_INIT(model, model_config, device, rank_info, attention_backend);
+
+    const size_t hidden_size = model_config->get<size_t>("hidden_size");
+    const size_t vocab_size = model_config->get<size_t>("vocab_size");
 
-    INFINICORE_NN_MODULE_INIT(model, model_config, device);
     INFINICORE_NN_MODULE_INIT(lm_head, hidden_size, vocab_size, false, dtype, device);
 }
 
-infinilm::InfinilmModel::Output MiniCPMSALAForCausalLM::forward(const infinilm::InfinilmModel::Input &input) const {
-    auto hidden_states = model_->forward(input);
+MiniCPMSALAForCausalLM::Output MiniCPMSALAForCausalLM::forward(
+    const Input &input) const {
+    auto input_ids = input.input_ids.value();
+    auto position_ids = input.position_ids.value();
+
+    auto past_sequence_lengths = input.past_sequence_lengths;
+    auto total_sequence_lengths = input.total_sequence_lengths;
+    auto input_offsets = input.input_offsets;
+    auto cu_seqlens = input.cu_seqlens;
+    auto block_tables = input.block_tables;
+    auto slot_mapping = input.slot_mapping;
+
+    auto hidden_states = model_->forward(
+        input_ids,
+        position_ids,
+        past_sequence_lengths,
+        total_sequence_lengths,
+        input_offsets,
+        cu_seqlens,
+        block_tables,
+        slot_mapping);
+
+    // MuP lm_head scale baked into lm_head.weight at load time; no forward scaling here.
     auto logits = lm_head_->forward(hidden_states);
     return {logits};
 }
 
 void MiniCPMSALAForCausalLM::reset_cache(const cache::CacheConfig *cache_config) {
-    if (nullptr == cache_config) {
-        InfinilmModel::reset_cache(nullptr);
-        return;
-    }
     cache_config_ = cache_config->unique_copy();
-
-    auto &kv_cache_vec = infinilm::global_state::get_forward_context().kv_cache_vec;
-    kv_cache_vec.clear();
-    const backends::AttentionBackend attention_backend = infinilm::global_state::get_infinilm_config().attention_backend;
-
-    auto new_kv_cache_vec = minicpm_sala_allocate_kv_cache_tensors(cache_config, model_config_, attention_backend);
-    kv_cache_vec = std::move(new_kv_cache_vec);
+    model_->reset_cache(cache_config_.get());
 }
 
-std::shared_ptr<infinilm::config::ModelConfig> create_minicpm_sala_model_config(std::shared_ptr<infinilm::config::ModelConfig> model_config) {
-    const std::string &model_type = model_config->get<std::string>("model_type");
-    if ("minicpm_sala" != model_type) {
-        throw std::runtime_error("infinilm::models::minicpm_sala::create_minicpm_sala_model_config: model_type is not minicpm_sala");
-    }
-    return model_config;
+const cache::CacheConfig *MiniCPMSALAForCausalLM::get_cache_config() const {
+    return cache_config_.get();
 }
 
 } // namespace infinilm::models::minicpm_sala
 
-namespace {
-INFINILM_REGISTER_CAUSAL_LM_MODEL(
-    minicpm_sala,
-    infinilm::models::minicpm_sala::MiniCPMSALAForCausalLM,
-    infinilm::models::minicpm_sala::create_minicpm_sala_model_config);
-} // namespace
diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp
index f0d0aaae..9bb3ec2b 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp
@@ -1,31 +1,38 @@
 #pragma once
 
-#include "minicpm_sala_decoderLayer.hpp"
-#include <memory>
-#include <vector>
+#include "../infinilm_model.hpp"
+#include "minicpm_sala_model.hpp"
 
-namespace infinilm::models::minicpm_sala {
+#include "../../config/model_config.hpp"
+#include "../../engine/distributed/distributed.hpp"
+#include "../../backends/attention_backends.hpp"
+
+#include "infinicore/device.hpp"
+#include "infinicore/nn/linear.hpp"
 
-using MiniCPMSALAModel = infinilm::layers::causal_lm_templates::TextModel<MiniCPMSALADecoderLayer>;
+namespace infinilm::models::minicpm_sala {
 
+// Milestone-0 stub. Full implementation will follow the MiniCPM-SALA design:
+// - Lightning Attention (Simple GLA) layers + InfLLM-V2 sparse layers in a 1:3 ratio
+// - HyPE (RoPE on linear layers; NoPE on sparse layers)
 class MiniCPMSALAForCausalLM : public InfinilmModel {
 public:
     MiniCPMSALAForCausalLM(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                           const infinicore::Device &device);
+                           const infinicore::Device &device,
+                           engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(),
+                           backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
 
     Output forward(const Input &input) const override;
 
     void reset_cache(const cache::CacheConfig *cache_config) override;
 
-protected:
+    const cache::CacheConfig *get_cache_config() const override;
+
+private:
     INFINICORE_NN_MODULE(MiniCPMSALAModel, model);
-    INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, lm_head);
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, lm_head);
+    std::unique_ptr<cache::CacheConfig> cache_config_;
 };
 
-std::shared_ptr<infinilm::config::ModelConfig> create_minicpm_sala_model_config(std::shared_ptr<infinilm::config::ModelConfig> model_config);
-
-/** Implemented in `minicpm_sala_allocate_kv_cache_tensors.cpp`. */
-std::vector<infinicore::Tensor> minicpm_sala_allocate_kv_cache_tensors(const cache::CacheConfig *cache_config,
-                                                                       const std::shared_ptr<infinilm::config::ModelConfig> &text_config,
-                                                                       const backends::AttentionBackend &attention_backend);
 } // namespace infinilm::models::minicpm_sala
+
diff --git a/csrc/models/minicpm_sala/minicpm_sala_mlp.cpp b/csrc/models/minicpm_sala/minicpm_sala_mlp.cpp
new file mode 100644
index 00000000..649c0095
--- /dev/null
+++ b/csrc/models/minicpm_sala/minicpm_sala_mlp.cpp
@@ -0,0 +1,32 @@
+#include "minicpm_sala_mlp.hpp"
+
+#include "infinicore/ops.hpp"
+
+namespace infinilm::models::minicpm_sala {
+
+MiniCPMSALAMLP::MiniCPMSALAMLP(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                               const infinicore::Device &device) {
+    // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA).
+    const auto dtype = model_config->get_dtype();
+    hidden_size_ = model_config->get<size_t>("hidden_size");
+    intermediate_size_ = model_config->get<size_t>("intermediate_size");
+
+    INFINICORE_NN_MODULE_INIT(gate_proj, hidden_size_, intermediate_size_, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(up_proj, hidden_size_, intermediate_size_, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, false, dtype, device);
+}
+
+infinicore::Tensor MiniCPMSALAMLP::forward(const infinicore::Tensor &x) const {
+    auto x_mut = x;
+    auto gate = gate_proj_->forward(x_mut);
+    auto up = up_proj_->forward(x_mut);
+
+    // SwiGLU: silu(gate) * up — fused single kernel (swiglu(a,b) = a*b*sigmoid(b) => swiglu(up,gate))
+    auto act = infinicore::op::swiglu(up, gate);
+
+    auto act_mut = act;
+    return down_proj_->forward(act_mut);
+}
+
+} // namespace infinilm::models::minicpm_sala
+
diff --git a/csrc/models/minicpm_sala/minicpm_sala_mlp.hpp b/csrc/models/minicpm_sala/minicpm_sala_mlp.hpp
new file mode 100644
index 00000000..9a90527a
--- /dev/null
+++ b/csrc/models/minicpm_sala/minicpm_sala_mlp.hpp
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "../../config/model_config.hpp"
+
+#include "infinicore/nn/linear.hpp"
+#include "infinicore/nn/module.hpp"
+#include "infinicore/tensor.hpp"
+
+#include <memory>
+
+namespace infinilm::models::minicpm_sala {
+
+class MiniCPMSALAMLP : public infinicore::nn::Module {
+public:
+    MiniCPMSALAMLP(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                   const infinicore::Device &device);
+
+    infinicore::Tensor forward(const infinicore::Tensor &x) const;
+
+protected:
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, gate_proj);
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, up_proj);
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, down_proj);
+
+private:
+    size_t hidden_size_;
+    size_t intermediate_size_;
+};
+
+} // namespace infinilm::models::minicpm_sala
+
diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.cpp b/csrc/models/minicpm_sala/minicpm_sala_model.cpp
new file mode 100644
index 00000000..a415915f
--- /dev/null
+++ b/csrc/models/minicpm_sala/minicpm_sala_model.cpp
@@ -0,0 +1,171 @@
+#include "minicpm_sala_model.hpp"
+
+#include "infinicore/context/context.hpp"
+#include "infinicore/ops.hpp"
+#include <cmath>
+#include <chrono>
+#include <cstdlib>
+#include <fstream>
+#include <stdexcept>
+#include <algorithm>
+#include <vector>
+
+namespace infinilm::models::minicpm_sala {
+
+MiniCPMSALAModel::MiniCPMSALAModel(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                                   const infinicore::Device &device,
+                                   engine::distributed::RankInfo rank_info,
+                                   backends::AttentionBackend attention_backend)
+    : model_config_(std::move(model_config)),
+      rank_info_(rank_info),
+      attention_backend_(attention_backend) {
+
+    // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA).
+    const auto dtype = model_config_->get_dtype();
+    compute_device_ = device;
+
+    hidden_size_ = model_config_->get<size_t>("hidden_size");
+    dim_model_base_ = model_config_->get_or<double>("dim_model_base", static_cast<double>(hidden_size_));
+    scale_emb_ = model_config_->get_or<double>("scale_emb", 1.0);
+
+    const size_t vocab_size = model_config_->get<size_t>("vocab_size");
+    const size_t num_layers = model_config_->get<size_t>("num_hidden_layers");
+
+    INFINICORE_NN_MODULE_INIT(embed_tokens, vocab_size, hidden_size_, std::nullopt, dtype, device);
+    INFINICORE_NN_MODULE_INIT(norm, hidden_size_, model_config_->get<double>("rms_norm_eps"), dtype, device);
+
+    // Shared rotary embedding (used by lightning layers only)
+    INFINICORE_NN_MODULE_INIT(rotary_emb,
+                              model_config_->get_head_dim(),
+                              model_config_->get<size_t>("max_position_embeddings"),
+                              model_config_->get<double>("rope_theta"),
+                              infinicore::nn::RoPE::Algo::GPT_NEOX,
+                              dtype,
+                              device,
+                              model_config_->get_rope_scaling());
+
+    // Mixer types per-layer decide attention flavor (minicpm4 vs lightning-attn).
+    std::vector<std::string> mixer_types;
+    try {
+        mixer_types = model_config_->get<std::vector<std::string>>("mixer_types");
+    } catch (...) {
+        mixer_types.assign(num_layers, "minicpm4");
+    }
+    if (mixer_types.size() != num_layers) {
+        mixer_types.resize(num_layers, mixer_types.empty() ? "minicpm4" : mixer_types.back());
+    }
+    mixer_types_ = mixer_types;
+
+    layers_.reserve(num_layers);
+    for (size_t i = 0; i < num_layers; ++i) {
+        layers_.push_back(this->register_module<MiniCPMSALADecoderLayer>(
+            "layers." + std::to_string(i), model_config_, device, i, mixer_types[i], rank_info_, attention_backend_));
+        layers_.back()->set_rotary_emb(rotary_emb_);
+    }
+}
+
+void MiniCPMSALAModel::reset_cache(const cache::CacheConfig *cache_config) {
+    if (cache_config == nullptr) {
+        kv_cache_minicpm4_ = nullptr;
+        kv_cache_lightning_ = nullptr;
+        for (auto &layer : layers_) {
+            layer->reset_cache();
+        }
+        return;
+    }
+
+    if (auto static_cfg = dynamic_cast<const cache::StaticKVCacheConfig *>(cache_config)) {
+        // Allocate separate caches by KV shape to avoid per-layer padding copies.
+        const size_t num_hidden_layers = model_config_->get<size_t>("num_hidden_layers");
+        // mixer_types_ is filled in ctor from model_config_->get("mixer_types").
+        const size_t minicpm4_layer_count =
+            !mixer_types_.empty() ? std::count(mixer_types_.begin(), mixer_types_.end(), "minicpm4") : num_hidden_layers;
+        const size_t lightning_layer_count = num_hidden_layers - minicpm4_layer_count;
+
+        const size_t base_kv_heads = model_config_->get<size_t>("num_key_value_heads");
+        const size_t base_head_dim = model_config_->get<size_t>("head_dim");
+        const size_t lightning_kv_heads = model_config_->get_or<size_t>("lightning_nkv", base_kv_heads);
+        const size_t lightning_head_dim = model_config_->get_or<size_t>("lightning_head_dim", base_head_dim);
+        const size_t lightning_nh = model_config_->get_or<size_t>("lightning_nh", model_config_->get<size_t>("num_attention_heads"));
+        const int tp_sz = std::max(1, rank_info_.tp_size);
+        const size_t lightning_nh_rank = lightning_nh / static_cast<size_t>(tp_sz);
+
+        kv_cache_minicpm4_ = (minicpm4_layer_count > 0)
+                                 ? std::make_shared<cache::StaticKVCache>(
+                                       /*k_dim=*/base_head_dim,
+                                       /*v_dim=*/base_head_dim,
+                                       /*num_k_heads=*/base_kv_heads,
+                                       /*num_v_heads=*/base_kv_heads,
+                                       /*num_layers=*/minicpm4_layer_count,
+                                       /*max_positional_embedding=*/model_config_->get<size_t>("max_position_embeddings"),
+                                       /*dtype=*/model_config_->get_dtype(),
+                                       *static_cfg,
+                                       rank_info_)
+                                 : nullptr;
+
+        kv_cache_lightning_ = (lightning_layer_count > 0)
+                                   ? std::make_shared<cache::StaticKVCache>(
+                                         /*k_dim=*/lightning_head_dim,
+                                         /*v_dim=*/lightning_head_dim,
+                                         /*num_k_heads=*/lightning_kv_heads,
+                                         /*num_v_heads=*/lightning_kv_heads,
+                                         /*num_layers=*/lightning_layer_count,
+                                         /*max_positional_embedding=*/model_config_->get<size_t>("max_position_embeddings"),
+                                         /*dtype=*/model_config_->get_dtype(),
+                                         *static_cfg,
+                                         rank_info_,
+                                         /*gla_recurrent_num_heads=*/lightning_nh_rank,
+                                         /*gla_recurrent_head_dim=*/lightning_head_dim)
+                                 : nullptr;
+    } else {
+        // This refactor implements HF-like dense caching only.
+        throw std::runtime_error("MiniCPMSALAModel::reset_cache: Unsupported cache type (expected StaticKVCacheConfig)");
+    }
+
+    for (auto &layer : layers_) {
+        layer->reset_cache();
+    }
+}
+
+infinicore::Tensor MiniCPMSALAModel::forward(const infinicore::Tensor &input_ids,
+                                             const infinicore::Tensor &position_ids,
+                                             std::optional<infinicore::Tensor> past_sequence_lengths,
+                                             std::optional<infinicore::Tensor> total_sequence_lengths,
+                                             std::optional<infinicore::Tensor> input_offsets,
+                                             std::optional<infinicore::Tensor> cu_seqlens,
+                                             std::optional<infinicore::Tensor> block_tables,
+                                             std::optional<infinicore::Tensor> slot_mapping) const {
+    // MuP scaling baked into weights at load time for minicpm_sala; no forward scaling here.
+    auto hs = embed_tokens_->forward(input_ids);
+
+    for (size_t i = 0; i < layers_.size(); ++i) {
+        std::shared_ptr<cache::Cache> layer_cache;
+        if (!mixer_types_.empty() && mixer_types_[i] == "minicpm4") {
+            layer_cache = kv_cache_minicpm4_;
+        } else {
+            layer_cache = kv_cache_lightning_;
+        }
+        hs = layers_[i]->forward(hs,
+                                 position_ids,
+                                 layer_cache,
+                                 past_sequence_lengths,
+                                 total_sequence_lengths,
+                                 input_offsets,
+                                 cu_seqlens,
+                                 block_tables,
+                                 slot_mapping);
+        if (const char *env = std::getenv("MINICPM_SALA_LAYER_TRACE")) {
+            if (env[0] != '\0' && env[0] != '0') {
+                fprintf(stderr, "[minicpm_sala][layer_trace] layer=%zu mixer=%s\n",
+                        i,
+                        mixer_types_.empty() ? "unknown" : mixer_types_[i].c_str());
+                fflush(stderr);
+            }
+        }
+    }
+
+    hs = norm_->forward(hs);
+    return hs;
+}
+
+} // namespace infinilm::models::minicpm_sala
diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.hpp b/csrc/models/minicpm_sala/minicpm_sala_model.hpp
new file mode 100644
index 00000000..d360dd3e
--- /dev/null
+++ b/csrc/models/minicpm_sala/minicpm_sala_model.hpp
@@ -0,0 +1,66 @@
+#pragma once
+
+#include "minicpm_sala_decoder_layer.hpp"
+
+#include "../../backends/attention_backends.hpp"
+#include "../../cache/kv_cache.hpp"
+#include "../../config/model_config.hpp"
+#include "../../engine/distributed/distributed.hpp"
+
+#include "infinicore/nn/embedding.hpp"
+#include "infinicore/nn/module.hpp"
+#include "infinicore/nn/rmsnorm.hpp"
+#include "infinicore/nn/rope.hpp"
+#include "infinicore/tensor.hpp"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace infinilm::models::minicpm_sala {
+
+class MiniCPMSALAModel : public infinicore::nn::Module {
+public:
+    MiniCPMSALAModel(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                     const infinicore::Device &device,
+                     engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(),
+                     backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
+
+    infinicore::Tensor forward(const infinicore::Tensor &input_ids,
+                               const infinicore::Tensor &position_ids,
+                               std::optional<infinicore::Tensor> past_sequence_lengths,
+                               std::optional<infinicore::Tensor> total_sequence_lengths,
+                               std::optional<infinicore::Tensor> input_offsets,
+                               std::optional<infinicore::Tensor> cu_seqlens,
+                               std::optional<infinicore::Tensor> block_tables,
+                               std::optional<infinicore::Tensor> slot_mapping) const;
+
+    void reset_cache(const cache::CacheConfig *cache_config);
+
+    size_t hidden_size() const { return hidden_size_; }
+    double dim_model_base() const { return dim_model_base_; }
+
+protected:
+    INFINICORE_NN_MODULE(infinicore::nn::Embedding, embed_tokens);
+    INFINICORE_NN_MODULE_VEC(MiniCPMSALADecoderLayer, layers);
+    INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, norm);
+    INFINICORE_NN_MODULE(infinicore::nn::RoPE, rotary_emb);
+
+private:
+    std::shared_ptr<infinilm::config::ModelConfig> model_config_;
+    engine::distributed::RankInfo rank_info_;
+    backends::AttentionBackend attention_backend_;
+    // MiniCPM-SALA is hybrid: minicpm4 vs lightning layers can have different KV shapes.
+    // Use two StaticKVCache instances to avoid per-layer padding/copies during long prefill.
+    std::shared_ptr<cache::StaticKVCache> kv_cache_minicpm4_;
+    std::shared_ptr<cache::StaticKVCache> kv_cache_lightning_;
+    std::vector<std::string> mixer_types_;
+    infinicore::Device compute_device_;
+
+    size_t hidden_size_;
+    double scale_emb_;
+    double dim_model_base_;
+};
+
+} // namespace infinilm::models::minicpm_sala
+
diff --git a/csrc/models/model_factory.cpp b/csrc/models/model_factory.cpp
index 03734ac9..3c885fc5 100644
--- a/csrc/models/model_factory.cpp
+++ b/csrc/models/model_factory.cpp
@@ -1,6 +1,6 @@
 #include "model_factory.hpp"
-#include "llama/llama_for_causal_lm.hpp"
-#include "models_registry.hpp"
+#include "llama/llama.hpp"
+#include "minicpm_sala/minicpm_sala_for_causal_lm.hpp"
 
 namespace infinilm {
 /**
@@ -41,8 +41,13 @@ std::shared_ptr<InfinilmModel> InfinilmModelFactory::createModel(
     engine::distributed::RankInfo rank_info,
     const cache::CacheConfig *cache,
     backends::AttentionBackend attention_backend) {
+
     std::shared_ptr<InfinilmModel> model;
-    if (true) {
+    const auto model_type = model_config->get_or<std::string>("model_type", "llama");
+    if (model_type == "minicpm_sala") {
+        model = std::make_shared<models::minicpm_sala::MiniCPMSALAForCausalLM>(
+            model_config, rank_info.device, rank_info, attention_backend);
+    } else if (true) {
         model = std::make_shared<models::llama::LlamaForCausalLM>(
             model_config, rank_info.device, rank_info, attention_backend);
     } else {
@@ -60,21 +65,8 @@ std::shared_ptr<InfinilmModel> InfinilmModelFactory::createModel(
     std::shared_ptr<infinilm::config::ModelConfig> model_config,
     const infinicore::Device &device,
     const cache::CacheConfig *cache) {
-    const std::string model_type = model_config->get<std::string>("model_type");
-    std::shared_ptr<InfinilmModel> model;
-    const auto &model_map = models::get_causal_lm_model_map();
-    auto it = model_map.find(model_type);
-    if (it != model_map.end()) {
-        // create model
-        auto &model_creator = it->second;
-        model = model_creator(model_config, device);
-    } else {
-        throw std::invalid_argument("InfinilmModelFactory::createModel: Unsupported model_type");
-    }
-
-    if (cache) {
-        model->reset_cache(cache);
-    }
-    return model;
+    engine::distributed::RankInfo rank_info;
+    rank_info.device = device;
+    return createModel(model_config, rank_info, cache, backends::AttentionBackend::Default);
 }
 } // namespace infinilm
diff --git a/csrc/pybind11/engine/engine.hpp b/csrc/pybind11/engine/engine.hpp
index a784d69c..d27b9585 100644
--- a/csrc/pybind11/engine/engine.hpp
+++ b/csrc/pybind11/engine/engine.hpp
@@ -199,6 +199,6 @@ inline void bind_infer_engine(py::module &m) {
 
     py::class_<InferEngine::Output>(infer_engine, "Output")
         .def_readwrite("output_ids", &InferEngine::Output::output_ids, "Output tensor");
-}
+    }
 
 } // namespace infinilm::engine
diff --git a/examples/collect_metrics_longtext_decode.py b/examples/collect_metrics_longtext_decode.py
new file mode 100644
index 00000000..172b6f40
--- /dev/null
+++ b/examples/collect_metrics_longtext_decode.py
@@ -0,0 +1,355 @@
+#!/usr/bin/env python3
+"""
+Collect long-context + decode metrics for metrics_longtext_mem.md.
+
+**OOM-safe workflow:** run each case in a **fresh Python process** so CUDA allocations
+are released between runs:
+
+  ./run_longtext_metrics_cases.sh
+
+Or manually:
+
+  python3 collect_metrics_longtext_decode.py --case hf:16384 --append-jsonl profiling_runs/longtext_decode_rows.jsonl
+
+See also docstring at top of previous revisions for GPU selection (CUDA_VISIBLE_DEVICES + NVML_GPU_INDEX).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+import threading
+import time
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
+
+
+def _poll_gpu_mem_mib(stop: threading.Event, gpu_index: int, out: List[int]) -> None:
+    while not stop.is_set():
+        try:
+            r = subprocess.run(
+                [
+                    "nvidia-smi",
+                    "-i",
+                    str(gpu_index),
+                    "--query-gpu=memory.used",
+                    "--format=csv,noheader,nounits",
+                ],
+                capture_output=True,
+                text=True,
+                timeout=5,
+            )
+            if r.returncode == 0 and r.stdout.strip().isdigit():
+                out.append(int(r.stdout.strip()))
+        except Exception:
+            pass
+        if stop.wait(timeout=1.0):
+            break
+
+
+def _with_mem_poll(gpu_index: int, fn: Callable[[], Any]) -> Tuple[Any, Optional[int]]:
+    samples: List[int] = []
+    stop = threading.Event()
+    th = threading.Thread(target=_poll_gpu_mem_mib, args=(stop, gpu_index, samples), daemon=True)
+    th.start()
+    err: Optional[BaseException] = None
+    result: Any = None
+    try:
+        result = fn()
+    except BaseException as e:
+        err = e
+    finally:
+        stop.set()
+        th.join(timeout=3.0)
+    peak = max(samples) if samples else None
+    if err is not None:
+        raise err
+    return result, peak
+
+
+def _row_dict(
+    date: str,
+    backend: str,
+    target: int,
+    actual: int,
+    max_new: int,
+    peak: Optional[int],
+    gpu_smi: int,
+    r: Dict[str, Any],
+) -> Dict[str, Any]:
+    return {
+        "date": date,
+        "backend": backend,
+        "target_input_tokens": target,
+        "actual_input_tokens": actual,
+        "max_new_tokens": max_new,
+        "peak_mem_mib": peak,
+        "gpu_smi_index": gpu_smi,
+        "total_time_ms": r.get("total_time_ms"),
+        "prefill_ttft_ms": r.get("prefill_ttft_ms"),
+        "prefill_throughput_tok_s": r.get("prefill_throughput_tok_s"),
+        "decode_itl_ms": r.get("decode_itl_ms"),
+        "decode_throughput_tok_s": r.get("decode_throughput_tok_s"),
+        "engine_reported_generation_ms": r.get("engine_reported_generation_ms"),
+        "error": r.get("error"),
+    }
+
+
+def run_single_case(
+    case: str,
+    *,
+    model_path: str,
+    gpu_smi: int,
+    date: str,
+) -> Dict[str, Any]:
+    """Run one measurement; returns a row dict (may contain error key)."""
+    examples_dir = os.path.dirname(os.path.abspath(__file__))
+    sys.path.insert(0, examples_dir)
+    os.chdir(examples_dir)
+
+    from transformers import AutoTokenizer
+
+    from compare_inference_speed import (
+        _make_prompt_with_target_tokens,
+        run_hf_decode_loop,
+        run_hf_forward_prefill,
+        run_infinilm_inprocess,
+    )
+
+    parts = case.strip().split(":")
+    kind = parts[0].lower()
+    if kind == "hf":
+        # Backward compatible:
+        #   hf:<target>         -> max_new=1 (forward-prefill only)
+        #   hf:<target>:<max>  -> max_new=<max> (decode-loop timing)
+        if len(parts) == 2:
+            target = int(parts[1])
+            max_new = 1
+        elif len(parts) == 3:
+            target = int(parts[1])
+            max_new = int(parts[2])
+        else:
+            raise ValueError("--case hf:<target_tokens>[:<max_new_tokens>] (e.g. hf:16384 or hf:16384:32)")
+    elif kind == "infinilm_rec":
+        if len(parts) != 3:
+            raise ValueError("--case infinilm_rec:<target>:<max_new> (e.g. infinilm_rec:32768:32)")
+        target = int(parts[1])
+        max_new = int(parts[2])
+    else:
+        raise ValueError(
+            f"Unknown case kind {kind!r}; use hf: or infinilm_rec:"
+        )
+
+    tok = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    prompt, actual = _make_prompt_with_target_tokens(tok, "How are you", target)
+
+    if kind == "hf":
+
+        def go() -> Dict[str, Any]:
+            # Always use hf decode-loop so total_time_ms can be end-to-end
+            # (prefill + decode), matching the InfiniLM generate semantics.
+            return run_hf_decode_loop(
+                model_path,
+                prompt,
+                max_new,
+                device="cuda",
+                attn_implementation="flash_attention_2",
+                use_cache=True,
+                warmup=1,
+                iters=1,
+            )
+
+        try:
+            r, peak = _with_mem_poll(gpu_smi, go)
+            r = dict(r)
+            return _row_dict(date, "hf (decode_loop)", target, actual, max_new, peak, gpu_smi, r)
+        except Exception as e:
+            return _row_dict(
+                date,
+                "hf (decode_loop)",
+                target,
+                actual,
+                max_new,
+                None,
+                gpu_smi,
+                {"error": str(e)},
+            )
+
+    recurrent = kind == "infinilm_rec"
+    if max_new == 1:
+        label = "infinilm (static_fit, recurrent GLA decode)"
+    else:
+        label = f"infinilm (static_fit, recurrent GLA, +{max_new} decode)"
+
+    saved_lightning = os.environ.get("INFINI_LIGHTNING_GLA_RECURRENT_DECODE")
+    saved_skip = os.environ.get("INFINI_SKIP_LAST_LOGITS_CPU")
+    try:
+        if recurrent:
+            os.environ["INFINI_LIGHTNING_GLA_RECURRENT_DECODE"] = "1"
+        else:
+            os.environ.pop("INFINI_LIGHTNING_GLA_RECURRENT_DECODE", None)
+        os.environ["INFINI_SKIP_LAST_LOGITS_CPU"] = "1"
+
+        def go_inf() -> Dict[str, Any]:
+            return run_infinilm_inprocess(
+                model_path,
+                prompt,
+                max_new,
+                cache_mode="static_fit",
+                paged_block_size=256,
+                attn_backend="default",
+            )
+
+        r, peak = _with_mem_poll(gpu_smi, go_inf)
+        return _row_dict(date, label, target, actual, max_new, peak, gpu_smi, dict(r))
+    except Exception as e:
+        return _row_dict(date, label, target, actual, max_new, None, gpu_smi, {"error": str(e)})
+    finally:
+        if saved_lightning is None:
+            os.environ.pop("INFINI_LIGHTNING_GLA_RECURRENT_DECODE", None)
+        else:
+            os.environ["INFINI_LIGHTNING_GLA_RECURRENT_DECODE"] = saved_lightning
+        if saved_skip is None:
+            os.environ.pop("INFINI_SKIP_LAST_LOGITS_CPU", None)
+        else:
+            os.environ["INFINI_SKIP_LAST_LOGITS_CPU"] = saved_skip
+
+
+def print_markdown_table(rows: List[Dict[str, Any]]) -> None:
+    def fmt(x: Any) -> str:
+        if x is None:
+            return "—"
+        if isinstance(x, float):
+            s = f"{x:.2f}"
+            return s.rstrip("0").rstrip(".")
+        return str(x)
+
+    gpu_smi = rows[0].get("gpu_smi_index", 0) if rows else 0
+    print("\n### Markdown table (paste into metrics_longtext_mem.md)\n")
+    hdr = (
+        "| date | backend | target_in | max_new | peak_mem_mib | total_ms | prefill_ttft_ms | "
+        "prefill_tok_s | decode_itl_ms | decode_tok_s | gpu |"
+    )
+    sep = "|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|"
+    print(hdr)
+    print(sep)
+    for row in rows:
+        if row.get("error"):
+            print(
+                f"| {row['date']} | {row['backend']} | {row['target_input_tokens']} | "
+                f"{row['max_new_tokens']} | {fmt(row.get('peak_mem_mib'))} | OOM/err | — | — | — | — | {gpu_smi} |"
+            )
+            continue
+        dec_itl = fmt(row.get("decode_itl_ms")) if row["max_new_tokens"] > 1 else "—"
+        dec_tps = fmt(row.get("decode_throughput_tok_s")) if row["max_new_tokens"] > 1 else "—"
+        ptt = row.get("prefill_ttft_ms")
+        # Only forward-prefill runs use total_time_ms as a prefill-time proxy.
+        if ptt is None and row.get("backend") == "hf (forward_prefill)":
+            ptt = row.get("total_time_ms")
+        print(
+            f"| {row['date']} | {row['backend']} | {row['target_input_tokens']} | {row['max_new_tokens']} | "
+            f"{fmt(row.get('peak_mem_mib'))} | {fmt(row.get('total_time_ms'))} | {fmt(ptt)} | "
+            f"{fmt(row.get('prefill_throughput_tok_s'))} | {dec_itl} | {dec_tps} | {gpu_smi} |"
+        )
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description="Long-context + decode metrics (OOM-safe --case mode)")
+    ap.add_argument(
+        "--case",
+        type=str,
+        default=None,
+        help="Single case: hf:16384 | infinilm_rec:32768:32",
+    )
+    ap.add_argument(
+        "--append-jsonl",
+        type=str,
+        default=None,
+        help="Append one JSON line (--case mode only)",
+    )
+    ap.add_argument(
+        "--from-jsonl",
+        type=str,
+        default=None,
+        help="Load rows from jsonl and print markdown table",
+    )
+    ap.add_argument(
+        "--all-in-process",
+        action="store_true",
+        help="Run full matrix in one process (may OOM between cases)",
+    )
+    args = ap.parse_args()
+
+    model_path = os.environ.get(
+        "MODEL_PATH", "/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA"
+    )
+    gpu_smi = int(os.environ.get("NVML_GPU_INDEX", os.environ.get("CUDA_VISIBLE_DEVICES", "0")))
+    date = os.environ.get("METRICS_DATE", "2026-03-23")
+    decode_steps = int(os.environ.get("METRICS_DECODE_STEPS", "32"))
+    targets = [int(x) for x in os.environ.get("METRICS_TARGETS", "16384,32768,65536").split(",")]
+
+    examples_dir = os.path.dirname(os.path.abspath(__file__))
+
+    if args.from_jsonl:
+        rows = []
+        with open(args.from_jsonl) as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    rows.append(json.loads(line))
+        print_markdown_table(rows)
+        return
+
+    if args.case:
+        row = run_single_case(args.case, model_path=model_path, gpu_smi=gpu_smi, date=date)
+        print(json.dumps(row, ensure_ascii=False))
+        if args.append_jsonl:
+            ap = os.path.abspath(args.append_jsonl)
+            ad = os.path.dirname(ap)
+            if ad:
+                os.makedirs(ad, exist_ok=True)
+            with open(ap, "a") as f:
+                f.write(json.dumps(row, ensure_ascii=False) + "\n")
+        return
+
+    if not args.all_in_process:
+        print(
+            "Specify --case CASE, --from-jsonl FILE, or --all-in-process.\n"
+            "For OOM safety use: ./run_longtext_metrics_cases.sh",
+            file=sys.stderr,
+        )
+        sys.exit(2)
+
+    # Legacy: all targets × all backends in one process
+    rows: List[Dict[str, Any]] = []
+    for t in targets:
+        row = run_single_case(f"hf:{t}", model_path=model_path, gpu_smi=gpu_smi, date=date)
+        rows.append(row)
+    for t in targets:
+        rows.append(
+            run_single_case(f"infinilm_rec:{t}:1", model_path=model_path, gpu_smi=gpu_smi, date=date)
+        )
+    for t in targets:
+        rows.append(
+            run_single_case(
+                f"infinilm_rec:{t}:{decode_steps}",
+                model_path=model_path,
+                gpu_smi=gpu_smi,
+                date=date,
+            )
+        )
+
+    out_path = os.path.join(examples_dir, "profiling_runs", "longtext_decode_metrics.json")
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    with open(out_path, "w") as f:
+        json.dump({"gpu_smi_index": gpu_smi, "decode_steps": decode_steps, "rows": rows}, f, indent=2)
+    print(f"Wrote {out_path}")
+    print_markdown_table(rows)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/compare_inference_speed.py b/examples/compare_inference_speed.py
new file mode 100644
index 00000000..06fad9a7
--- /dev/null
+++ b/examples/compare_inference_speed.py
@@ -0,0 +1,868 @@
+#!/usr/bin/env python3
+"""
+Compare MiniCPM-SALA inference speed across HF, InfiniLM, and (optionally) SGLang.
+
+Usage:
+  # HF + InfiniLM only (InfiniLM runs in subprocess with same env as jiuge):
+  python compare_inference_speed.py --model_path /path/to/MiniCPM-SALA [--prompt "How are you"] [--max_new_tokens 32]
+
+  # Include SGLang (server must already be running with MiniCPM-SALA):
+  python compare_inference_speed.py --model_path /path/to/MiniCPM-SALA --sglang_url http://127.0.0.1:30000
+
+  # Optional: write JSON
+  python compare_inference_speed.py --model_path /path/to/MiniCPM-SALA --output results.json
+
+Requires: transformers, torch; for InfiniLM subprocess: PYTHONPATH and LD_LIBRARY_PATH as in jiuge.
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+from typing import Optional, Tuple, Literal
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
+
+def _build_chat_input_ids(tokenizer, prompt: str):
+    conversation = [{"role": "user", "content": prompt}]
+    text = tokenizer.apply_chat_template(
+        conversation, add_generation_prompt=True, tokenize=False
+    )
+    ids = tokenizer(text, add_special_tokens=True)["input_ids"]
+    return ids
+
+
+def _make_prompt_with_target_tokens(tokenizer, base_prompt: str, target_input_tokens: int) -> Tuple[str, int]:
+    """
+    Build a prompt (user content) such that the *chat-templated* input_ids length is >= target_input_tokens.
+    Returns (prompt, actual_input_tokens).
+    """
+    if target_input_tokens <= 0:
+        raise ValueError("--target_input_tokens must be > 0")
+
+    # Ensure boundaries don't merge tokens weirdly.
+    chunk = (base_prompt.strip() + "\n") if base_prompt.strip() else "hello\n"
+
+    # Exponential growth to find an upper bound.
+    rep = 1
+    while True:
+        prompt = chunk * rep
+        ids = _build_chat_input_ids(tokenizer, prompt)
+        if len(ids) >= target_input_tokens:
+            break
+        rep *= 2
+        if rep > 1_000_000:
+            raise RuntimeError("Failed to build prompt to target length (rep too large)")
+
+    # Binary search for smallest rep that reaches target.
+    lo, hi = 1, rep
+    best_prompt = prompt
+    best_len = len(ids)
+    while lo <= hi:
+        mid = (lo + hi) // 2
+        p = chunk * mid
+        l = len(_build_chat_input_ids(tokenizer, p))
+        if l >= target_input_tokens:
+            best_prompt, best_len = p, l
+            hi = mid - 1
+        else:
+            lo = mid + 1
+
+    return best_prompt, best_len
+
+
+def run_hf(
+    model_path: str,
+    prompt: str,
+    max_new_tokens: int,
+    device: str = "cuda",
+    *,
+    attn_implementation: Optional[str] = None,
+):
+    """Run HuggingFace generate and return metrics."""
+    import torch
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    model_kwargs = {
+        "torch_dtype": "auto",
+        "trust_remote_code": True,
+    }
+    # Prefer flash-attn when available; fall back silently if not supported.
+    if attn_implementation is not None:
+        model_kwargs["attn_implementation"] = attn_implementation  # type: ignore[assignment]
+    try:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            **model_kwargs,
+        ).to(device)
+    except TypeError:
+        # Older transformers versions may not support attn_implementation kwarg.
+        model_kwargs.pop("attn_implementation", None)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            **model_kwargs,
+        ).to(device)
+    model.eval()
+
+    conversation = [{"role": "user", "content": prompt}]
+    text = tokenizer.apply_chat_template(
+        conversation, add_generation_prompt=True, tokenize=False
+    )
+    inputs = tokenizer(text, return_tensors="pt").to(device)
+    input_len = inputs.input_ids.shape[1]
+
+    start = time.perf_counter()
+    with torch.inference_mode():
+        out = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=False,
+            pad_token_id=tokenizer.eos_token_id or 0,
+        )
+    elapsed = time.perf_counter() - start
+    output_len = out.shape[1] - input_len
+
+    return {
+        "backend": "hf",
+        "total_time_ms": round(elapsed * 1000, 2),
+        "input_tokens": input_len,
+        "output_tokens": output_len,
+        "prefill_ttft_ms": None,  # HF generate() doesn't expose TTFT without streaming
+        "decode_throughput_tok_s": round(output_len / elapsed, 2) if elapsed > 0 else None,
+        "total_throughput_tok_s": round((input_len + output_len) / elapsed, 2) if elapsed > 0 else None,
+    }
+
+
+def run_hf_forward_prefill(
+    model_path: str,
+    prompt: str,
+    device: str = "cuda",
+    *,
+    attn_implementation: Optional[str] = None,
+    use_cache: bool = True,
+    warmup: int = 1,
+    iters: int = 1,
+):
+    """
+    Run HuggingFace *forward-only* prefill (no decode loop).
+    Intended for kernel-level profiling to isolate prefill work.
+    """
+    import torch
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    model_kwargs = {
+        "torch_dtype": "auto",
+        "trust_remote_code": True,
+    }
+    if attn_implementation is not None:
+        model_kwargs["attn_implementation"] = attn_implementation  # type: ignore[assignment]
+    try:
+        model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs).to(device)
+    except TypeError:
+        model_kwargs.pop("attn_implementation", None)
+        model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs).to(device)
+    model.eval()
+
+    conversation = [{"role": "user", "content": prompt}]
+    text = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+    inputs = tokenizer(text, return_tensors="pt").to(device)
+    input_len = inputs.input_ids.shape[1]
+
+    # Warmup (reduces first-iter compilation / cache effects for profiling).
+    with torch.inference_mode():
+        for _ in range(max(0, warmup)):
+            # Prefer last-token logits only (reduces memory at long context).
+            try:
+                _ = model(**inputs, use_cache=use_cache, logits_to_keep=1)
+            except TypeError:
+                _ = model(**inputs, use_cache=use_cache)
+        torch.cuda.synchronize()
+
+    # Timed iters.
+    times = []
+    with torch.inference_mode():
+        for _ in range(max(1, iters)):
+            torch.cuda.synchronize()
+            try:
+                torch.cuda.nvtx.range_push("hf_forward_prefill")
+            except Exception:
+                pass
+            start = time.perf_counter()
+            try:
+                _ = model(**inputs, use_cache=use_cache, logits_to_keep=1)
+            except TypeError:
+                _ = model(**inputs, use_cache=use_cache)
+            torch.cuda.synchronize()
+            elapsed = time.perf_counter() - start
+            try:
+                torch.cuda.nvtx.range_pop()
+            except Exception:
+                pass
+            times.append(elapsed)
+
+    best = min(times) if times else 0.0
+    return {
+        "backend": "hf_forward_prefill",
+        "total_time_ms": round(best * 1000, 2),
+        "input_tokens": int(input_len),
+        "output_tokens": 0,
+        "use_cache": bool(use_cache),
+        "warmup": int(warmup),
+        "iters": int(iters),
+        "prefill_throughput_tok_s": round(input_len / best, 2) if best > 0 else None,
+    }
+
+
+def run_hf_decode_loop(
+    model_path: str,
+    prompt: str,
+    max_new_tokens: int,
+    device: str = "cuda",
+    *,
+    attn_implementation: Optional[str] = None,
+    use_cache: bool = True,
+    warmup: int = 8,
+    iters: int = 1,
+):
+    """
+    Measure HF *decode-only* per-token latency using a manual loop with past_key_values.
+
+    Protocol:
+    - Prefill once on the full prompt (not included in decode timing).
+    - Then decode `max_new_tokens` tokens with 1-token steps, timing the whole decode loop
+      (optionally best-of `iters`).
+    """
+    import torch
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    if max_new_tokens <= 0:
+        raise ValueError("--max_new_tokens must be > 0 for hf decode_loop")
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    model_kwargs = {
+        "torch_dtype": "auto",
+        "trust_remote_code": True,
+    }
+    if attn_implementation is not None:
+        model_kwargs["attn_implementation"] = attn_implementation  # type: ignore[assignment]
+    try:
+        model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs).to(device)
+    except TypeError:
+        model_kwargs.pop("attn_implementation", None)
+        model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs).to(device)
+    model.eval()
+
+    conversation = [{"role": "user", "content": prompt}]
+    text = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+    inputs = tokenizer(text, return_tensors="pt").to(device)
+    input_ids = inputs.input_ids
+    input_len = int(input_ids.shape[1])
+    # Some decoder-only models require attention_mask even when no padding is used.
+    attention_mask = inputs.get("attention_mask", None)
+    if attention_mask is None:
+        attention_mask = input_ids.new_ones(input_ids.shape)
+    attention_mask = attention_mask.to(device)
+    # Precompute full (input_len + max_new_tokens) causal attention mask for past-key decoding.
+    attention_mask_full = attention_mask.new_ones((attention_mask.shape[0], input_len + max_new_tokens))
+
+    # Prefill once to build cache.
+    with torch.inference_mode():
+        try:
+            pre = model(input_ids=input_ids, attention_mask=attention_mask, use_cache=use_cache, logits_to_keep=1)
+        except TypeError:
+            pre = model(input_ids=input_ids, attention_mask=attention_mask, use_cache=use_cache)
+        past = getattr(pre, "past_key_values", None)
+        # Greedy next token from last logits.
+        logits = pre.logits[:, -1, :]
+        next_token = torch.argmax(logits, dim=-1, keepdim=True)
+
+    # Warmup decode steps (not timed) to reduce first-step effects.
+    with torch.inference_mode():
+        for warm_i in range(max(0, warmup)):
+            try:
+                # Attention mask must cover (past + current token).
+                attn_mask_step = attention_mask_full[:, : input_len + warm_i + 1]
+                out = model(
+                    input_ids=next_token,
+                    attention_mask=attn_mask_step,
+                    use_cache=use_cache,
+                    past_key_values=past,
+                    logits_to_keep=1,
+                )
+            except TypeError:
+                attn_mask_step = attention_mask_full[:, : input_len + warm_i + 1]
+                out = model(
+                    input_ids=next_token,
+                    attention_mask=attn_mask_step,
+                    use_cache=use_cache,
+                    past_key_values=past,
+                )
+            past = getattr(out, "past_key_values", past)
+            logits = out.logits[:, -1, :]
+            next_token = torch.argmax(logits, dim=-1, keepdim=True)
+        torch.cuda.synchronize()
+
+    # Timed decode loops (best-of iters).
+    # We report total_time_ms as end-to-end (prefill + decode), but keep
+    # decode_itl_ms / decode_throughput_tok_s based on decode-only time.
+    total_times = []
+    decode_times = []
+    with torch.inference_mode():
+        for _ in range(max(1, iters)):
+            # Re-prefill to avoid measuring a "warmed" cache from prior iteration.
+            # Time prefill separately so decode_itl_ms stays decode-only.
+            torch.cuda.synchronize()
+            prefill_start = time.perf_counter()
+            try:
+                pre = model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    use_cache=use_cache,
+                    logits_to_keep=1,
+                )
+            except TypeError:
+                # Some model/transformers combinations may not accept attention_mask.
+                pre = model(input_ids=input_ids, use_cache=use_cache)
+            past = getattr(pre, "past_key_values", None)
+            logits = pre.logits[:, -1, :]
+            next_token = torch.argmax(logits, dim=-1, keepdim=True)
+
+            torch.cuda.synchronize()
+            prefill_elapsed = time.perf_counter() - prefill_start
+
+            torch.cuda.synchronize()
+            start = time.perf_counter()  # decode start
+            try:
+                torch.cuda.nvtx.range_push("hf_decode_loop")
+            except Exception:
+                pass
+            for t in range(max_new_tokens):
+                attn_mask_step = attention_mask_full[:, : input_len + t + 1]
+                try:
+                    out = model(
+                        input_ids=next_token,
+                        attention_mask=attn_mask_step,
+                        use_cache=use_cache,
+                        past_key_values=past,
+                        logits_to_keep=1,
+                    )
+                except TypeError:
+                    out = model(
+                        input_ids=next_token,
+                        attention_mask=attn_mask_step,
+                        use_cache=use_cache,
+                        past_key_values=past,
+                    )
+                past = getattr(out, "past_key_values", past)
+                logits = out.logits[:, -1, :]
+                next_token = torch.argmax(logits, dim=-1, keepdim=True)
+            torch.cuda.synchronize()
+            decode_elapsed = time.perf_counter() - start
+            total_elapsed = prefill_elapsed + decode_elapsed
+            try:
+                torch.cuda.nvtx.range_pop()
+            except Exception:
+                pass
+            total_times.append(total_elapsed)
+            decode_times.append(decode_elapsed)
+
+    # Pick the iteration with the best end-to-end time; compute decode metrics
+    # from the corresponding decode-only time.
+    if total_times:
+        best_idx = min(range(len(total_times)), key=lambda i: total_times[i])
+        best_total = total_times[best_idx]
+        best_decode = decode_times[best_idx]
+    else:
+        best_total = 0.0
+        best_decode = 0.0
+
+    itl_ms = (best_decode * 1000.0 / max_new_tokens) if best_decode > 0 else None
+    thr = (max_new_tokens / best_decode) if best_decode > 0 else None
+    return {
+        "backend": "hf_decode_loop",
+        "total_time_ms": round(best_total * 1000, 2),
+        "input_tokens": int(input_len),
+        "output_tokens": int(max_new_tokens),
+        "decode_itl_ms": round(itl_ms, 4) if itl_ms is not None else None,
+        "decode_throughput_tok_s": round(thr, 2) if thr is not None else None,
+        "use_cache": bool(use_cache),
+        "warmup": int(warmup),
+        "iters": int(iters),
+    }
+
+
+def run_infinilm_inprocess(
+    model_path: str,
+    prompt: str,
+    max_new_tokens: int,
+    *,
+    cache_mode: Literal["static_fit", "static_maxpos", "paged"] = "paged",
+    paged_block_size: int = 256,
+    attn_backend: str = "flash-attn",
+):
+    """
+    Run InfiniLM in-process (no 2048-token truncation). Parses InferEngine's timing prints.
+    This expects PYTHONPATH to include InfiniLM/InfiniCore python packages (container runner does this).
+    """
+    import io
+    import torch
+    import contextlib
+
+    import infinicore
+    from transformers import AutoTokenizer
+
+    from infinilm.cache import PagedKVCacheConfig, StaticKVCacheConfig
+    from infinilm.distributed import DistConfig
+    from infinilm.infer_engine import GenerationConfig, InferEngine
+    from infinilm.modeling_utils import load_model_state_dict_by_file
+
+    model_path = os.path.expanduser(model_path)
+    # Prefer flash-attn when available; fall back to default.
+    try:
+        model = InferEngine(
+            model_path,
+            device=infinicore.device("cuda", 0),
+            distributed_config=DistConfig(1),
+            enable_graph_compiling=False,
+            attention_backend=attn_backend,
+        )
+    except TypeError:
+        # Older InferEngine builds may not accept attention_backend.
+        model = InferEngine(
+            model_path,
+            device=infinicore.device("cuda", 0),
+            distributed_config=DistConfig(1),
+            enable_graph_compiling=False,
+        )
+    except Exception:
+        try:
+            model = InferEngine(
+                model_path,
+                device=infinicore.device("cuda", 0),
+                distributed_config=DistConfig(1),
+                enable_graph_compiling=False,
+                attention_backend="default",
+            )
+        except TypeError:
+            model = InferEngine(
+                model_path,
+                device=infinicore.device("cuda", 0),
+                distributed_config=DistConfig(1),
+                enable_graph_compiling=False,
+            )
+    load_model_state_dict_by_file(model, model_path, dtype=model.config.dtype)
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+
+    input_ids = _build_chat_input_ids(tokenizer, prompt)
+    input_ids_infini = infinicore.from_list([input_ids])
+
+    initial_capacity = len(input_ids) + max_new_tokens
+    if cache_mode == "paged":
+        num_blocks = (initial_capacity + (paged_block_size - 1)) // paged_block_size
+        cache_config = PagedKVCacheConfig(
+            num_blocks=num_blocks,
+            block_size=paged_block_size,
+        )
+    else:
+        if cache_mode == "static_maxpos":
+            max_pos = getattr(model.config, "max_position_embeddings", 4096)
+            max_cache_len = max(initial_capacity, max_pos)
+        else:
+            # Fit cache to what we actually need for this run.
+            max_cache_len = initial_capacity
+        cache_config = StaticKVCacheConfig(max_batch_size=1, max_cache_len=max_cache_len)
+    # Basic GPU memory stats around cache construction (CUDA device assumed to be index 0).
+    mem_before_cache = torch.cuda.memory_allocated(0)
+    max_mem_before_cache = torch.cuda.max_memory_allocated(0)
+
+    model.reset_cache(cache_config)
+
+    mem_after_cache = torch.cuda.memory_allocated(0)
+    max_mem_after_cache = torch.cuda.max_memory_allocated(0)
+
+    buf = io.StringIO()
+    start = time.perf_counter()
+    with contextlib.redirect_stdout(buf):
+        try:
+            torch.cuda.nvtx.range_push("infinilm_generate")
+        except Exception:
+            pass
+        try:
+            model.generate(
+                input_ids_infini,
+                GenerationConfig(
+                    max_new_tokens=max_new_tokens,
+                    temperature=1.0,
+                    top_k=1,
+                    top_p=1.0,
+                    # Profiling: avoid per-step EOS checks + early stop variability.
+                    stop_on_eos=False,
+                ),
+                _measure_and_log_time=True,
+            )
+        finally:
+            try:
+                torch.cuda.nvtx.range_pop()
+            except Exception:
+                pass
+    elapsed = time.perf_counter() - start
+    stdout = buf.getvalue()
+
+    prefill_ttft_ms = None
+    prefill_throughput = None
+    decode_itl_ms = None
+    decode_throughput = None
+    gen_completed_ms = None
+    for line in stdout.splitlines():
+        if "Prefill TTFT:" in line:
+            m = re.search(
+                r"Prefill TTFT:\s*([\d.]+)\s*ms.*Throughput:\s*([\d.]+)\s*tok/s", line
+            )
+            if m:
+                prefill_ttft_ms = float(m.group(1))
+                prefill_throughput = float(m.group(2))
+        if "Decode" in line and "ITL:" in line:
+            m = re.search(
+                r"Decode\s+Avg ITL:\s*([\d.]+)\s*ms.*Throughput:\s*([\d.]+)\s*tok/s", line
+            )
+            if m:
+                decode_itl_ms = float(m.group(1))
+                decode_throughput = float(m.group(2))
+        if "Generation completed in" in line:
+            m = re.search(r"Generation completed in\s*([\d.]+)\s*ms", line)
+            if m:
+                gen_completed_ms = float(m.group(1))
+
+    return {
+        "backend": "infinilm",
+        "total_time_ms": round(elapsed * 1000, 2),
+        "input_tokens": len(input_ids),
+        "output_tokens": max_new_tokens,
+        "prefill_ttft_ms": prefill_ttft_ms,
+        "prefill_throughput_tok_s": prefill_throughput,
+        "decode_itl_ms": decode_itl_ms,
+        "decode_throughput_tok_s": decode_throughput,
+        "engine_reported_generation_ms": gen_completed_ms,
+        # Cache / attention configuration
+        "cache_mode": cache_mode,
+        "paged_block_size": paged_block_size if cache_mode == "paged" else None,
+        "enable_paged_attn": getattr(model, "enable_paged_attn", False),
+        "static_max_cache_len": max_cache_len if cache_mode != "paged" else None,
+        "paged_num_blocks": num_blocks if cache_mode == "paged" else None,
+        # Torch CUDA memory snapshots (bytes)
+        "torch_memory_allocated_before_cache": int(mem_before_cache),
+        "torch_memory_allocated_after_cache": int(mem_after_cache),
+        "torch_max_memory_allocated_before_cache": int(max_mem_before_cache),
+        "torch_max_memory_allocated_after_cache": int(max_mem_after_cache),
+    }
+
+
+def run_infinilm(model_path: str, prompt: str, max_new_tokens: int, env=None):
+    """Run InfiniLM jiuge via subprocess and parse stdout for metrics."""
+    run_env = {**os.environ, **(env or {})}
+    examples_dir = os.path.dirname(os.path.abspath(__file__))
+    jiuge_py = os.path.join(examples_dir, "jiuge.py")
+    cmd = [
+        sys.executable,
+        jiuge_py,
+        "--nvidia",
+        "--model_path", model_path,
+        "--prompt", prompt,
+        "--max_new_tokens", str(max_new_tokens),
+    ]
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=300,
+            env=run_env,
+            cwd=examples_dir,
+        )
+        stdout = result.stdout or ""
+        if result.returncode != 0 and not stdout:
+            return {"backend": "infinilm", "error": (result.stderr or f"exit code {result.returncode}")[:500]}
+    except Exception as e:
+        return {"backend": "infinilm", "error": str(e)}
+
+    # Parse jiuge / InferEngine output
+    prefill_ttft_ms = None
+    prefill_throughput = None
+    decode_itl_ms = None
+    decode_throughput = None
+    total_time_ms = None
+    for line in stdout.splitlines():
+        if "Prefill TTFT:" in line:
+            m = re.search(r"Prefill TTFT:\s*([\d.]+)\s*ms.*Throughput:\s*([\d.]+)\s*tok/s", line)
+            if m:
+                prefill_ttft_ms = float(m.group(1))
+                prefill_throughput = float(m.group(2))
+        if "Decode" in line and "ITL:" in line:
+            m = re.search(r"Decode\s+Avg ITL:\s*([\d.]+)\s*ms.*Throughput:\s*([\d.]+)\s*tok/s", line)
+            if m:
+                decode_itl_ms = float(m.group(1))
+                decode_throughput = float(m.group(2))
+        if "total_time:" in line:
+            m = re.search(r"total_time:\s*([\d.]+)\s*ms", line)
+            if m:
+                total_time_ms = float(m.group(1))
+        if "Generation completed in" in line:
+            m = re.search(r"Generation completed in\s*([\d.]+)\s*ms", line)
+            if m:
+                total_time_ms = float(m.group(1))
+
+    return {
+        "backend": "infinilm",
+        "total_time_ms": total_time_ms,
+        "prefill_ttft_ms": prefill_ttft_ms,
+        "prefill_throughput_tok_s": prefill_throughput,
+        "decode_itl_ms": decode_itl_ms,
+        "decode_throughput_tok_s": decode_throughput,
+    }
+
+
+def run_sglang_client(sglang_url: str, prompt: str, max_new_tokens: int):
+    """Send one request to SGLang server and return metrics."""
+    try:
+        import requests
+    except ImportError:
+        return {"backend": "sglang", "error": "requests not installed"}
+
+    url = sglang_url.rstrip("/") + "/generate"
+    payload = {
+        "text": prompt,
+        "sampling_params": {"max_new_tokens": max_new_tokens, "temperature": 0},
+    }
+    start = time.perf_counter()
+    try:
+        r = requests.post(url, json=payload, timeout=120)
+        r.raise_for_status()
+        data = r.json()
+    except Exception as e:
+        return {"backend": "sglang", "error": str(e)}
+    elapsed_ms = (time.perf_counter() - start) * 1000
+
+    # SGLang response may have "meta_info" with "completion_tokens" or we use prompt + output length
+    output_text = (data.get("text") or data.get("choices", [{}])[0].get("text") or "")
+    completion_tokens = data.get("meta_info", {}).get("completion_tokens") or data.get("usage", {}).get("completion_tokens")
+    if completion_tokens is None and "usage" in data:
+        completion_tokens = data["usage"].get("completion_tokens")
+    if completion_tokens is None:
+        completion_tokens = max_new_tokens  # fallback
+
+    return {
+        "backend": "sglang",
+        "total_time_ms": round(elapsed_ms, 2),
+        "output_tokens": completion_tokens,
+        "total_throughput_tok_s": round(completion_tokens / (elapsed_ms / 1000), 2) if elapsed_ms > 0 else None,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Compare MiniCPM-SALA inference speed: HF, InfiniLM, SGLang")
+    parser.add_argument("--model_path", required=True, help="Path to MiniCPM-SALA model dir")
+    parser.add_argument("--prompt", default="How are you", help="Prompt for generation")
+    parser.add_argument("--max_new_tokens", type=int, default=32, help="Max new tokens to generate")
+    parser.add_argument(
+        "--target_input_tokens",
+        type=int,
+        default=None,
+        help="If set, synthesize a long prompt so chat-templated input tokens >= this value (e.g. 65536).",
+    )
+    parser.add_argument(
+        "--infinilm_cache_mode",
+        type=str,
+        default="paged",
+        choices=["paged", "static_fit", "static_maxpos"],
+        help="InfiniLM KV cache mode when running long prompts in-process.",
+    )
+    parser.add_argument(
+        "--infinilm_paged_block_size",
+        type=int,
+        default=256,
+        help="Paged KV block size (tokens per block).",
+    )
+    parser.add_argument(
+        "--infinilm_attn_backend",
+        type=str,
+        default="flash-attn",
+        help="InfiniLM attention backend (e.g. flash-attn or default).",
+    )
+    parser.add_argument(
+        "--hf_attn_implementation",
+        type=str,
+        default="flash_attention_2",
+        help="HF attention implementation to request (e.g. flash_attention_2 or eager).",
+    )
+    parser.add_argument(
+        "--hf_mode",
+        type=str,
+        default="generate",
+        choices=["generate", "forward_prefill", "decode_loop"],
+        help="HF run mode: generate() end-to-end, forward-only prefill, or manual decode_loop timing with KV cache.",
+    )
+    parser.add_argument(
+        "--hf_forward_use_cache",
+        action="store_true",
+        help="In HF forward_prefill mode, pass use_cache=True (recommended).",
+    )
+    parser.add_argument(
+        "--hf_forward_warmup",
+        type=int,
+        default=1,
+        help="Warmup iterations for HF forward_prefill.",
+    )
+    parser.add_argument(
+        "--hf_forward_iters",
+        type=int,
+        default=1,
+        help="Measured iterations for HF forward_prefill (best-of).",
+    )
+    parser.add_argument(
+        "--hf_decode_warmup",
+        type=int,
+        default=8,
+        help="Warmup steps for HF decode_loop (not timed).",
+    )
+    parser.add_argument(
+        "--hf_decode_iters",
+        type=int,
+        default=1,
+        help="Measured iterations for HF decode_loop (best-of).",
+    )
+    parser.add_argument("--sglang_url", default=None, help="SGLang server URL (e.g. http://127.0.0.1:30000); if set, query SGLang")
+    parser.add_argument("--backends", default="hf,infinilm", help="Comma-separated: hf,infinilm,sglang")
+    parser.add_argument("--output", default=None, help="Write JSON results to this path")
+    parser.add_argument("--no_hf", action="store_true", help="Skip HF (e.g. if no GPU memory for two models)")
+    parser.add_argument("--no_infinilm", action="store_true", help="Skip InfiniLM")
+    parser.add_argument(
+        "--prefill_16k",
+        action="store_true",
+        help="Convenience flag: set --target_input_tokens=16384 and --max_new_tokens=1 (prefill-dominated).",
+    )
+    parser.add_argument(
+        "--infinilm_inprocess",
+        action="store_true",
+        help="Run InfiniLM in-process (no jiuge subprocess). Use when PYTHONPATH/LD_LIBRARY_PATH are set in this process.",
+    )
+    args = parser.parse_args()
+
+    backends = [b.strip() for b in args.backends.split(",")]
+    results = []
+
+    # Normalize convenience prefill-only configuration.
+    if args.prefill_16k:
+        if args.target_input_tokens is None:
+            args.target_input_tokens = 16384
+        # For prefill-dominated comparisons, prefer HF forward-only by default.
+        if args.hf_mode == "generate":
+            args.hf_mode = "forward_prefill"
+        if args.max_new_tokens != 1:
+            args.max_new_tokens = 1
+
+    # If requested, build a long prompt once using HF tokenizer.
+    if args.target_input_tokens is not None:
+        try:
+            from transformers import AutoTokenizer
+
+            tok = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
+            long_prompt, actual = _make_prompt_with_target_tokens(tok, args.prompt, args.target_input_tokens)
+            args.prompt = long_prompt
+            print(f"[prompt] synthesized chat input tokens: {actual} (target >= {args.target_input_tokens})")
+        except Exception as e:
+            print(f"[prompt] failed to synthesize long prompt: {e}")
+
+    if "hf" in backends and not args.no_hf:
+        try:
+            import torch
+            if args.hf_mode == "forward_prefill":
+                r = run_hf_forward_prefill(
+                    args.model_path,
+                    args.prompt,
+                    attn_implementation=args.hf_attn_implementation,
+                    use_cache=args.hf_forward_use_cache,
+                    warmup=args.hf_forward_warmup,
+                    iters=args.hf_forward_iters,
+                )
+            elif args.hf_mode == "decode_loop":
+                r = run_hf_decode_loop(
+                    args.model_path,
+                    args.prompt,
+                    args.max_new_tokens,
+                    attn_implementation=args.hf_attn_implementation,
+                    use_cache=True,
+                    warmup=args.hf_decode_warmup,
+                    iters=args.hf_decode_iters,
+                )
+            else:
+                r = run_hf(
+                    args.model_path,
+                    args.prompt,
+                    args.max_new_tokens,
+                    attn_implementation=args.hf_attn_implementation,
+                )
+            results.append(r)
+        except Exception as e:
+            results.append({"backend": "hf", "error": str(e)})
+
+    if "infinilm" in backends and not args.no_infinilm:
+        # In-process: when env is set in this process or --infinilm_inprocess, avoid jiuge subprocess.
+        # Also use in-process for long prompts (target_input_tokens) to avoid 2048-token truncation.
+        use_inprocess = args.infinilm_inprocess or args.target_input_tokens is not None
+        if use_inprocess:
+            try:
+                r = run_infinilm_inprocess(
+                    args.model_path,
+                    args.prompt,
+                    args.max_new_tokens,
+                    cache_mode=args.infinilm_cache_mode,  # type: ignore[arg-type]
+                    paged_block_size=args.infinilm_paged_block_size,
+                    attn_backend=args.infinilm_attn_backend,
+                )
+            except Exception as e:
+                r = {"backend": "infinilm", "error": str(e)}
+        else:
+            r = run_infinilm(args.model_path, args.prompt, args.max_new_tokens)
+        results.append(r)
+
+    if "sglang" in backends and args.sglang_url:
+        r = run_sglang_client(args.sglang_url, args.prompt, args.max_new_tokens)
+        results.append(r)
+    elif "sglang" in backends and not args.sglang_url:
+        results.append({"backend": "sglang", "error": "No --sglang_url provided; start SGLang server with MiniCPM-SALA first"})
+
+    # Print table
+    print("\n" + "=" * 60)
+    print("MiniCPM-SALA inference speed comparison")
+    print("=" * 60)
+    print(f"  prompt = {repr(args.prompt[:500])}   max_new_tokens = {args.max_new_tokens}")
+    print()
+    for r in results:
+        if "error" in r:
+            print(f"  {r['backend']}: ERROR {r['error']}")
+            continue
+        print(f"  {r['backend']}:")
+        for k, v in r.items():
+            if k == "backend" or v is None:
+                continue
+            if isinstance(v, float):
+                print(f"    {k}: {v}")
+            else:
+                print(f"    {k}: {v}")
+        print()
+    print("=" * 60)
+
+    if args.output:
+        with open(args.output, "w") as f:
+            json.dump({"prompt": args.prompt, "max_new_tokens": args.max_new_tokens, "results": results}, f, indent=2)
+        print(f"Wrote {args.output}")
+
+
+if __name__ == "__main__":
+    import os
+    main()
diff --git a/examples/jiuge.py b/examples/jiuge.py
index fa547435..1fcba6c4 100644
--- a/examples/jiuge.py
+++ b/examples/jiuge.py
@@ -252,9 +252,13 @@ def test(
     # ---------------------------------------------------------------------------- #
     #                       Create KVCache
     # ---------------------------------------------------------------------------- #
+    batch_size = 1 if isinstance(prompts, str) else len(prompts)
+    initial_capacity = max_new_tokens + len(input_ids_list[0])
+    # MiniCPM-SALA uses per-layer dense KV cache in C++; engine cache_config drives
+    # scheduling only. Static cache is recommended (no paged bookkeeping) unless
+    # --enable-paged-attn is explicitly set.
     if enable_paged_attn:
-        batch_size = 1 if prompts is str else len(prompts)
-        max_total_tokens = max_new_tokens + len(input_ids_list[0])
+        max_total_tokens = initial_capacity
         cache_config = PagedKVCacheConfig(
             num_blocks=(
                 (max_total_tokens + (_PAGED_KV_BLOCK_SIZE - 1)) // _PAGED_KV_BLOCK_SIZE
@@ -263,10 +267,12 @@ def test(
             block_size=_PAGED_KV_BLOCK_SIZE,
         )
     else:
-        batch_size = 1 if prompts is str else len(prompts)
-        initial_capacity = max_new_tokens + len(input_ids_list[0])
+        max_cache_len = initial_capacity
+        if getattr(model.config, "model_type", None) == "minicpm_sala":
+            max_pos = getattr(model.config, "max_position_embeddings", 4096)
+            max_cache_len = max(initial_capacity, max_pos)
         cache_config = StaticKVCacheConfig(
-            max_batch_size=batch_size, max_cache_len=initial_capacity
+            max_batch_size=batch_size, max_cache_len=max_cache_len
         )
 
     model.reset_cache(cache_config)
diff --git a/examples/metrics_16k_prefill.md b/examples/metrics_16k_prefill.md
new file mode 100644
index 00000000..2337fac0
--- /dev/null
+++ b/examples/metrics_16k_prefill.md
@@ -0,0 +1,152 @@
+### MiniCPM-SALA 16k long-prompt metrics (A/B cache modes)
+
+**Setup**
+
+- **Prompt construction**: `--target_input_tokens 16384` (actual synthesized **16386** chat-template tokens)
+- **Workload**: `--max_new_tokens 1` (prefill-dominated)
+- **Environment**: run via `scripts/run_compare_speed_in_container.sh` inside container `minicpm-sala`
+
+| backend | cache_mode | attn_backend | enable_paged_attn | cache sizing | prefill_ttft_ms | prefill_throughput_tok_s | total_time_ms |
+|---|---|---|---:|---|---:|---:|---:|
+| hf | — | — | — | — | — | 9325.01 | 1757.21 |
+| infinilm | static_fit | default | False | static_max_cache_len=16387 | 33632.05 | 487.21 | 33632.29 |
+| infinilm | static_maxpos | default | False | static_max_cache_len=524288 | 34067.49 | 480.99 | 34067.75 |
+| infinilm | paged | default | True | paged_block_size=256, paged_num_blocks=65 | 35626.25 | 459.94 | 35627.10 |
+
+**Raw commands**
+
+```bash
+./scripts/run_compare_speed_in_container.sh --backends hf --target_input_tokens 16384 --max_new_tokens 1
+./scripts/run_compare_speed_in_container.sh --backends infinilm --target_input_tokens 16384 --max_new_tokens 1 --infinilm_attn_backend default --infinilm_cache_mode static_fit
+./scripts/run_compare_speed_in_container.sh --backends infinilm --target_input_tokens 16384 --max_new_tokens 1 --infinilm_attn_backend default --infinilm_cache_mode static_maxpos
+./scripts/run_compare_speed_in_container.sh --backends infinilm --target_input_tokens 16384 --max_new_tokens 1 --infinilm_attn_backend default --infinilm_cache_mode paged --infinilm_paged_block_size 256
+```
+
+### Profiling methodology (nsys) for kernel attribution (HF vs InfiniLM prefill)
+
+**Goal**: attribute the 16k prefill gap to kernel families (attention vs GEMMs vs layout/copies/sync), using the same prompt and a prefill-dominated workload.
+
+**Environment**: all profiling commands in this section are run **inside the container `minicpm-sala`** (not on the host), so that PyTorch, InfiniCore, and the model path are available. Use `docker exec -it minicpm-sala bash` or the host script `./scripts/profile_prefill_torchprof_in_container.sh` to run in-container.
+
+**Workload**
+
+- HF: forward-only prefill (`--hf_mode forward_prefill`, `--max_new_tokens 1`)
+- InfiniLM: prefill-dominated generation (`--target_input_tokens 16384 --max_new_tokens 1`)
+
+**Key requirements**
+
+- Use a free GPU to avoid allocator failures and noisy traces, e.g. `CUDA_VISIBLE_DEVICES=1`.
+- Prefer `nsys stats` reports:
+  - `cuda_gpu_kern_sum`
+  - `cuda_gpu_mem_time_sum`
+  - `cuda_api_sum`
+  - `nvtx_sum`
+
+**Example (inside container `minicpm-sala`)**
+
+```bash
+export CUDA_VISIBLE_DEVICES=1
+REPO=/home/zenghua/workspace/minicpm-sala-support
+MODEL=/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA
+OUT=${REPO}/profiles
+mkdir -p ${OUT}
+
+source /app/docker/nvidia/env-set.sh 2>/dev/null || true
+export PYTHONPATH=${REPO}/InfiniLM/python:${REPO}/InfiniCore/python:${PYTHONPATH}
+
+# HF forward-only prefill (single forward, best for kernel attribution)
+nsys profile --force-overwrite=true --trace=cuda,nvtx,osrt \
+  -o ${OUT}/hf_forward_prefill_16k \
+  python3 ${REPO}/InfiniLM/examples/compare_inference_speed.py \
+    --model_path "${MODEL}" --prefill_16k --backends hf \
+    --hf_mode forward_prefill --hf_forward_use_cache \
+    --hf_forward_warmup 1 --hf_forward_iters 1 \
+    --hf_attn_implementation flash_attention_2
+
+# InfiniLM prefill-dominated (max_new_tokens=1)
+nsys profile --force-overwrite=true --trace=cuda,nvtx,osrt \
+  -o ${OUT}/infinilm_prefill_16k \
+  python3 ${REPO}/InfiniLM/examples/compare_inference_speed.py \
+    --model_path "${MODEL}" --prefill_16k --backends infinilm \
+    --infinilm_cache_mode static_fit --infinilm_attn_backend default
+
+# Summaries
+nsys stats --report cuda_gpu_kern_sum --format table ${OUT}/hf_forward_prefill_16k.nsys-rep > ${OUT}/hf_forward_prefill_16k_cuda_gpu_kern_sum.txt
+nsys stats --report cuda_gpu_kern_sum --format table ${OUT}/infinilm_prefill_16k.nsys-rep   > ${OUT}/infinilm_prefill_16k_cuda_gpu_kern_sum.txt
+nsys stats --report cuda_gpu_mem_time_sum --format table ${OUT}/hf_forward_prefill_16k.nsys-rep > ${OUT}/hf_forward_prefill_16k_cuda_gpu_mem_time_sum.txt
+nsys stats --report cuda_gpu_mem_time_sum --format table ${OUT}/infinilm_prefill_16k.nsys-rep   > ${OUT}/infinilm_prefill_16k_cuda_gpu_mem_time_sum.txt
+nsys stats --report cuda_api_sum --format table ${OUT}/hf_forward_prefill_16k.nsys-rep > ${OUT}/hf_forward_prefill_16k_cuda_api_sum.txt
+nsys stats --report cuda_api_sum --format table ${OUT}/infinilm_prefill_16k.nsys-rep   > ${OUT}/infinilm_prefill_16k_cuda_api_sum.txt
+nsys stats --report nvtx_sum --format table ${OUT}/hf_forward_prefill_16k.nsys-rep > ${OUT}/hf_forward_prefill_16k_nvtx_sum.txt
+nsys stats --report nvtx_sum --format table ${OUT}/infinilm_prefill_16k.nsys-rep   > ${OUT}/infinilm_prefill_16k_nvtx_sum.txt
+```
+
+### Prefill kernel launch reduction: SiLU/SwiGLU evidence and change
+
+**Evidence that SiLU/SwiGLU contributed to launch count**
+
+- Prefill profiling (e.g. `profile_prefill_infinilm_torchprof.py` at seq_len=512) showed ~298k `cudaLaunchKernel` and many small **elementwise** kernels (~36k calls). The MLP path used two separate InfiniCore ops per layer for SwiGLU:
+  - `infinicore::op::silu_(gate, gate)` — one kernel per layer
+  - `infinicore::op::mul(gate, up)` — one kernel per layer
+- With 32 layers that is **64 extra launches** from this pattern alone. InfiniCore provides a **fused** `swiglu(a, b)` (single kernel: `a * b * sigmoid(b)`), which matches SwiGLU as `silu(gate)*up` when called as `swiglu(up, gate)`.
+
+**Change applied**
+
+- **File**: `InfiniLM/csrc/models/minicpm_sala/minicpm_sala_mlp.cpp`
+- **Before**: `silu_(gate, gate)` then `mul(gate, up)` (two kernel launches per layer).
+- **After**: single `infinicore::op::swiglu(up, gate)` (one kernel per layer).
+- **Effect**: 32 fewer kernel launches per prefill (one per layer). Re-run the same prefill profiler or nsys commands above and compare `cuda_api_sum` (e.g. `cudaLaunchKernel` count) and `cuda_gpu_kern_sum` to confirm.
+
+### Environment fix: run InfiniLM/InfiniCore with InfLLM-v2 without LD_PRELOAD (nsys-safe)
+
+When profiling with `nsys`, setting `LD_PRELOAD` to the `infllm_v2` extension can break `nsys` itself (loader errors from PyTorch's `libtorch_python.so`). To make `nsys profile ... python ...` work reliably, we preload the InfLLM-v2 `.so` **inside Python** (RTLD_GLOBAL) before importing `infinicore`, so that `libinfinicore_cpp_api.so` can resolve `mha_varlen_fwd` / `mha_fwd_kvcache` without using `LD_PRELOAD`.
+
+- **Note**: InfLLM-v2 is now linked normally via InfiniCore build; no Python-side preload helper is required.
+- **Wired into scripts** (preload before `import infinicore`):
+  - `InfiniLM/examples/compare_inference_speed.py`
+  - `InfiniLM/examples/profile_prefill_infinilm_torchprof.py`
+  - `InfiniLM/examples/minicpm_sala_logits_sanity.py`
+
+This unblocks running both torchprof and `nsys profile` inside the `minicpm-sala` container with a consistent environment.
+
+### 16k prefill nsys numbers (post env-fix)
+
+**Workload:** `--prefill_16k` (prompt tokens 16386), `--max_new_tokens 1`, `--infinilm_cache_mode static_fit`, `--infinilm_attn_backend default`
+
+- **HF forward-only prefill** (from `compare_inference_speed.py`): `total_time_ms ≈ 1782.58` for 16386 tokens.
+- **HF forward-only prefill (rerun)** (from `compare_inference_speed.py`): `total_time_ms = 1757.21`, `prefill_throughput_tok_s = 9325.01` for 16386 tokens.
+- **InfiniLM prefill-dominated** (from `compare_inference_speed.py`): `prefill_ttft_ms ≈ 55646.11` (baseline run) and `prefill_ttft_ms ≈ 57623.64` (rerun after minor code changes).
+
+**InfiniLM 16k CUDA API summary** (nsys `cuda_api_sum`, baseline run `profiles/infinilm_prefill_16k_cuda_api_sum.txt`):
+
+- `cudaLaunchKernel`: **3,147,266 calls**
+- `cudaMemcpyAsync`: **394,155 calls**
+
+Top GPU kernels by time (nsys `cuda_gpu_kern_sum`, baseline run `profiles/infinilm_prefill_16k_cuda_gpu_kern_sum.txt`) show very high call counts tied to the Lightning Simple GLA path:
+
+- Several `at::native::*elementwise_kernel*` entries at **393,264 instances each** (exactly `16386 * 24`), indicating a large per-token kernel launch budget in the current GLA implementation.
+
+**Prefill profiling: run inside container `minicpm-sala`**
+
+All profiling commands below are intended to run **inside the container** (so PyTorch, InfiniCore, and the model are available). From the host you can either `docker exec -it minicpm-sala bash` and run the commands, or use the helper script that runs the torchprof prefill script in-container.
+
+- **Launch-count confirmation (torchprof, in-container)**
+
+  From repo root on host:
+
+  ```bash
+  ./scripts/profile_prefill_torchprof_in_container.sh
+  ```
+
+  Optional env: `SEQ_LEN=512` (default), `ACTIVE=1`, `MODEL_PATH`, `CUDA_VISIBLE_DEVICES`, `INFINILM_CUDA_INDEX`. The script prints `[launch_summary] cudaLaunchKernel_count=... cudaMemcpy_count=...` and the kernel table; compare after the SwiGLU fusion to confirm ~32 fewer launches per prefill.
+
+  Or inside the container:
+
+  ```bash
+  source /app/docker/nvidia/env-set.sh 2>/dev/null || true
+  export PYTHONPATH=${REPO}/InfiniLM/python:${REPO}/InfiniCore/python:${PYTHONPATH}
+  cd ${REPO}/InfiniLM
+  INFINILM_CUDA_INDEX=0 python3 examples/profile_prefill_infinilm_torchprof.py --model_path "${MODEL}" --seq_len 512 --active 1 --out /tmp/torchprof_prefill_512.txt
+  ```
+
+- **nsys prefill profiling** (see “Example (inside container minicpm-sala)” above) also runs in-container; use the same `REPO`, `MODEL`, `source env-set.sh`, and `PYTHONPATH` before `nsys profile` and `nsys stats`.
diff --git a/examples/metrics_longtext_mem.md b/examples/metrics_longtext_mem.md
new file mode 100644
index 00000000..28fe8f33
--- /dev/null
+++ b/examples/metrics_longtext_mem.md
@@ -0,0 +1,378 @@
+### MiniCPM-SALA long-context metrics + memory history
+
+**Goal**: record reproducible long-context runs with:
+
+- **time** (prefill TTFT / throughput)
+- **peak GPU memory** (from 1s `nvidia-smi` polling)
+- exact **command lines** and key env
+
+**Notes**
+
+- All commands are intended to run **inside** docker container `minicpm-sala`.
+- Prefer an **idle** GPU (avoid indices that are already near full VRAM). Scan on the host (or `docker exec minicpm-sala nvidia-smi ...` if all GPUs are visible there):
+  `nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv,noheader,nounits`
+  Then set `export CUDA_VISIBLE_DEVICES=<N>` for the run and, for scripts that poll VRAM (e.g. `collect_metrics_longtext_decode.py`), set `NVML_GPU_INDEX=<N>` to the **same** index. Example when GPUs 2–4 are mostly free: `CUDA_VISIBLE_DEVICES=2` and `NVML_GPU_INDEX=2`.
+- For InfiniLM + InfLLM-v2 builds, `libinfinicore_cpp_api.so` may require preloading `infllm_v2` with `RTLD_GLOBAL` before importing `infinicore`.
+
+### OOM-safe sweep: one case per process
+
+Running every long-context case in a **single** Python session can leave CUDA memory fragmented or peak across cases. Prefer `**run_longtext_metrics_cases.sh`**, which runs each `(backend × target × max_new)` as its **own** `python3 collect_metrics_longtext_decode.py --case ...` subprocess, appends one JSON line per row to `profiling_runs/longtext_decode_rows.jsonl`, then prints a markdown table via `--from-jsonl`.
+
+```bash
+REPO=/home/zenghua/workspace/minicpm-sala-support
+export CUDA_VISIBLE_DEVICES=2
+export NVML_GPU_INDEX=2
+export PYTHONPATH=$REPO/InfiniLM/examples:$REPO/InfiniCore/python:$REPO/InfiniLM/python
+export LD_LIBRARY_PATH=/root/.infini/lib:${LD_LIBRARY_PATH:-}
+export METRICS_DATE=2026-03-23
+cd $REPO/InfiniLM/examples
+./run_longtext_metrics_cases.sh
+```
+
+Single case manually: `python3 collect_metrics_longtext_decode.py --case hf:16384` or `infinilm_rec:65536:1`. Monolithic (unsafe) full matrix: `python3 collect_metrics_longtext_decode.py --all-in-process`.
+
+### Clean & Validate Status (post-cleanup: 2026-03-23)
+
+- Clean: removed unused debug helper `log_tensor_stats_to_file_if_enabled` and deprecated metrics padded-decode cases; `collect_metrics_longtext_decode.py` + `run_longtext_metrics_cases.sh` no longer sweep `infinilm_pad:*`.
+- Validate: rebuilt `_infinilm`, ran `InfiniCore/test/infinicore/ops/test_simple_gla_decode_recurrent.py --nvidia`, `test_simple_gla_prefill.py --nvidia`, and `InfiniLM/examples/minicpm_sala_logits_sanity.py` in `prefill` + `decode1`; confirmed `collect_metrics_longtext_decode.py --case infinilm_pad:*` is rejected with an "Unknown case kind" error.
+
+---
+
+## 2026-03-23 long-context + decode (`longtext_decode_rows.jsonl`)
+
+Subprocess sweep via `./run_longtext_metrics_cases.sh`. **GPU:** `CUDA_VISIBLE_DEVICES=0`, `NVML_GPU_INDEX=0`. **Targets:** `METRICS_TARGETS=16384,32768`. **Decode steps:** `METRICS_DECODE_STEPS=32`. Recurrent InfiniLM uses `INFINI_LIGHTNING_GLA_RECURRENT_DECODE=1` with batched GLA state sync (GEMM). HF `total_ms` is end-to-end (prefill + decode), matching InfiniLM. `hf (decode_loop)` rows for `max_new=32` are appended via `hf:<target>:32`. Regenerate this table: `python3 collect_metrics_longtext_decode.py --from-jsonl profiling_runs/longtext_decode_rows.jsonl`
+
+
+| date       | backend                                          | target_in | max_new | peak_mem_mib | total_ms | prefill_ttft_ms | prefill_tok_s | decode_itl_ms | decode_tok_s | gpu |
+| ---------- | ------------------------------------------------ | --------- | ------- | ------------ | -------- | --------------- | ------------- | ------------- | ------------ | --- |
+| 2026-03-23 | hf (decode_loop)                                 | 16384     | 1       | 38101        | 1821.53  | —              | —             | —             | —            | 0   |
+| 2026-03-23 | hf (decode_loop)                                 | 32768     | 1       | 51545        | 3711.99  | —              | —             | —             | —            | 0   |
+| 2026-03-23 | hf (decode_loop)                                 | 16384     | 32      | 38365        | 3435.09  | —              | —             | 52.24         | 19.14        | 0   |
+| 2026-03-23 | hf (decode_loop)                                 | 32768     | 32      | 41717        | 5247.77  | —              | —             | 52.90         | 18.90        | 0   |
+| 2026-03-23 | infinilm (static_fit, recurrent GLA decode)      | 16384     | 1       | 33525        | 3162.11  | 3161.5          | 5182.98       | —             | —            | 0   |
+| 2026-03-23 | infinilm (static_fit, recurrent GLA decode)      | 32768     | 1       | 44897        | 7139.12  | 7138.74         | 4590.45       | —             | —            | 0   |
+| 2026-03-23 | infinilm (static_fit, recurrent GLA, +32 decode) | 16384     | 32      | 33537        | 4111.32  | 3182.07         | 5149.48       | 29.94         | 33.4         | 0   |
+| 2026-03-23 | infinilm (static_fit, recurrent GLA, +32 decode) | 32768     | 32      | 44911        | 8357.39  | 7146.78         | 4585.28       | 39            | 25.64        | 0   |
+
+
+---
+
+## History table
+
+
+| date       | backend                                       | target_input_tokens | max_new_tokens | cache_mode | peak_mem_mib | total_time_ms | prefill_ttft_ms | prefill_throughput_tok_s | gpu |
+| ---------- | --------------------------------------------- | ------------------- | -------------- | ---------- | ------------ | ------------- | --------------- | ------------------------ | --- |
+| 2026-03-18 | hf                                            | 16384               | 1              | —          | 38091        | 1757.21       | —               | 9325.01                  | 2   |
+| 2026-03-19 | hf                                            | 16384               | 1              | —          | 38091        | 1760.08       | —               | 9311.48                  | 2   |
+| 2026-03-18 | hf                                            | 32768               | 1              | —          | 41173        | 3537.65       | —               | 9263.22                  | 2   |
+| 2026-03-19 | hf                                            | 32768               | 1              | —          | 41151        | 3516.06       | —               | 9319.51                  | 2   |
+| 2026-03-19 | infinilm(baseline)                            | 16384               | 1              | static_fit | 33570        | 2849.22       | 2849.03         | 5751.44                  | 0   |
+| 2026-03-19 | infinilm(baseline)                            | 32768               | 1              | static_fit | 44174        | 5960.41       | 5960.14         | 5498.19                  | 0   |
+| 2026-03-19 | infinilm(baseline)                            | 65536               | 1              | static_fit | 67195        | 13929.51      | 13929.12        | 4705.11                  | 4   |
+| 2026-03-19 | hf (consistent-batch)                         | 16384               | 1              | —          | 38091        | 1782.63       | —               | 9192.04                  | 4   |
+| 2026-03-19 | hf (consistent-batch)                         | 32768               | 1              | —          | 41173        | 3585.96       | —               | 9138.42                  | 4   |
+| 2026-03-19 | hf (consistent-batch)                         | 65536               | 1              | —          | 47319        | 7426.98       | —               | 8824.32                  | 4   |
+| 2026-03-19 | infinilm (consistent-batch)                   | 16384               | 1              | static_fit | 32605        | 2887.28       | 2887.06         | 5675.67                  | 4   |
+| 2026-03-19 | infinilm (consistent-batch)                   | 32768               | 1              | static_fit | 43209        | 6005.78       | 6005.57         | 5456.60                  | 4   |
+| 2026-03-19 | infinilm (consistent-batch)                   | 65536               | 1              | static_fit | 67195        | 13940.17      | 13939.90        | 4701.47                  | 4   |
+| 2026-03-19 | infinilm (exp2/3 opt: strided KV + GLA views) | 32768               | 1              | static_fit | 38613        | 5993.70       | 5993.45         | 5467.64                  | 4   |
+| 2026-03-19 | infinilm (exp2/3 opt: strided KV + GLA views) | 65536               | 1              | static_fit | 67195        | 13959.08      | 13958.78        | 4695.11                  | 4   |
+| 2026-03-19 | infinilm(baseline)                            | 131072              | 1              | static_fit | 79883        | OOM           | —               | —                        | 6   |
+| 2026-03-18 | hf                                            | 524288              | 1              | —          | 59591        | OOM           | —               | —                        | 3   |
+| 2026-03-18 | hf                                            | 65536               | 1              | —          | 47319        | 7340.99       | —               | 8927.67                  | 1   |
+| 2026-03-18 | hf                                            | 131072              | 1              | —          | 61641        | 15290.39      | —               | 8572.31                  | 1   |
+| 2026-03-18 | hf                                            | 262144              | 1              | —          | 80059        | OOM           | —               | —                        | 1   |
+
+
+---
+
+## 2026-03-19 consistent batch summary (GPU 4, 1s polling)
+
+Protocol used for both backends:
+
+- same physical GPU (`CUDA_VISIBLE_DEVICES=4`), same model, `max_new_tokens=1`
+- same target lengths: 16k / 32k / 64k
+- memory measured from 1s `nvidia-smi -i 4 --query-gpu=memory.used` polling
+- HF path: `--hf_mode forward_prefill --hf_forward_use_cache --hf_forward_warmup 1 --hf_forward_iters 1`
+- InfiniLM path: `--infinilm_inprocess --infinilm_cache_mode static_fit`
+
+### Growth deltas (16k->32k and 32k->64k)
+
+TTFT note: HF forward-prefill does not emit TTFT; `total_time_ms` is used as prefill-time proxy for HF deltas.
+
+
+| backend               | 16k->32k mem delta (MiB) | 32k->64k mem delta (MiB) | 16k->32k time delta (ms) | 32k->64k time delta (ms) |
+| --------------------- | ------------------------ | ------------------------ | ------------------------ | ------------------------ |
+| hf (forward-prefill)  | +3082                    | +6146                    | +1803.33                 | +3841.02                 |
+| infinilm (static_fit) | +10604                   | +23986                   | +3118.51 (TTFT)          | +7934.33 (TTFT)          |
+
+
+### Attribution profiling (InfiniLM 32k / 64k)
+
+Artifacts are saved in `InfiniLM/examples/profiling_runs`:
+
+- allocator logs: `alloc_infinilm_32768_gpu4.log`, `alloc_infinilm_65536_gpu4.log`
+- nsys logs: `nsys_infinilm_32768_gpu4.log`, `nsys_infinilm_65536_gpu4.log`
+
+Allocator observations (`INFINICORE_DEBUG_ALLOC=1`):
+
+- both runs show identical small/medium allocation patterns (e.g., many `32 MiB` and `128 MiB` class allocations), suggesting these are mostly fixed/runtime-structural.
+- 64k introduces substantially larger "large" allocations than 32k (examples in logs include `12.0 GiB`, `9.0 GiB`, and `2.0 GiB`-class requests), consistent with context-length-driven persistent KV slab growth.
+- 32k large allocations are present but markedly smaller (e.g., `~6.0 GiB`, `~4.5 GiB`, `~1.0 GiB`), aligning with lower persistent cache footprint.
+
+Nsight Systems observations (`nsys profile --trace=cuda,nvtx,osrt --stats=true`):
+
+- NVTX `infinilm_generate` range scales from `~6.18s` (32k) to `~14.17s` (64k), matching TTFT growth.
+- CUDA API summary becomes more memcpy-dominated at 64k:
+  - 32k: `cudaMemcpy ~64.6%`, `cudaMemcpyAsync ~33.0%`
+  - 64k: `cudaMemcpy ~83.0%`, `cudaMemcpyAsync ~15.7%`
+- GPU kernel summary shows both attention and GLA prefill kernels scaling up:
+  - `flash_fwd_kernel` total: `~1.03s` -> `~4.09s`
+  - `simple_gla_prefill_chunked_kernel` total: `~1.24s` -> `~2.45s`
+
+Attribution confidence:
+
+- **High**: persistent KV/cache-related allocations are the primary memory-growth driver from 32k to 64k.
+- **Medium**: transient prefill compute/workspace growth contributes, but is secondary vs persistent slabs for memory.
+- **Medium**: synchronization/memcpy behavior is a major TTFT growth contributor at 64k.
+
+### Short-context decode profiling (Nsight Systems, vs HF)
+
+**Artifacts** (under `InfiniLM/examples/profiling_runs/`):
+
+- HF manual decode: `nsys_decode_hf_tok256_gpu4.log` (`--hf_mode decode_loop`, short prompt, `max_new_tokens=256`).
+- InfiniLM generate: `nsys_decode_infinilm_tok256_gpu4.log`, `nsys_decode_infinilm_nvtx_tok256_gpu4.log`, `nsys_decode_infinilm_nvtx_opt_tok256_gpu4.log` (same prompt / 256 new tokens; NVTX ranges from `infer_engine.generate`).
+- Post–`write_i32`/`write_i64` rebuild (2026-03-20, GPU 4): `nsys_decode_infinilm_tok256_gpu4_pybind_run.log` (failed: stale `_infinicore` without `write_i32`), `nsys_decode_infinilm_tok256_gpu4_pybind_run2.log` + `decode_infinilm_tok256_gpu4_pybind_run2.nsys-rep` (**good** after `install.py` + `xmake build/install _infinicore` in container). Script `compare_inference_speed.py` preloads InfLLM-v2 (`RTLD_GLOBAL`) so `libinfinicore_cpp_api.so` resolves `mha_varlen_fwd`; bare `python -c import infinicore` without that preload can show an undefined-symbol error.
+
+**NVTX (InfiniLM)** — use these ranges in the Nsight UI / `nsys stats` to isolate prefill vs steady decode:
+
+- `infinilm_prefill_step` — first `generate` iteration.
+- `infinilm_decode_total` — spans decode iterations 1..N-1 (opened on iter 1).
+- `infinilm_decode_step` — one range per token step (high instance count).
+- `infinilm_generate` — full `engine.generate()` call.
+
+**HF**: `hf_decode_loop` wraps the timed decode loop (prefill is outside this range).
+
+**Headline comparison** (same GPU, 256 decode steps, short prompt; numbers from the logs above):
+
+
+| Metric (CUDA API sum)    | HF `decode_loop`    | InfiniLM `generate` |
+| ------------------------ | ------------------- | ------------------- |
+| `cudaLaunchKernel` calls | ~593k               | ~7.44M              |
+| ~calls / decode step     | ~2.3k               | ~29k                |
+| `cudaMemcpyAsync` calls  | lower than InfiniLM | ~988k               |
+
+
+**Memcpy time** (`cuda_gpu_mem_time_sum`): InfiniLM decode shows large **H2D** wall share (~63% of memcpy time in one run) with **many** small transfers; HF decode shows **fewer** H2D operations but they can dominate memcpy time when they occur.
+
+**Interpretation**: InfiniLM short decode is limited less by a single kernel and more by **per-step framework overhead** (launch count + small copies). Next wins are structural (fewer launches per token, true decode KV path, graph/capture where safe), not scalar metadata alone.
+
+**Continuing profiling — repro commands** (inside `minicpm-sala`, pick idle `GPU`; outputs go to `profiling_runs/`):
+
+```bash
+REPO=/home/zenghua/workspace/minicpm-sala-support
+MODEL=/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA
+GPU=4
+export CUDA_VISIBLE_DEVICES=$GPU
+export PYTHONPATH=$REPO/InfiniLM/python:$REPO/InfiniCore/python:${PYTHONPATH:-}
+export LD_LIBRARY_PATH=/root/.infini/lib:${LD_LIBRARY_PATH:-}
+cd $REPO/InfiniLM/examples
+
+TAG=decode_infinilm_tok256_gpu${GPU}
+nsys profile --trace=cuda,nvtx,osrt --stats=true -o profiling_runs/${TAG} --force-overwrite true \
+  python3 compare_inference_speed.py \
+    --model_path "$MODEL" \
+    --prompt "Write a short haiku about GPUs." \
+    --max_new_tokens 256 \
+    --backends infinilm \
+    --no_hf \
+    --infinilm_inprocess \
+    --infinilm_cache_mode static_fit \
+  2>&1 | tee profiling_runs/nsys_${TAG}.log
+
+# Optional (InfiniLM decode): reduce D2H / Python overhead and A/B CPU metadata tensors
+# export INFINI_PROFILE_KEEP_OUTPUT_IDS_ON_DEVICE=1
+# export INFINI_PROFILE_COLLECT_OUTPUT_IDS=0
+# export INFINI_PROFILE_DISABLE_FAST_DECODE_META=1   # force per-step from_list() metadata vs reusable CPU+write_i* fast path
+
+TAG=decode_hf_tok256_gpu${GPU}
+nsys profile --trace=cuda,nvtx,osrt --stats=true -o profiling_runs/${TAG} --force-overwrite true \
+  python3 compare_inference_speed.py \
+    --model_path "$MODEL" \
+    --prompt "Write a short haiku about GPUs." \
+    --max_new_tokens 256 \
+    --backends hf \
+    --no_infinilm \
+    --hf_mode decode_loop \
+    --hf_decode_warmup 8 \
+    --hf_decode_iters 1 \
+    --hf_attn_implementation flash_attention_2 \
+  2>&1 | tee profiling_runs/nsys_${TAG}.log
+```
+
+**Long-context decode** (optional): add e.g. `--target_input_tokens 32768` to either command so NVTX still tags prefill vs decode; expect traces to be large.
+
+**Prefill-only nsys** (matches earlier 32k/64k attribution):
+
+```bash
+TAG=infinilm_prefill_32768_gpu${GPU}
+nsys profile --trace=cuda,nvtx,osrt --stats=true -o profiling_runs/${TAG} --force-overwrite true \
+  python3 compare_inference_speed.py \
+    --model_path "$MODEL" \
+    --target_input_tokens 32768 \
+    --max_new_tokens 1 \
+    --backends infinilm \
+    --no_hf \
+    --infinilm_inprocess \
+    --infinilm_cache_mode static_fit \
+  2>&1 | tee profiling_runs/nsys_${TAG}.log
+```
+
+After code changes (e.g. pybind metadata path), re-run the **same** `TAG` with a suffix (`_run2`) and diff `cuda_api_sum` / `cuda_gpu_kern_sum` / NVTX tables.
+
+### Ranked next optimization experiments (minimal changes)
+
+1. **Constrain/reshape persistent KV growth first**
+Expected impact: High memory reduction, likely best leverage on 32k->64k slope.
+Minimal experiment: compare `static_fit` vs `paged` (small block sizes, e.g., 128/256) at 32k/64k and re-measure peaks + TTFT.
+2. **Reduce transient prefill movement/workspace**
+Expected impact: Medium TTFT gain, small-to-medium memory relief.
+Minimal experiment: isolate `simple_gla_prefill` transform/workspace path and reduce extra copies/format conversions; confirm via reduced `cudaMemcpy` share in nsys.
+3. **Trim synchronization/copy overhead around prefill**
+Expected impact: Medium TTFT gain at long context.
+Minimal experiment: profile before/after removing avoidable sync points or host-device transfers in attention/prefill orchestration; success criterion is lower `cudaMemcpy` wall share with unchanged logits.
+
+Applied (2026-03-19): removed `permute(...)->contiguous()` materialization for KV cache update and GLA prefill inputs in `minicpm_sala_attention.cpp` (pass strided views).
+Result: 32k peak memory improved on GPU 4 (**43209 MiB → 38613 MiB**) with similar TTFT; 64k peak unchanged (dominated by persistent KV slabs).
+
+Validation gate for each experiment:
+
+- **Operator unit tests (CUDA) first** — InfLLM-v2 + Simple GLA prefill (see below). Failing ops almost always mean wasted time on full-model logits debugging.
+- run `minicpm_sala_logits_sanity.py` (prefill mode) and compare ratio/max_diff/mean_diff against current baseline.
+- run one prompt generation sanity and verify no functional regression.
+
+---
+
+## Commands (repro)
+
+### InfiniCore operator tests (run before logits sanity)
+
+MiniCPM-SALA stack depends on `infllmv2_varlen` / `infllmv2_kvcache` and `simple_gla_prefill`. Run these inside `minicpm-sala` with `InfiniLM/python` on `PYTHONPATH` so InfLLM-v2 preloads before `import infinicore`:
+
+```bash
+REPO=/home/zenghua/workspace/minicpm-sala-support
+export CUDA_VISIBLE_DEVICES=1
+export PYTHONPATH=$REPO/InfiniCore/test/infinicore:$REPO/InfiniCore/python:$REPO/InfiniLM/python:${PYTHONPATH:-}
+export LD_LIBRARY_PATH=/root/.infini/lib:${LD_LIBRARY_PATH:-}
+cd $REPO/InfiniCore/test/infinicore/ops
+
+python3 test_infllmv2_attention.py --nvidia
+python3 test_simple_gla_prefill.py --nvidia
+```
+
+One-liner wrapper (same env assumptions as the repo):
+
+```bash
+bash $REPO/InfiniLM/examples/run_infinicore_ops_before_logits.sh
+```
+
+### Logits correctness gate (HF vs InfiniLM)
+
+Run (inside `minicpm-sala`) to sanity-check HF vs InfiniLM prefill logits on a short prompt:
+
+```bash
+REPO=/home/zenghua/workspace/minicpm-sala-support
+MODEL=/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA
+export CUDA_VISIBLE_DEVICES=1
+export HF_CUDA_INDEX=0
+export INFINILM_CUDA_INDEX=0
+export PYTHONPATH=$REPO/InfiniLM/python:$REPO/InfiniCore/python:${PYTHONPATH:-}
+export LD_LIBRARY_PATH=/root/.infini/lib:${LD_LIBRARY_PATH:-}
+cd $REPO/InfiniLM/examples
+
+python3 minicpm_sala_logits_sanity.py \
+  --model_path "$MODEL" \
+  --mode prefill \
+  --prompt "How are you? Tell me a short joke." \
+  --k 10
+```
+
+Recorded output (2026-03-18, GPU=1):
+
+```text
+SANITY_ONELINE ratio=0.9889 max_diff=0.1875 mean_diff=0.0682
+```
+
+`--mode decode1` (prefill + one decode step): **prefill section** should match the prefill-only run. The **decode** section should now be finite (the previous `NaN` issue was traced to the CUDA embedding kernel leaving outputs uninitialized for out-of-range indices). Correctness can still diverge from HF for longer prompts due to decode/KV/attention parity work; treat **prefill** as the strongest HF parity gate for now.
+
+### GPU scan
+
+```bash
+nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv,noheader,nounits
+```
+
+### HF-only prefill (32k) with 1s memory polling
+
+```bash
+REPO=/home/zenghua/workspace/minicpm-sala-support
+MODEL=/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA
+export CUDA_VISIBLE_DEVICES=2
+export PYTHONPATH=$REPO/InfiniLM/python:$REPO/InfiniCore/python:${PYTHONPATH:-}
+cd $REPO/InfiniLM/examples
+
+python3 compare_inference_speed.py \
+  --model_path "$MODEL" \
+  --target_input_tokens 32768 \
+  --max_new_tokens 1 \
+  --backends hf \
+  --hf_mode forward_prefill \
+  --hf_forward_use_cache \
+  --hf_forward_warmup 1 \
+  --hf_forward_iters 1 \
+  --hf_attn_implementation flash_attention_2 \
+  & pid=$!
+
+echo "[mem] polling physical GPU 2 while pid=$pid"
+while kill -0 $pid 2>/dev/null; do
+  date +"%F %T"
+  nvidia-smi -i 2 --query-gpu=memory.used,memory.total --format=csv,noheader,nounits
+  sleep 1
+done
+wait $pid
+```
+
+### InfiniLM-only (32k) with InfLLM-v2 preload + 1s memory polling
+
+```bash
+REPO=/home/zenghua/workspace/minicpm-sala-support
+MODEL=/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA
+export CUDA_VISIBLE_DEVICES=2
+export PYTHONPATH=$REPO/InfiniLM/python:$REPO/InfiniCore/python:${PYTHONPATH:-}
+export LD_LIBRARY_PATH=/root/.infini/lib:$REPO/InfiniLM/build/linux/x86_64/release:${LD_LIBRARY_PATH:-}
+cd $REPO/InfiniLM/examples
+
+python3 - <<'PY' & pid=$!
+import ctypes, os, runpy, sys
+ctypes.CDLL("/usr/local/lib/python3.12/dist-packages/infllm_v2/C.cpython-312-x86_64-linux-gnu.so", mode=ctypes.RTLD_GLOBAL)
+sys.argv = [
+  "compare_inference_speed.py",
+  "--model_path", os.environ["MODEL"],
+  "--target_input_tokens", "32768",
+  "--max_new_tokens", "1",
+  "--backends", "infinilm",
+  "--no_hf",
+  "--infinilm_inprocess",
+  "--infinilm_cache_mode", "static_fit",
+]
+runpy.run_path("compare_inference_speed.py", run_name="__main__")
+PY
+
+echo "[mem] polling physical GPU 2 while pid=$pid"
+while kill -0 $pid 2>/dev/null; do
+  date +"%F %T"
+  nvidia-smi -i 2 --query-gpu=memory.used,memory.total --format=csv,noheader,nounits
+  sleep 1
+done
+wait $pid
+```
+
diff --git a/examples/run_infinicore_ops_before_logits.sh b/examples/run_infinicore_ops_before_logits.sh
new file mode 100755
index 00000000..5a93fe11
--- /dev/null
+++ b/examples/run_infinicore_ops_before_logits.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+# InfiniCore CUDA operator smoke tests for MiniCPM-SALA-related ops.
+# Run inside minicpm-sala docker before deeper HF-vs-InfiniLM alignment probes.
+set -euo pipefail
+
+REPO="${REPO:-/home/zenghua/workspace/minicpm-sala-support}"
+export PYTHONPATH="$REPO/InfiniCore/test/infinicore:$REPO/InfiniCore/python:$REPO/InfiniLM/python:${PYTHONPATH:-}"
+export LD_LIBRARY_PATH="/root/.infini/lib:${LD_LIBRARY_PATH:-}"
+
+OPS_DIR="$REPO/InfiniCore/test/infinicore/ops"
+cd "$OPS_DIR"
+
+echo "[run_infinicore_ops] REPO=$REPO"
+echo "[run_infinicore_ops] test_infllmv2_attention.py --nvidia"
+python3 test_infllmv2_attention.py --nvidia
+echo "[run_infinicore_ops] test_simple_gla_prefill.py --nvidia"
+python3 test_simple_gla_prefill.py --nvidia
+echo "[run_infinicore_ops] OK"
diff --git a/examples/run_longtext_metrics_cases.sh b/examples/run_longtext_metrics_cases.sh
new file mode 100755
index 00000000..dd595c7b
--- /dev/null
+++ b/examples/run_longtext_metrics_cases.sh
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+# Run each longtext/decode metric case in a **separate** Python process to release CUDA
+# memory between runs (reduces OOM when sweeping 16k/32k/64k × HF + InfiniLM).
+#
+# Usage (inside minicpm-sala, after picking an idle GPU):
+#   export CUDA_VISIBLE_DEVICES=2
+#   export NVML_GPU_INDEX=2
+#   export REPO=/home/zenghua/workspace/minicpm-sala-support
+#   export PYTHONPATH=$REPO/InfiniLM/examples:$REPO/InfiniCore/python:$REPO/InfiniLM/python
+#   export LD_LIBRARY_PATH=/root/.infini/lib:${LD_LIBRARY_PATH:-}
+#   export METRICS_DATE=2026-03-23
+#   cd $REPO/InfiniLM/examples && ./run_longtext_metrics_cases.sh
+#
+# Optional:
+#   METRICS_TARGETS=16384,32768   METRICS_DECODE_STEPS=32  ./run_longtext_metrics_cases.sh
+#   SLEEP_BETWEEN_SEC=3   # extra pause between subprocesses
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR"
+
+REPO="${REPO:-/home/zenghua/workspace/minicpm-sala-support}"
+export PYTHONPATH="${SCRIPT_DIR}:${REPO}/InfiniCore/python:${REPO}/InfiniLM/python:${PYTHONPATH:-}"
+export LD_LIBRARY_PATH="/root/.infini/lib:${LD_LIBRARY_PATH:-}"
+
+: "${CUDA_VISIBLE_DEVICES:=0}"
+: "${NVML_GPU_INDEX:=${CUDA_VISIBLE_DEVICES}}"
+: "${METRICS_DATE:=2026-03-23}"
+: "${METRICS_DECODE_STEPS:=32}"
+: "${METRICS_TARGETS:=16384,32768,65536}"
+: "${SLEEP_BETWEEN_SEC:=2}"
+
+OUT_JSONL="${OUT_JSONL:-${SCRIPT_DIR}/profiling_runs/longtext_decode_rows.jsonl}"
+mkdir -p "$(dirname "$OUT_JSONL")"
+rm -f "$OUT_JSONL"
+echo "[run_longtext_metrics] jsonl -> $OUT_JSONL  GPU smi index=$NVML_GPU_INDEX"
+
+IFS=',' read -r -a TARGETS <<< "$METRICS_TARGETS"
+
+run_one() {
+  local c="$1"
+  echo "[run_longtext_metrics] case=$c"
+  python3 collect_metrics_longtext_decode.py --case "$c" --append-jsonl "$OUT_JSONL" || true
+  sleep "${SLEEP_BETWEEN_SEC}"
+}
+
+for t in "${TARGETS[@]}"; do
+  run_one "hf:${t}"
+done
+for t in "${TARGETS[@]}"; do
+  run_one "infinilm_rec:${t}:1"
+done
+for t in "${TARGETS[@]}"; do
+  run_one "infinilm_rec:${t}:${METRICS_DECODE_STEPS}"
+done
+
+echo "[run_longtext_metrics] merged table:"
+python3 collect_metrics_longtext_decode.py --from-jsonl "$OUT_JSONL"
diff --git a/include/infinicore_infer/cache.h b/include/infinicore_infer/cache.h
index 522f2235..5f691c64 100644
--- a/include/infinicore_infer/cache.h
+++ b/include/infinicore_infer/cache.h
@@ -3,6 +3,11 @@
 
 #include <infinirt.h>
 
+#ifndef __INFINI_C
+// Compat: older InfiniCore headers use `__C` instead of `__INFINI_C`.
+#define __INFINI_C __C
+#endif
+
 __INFINI_C __export struct KVCache *createKVCache(
     size_t nlayers,
     size_t max_len,
diff --git a/include/infinicore_infer/weights_loader.h b/include/infinicore_infer/weights_loader.h
index 82eafe59..057c3a1b 100644
--- a/include/infinicore_infer/weights_loader.h
+++ b/include/infinicore_infer/weights_loader.h
@@ -3,6 +3,11 @@
 
 #include <infinirt.h>
 
+#ifndef __INFINI_C
+// Compat: older InfiniCore headers use `__C` instead of `__INFINI_C`.
+#define __INFINI_C __C
+#endif
+
 struct ModelWeights;
 
 __INFINI_C __export void
diff --git a/python/infinilm/auto_config.py b/python/infinilm/auto_config.py
index 7e2d4afd..b6f96ff5 100644
--- a/python/infinilm/auto_config.py
+++ b/python/infinilm/auto_config.py
@@ -27,6 +27,8 @@ def from_pretrained(model_path):
             return LlamaConfig(**config_dict)
         elif config_dict["model_type"] == "minicpm":
             return LlamaConfig(**config_dict)
+        elif config_dict["model_type"] == "minicpm_sala":
+            return LlamaConfig(**config_dict)
         elif config_dict["model_type"] == "fm9g":
             return LlamaConfig(**config_dict)
         elif config_dict["model_type"] == "fm9g7b":
diff --git a/python/infinilm/infer_engine.py b/python/infinilm/infer_engine.py
index a67add6f..6552227c 100644
--- a/python/infinilm/infer_engine.py
+++ b/python/infinilm/infer_engine.py
@@ -1,4 +1,5 @@
 import time
+import os
 from dataclasses import dataclass
 
 import infinicore
@@ -78,9 +79,7 @@ def forward(
         try:
             # TODO: Remove `_underlying` and simplify the corresponding code.
             input_ids = input_ids._underlying if input_ids is not None else None
-            position_ids = (
-                position_ids._underlying if position_ids is not None else None
-            )
+            position_ids = position_ids._underlying if position_ids is not None else None
             past_kv_lengths = (
                 past_kv_lengths._underlying if past_kv_lengths is not None else None
             )
@@ -134,6 +133,7 @@ def generate(
             eos_token_id = generation_config.eos_token_id
 
         past_seq_len = 0
+
         output_ids = []
         initial_batch_size, initial_seqlen = input_ids.shape[:2]
         seq_len = initial_seqlen
@@ -164,6 +164,42 @@ def generate(
                 dtype=infinicore.int32,
             )
 
+        # Decode metadata fast path (batch=1, static cache):
+        # avoid per-step from_list()/numpy allocations for tiny scalar tensors.
+        # Those tensors live on CPU and are H2D-copied each forward; for profiling
+        # comparisons vs `from_list` device metadata, set:
+        #   INFINI_PROFILE_DISABLE_FAST_DECODE_META=1
+        disable_fast_decode_meta = os.environ.get(
+            "INFINI_PROFILE_DISABLE_FAST_DECODE_META", "0"
+        ) not in ("", "0", "false", "False")
+        fast_decode_meta = (
+            (not self.enable_paged_attn)
+            and (initial_batch_size == 1)
+            and not disable_fast_decode_meta
+        )
+        if fast_decode_meta:
+            cpu = infinicore.device("cpu", 0)
+
+            # Reusable metadata tensors; values updated via pybind write_i32/write_i64.
+            position_ids_decode = infinicore.empty(
+                [1, 1], dtype=infinicore.int64, device=cpu
+            )
+            past_kv_lengths_decode = infinicore.empty(
+                [1], dtype=infinicore.int32, device=cpu
+            )
+            total_kv_lengths_decode = infinicore.empty(
+                [1], dtype=infinicore.int32, device=cpu
+            )
+            cu_seqlens_decode = infinicore.empty(
+                [2], dtype=infinicore.int32, device=cpu
+            )
+            input_offsets_decode = infinicore.empty(
+                [2], dtype=infinicore.int32, device=cpu
+            )
+            input_offsets_decode.write_i32(0, 0)
+            input_offsets_decode.write_i32(1, 1)
+
+        decode_total_open = False
         for iter in range(0, generation_config.max_new_tokens):
             if _measure_and_log_time:
                 start_time = time.perf_counter()
@@ -203,29 +239,54 @@ def generate(
                     dtype=infinicore.int64,
                 )
             else:
-                position_ids = infinicore.from_list(
-                    [
-                        list(range(past_seq_len, past_seq_len + seq_len))
-                        for _ in range(batch_size)
-                    ],
-                    dtype=infinicore.int64,
-                )
+                if fast_decode_meta and iter > 0 and batch_size == 1 and seq_len == 1:
+                    position_ids_decode.write_i64(0, int(past_seq_len))
+                    past_kv_lengths_decode.write_i32(0, int(past_seq_len))
+                    total_kv_lengths_decode.write_i32(0, int(past_seq_len + seq_len))
+                    cu_seqlens_decode.write_i32(0, 0)
+                    cu_seqlens_decode.write_i32(1, int(past_seq_len + seq_len))
+                    position_ids = position_ids_decode
+                    past_kv_lengths = past_kv_lengths_decode
+                    total_kv_lengths = total_kv_lengths_decode
+                    cu_seqlens = cu_seqlens_decode
+                    input_offsets = input_offsets_decode
+                else:
+                    position_ids = infinicore.from_list(
+                        [
+                            list(range(past_seq_len, past_seq_len + seq_len))
+                            for _ in range(batch_size)
+                        ],
+                        dtype=infinicore.int64,
+                    )
+                    past_kv_lengths = infinicore.from_list(
+                        [past_seq_len] * batch_size, dtype=infinicore.int32
+                    )
+                    total_kv_lengths = infinicore.from_list(
+                        [past_seq_len + seq_len] * batch_size, dtype=infinicore.int32
+                    )
+                    cu_seqlens = infinicore.from_list(
+                        [(past_seq_len + seq_len) * i for i in range(batch_size + 1)],
+                        dtype=infinicore.int32,
+                    )
+                    input_offsets = infinicore.from_list(
+                        [seq_len * i for i in range(batch_size + 1)], dtype=infinicore.int32
+                    )
 
                 slot_mapping = None
-
-            past_kv_lengths = infinicore.from_list(
-                [past_seq_len] * batch_size, dtype=infinicore.int32
-            )
-            total_kv_lengths = infinicore.from_list(
-                [past_seq_len + seq_len] * batch_size, dtype=infinicore.int32
-            )
-            cu_seqlens = infinicore.from_list(
-                [(past_seq_len + seq_len) * i for i in range(batch_size + 1)],
-                dtype=infinicore.int32,
-            )
-            input_offsets = infinicore.from_list(
-                [seq_len * i for i in range(batch_size + 1)], dtype=infinicore.int32
-            )
+            if self.enable_paged_attn:
+                past_kv_lengths = infinicore.from_list(
+                    [past_seq_len] * batch_size, dtype=infinicore.int32
+                )
+                total_kv_lengths = infinicore.from_list(
+                    [past_seq_len + seq_len] * batch_size, dtype=infinicore.int32
+                )
+                cu_seqlens = infinicore.from_list(
+                    [(past_seq_len + seq_len) * i for i in range(batch_size + 1)],
+                    dtype=infinicore.int32,
+                )
+                input_offsets = infinicore.from_list(
+                    [seq_len * i for i in range(batch_size + 1)], dtype=infinicore.int32
+                )
 
             output_id = self(
                 input_ids=input_ids,
@@ -240,7 +301,6 @@ def generate(
                 top_k=generation_config.top_k,
                 top_p=generation_config.top_p,
             )
-
             output_ids.append(output_id)
 
             if (
diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py
index 7b6ceea4..e07e2155 100644
--- a/python/infinilm/llm/llm.py
+++ b/python/infinilm/llm/llm.py
@@ -95,8 +95,15 @@ def __init__(self, config: EngineConfig):
         )
 
         # Load model weights
+        dtype_map = {
+            "float16": infinicore.float16,
+            "bfloat16": infinicore.bfloat16,
+            "float32": infinicore.float32,
+        }
         load_model_state_dict_by_file(
-            self.model_engine, config.model_path, dtype=self.model_engine.config.dtype
+            self.model_engine,
+            config.model_path,
+            dtype=dtype_map.get(config.dtype, self.model_engine.config.dtype),
         )
 
         # Initialize tokenizer
@@ -371,6 +378,7 @@ def apply_chat_template(
             conversation=messages,
             add_generation_prompt=add_generation_prompt,
             tokenize=False,
+            continue_final_message=not add_generation_prompt,
             **chat_template_kwargs,
         )
 
diff --git a/python/infinilm/llm/static_scheduler.py b/python/infinilm/llm/static_scheduler.py
index de4d9d35..25d64ae5 100644
--- a/python/infinilm/llm/static_scheduler.py
+++ b/python/infinilm/llm/static_scheduler.py
@@ -4,6 +4,7 @@
 
 import logging
 import queue
+import os
 import janus
 from typing import List, Optional
 
@@ -60,9 +61,17 @@ def build_model_inputs(
             tokens = req.get_input_tokens()
             prefix_hit_len = self.prefix_hit_len
             input_tokens = tokens[prefix_hit_len:]
-            input_ids = [input_tokens]
-            position_ids = [list(range(prefix_hit_len, len(tokens)))]
-            past_kv_len = prefix_hit_len
+            if len(input_tokens) == 0:
+                # Full prefix hit: avoid empty tensor conversion in model input path.
+                # Recompute the last prompt token as a one-token prefill step.
+                input_tokens = [tokens[-1]]
+                input_ids = [input_tokens]
+                position_ids = [[len(tokens) - 1]]
+                past_kv_len = len(tokens) - 1
+            else:
+                input_ids = [input_tokens]
+                position_ids = [list(range(prefix_hit_len, len(tokens)))]
+                past_kv_len = prefix_hit_len
             total_kv_len = len(tokens)
             input_offsets = [0, len(input_tokens)]
         else:
@@ -106,6 +115,11 @@ def __init__(self, max_cache_len: int = 4096):
         self.max_cache_len = max_cache_len
         self.cached_block_hashes: List[int] = []
         self.pending_block_hashes: List[int] = []
+        # Safety switch: disable cross-request prefix reuse when investigating
+        # corrupted/contaminated generations.
+        self.disable_prefix_reuse = os.getenv(
+            "INFINILM_STATIC_DISABLE_PREFIX_REUSE", "0"
+        ) in ("1", "true", "True", "yes", "on")
 
     def add_request(self, request: InferenceRequest):
         if request is not None:
@@ -205,6 +219,8 @@ def schedule(self) -> Optional[StaticSchedulerOutput]:
             num_full_blocks = prompt_len // _BLOCK_SIZE
             matched = 0
 
+            if self.disable_prefix_reuse and self.cached_block_hashes:
+                self.cached_block_hashes.clear()
             self.pending_block_hashes.clear()
 
             for i in range(num_full_blocks):
diff --git a/python/infinilm/modeling_utils.py b/python/infinilm/modeling_utils.py
index 1d21f2d9..17a5fe58 100644
--- a/python/infinilm/modeling_utils.py
+++ b/python/infinilm/modeling_utils.py
@@ -1,4 +1,6 @@
 import os
+import json
+import math
 from typing import Dict, Union
 import time
 import torch
@@ -93,7 +95,8 @@ def load_state_dict(
             )
 
         for k in f.keys():
-            state_dict[k] = f.get_tensor(k).to(device=device)
+            # Explicitly cast dtype: some ops (e.g. embedding) may not support BF16 on all backends.
+            state_dict[k] = f.get_tensor(k).to(device=device, dtype=dtype)
 
     return state_dict
 
@@ -152,6 +155,35 @@ def load_model_state_dict_by_file(
     torch_dtype = infinicore.utils.to_torch_dtype(dtype)
     model_keys = model.state_dict_keyname()
 
+    # MiniCPM-style scaling (used by MiniCPM / FM9G; also applies to MiniCPM-SALA checkpoints).
+    # This matches `InfiniLM/scripts/jiuge.py` weight scaling behavior.
+    scale_input = 1.0
+    scale_output = 1.0
+    scale_o = 1.0
+    scale_down = 1.0
+    scale_lm_head = 1.0
+    try:
+        with open(os.path.join(model_path, "config.json")) as f:
+            cfg = json.load(f)
+        if (
+            cfg.get("model_type") in ["fm9g", "minicpm", "minicpm_sala"]
+            and "scale_emb" in cfg
+            and "scale_depth" in cfg
+        ):
+            scale_input = float(cfg["scale_emb"])
+            scale_o = float(cfg["scale_depth"]) / math.sqrt(float(cfg["num_hidden_layers"]))
+            scale_down = float(cfg["scale_depth"]) / math.sqrt(float(cfg["num_hidden_layers"]))
+            if cfg.get("model_type") in ["fm9g", "minicpm"] and "dim_model_base" in cfg:
+                scale_output = float(int(cfg["hidden_size"]) // int(cfg["dim_model_base"]))
+            if cfg.get("model_type") == "minicpm_sala" and "dim_model_base" in cfg and "hidden_size" in cfg:
+                scale_lm_head = float(cfg["dim_model_base"]) / float(cfg["hidden_size"])
+            # minicpm_sala: only bake embed and lm_head; residual scaling done at forward in C++
+            if cfg.get("model_type") == "minicpm_sala":
+                scale_o = 1.0
+                scale_down = 1.0
+    except Exception:
+        pass
+
     already_loaded_keys = []
 
     file_list = glob.glob(os.path.join(model_path, "*.safetensors"))
@@ -167,6 +199,24 @@ def load_model_state_dict_by_file(
             )
             already_loaded_keys.extend(model_param.keys())
 
+            # Apply MiniCPM scaling to loaded tensors (in torch space).
+            if scale_input != 1.0 and "model.embed_tokens.weight" in model_param:
+                model_param["model.embed_tokens.weight"] = (
+                    model_param["model.embed_tokens.weight"] * scale_input
+                )
+            if scale_output != 1.0 and "model.norm.weight" in model_param:
+                model_param["model.norm.weight"] = (
+                    model_param["model.norm.weight"] * scale_output
+                )
+            if scale_o != 1.0 or scale_down != 1.0:
+                for k, v in list(model_param.items()):
+                    if scale_o != 1.0 and k.endswith(".self_attn.o_proj.weight"):
+                        model_param[k] = v * scale_o
+                    elif scale_down != 1.0 and k.endswith(".mlp.down_proj.weight"):
+                        model_param[k] = v * scale_down
+            if scale_lm_head != 1.0 and "lm_head.weight" in model_param:
+                model_param["lm_head.weight"] = model_param["lm_head.weight"] * scale_lm_head
+
             # --------------------------------------------------------- #
             #         model_param_infini references torch.Tensor
             # --------------------------------------------------------- #
@@ -180,6 +230,19 @@ def load_model_state_dict_by_file(
         file_path = os.path.join(model_path, "pytorch_model.bin")
         model_params = torch.load(file_path, weights_only=True, map_location="cpu")
 
+        if scale_input != 1.0 and "model.embed_tokens.weight" in model_params:
+            model_params["model.embed_tokens.weight"] = model_params["model.embed_tokens.weight"] * scale_input
+        if scale_output != 1.0 and "model.norm.weight" in model_params:
+            model_params["model.norm.weight"] = model_params["model.norm.weight"] * scale_output
+        if scale_o != 1.0 or scale_down != 1.0:
+            for k, v in list(model_params.items()):
+                if scale_o != 1.0 and k.endswith(".self_attn.o_proj.weight"):
+                    model_params[k] = v * scale_o
+                elif scale_down != 1.0 and k.endswith(".mlp.down_proj.weight"):
+                    model_params[k] = v * scale_down
+        if scale_lm_head != 1.0 and "lm_head.weight" in model_params:
+            model_params["lm_head.weight"] = model_params["lm_head.weight"] * scale_lm_head
+
         model_param_infini = {}
         for key in model_params.keys():
             model_param_infini[key] = infinicore.from_torch(
diff --git a/python/infinilm/server/chat_message_normalize.py b/python/infinilm/server/chat_message_normalize.py
new file mode 100644
index 00000000..04afe176
--- /dev/null
+++ b/python/infinilm/server/chat_message_normalize.py
@@ -0,0 +1,76 @@
+"""Normalize OpenAI-style chat messages before HuggingFace chat_template.
+
+Kept separate from ``inference_server`` so this logic can be smoke-tested without
+loading InfiniCore / CUDA (see ``__main__`` block).
+"""
+
+
+def normalize_openai_messages_for_hf_template(messages: list) -> list:
+    """Strip lm-eval ``type: text`` wrappers; flatten multimodal text parts.
+
+    lm-eval ``local-chat-completions`` with ``tokenized_requests=False`` JSON-encodes
+    each turn with an extra top-level ``"type": "text"`` (see ``TemplateAPI.apply_chat_template``
+    in lm-eval). HuggingFace ``--model hf`` passes plain ``{role, content}`` dicts into
+    ``apply_chat_template``. Stripping unknown keys keeps server templating aligned with
+    the HF harness for text-only tasks.
+    """
+    normalized: list = []
+    for msg in messages:
+        if not isinstance(msg, dict):
+            normalized.append(msg)
+            continue
+
+        role = msg.get("role")
+        if role is None:
+            normalized.append(msg)
+            continue
+
+        content = msg.get("content")
+        if isinstance(content, list):
+            text_parts: list[str] = []
+            for part in content:
+                if isinstance(part, dict):
+                    if part.get("type") == "text" and "text" in part:
+                        text_parts.append(part["text"])
+                    elif isinstance(part, str):
+                        text_parts.append(part)
+                elif isinstance(part, str):
+                    text_parts.append(part)
+            merged = "".join(text_parts) if text_parts else ""
+            core = {"role": role, "content": merged}
+            if msg.get("name") is not None:
+                core["name"] = msg["name"]
+            normalized.append(core)
+        elif isinstance(content, str):
+            core = {"role": role, "content": content}
+            if msg.get("name") is not None:
+                core["name"] = msg["name"]
+            normalized.append(core)
+        else:
+            normalized.append(msg)
+
+    return normalized
+
+
+if __name__ == "__main__":
+    # Smoke test (no InfiniCore): run as
+    #   python3 -m infinilm.server.chat_message_normalize
+    lm_eval_style = [
+        {"role": "system", "content": "sys", "type": "text"},
+        {"role": "user", "content": "hi", "type": "text"},
+    ]
+    out = normalize_openai_messages_for_hf_template(lm_eval_style)
+    assert out == [{"role": "system", "content": "sys"}, {"role": "user", "content": "hi"}], out
+    mm = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "a"},
+                {"type": "text", "text": "b"},
+            ],
+        }
+    ]
+    assert normalize_openai_messages_for_hf_template(mm) == [
+        {"role": "user", "content": "ab"}
+    ]
+    print("chat_message_normalize: ok")
diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py
index b5c49247..8c361c4e 100644
--- a/python/infinilm/server/inference_server.py
+++ b/python/infinilm/server/inference_server.py
@@ -17,6 +17,7 @@
 from fastapi.responses import JSONResponse, StreamingResponse
 
 from infinilm.llm import AsyncLLMEngine, SamplingParams, FinishReason
+from infinilm.server.chat_message_normalize import normalize_openai_messages_for_hf_template
 
 logger = logging.getLogger(__name__)
 
@@ -266,37 +267,8 @@ async def list_models_legacy():
             return _models_payload()
 
     def _normalize_messages(self, messages: list) -> list:
-        """Normalize messages to handle multimodal content (list format).
-
-        Converts content from list format [{"type": "text", "text": "..."}]
-        to string format for chat template compatibility.
-        """
-        normalized = []
-        for msg in messages:
-            if not isinstance(msg, dict):
-                normalized.append(msg)
-                continue
-
-            content = msg.get("content")
-            if isinstance(content, list):
-                # Extract text from multimodal content list
-                text_parts = []
-                for part in content:
-                    if isinstance(part, dict):
-                        if part.get("type") == "text" and "text" in part:
-                            text_parts.append(part["text"])
-                        elif isinstance(part, str):
-                            text_parts.append(part)
-                    elif isinstance(part, str):
-                        text_parts.append(part)
-                # Join all text parts
-                normalized_msg = msg.copy()
-                normalized_msg["content"] = "".join(text_parts) if text_parts else ""
-                normalized.append(normalized_msg)
-            else:
-                normalized.append(msg)
-
-        return normalized
+        """Delegate to :func:`normalize_openai_messages_for_hf_template`."""
+        return normalize_openai_messages_for_hf_template(messages)
 
     def _build_sampling_params(self, data: dict) -> SamplingParams:
         """Build SamplingParams from request data."""
diff --git a/xmake.lua b/xmake.lua
index 2b1b51d3..5282f6a7 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -56,7 +56,7 @@ target_end()
 target("_infinilm")
     add_packages("pybind11")
     set_default(false)
-    add_rules("python.module", {soabi = true})
+    add_rules("python.library", {soabi = true})
     set_languages("cxx17")
     set_kind("shared")
 
@@ -70,6 +70,7 @@ target("_infinilm")
 
     add_linkdirs(INFINI_ROOT.."/lib")
     add_links("infinicore_cpp_api", "infiniop", "infinirt", "infiniccl")
+    add_rpathdirs(INFINI_ROOT.."/lib")
 
     -- Add src files
     add_files("csrc/**.cpp")

From d037eecc8b3f163e52272cf98ac776b08515ffd8 Mon Sep 17 00:00:00 2001
From: Ceng23333 <441651826@qq.com>
Date: Thu, 9 Apr 2026 01:38:05 +0000
Subject: [PATCH 02/11] refactor minicpm-sala

Signed-off-by: Ceng23333 <441651826@qq.com>
---
 csrc/cache/kv_cache.cpp                       | 67 ++-----------
 csrc/cache/kv_cache.hpp                       | 25 +----
 csrc/config/config_factory.cpp                |  2 +-
 csrc/engine/infer_engine.cpp                  | 10 +-
 csrc/engine/rank_worker.cpp                   |  3 +-
 .../minicpm_sala/minicpm_sala_attention.cpp   | 95 +++++++++++++------
 .../minicpm_sala/minicpm_sala_attention.hpp   |  6 ++
 .../minicpm_sala_for_causal_lm.cpp            | 18 ++++
 .../minicpm_sala_for_causal_lm.hpp            |  7 ++
 .../minicpm_sala/minicpm_sala_model.cpp       |  7 +-
 csrc/models/model_factory.cpp                 | 30 +++---
 csrc/pybind11/engine/engine.hpp               |  2 +-
 12 files changed, 129 insertions(+), 143 deletions(-)

diff --git a/csrc/cache/kv_cache.cpp b/csrc/cache/kv_cache.cpp
index a7220773..4c97edfa 100644
--- a/csrc/cache/kv_cache.cpp
+++ b/csrc/cache/kv_cache.cpp
@@ -4,7 +4,6 @@
 #include "../utils.hpp"
 #include "infinicore/ops.hpp"
 #include <stdexcept>
-#include <cstdlib>
 
 namespace infinilm::cache {
 // ==========================
@@ -46,9 +45,7 @@ StaticKVCache::StaticKVCache(
     infinicore::Size max_positional_embedding,
     infinicore::DataType dtype,
     const StaticKVCacheConfig &config,
-    const engine::distributed::RankInfo &rank_info,
-    infinicore::Size gla_recurrent_num_heads,
-    infinicore::Size gla_recurrent_head_dim)
+    const engine::distributed::RankInfo &rank_info)
     : Cache(),
       k_dim_(k_dim),
       v_dim_(v_dim),
@@ -57,9 +54,7 @@ StaticKVCache::StaticKVCache(
       rank_batch_size_(config.max_batch_size()),
       cache_len_(config.max_cache_len() == std::numeric_limits<infinicore::Size>::max() || config.max_cache_len() == 0 ? max_positional_embedding : config.max_cache_len()),
       rank_num_layers_(num_layers),
-      dtype_(dtype),
-      gla_recurrent_num_heads_(gla_recurrent_num_heads),
-      gla_recurrent_head_dim_(gla_recurrent_head_dim) {
+      dtype_(dtype) {
 
     // Allocate K cache
     k_caches_ = infinicore::Tensor::empty(
@@ -80,17 +75,6 @@ StaticKVCache::StaticKVCache(
          v_dim_},
         dtype_,
         rank_info.device);
-
-    if (gla_recurrent_num_heads_ > 0 && gla_recurrent_head_dim_ > 0) {
-        gla_state_ = infinicore::Tensor::zeros(
-            {rank_num_layers_,
-             rank_batch_size_,
-             gla_recurrent_num_heads_,
-             gla_recurrent_head_dim_,
-             gla_recurrent_head_dim_},
-            infinicore::DataType::F32,
-            rank_info.device);
-    }
 }
 
 infinicore::Tensor StaticKVCache::create_layer_kv_cache(
@@ -141,27 +125,12 @@ StaticKVCache::update(size_t layer_idx,
     auto device = k_cache_layer->device();
 
 #ifdef ENABLE_KV_CACHING
-    // Some debug builds have shown incremental decode (update_len=1) may diverge
-    // from full-sequence recompute when using the optimized kv_caching_ kernel.
-    // Provide an env override to fall back to the simple (and slower) copy update.
-    const char *disable_kv_caching = std::getenv("INFINI_DISABLE_KV_CACHING");
-    const bool force_copy_update = disable_kv_caching && disable_kv_caching[0] != '\0' && disable_kv_caching[0] != '0';
-    if (force_copy_update) {
-        size_t cache_pos = reinterpret_cast<int32_t *>(past_sequence_lengths->to(infinicore::Device::cpu())->data())[0];
-        auto result_len = cache_pos + update_len;
-        ASSERT(result_len <= cache_len_);
-        auto k_cache_update = k_cache_layer->narrow({{2, cache_pos, update_len}});
-        auto v_cache_update = v_cache_layer->narrow({{2, cache_pos, update_len}});
-        k_cache_update->copy_from(k);
-        v_cache_update->copy_from(v);
-    } else {
-        infinicore::op::kv_caching_(
-            k_cache_layer,
-            v_cache_layer,
-            k,
-            v,
-            past_sequence_lengths);
-    }
+    infinicore::op::kv_caching_(
+        k_cache_layer,
+        v_cache_layer,
+        k,
+        v,
+        past_sequence_lengths);
 #else
     size_t cache_pos = reinterpret_cast<int32_t *>(past_sequence_lengths->to(infinicore::Device::cpu())->data())[0];
     auto result_len = cache_pos + update_len;
@@ -177,26 +146,6 @@ StaticKVCache::update(size_t layer_idx,
     return {k_cache_layer, v_cache_layer};
 }
 
-std::tuple<infinicore::Tensor, infinicore::Tensor>
-StaticKVCache::get_layer_kv(size_t layer_idx) {
-    ASSERT(layer_idx < rank_num_layers_);
-    auto k_cache_layer = k_caches_->narrow({{0, layer_idx, 1}})->squeeze(0);
-    auto v_cache_layer = v_caches_->narrow({{0, layer_idx, 1}})->squeeze(0);
-    return {k_cache_layer, v_cache_layer};
-}
-
-bool
-StaticKVCache::has_gla_recurrent_state() const {
-    return gla_recurrent_num_heads_ > 0 && gla_recurrent_head_dim_ > 0 && static_cast<bool>(gla_state_);
-}
-
-infinicore::Tensor
-StaticKVCache::gla_recurrent_state_for_layer(size_t layer_idx) {
-    ASSERT(layer_idx < rank_num_layers_);
-    ASSERT(has_gla_recurrent_state());
-    return gla_state_->narrow({{0, layer_idx, 1}})->squeeze(0);
-}
-
 // ==========================
 // PagedKVCacheConfig
 // ==========================
diff --git a/csrc/cache/kv_cache.hpp b/csrc/cache/kv_cache.hpp
index cbef0722..e6e640df 100644
--- a/csrc/cache/kv_cache.hpp
+++ b/csrc/cache/kv_cache.hpp
@@ -12,7 +12,6 @@
 #include <memory>
 #include <numeric>
 #include <stdexcept>
-#include <cstdlib>
 #include <utility>
 
 #include <spdlog/spdlog.h>
@@ -44,9 +43,7 @@ class StaticKVCache final : public Cache {
         infinicore::Size max_positional_embedding,
         infinicore::DataType dtype,
         const StaticKVCacheConfig &config,
-        const engine::distributed::RankInfo &rank_info,
-        infinicore::Size gla_recurrent_num_heads = 0,
-        infinicore::Size gla_recurrent_head_dim = 0);
+        const engine::distributed::RankInfo &rank_info);
 
     static infinicore::Tensor create_layer_kv_cache(
         const infinicore::Size k_dim,
@@ -75,20 +72,6 @@ class StaticKVCache final : public Cache {
            const infinicore::Tensor &v,
            const infinicore::Tensor &past_sequence_lengths);
 
-    /**
-     * @brief Get KV cache tensors for a layer (views).
-     *
-     * @return (k_cache_layer, v_cache_layer)
-     *         k_cache_layer: [batch, num_rank_k_heads, max_cache_len, k_dim]
-     *         v_cache_layer: [batch, num_rank_v_heads, max_cache_len, v_dim]
-     */
-    std::tuple<infinicore::Tensor, infinicore::Tensor>
-    get_layer_kv(size_t layer_idx);
-
-    /** Per-layer Simple GLA recurrent state for lightning decode: [batch, H, D, D] float32 (in-place for decode_step). */
-    bool has_gla_recurrent_state() const;
-    infinicore::Tensor gla_recurrent_state_for_layer(size_t layer_idx);
-
     ~StaticKVCache() override = default;
 
 private:
@@ -106,12 +89,6 @@ class StaticKVCache final : public Cache {
 
     // [num_layers, max_batch, num_rank_v_heads, max_cache_len, v_dim]
     infinicore::Tensor v_caches_;
-
-    infinicore::Size gla_recurrent_num_heads_{0};
-    infinicore::Size gla_recurrent_head_dim_{0};
-    // [num_layers, max_batch, gla_recurrent_num_heads, D, D], F32; empty when heads==0
-    infinicore::Tensor gla_state_;
-
 };
 
 class PagedKVCacheConfig final : public CacheConfig {
diff --git a/csrc/config/config_factory.cpp b/csrc/config/config_factory.cpp
index aff8b986..c822983e 100644
--- a/csrc/config/config_factory.cpp
+++ b/csrc/config/config_factory.cpp
@@ -16,7 +16,7 @@ std::shared_ptr<infinilm::config::ModelConfig> ConfigFactory::createConfig(const
     if (it != config_map.end()) {
         it->second(model_config);
     } else {
-        std::vector<std::string> classic_models = {"llama", "qwen2", "minicpm", "minicpm_sala", "fm9g", "fm9g7b"};
+        std::vector<std::string> classic_models = {"llama", "qwen2", "minicpm", "fm9g", "fm9g7b"};
         const std::string &model_type = model_config->get<std::string>("model_type");
         if (std::find(classic_models.begin(), classic_models.end(), model_type) == classic_models.end()) {
             throw std::invalid_argument("infinilm::config::ConfigFactory::createConfig: Unsupported model config type: " + model_type);
diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp
index 2a5c5ff4..f1afd84b 100644
--- a/csrc/engine/infer_engine.cpp
+++ b/csrc/engine/infer_engine.cpp
@@ -121,15 +121,7 @@ InferEngine::Input::to_model_input(infinicore::Device device) const {
 
     auto to_device = [&](const std::optional<infinicore::Tensor> &t)
         -> std::optional<infinicore::Tensor> {
-        if (!t.has_value()) {
-            return t;
-        }
-        auto ten = t.value();
-        // Avoid redundant copies when the tensor is already on the target device.
-        if (ten->device() == device) {
-            return ten;
-        }
-        return ten->to(device);
+        return t.has_value() ? t.value()->to(device) : t;
     };
 
     infinilm::InfinilmModel::Input input = {
diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp
index 1ba89ca1..1542c1e0 100644
--- a/csrc/engine/rank_worker.cpp
+++ b/csrc/engine/rank_worker.cpp
@@ -5,7 +5,6 @@
 #include "../models/models_registry.hpp"
 #include "infinicore/ops.hpp"
 #include <iostream>
-#include <cstdlib>
 #include <spdlog/spdlog.h>
 #include <stdexcept>
 
@@ -262,7 +261,7 @@ void RankWorker::thread_loop() {
                     rank_info_.device,
                     pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr);
             } else {
-                std::vector<std::string> classic_models = {"llama", "qwen2", "minicpm", "minicpm_sala", "fm9g", "fm9g7b"};
+                std::vector<std::string> classic_models = {"llama", "qwen2", "minicpm", "fm9g", "fm9g7b"};
                 if ((std::find(classic_models.begin(), classic_models.end(), model_type) != classic_models.end())) {
                     model_ = InfinilmModelFactory::createModel(
                         model_config_,
diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp
index c426ec1c..001122e4 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp
@@ -3,7 +3,9 @@
 #include "infinicore/ops.hpp"
 #include "infinicore/ops/infllmv2_attention.hpp"
 #include "infinicore/ops/simple_gla_attention.hpp"
+#include "infinicore/ops/simple_gla_decode_step.hpp"
 #include "infinicore/ops/simple_gla_prefill.hpp"
+#include "infinicore/ops/simple_gla_recurrent_state_append.hpp"
 #include "infinicore/context/context.hpp"
 #include "../debug_utils/tensor_utils.hpp"
 
@@ -45,6 +47,19 @@ std::vector<float> build_slope_tensor(size_t n) {
 
 } // namespace
 
+namespace {
+void ensure_gla_state_allocated(infinicore::Tensor &state,
+                               const infinicore::Device &device,
+                               size_t batch_size,
+                               size_t n_h,
+                               size_t head_dim) {
+    const std::vector<size_t> want = {batch_size, n_h, head_dim, head_dim};
+    if (!state || state->shape() != want || state->dtype() != infinicore::DataType::F32 || state->device() != device) {
+        state = infinicore::Tensor::zeros(want, infinicore::DataType::F32, device);
+    }
+}
+} // namespace
+
 MiniCPMSALAAttention::MiniCPMSALAAttention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
                                            const infinicore::Device &device,
                                            size_t layer_idx,
@@ -162,7 +177,11 @@ void MiniCPMSALAAttention::set_rotary_emb(const std::shared_ptr<infinicore::nn::
 }
 
 void MiniCPMSALAAttention::reset_cache() {
-    // KV state is maintained by the shared engine cache (StaticKVCache).
+    // KV tensors are maintained by the shared engine cache (StaticKVCache).
+    // Lightning decode recurrent state is maintained locally for performance.
+    gla_state_valid_ = false;
+    gla_state_cached_len_ = 0;
+    gla_state_ = {};
 }
 
 
@@ -318,34 +337,47 @@ infinicore::Tensor MiniCPMSALAAttention::forward_dense_(const infinicore::Tensor
         auto k_bthd = k_use->permute({0, 2, 1, 3})->contiguous(); // [B, S_kv, H, D]
         auto v_bthd = v_use->permute({0, 2, 1, 3})->contiguous(); // [B, S_kv, H, D]
 
-        // Lightning GLA decode must use recurrent state (StaticKVCache) whenever available.
-        const bool is_lightning_decode = has_cache_meta && static_kv_cache && (seq_len < total_seq_len);
-        if (is_lightning_decode && !static_kv_cache->has_gla_recurrent_state()) {
-            throw std::runtime_error(
-                "MiniCPMSALAAttention(lightning): Lightning decode requires StaticKVCache gla_recurrent_state "
-                "(missing recurrent buffer in StaticKVCache).");
-        }
-
-        const bool recurrent_gla = static_kv_cache && static_kv_cache->has_gla_recurrent_state() && has_cache_meta;
+        // Lightning fast decode: maintain recurrent state locally (do NOT depend on StaticKVCache extensions).
+        // We rebuild state on-demand if it is out-of-sync with cache_pos.
+        const bool is_decode = has_cache_meta && static_kv_cache && (seq_len == 1) && (total_seq_len > 1);
+        if (is_decode) {
+            ensure_gla_state_allocated(gla_state_, q_bthd->device(), batch_size, n_h, head_dim_);
+
+            // Ensure `state` corresponds to exactly `cache_pos` cached tokens (excluding current token).
+            if (!gla_state_valid_ || gla_state_cached_len_ != cache_pos) {
+                // Rebuild from available KV. This is O(T) once after reset / mismatch.
+                infinicore::op::zeros_(gla_state_);
+                if (cache_pos > 0) {
+                    auto k_prev = k_bthd->narrow({{1, 0, cache_pos}});
+                    auto v_prev = v_bthd->narrow({{1, 0, cache_pos}});
+                    infinicore::op::simple_gla_recurrent_state_append_segment(gla_state_, k_prev, v_prev, g_gamma_);
+                }
+                gla_state_cached_len_ = cache_pos;
+                gla_state_valid_ = true;
+            }
 
-        infinicore::Tensor gla_out;
-        if (recurrent_gla && seq_len == 1 && total_seq_len > 1) {
-            auto S = static_kv_cache->gla_recurrent_state_for_layer(cache_layer_idx_);
-            auto q_new = q_bthd;
+            // Decode-step uses only the newest KV at position (total_seq_len - 1).
+            auto q_new = q_bthd; // [B,1,H,D]
             auto k_new = k_bthd->narrow({{1, total_seq_len - 1, 1}});
             auto v_new = v_bthd->narrow({{1, total_seq_len - 1, 1}});
-            gla_out = infinicore::op::simple_gla_decode_step(q_new, k_new, v_new, S, g_gamma_, scaling_);
+            auto out_b1hd = infinicore::op::simple_gla_decode_step(q_new, k_new, v_new, gla_state_, g_gamma_, scaling_);
+            gla_state_cached_len_ = cache_pos + 1;
+            attn_output = out_b1hd->view({batch_size, seq_len, n_h * head_dim_});
+            // Fall through to output norm/gate + o_proj below (do not run full-sequence GLA again).
         } else {
+            // Prefill / non-decode batching: non-recurrent kernels, then update local recurrent state.
             infinicore::Tensor q_full;
             if (seq_len == total_seq_len) {
                 q_full = q_bthd;
             } else {
-                // Decode: q has seq_len (e.g. 1), kv has total_seq_len; pad q to [B, total_seq_len, H, D].
+                // q shorter than KV: pad q to [B, total_seq_len, H, D].
                 q_full = infinicore::Tensor::zeros(
                     {batch_size, total_seq_len, n_h, head_dim_}, q_bthd->dtype(), q_bthd->device());
                 auto q_slot = q_full->narrow({{1, total_seq_len - seq_len, seq_len}});
                 q_slot->copy_from(q_bthd);
             }
+
+            infinicore::Tensor gla_out;
             // Fused prefill: naive kernel for head_dim<=64; chunked/tiled kernel for head_dim>64 (e.g. 128).
             bool use_fused_prefill = (batch_size == 1) && (seq_len == total_seq_len);
             if (use_fused_prefill) {
@@ -354,24 +386,27 @@ infinicore::Tensor MiniCPMSALAAttention::forward_dense_(const infinicore::Tensor
                 gla_out = infinicore::op::simple_gla_attention(q_full, k_bthd, v_bthd, g_gamma_, scaling_);
             }
 
-            // Keep per-layer recurrent state aligned with simple_gla_attention / prefill outputs.
-            // Use batched GEMM (CUDA+ATen) instead of O(seq_len) decode_step launches; see
-            // simple_gla_recurrent_state_append_segment (closed form: S <- g^L S + Σ g^{L-1-j} outer(k,v)).
-            if (recurrent_gla) {
-                auto S = static_kv_cache->gla_recurrent_state_for_layer(cache_layer_idx_);
-                if (cache_pos == 0) {
-                    infinicore::op::zeros_(S);
-                }
+            // Keep local recurrent state in sync for subsequent decode steps.
+            ensure_gla_state_allocated(gla_state_, q_bthd->device(), batch_size, n_h, head_dim_);
+            if (cache_pos == 0) {
+                infinicore::op::zeros_(gla_state_);
+                gla_state_cached_len_ = 0;
+                gla_state_valid_ = true;
+            }
+            // Append the segment we just wrote: [cache_pos, cache_pos + seq_len)
+            if (gla_state_valid_ && gla_state_cached_len_ == cache_pos) {
                 auto k_seg = k_bthd->narrow({{1, cache_pos, seq_len}});
                 auto v_seg = v_bthd->narrow({{1, cache_pos, seq_len}});
-                infinicore::op::simple_gla_recurrent_state_append_segment(S, k_seg, v_seg, g_gamma_);
+                infinicore::op::simple_gla_recurrent_state_append_segment(gla_state_, k_seg, v_seg, g_gamma_);
+                gla_state_cached_len_ = cache_pos + seq_len;
+            } else {
+                // Out-of-sync; force rebuild next time we need recurrent decode.
+                gla_state_valid_ = false;
             }
-        }
 
-        infinicore::Tensor out_slice = (recurrent_gla && seq_len == 1 && total_seq_len > 1)
-                                           ? gla_out
-                                           : gla_out->narrow({{1, total_seq_len - seq_len, seq_len}});
-        attn_output = out_slice->view({batch_size, seq_len, n_h * head_dim_});
+            infinicore::Tensor out_slice = gla_out->narrow({{1, total_seq_len - seq_len, seq_len}});
+            attn_output = out_slice->view({batch_size, seq_len, n_h * head_dim_});
+        }
     } else {
         // minicpm4 layers must use InfLLM-v2 attention (hard error if not available).
         // NOTE: Lightning layers keep Simple GLA for correctness; only minicpm4 routes here.
diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp
index 3cd8f284..37dab7ec 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp
@@ -97,6 +97,12 @@ class MiniCPMSALAAttention : public infinicore::nn::Module {
 
     // Lightning layers only: per-head log-decay for Simple GLA (HF _build_slope_tensor * -1).
     infinicore::Tensor g_gamma_;
+
+    // Lightning layers only: recurrent state for fast decode.
+    // Shape: [B, H, D, D] float32. Tracks how many KV tokens are folded into the state.
+    mutable infinicore::Tensor gla_state_;
+    mutable size_t gla_state_cached_len_ = 0;
+    mutable bool gla_state_valid_ = false;
 };
 
 } // namespace infinilm::models::minicpm_sala
diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp
index ce2e9474..74ea4f9a 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp
@@ -1,11 +1,22 @@
 #include "minicpm_sala_for_causal_lm.hpp"
+#include "../models_registry.hpp"
 
 #include "infinicore/ops.hpp"
 #include <cmath>
 #include <stdexcept>
+#include <string>
 
 namespace infinilm::models::minicpm_sala {
 
+std::shared_ptr<infinilm::config::ModelConfig> create_minicpm_sala_model_config(
+    std::shared_ptr<infinilm::config::ModelConfig> model_config) {
+    const std::string &model_type = model_config->get<std::string>("model_type");
+    if ("minicpm_sala" != model_type) {
+        throw std::runtime_error("infinilm::models::minicpm_sala::create_minicpm_sala_model_config: model_type is not minicpm_sala");
+    }
+    return model_config;
+}
+
 MiniCPMSALAForCausalLM::MiniCPMSALAForCausalLM(
     std::shared_ptr<infinilm::config::ModelConfig> model_config,
     const infinicore::Device &device,
@@ -61,3 +72,10 @@ const cache::CacheConfig *MiniCPMSALAForCausalLM::get_cache_config() const {
 
 } // namespace infinilm::models::minicpm_sala
 
+namespace {
+INFINILM_REGISTER_CAUSAL_LM_MODEL(
+    minicpm_sala,
+    infinilm::models::minicpm_sala::MiniCPMSALAForCausalLM,
+    infinilm::models::minicpm_sala::create_minicpm_sala_model_config);
+} // namespace
+
diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp
index 9bb3ec2b..33305b23 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp
@@ -36,3 +36,10 @@ class MiniCPMSALAForCausalLM : public InfinilmModel {
 
 } // namespace infinilm::models::minicpm_sala
 
+namespace infinilm::models::minicpm_sala {
+
+std::shared_ptr<infinilm::config::ModelConfig> create_minicpm_sala_model_config(
+    std::shared_ptr<infinilm::config::ModelConfig> model_config);
+
+} // namespace infinilm::models::minicpm_sala
+
diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.cpp b/csrc/models/minicpm_sala/minicpm_sala_model.cpp
index a415915f..6fd00bfe 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_model.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_model.cpp
@@ -86,9 +86,6 @@ void MiniCPMSALAModel::reset_cache(const cache::CacheConfig *cache_config) {
         const size_t base_head_dim = model_config_->get<size_t>("head_dim");
         const size_t lightning_kv_heads = model_config_->get_or<size_t>("lightning_nkv", base_kv_heads);
         const size_t lightning_head_dim = model_config_->get_or<size_t>("lightning_head_dim", base_head_dim);
-        const size_t lightning_nh = model_config_->get_or<size_t>("lightning_nh", model_config_->get<size_t>("num_attention_heads"));
-        const int tp_sz = std::max(1, rank_info_.tp_size);
-        const size_t lightning_nh_rank = lightning_nh / static_cast<size_t>(tp_sz);
 
         kv_cache_minicpm4_ = (minicpm4_layer_count > 0)
                                  ? std::make_shared<cache::StaticKVCache>(
@@ -113,9 +110,7 @@ void MiniCPMSALAModel::reset_cache(const cache::CacheConfig *cache_config) {
                                          /*max_positional_embedding=*/model_config_->get<size_t>("max_position_embeddings"),
                                          /*dtype=*/model_config_->get_dtype(),
                                          *static_cfg,
-                                         rank_info_,
-                                         /*gla_recurrent_num_heads=*/lightning_nh_rank,
-                                         /*gla_recurrent_head_dim=*/lightning_head_dim)
+                                         rank_info_)
                                  : nullptr;
     } else {
         // This refactor implements HF-like dense caching only.
diff --git a/csrc/models/model_factory.cpp b/csrc/models/model_factory.cpp
index 3c885fc5..03734ac9 100644
--- a/csrc/models/model_factory.cpp
+++ b/csrc/models/model_factory.cpp
@@ -1,6 +1,6 @@
 #include "model_factory.hpp"
-#include "llama/llama.hpp"
-#include "minicpm_sala/minicpm_sala_for_causal_lm.hpp"
+#include "llama/llama_for_causal_lm.hpp"
+#include "models_registry.hpp"
 
 namespace infinilm {
 /**
@@ -41,13 +41,8 @@ std::shared_ptr<InfinilmModel> InfinilmModelFactory::createModel(
     engine::distributed::RankInfo rank_info,
     const cache::CacheConfig *cache,
     backends::AttentionBackend attention_backend) {
-
     std::shared_ptr<InfinilmModel> model;
-    const auto model_type = model_config->get_or<std::string>("model_type", "llama");
-    if (model_type == "minicpm_sala") {
-        model = std::make_shared<models::minicpm_sala::MiniCPMSALAForCausalLM>(
-            model_config, rank_info.device, rank_info, attention_backend);
-    } else if (true) {
+    if (true) {
         model = std::make_shared<models::llama::LlamaForCausalLM>(
             model_config, rank_info.device, rank_info, attention_backend);
     } else {
@@ -65,8 +60,21 @@ std::shared_ptr<InfinilmModel> InfinilmModelFactory::createModel(
     std::shared_ptr<infinilm::config::ModelConfig> model_config,
     const infinicore::Device &device,
     const cache::CacheConfig *cache) {
-    engine::distributed::RankInfo rank_info;
-    rank_info.device = device;
-    return createModel(model_config, rank_info, cache, backends::AttentionBackend::Default);
+    const std::string model_type = model_config->get<std::string>("model_type");
+    std::shared_ptr<InfinilmModel> model;
+    const auto &model_map = models::get_causal_lm_model_map();
+    auto it = model_map.find(model_type);
+    if (it != model_map.end()) {
+        // create model
+        auto &model_creator = it->second;
+        model = model_creator(model_config, device);
+    } else {
+        throw std::invalid_argument("InfinilmModelFactory::createModel: Unsupported model_type");
+    }
+
+    if (cache) {
+        model->reset_cache(cache);
+    }
+    return model;
 }
 } // namespace infinilm
diff --git a/csrc/pybind11/engine/engine.hpp b/csrc/pybind11/engine/engine.hpp
index d27b9585..a784d69c 100644
--- a/csrc/pybind11/engine/engine.hpp
+++ b/csrc/pybind11/engine/engine.hpp
@@ -199,6 +199,6 @@ inline void bind_infer_engine(py::module &m) {
 
     py::class_<InferEngine::Output>(infer_engine, "Output")
         .def_readwrite("output_ids", &InferEngine::Output::output_ids, "Output tensor");
-    }
+}
 
 } // namespace infinilm::engine

From b93614901039c5f1dbecd81feaccdecd838b2aaf Mon Sep 17 00:00:00 2001
From: Ceng23333 <441651826@qq.com>
Date: Thu, 9 Apr 2026 01:40:55 +0000
Subject: [PATCH 03/11] cleanup code

Signed-off-by: Ceng23333 <441651826@qq.com>
---
 examples/collect_metrics_longtext_decode.py  | 355 --------
 examples/compare_inference_speed.py          | 868 -------------------
 examples/metrics_16k_prefill.md              | 152 ----
 examples/metrics_longtext_mem.md             | 378 --------
 examples/run_infinicore_ops_before_logits.sh |  18 -
 examples/run_longtext_metrics_cases.sh       |  59 --
 6 files changed, 1830 deletions(-)
 delete mode 100644 examples/collect_metrics_longtext_decode.py
 delete mode 100644 examples/compare_inference_speed.py
 delete mode 100644 examples/metrics_16k_prefill.md
 delete mode 100644 examples/metrics_longtext_mem.md
 delete mode 100755 examples/run_infinicore_ops_before_logits.sh
 delete mode 100755 examples/run_longtext_metrics_cases.sh

diff --git a/examples/collect_metrics_longtext_decode.py b/examples/collect_metrics_longtext_decode.py
deleted file mode 100644
index 172b6f40..00000000
--- a/examples/collect_metrics_longtext_decode.py
+++ /dev/null
@@ -1,355 +0,0 @@
-#!/usr/bin/env python3
-"""
-Collect long-context + decode metrics for metrics_longtext_mem.md.
-
-**OOM-safe workflow:** run each case in a **fresh Python process** so CUDA allocations
-are released between runs:
-
-  ./run_longtext_metrics_cases.sh
-
-Or manually:
-
-  python3 collect_metrics_longtext_decode.py --case hf:16384 --append-jsonl profiling_runs/longtext_decode_rows.jsonl
-
-See also docstring at top of previous revisions for GPU selection (CUDA_VISIBLE_DEVICES + NVML_GPU_INDEX).
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import subprocess
-import sys
-import threading
-import time
-from typing import Any, Callable, Dict, List, Optional, Tuple
-
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
-
-
-def _poll_gpu_mem_mib(stop: threading.Event, gpu_index: int, out: List[int]) -> None:
-    while not stop.is_set():
-        try:
-            r = subprocess.run(
-                [
-                    "nvidia-smi",
-                    "-i",
-                    str(gpu_index),
-                    "--query-gpu=memory.used",
-                    "--format=csv,noheader,nounits",
-                ],
-                capture_output=True,
-                text=True,
-                timeout=5,
-            )
-            if r.returncode == 0 and r.stdout.strip().isdigit():
-                out.append(int(r.stdout.strip()))
-        except Exception:
-            pass
-        if stop.wait(timeout=1.0):
-            break
-
-
-def _with_mem_poll(gpu_index: int, fn: Callable[[], Any]) -> Tuple[Any, Optional[int]]:
-    samples: List[int] = []
-    stop = threading.Event()
-    th = threading.Thread(target=_poll_gpu_mem_mib, args=(stop, gpu_index, samples), daemon=True)
-    th.start()
-    err: Optional[BaseException] = None
-    result: Any = None
-    try:
-        result = fn()
-    except BaseException as e:
-        err = e
-    finally:
-        stop.set()
-        th.join(timeout=3.0)
-    peak = max(samples) if samples else None
-    if err is not None:
-        raise err
-    return result, peak
-
-
-def _row_dict(
-    date: str,
-    backend: str,
-    target: int,
-    actual: int,
-    max_new: int,
-    peak: Optional[int],
-    gpu_smi: int,
-    r: Dict[str, Any],
-) -> Dict[str, Any]:
-    return {
-        "date": date,
-        "backend": backend,
-        "target_input_tokens": target,
-        "actual_input_tokens": actual,
-        "max_new_tokens": max_new,
-        "peak_mem_mib": peak,
-        "gpu_smi_index": gpu_smi,
-        "total_time_ms": r.get("total_time_ms"),
-        "prefill_ttft_ms": r.get("prefill_ttft_ms"),
-        "prefill_throughput_tok_s": r.get("prefill_throughput_tok_s"),
-        "decode_itl_ms": r.get("decode_itl_ms"),
-        "decode_throughput_tok_s": r.get("decode_throughput_tok_s"),
-        "engine_reported_generation_ms": r.get("engine_reported_generation_ms"),
-        "error": r.get("error"),
-    }
-
-
-def run_single_case(
-    case: str,
-    *,
-    model_path: str,
-    gpu_smi: int,
-    date: str,
-) -> Dict[str, Any]:
-    """Run one measurement; returns a row dict (may contain error key)."""
-    examples_dir = os.path.dirname(os.path.abspath(__file__))
-    sys.path.insert(0, examples_dir)
-    os.chdir(examples_dir)
-
-    from transformers import AutoTokenizer
-
-    from compare_inference_speed import (
-        _make_prompt_with_target_tokens,
-        run_hf_decode_loop,
-        run_hf_forward_prefill,
-        run_infinilm_inprocess,
-    )
-
-    parts = case.strip().split(":")
-    kind = parts[0].lower()
-    if kind == "hf":
-        # Backward compatible:
-        #   hf:<target>         -> max_new=1 (forward-prefill only)
-        #   hf:<target>:<max>  -> max_new=<max> (decode-loop timing)
-        if len(parts) == 2:
-            target = int(parts[1])
-            max_new = 1
-        elif len(parts) == 3:
-            target = int(parts[1])
-            max_new = int(parts[2])
-        else:
-            raise ValueError("--case hf:<target_tokens>[:<max_new_tokens>] (e.g. hf:16384 or hf:16384:32)")
-    elif kind == "infinilm_rec":
-        if len(parts) != 3:
-            raise ValueError("--case infinilm_rec:<target>:<max_new> (e.g. infinilm_rec:32768:32)")
-        target = int(parts[1])
-        max_new = int(parts[2])
-    else:
-        raise ValueError(
-            f"Unknown case kind {kind!r}; use hf: or infinilm_rec:"
-        )
-
-    tok = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    prompt, actual = _make_prompt_with_target_tokens(tok, "How are you", target)
-
-    if kind == "hf":
-
-        def go() -> Dict[str, Any]:
-            # Always use hf decode-loop so total_time_ms can be end-to-end
-            # (prefill + decode), matching the InfiniLM generate semantics.
-            return run_hf_decode_loop(
-                model_path,
-                prompt,
-                max_new,
-                device="cuda",
-                attn_implementation="flash_attention_2",
-                use_cache=True,
-                warmup=1,
-                iters=1,
-            )
-
-        try:
-            r, peak = _with_mem_poll(gpu_smi, go)
-            r = dict(r)
-            return _row_dict(date, "hf (decode_loop)", target, actual, max_new, peak, gpu_smi, r)
-        except Exception as e:
-            return _row_dict(
-                date,
-                "hf (decode_loop)",
-                target,
-                actual,
-                max_new,
-                None,
-                gpu_smi,
-                {"error": str(e)},
-            )
-
-    recurrent = kind == "infinilm_rec"
-    if max_new == 1:
-        label = "infinilm (static_fit, recurrent GLA decode)"
-    else:
-        label = f"infinilm (static_fit, recurrent GLA, +{max_new} decode)"
-
-    saved_lightning = os.environ.get("INFINI_LIGHTNING_GLA_RECURRENT_DECODE")
-    saved_skip = os.environ.get("INFINI_SKIP_LAST_LOGITS_CPU")
-    try:
-        if recurrent:
-            os.environ["INFINI_LIGHTNING_GLA_RECURRENT_DECODE"] = "1"
-        else:
-            os.environ.pop("INFINI_LIGHTNING_GLA_RECURRENT_DECODE", None)
-        os.environ["INFINI_SKIP_LAST_LOGITS_CPU"] = "1"
-
-        def go_inf() -> Dict[str, Any]:
-            return run_infinilm_inprocess(
-                model_path,
-                prompt,
-                max_new,
-                cache_mode="static_fit",
-                paged_block_size=256,
-                attn_backend="default",
-            )
-
-        r, peak = _with_mem_poll(gpu_smi, go_inf)
-        return _row_dict(date, label, target, actual, max_new, peak, gpu_smi, dict(r))
-    except Exception as e:
-        return _row_dict(date, label, target, actual, max_new, None, gpu_smi, {"error": str(e)})
-    finally:
-        if saved_lightning is None:
-            os.environ.pop("INFINI_LIGHTNING_GLA_RECURRENT_DECODE", None)
-        else:
-            os.environ["INFINI_LIGHTNING_GLA_RECURRENT_DECODE"] = saved_lightning
-        if saved_skip is None:
-            os.environ.pop("INFINI_SKIP_LAST_LOGITS_CPU", None)
-        else:
-            os.environ["INFINI_SKIP_LAST_LOGITS_CPU"] = saved_skip
-
-
-def print_markdown_table(rows: List[Dict[str, Any]]) -> None:
-    def fmt(x: Any) -> str:
-        if x is None:
-            return "—"
-        if isinstance(x, float):
-            s = f"{x:.2f}"
-            return s.rstrip("0").rstrip(".")
-        return str(x)
-
-    gpu_smi = rows[0].get("gpu_smi_index", 0) if rows else 0
-    print("\n### Markdown table (paste into metrics_longtext_mem.md)\n")
-    hdr = (
-        "| date | backend | target_in | max_new | peak_mem_mib | total_ms | prefill_ttft_ms | "
-        "prefill_tok_s | decode_itl_ms | decode_tok_s | gpu |"
-    )
-    sep = "|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|"
-    print(hdr)
-    print(sep)
-    for row in rows:
-        if row.get("error"):
-            print(
-                f"| {row['date']} | {row['backend']} | {row['target_input_tokens']} | "
-                f"{row['max_new_tokens']} | {fmt(row.get('peak_mem_mib'))} | OOM/err | — | — | — | — | {gpu_smi} |"
-            )
-            continue
-        dec_itl = fmt(row.get("decode_itl_ms")) if row["max_new_tokens"] > 1 else "—"
-        dec_tps = fmt(row.get("decode_throughput_tok_s")) if row["max_new_tokens"] > 1 else "—"
-        ptt = row.get("prefill_ttft_ms")
-        # Only forward-prefill runs use total_time_ms as a prefill-time proxy.
-        if ptt is None and row.get("backend") == "hf (forward_prefill)":
-            ptt = row.get("total_time_ms")
-        print(
-            f"| {row['date']} | {row['backend']} | {row['target_input_tokens']} | {row['max_new_tokens']} | "
-            f"{fmt(row.get('peak_mem_mib'))} | {fmt(row.get('total_time_ms'))} | {fmt(ptt)} | "
-            f"{fmt(row.get('prefill_throughput_tok_s'))} | {dec_itl} | {dec_tps} | {gpu_smi} |"
-        )
-
-
-def main() -> None:
-    ap = argparse.ArgumentParser(description="Long-context + decode metrics (OOM-safe --case mode)")
-    ap.add_argument(
-        "--case",
-        type=str,
-        default=None,
-        help="Single case: hf:16384 | infinilm_rec:32768:32",
-    )
-    ap.add_argument(
-        "--append-jsonl",
-        type=str,
-        default=None,
-        help="Append one JSON line (--case mode only)",
-    )
-    ap.add_argument(
-        "--from-jsonl",
-        type=str,
-        default=None,
-        help="Load rows from jsonl and print markdown table",
-    )
-    ap.add_argument(
-        "--all-in-process",
-        action="store_true",
-        help="Run full matrix in one process (may OOM between cases)",
-    )
-    args = ap.parse_args()
-
-    model_path = os.environ.get(
-        "MODEL_PATH", "/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA"
-    )
-    gpu_smi = int(os.environ.get("NVML_GPU_INDEX", os.environ.get("CUDA_VISIBLE_DEVICES", "0")))
-    date = os.environ.get("METRICS_DATE", "2026-03-23")
-    decode_steps = int(os.environ.get("METRICS_DECODE_STEPS", "32"))
-    targets = [int(x) for x in os.environ.get("METRICS_TARGETS", "16384,32768,65536").split(",")]
-
-    examples_dir = os.path.dirname(os.path.abspath(__file__))
-
-    if args.from_jsonl:
-        rows = []
-        with open(args.from_jsonl) as f:
-            for line in f:
-                line = line.strip()
-                if line:
-                    rows.append(json.loads(line))
-        print_markdown_table(rows)
-        return
-
-    if args.case:
-        row = run_single_case(args.case, model_path=model_path, gpu_smi=gpu_smi, date=date)
-        print(json.dumps(row, ensure_ascii=False))
-        if args.append_jsonl:
-            ap = os.path.abspath(args.append_jsonl)
-            ad = os.path.dirname(ap)
-            if ad:
-                os.makedirs(ad, exist_ok=True)
-            with open(ap, "a") as f:
-                f.write(json.dumps(row, ensure_ascii=False) + "\n")
-        return
-
-    if not args.all_in_process:
-        print(
-            "Specify --case CASE, --from-jsonl FILE, or --all-in-process.\n"
-            "For OOM safety use: ./run_longtext_metrics_cases.sh",
-            file=sys.stderr,
-        )
-        sys.exit(2)
-
-    # Legacy: all targets × all backends in one process
-    rows: List[Dict[str, Any]] = []
-    for t in targets:
-        row = run_single_case(f"hf:{t}", model_path=model_path, gpu_smi=gpu_smi, date=date)
-        rows.append(row)
-    for t in targets:
-        rows.append(
-            run_single_case(f"infinilm_rec:{t}:1", model_path=model_path, gpu_smi=gpu_smi, date=date)
-        )
-    for t in targets:
-        rows.append(
-            run_single_case(
-                f"infinilm_rec:{t}:{decode_steps}",
-                model_path=model_path,
-                gpu_smi=gpu_smi,
-                date=date,
-            )
-        )
-
-    out_path = os.path.join(examples_dir, "profiling_runs", "longtext_decode_metrics.json")
-    os.makedirs(os.path.dirname(out_path), exist_ok=True)
-    with open(out_path, "w") as f:
-        json.dump({"gpu_smi_index": gpu_smi, "decode_steps": decode_steps, "rows": rows}, f, indent=2)
-    print(f"Wrote {out_path}")
-    print_markdown_table(rows)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/compare_inference_speed.py b/examples/compare_inference_speed.py
deleted file mode 100644
index 06fad9a7..00000000
--- a/examples/compare_inference_speed.py
+++ /dev/null
@@ -1,868 +0,0 @@
-#!/usr/bin/env python3
-"""
-Compare MiniCPM-SALA inference speed across HF, InfiniLM, and (optionally) SGLang.
-
-Usage:
-  # HF + InfiniLM only (InfiniLM runs in subprocess with same env as jiuge):
-  python compare_inference_speed.py --model_path /path/to/MiniCPM-SALA [--prompt "How are you"] [--max_new_tokens 32]
-
-  # Include SGLang (server must already be running with MiniCPM-SALA):
-  python compare_inference_speed.py --model_path /path/to/MiniCPM-SALA --sglang_url http://127.0.0.1:30000
-
-  # Optional: write JSON
-  python compare_inference_speed.py --model_path /path/to/MiniCPM-SALA --output results.json
-
-Requires: transformers, torch; for InfiniLM subprocess: PYTHONPATH and LD_LIBRARY_PATH as in jiuge.
-"""
-
-import argparse
-import json
-import os
-import re
-import subprocess
-import sys
-import time
-from typing import Optional, Tuple, Literal
-
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
-
-def _build_chat_input_ids(tokenizer, prompt: str):
-    conversation = [{"role": "user", "content": prompt}]
-    text = tokenizer.apply_chat_template(
-        conversation, add_generation_prompt=True, tokenize=False
-    )
-    ids = tokenizer(text, add_special_tokens=True)["input_ids"]
-    return ids
-
-
-def _make_prompt_with_target_tokens(tokenizer, base_prompt: str, target_input_tokens: int) -> Tuple[str, int]:
-    """
-    Build a prompt (user content) such that the *chat-templated* input_ids length is >= target_input_tokens.
-    Returns (prompt, actual_input_tokens).
-    """
-    if target_input_tokens <= 0:
-        raise ValueError("--target_input_tokens must be > 0")
-
-    # Ensure boundaries don't merge tokens weirdly.
-    chunk = (base_prompt.strip() + "\n") if base_prompt.strip() else "hello\n"
-
-    # Exponential growth to find an upper bound.
-    rep = 1
-    while True:
-        prompt = chunk * rep
-        ids = _build_chat_input_ids(tokenizer, prompt)
-        if len(ids) >= target_input_tokens:
-            break
-        rep *= 2
-        if rep > 1_000_000:
-            raise RuntimeError("Failed to build prompt to target length (rep too large)")
-
-    # Binary search for smallest rep that reaches target.
-    lo, hi = 1, rep
-    best_prompt = prompt
-    best_len = len(ids)
-    while lo <= hi:
-        mid = (lo + hi) // 2
-        p = chunk * mid
-        l = len(_build_chat_input_ids(tokenizer, p))
-        if l >= target_input_tokens:
-            best_prompt, best_len = p, l
-            hi = mid - 1
-        else:
-            lo = mid + 1
-
-    return best_prompt, best_len
-
-
-def run_hf(
-    model_path: str,
-    prompt: str,
-    max_new_tokens: int,
-    device: str = "cuda",
-    *,
-    attn_implementation: Optional[str] = None,
-):
-    """Run HuggingFace generate and return metrics."""
-    import torch
-    from transformers import AutoModelForCausalLM, AutoTokenizer
-
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    model_kwargs = {
-        "torch_dtype": "auto",
-        "trust_remote_code": True,
-    }
-    # Prefer flash-attn when available; fall back silently if not supported.
-    if attn_implementation is not None:
-        model_kwargs["attn_implementation"] = attn_implementation  # type: ignore[assignment]
-    try:
-        model = AutoModelForCausalLM.from_pretrained(
-            model_path,
-            **model_kwargs,
-        ).to(device)
-    except TypeError:
-        # Older transformers versions may not support attn_implementation kwarg.
-        model_kwargs.pop("attn_implementation", None)
-        model = AutoModelForCausalLM.from_pretrained(
-            model_path,
-            **model_kwargs,
-        ).to(device)
-    model.eval()
-
-    conversation = [{"role": "user", "content": prompt}]
-    text = tokenizer.apply_chat_template(
-        conversation, add_generation_prompt=True, tokenize=False
-    )
-    inputs = tokenizer(text, return_tensors="pt").to(device)
-    input_len = inputs.input_ids.shape[1]
-
-    start = time.perf_counter()
-    with torch.inference_mode():
-        out = model.generate(
-            **inputs,
-            max_new_tokens=max_new_tokens,
-            do_sample=False,
-            pad_token_id=tokenizer.eos_token_id or 0,
-        )
-    elapsed = time.perf_counter() - start
-    output_len = out.shape[1] - input_len
-
-    return {
-        "backend": "hf",
-        "total_time_ms": round(elapsed * 1000, 2),
-        "input_tokens": input_len,
-        "output_tokens": output_len,
-        "prefill_ttft_ms": None,  # HF generate() doesn't expose TTFT without streaming
-        "decode_throughput_tok_s": round(output_len / elapsed, 2) if elapsed > 0 else None,
-        "total_throughput_tok_s": round((input_len + output_len) / elapsed, 2) if elapsed > 0 else None,
-    }
-
-
-def run_hf_forward_prefill(
-    model_path: str,
-    prompt: str,
-    device: str = "cuda",
-    *,
-    attn_implementation: Optional[str] = None,
-    use_cache: bool = True,
-    warmup: int = 1,
-    iters: int = 1,
-):
-    """
-    Run HuggingFace *forward-only* prefill (no decode loop).
-    Intended for kernel-level profiling to isolate prefill work.
-    """
-    import torch
-    from transformers import AutoModelForCausalLM, AutoTokenizer
-
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    model_kwargs = {
-        "torch_dtype": "auto",
-        "trust_remote_code": True,
-    }
-    if attn_implementation is not None:
-        model_kwargs["attn_implementation"] = attn_implementation  # type: ignore[assignment]
-    try:
-        model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs).to(device)
-    except TypeError:
-        model_kwargs.pop("attn_implementation", None)
-        model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs).to(device)
-    model.eval()
-
-    conversation = [{"role": "user", "content": prompt}]
-    text = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
-    inputs = tokenizer(text, return_tensors="pt").to(device)
-    input_len = inputs.input_ids.shape[1]
-
-    # Warmup (reduces first-iter compilation / cache effects for profiling).
-    with torch.inference_mode():
-        for _ in range(max(0, warmup)):
-            # Prefer last-token logits only (reduces memory at long context).
-            try:
-                _ = model(**inputs, use_cache=use_cache, logits_to_keep=1)
-            except TypeError:
-                _ = model(**inputs, use_cache=use_cache)
-        torch.cuda.synchronize()
-
-    # Timed iters.
-    times = []
-    with torch.inference_mode():
-        for _ in range(max(1, iters)):
-            torch.cuda.synchronize()
-            try:
-                torch.cuda.nvtx.range_push("hf_forward_prefill")
-            except Exception:
-                pass
-            start = time.perf_counter()
-            try:
-                _ = model(**inputs, use_cache=use_cache, logits_to_keep=1)
-            except TypeError:
-                _ = model(**inputs, use_cache=use_cache)
-            torch.cuda.synchronize()
-            elapsed = time.perf_counter() - start
-            try:
-                torch.cuda.nvtx.range_pop()
-            except Exception:
-                pass
-            times.append(elapsed)
-
-    best = min(times) if times else 0.0
-    return {
-        "backend": "hf_forward_prefill",
-        "total_time_ms": round(best * 1000, 2),
-        "input_tokens": int(input_len),
-        "output_tokens": 0,
-        "use_cache": bool(use_cache),
-        "warmup": int(warmup),
-        "iters": int(iters),
-        "prefill_throughput_tok_s": round(input_len / best, 2) if best > 0 else None,
-    }
-
-
-def run_hf_decode_loop(
-    model_path: str,
-    prompt: str,
-    max_new_tokens: int,
-    device: str = "cuda",
-    *,
-    attn_implementation: Optional[str] = None,
-    use_cache: bool = True,
-    warmup: int = 8,
-    iters: int = 1,
-):
-    """
-    Measure HF *decode-only* per-token latency using a manual loop with past_key_values.
-
-    Protocol:
-    - Prefill once on the full prompt (not included in decode timing).
-    - Then decode `max_new_tokens` tokens with 1-token steps, timing the whole decode loop
-      (optionally best-of `iters`).
-    """
-    import torch
-    from transformers import AutoModelForCausalLM, AutoTokenizer
-
-    if max_new_tokens <= 0:
-        raise ValueError("--max_new_tokens must be > 0 for hf decode_loop")
-
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    model_kwargs = {
-        "torch_dtype": "auto",
-        "trust_remote_code": True,
-    }
-    if attn_implementation is not None:
-        model_kwargs["attn_implementation"] = attn_implementation  # type: ignore[assignment]
-    try:
-        model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs).to(device)
-    except TypeError:
-        model_kwargs.pop("attn_implementation", None)
-        model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs).to(device)
-    model.eval()
-
-    conversation = [{"role": "user", "content": prompt}]
-    text = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
-    inputs = tokenizer(text, return_tensors="pt").to(device)
-    input_ids = inputs.input_ids
-    input_len = int(input_ids.shape[1])
-    # Some decoder-only models require attention_mask even when no padding is used.
-    attention_mask = inputs.get("attention_mask", None)
-    if attention_mask is None:
-        attention_mask = input_ids.new_ones(input_ids.shape)
-    attention_mask = attention_mask.to(device)
-    # Precompute full (input_len + max_new_tokens) causal attention mask for past-key decoding.
-    attention_mask_full = attention_mask.new_ones((attention_mask.shape[0], input_len + max_new_tokens))
-
-    # Prefill once to build cache.
-    with torch.inference_mode():
-        try:
-            pre = model(input_ids=input_ids, attention_mask=attention_mask, use_cache=use_cache, logits_to_keep=1)
-        except TypeError:
-            pre = model(input_ids=input_ids, attention_mask=attention_mask, use_cache=use_cache)
-        past = getattr(pre, "past_key_values", None)
-        # Greedy next token from last logits.
-        logits = pre.logits[:, -1, :]
-        next_token = torch.argmax(logits, dim=-1, keepdim=True)
-
-    # Warmup decode steps (not timed) to reduce first-step effects.
-    with torch.inference_mode():
-        for warm_i in range(max(0, warmup)):
-            try:
-                # Attention mask must cover (past + current token).
-                attn_mask_step = attention_mask_full[:, : input_len + warm_i + 1]
-                out = model(
-                    input_ids=next_token,
-                    attention_mask=attn_mask_step,
-                    use_cache=use_cache,
-                    past_key_values=past,
-                    logits_to_keep=1,
-                )
-            except TypeError:
-                attn_mask_step = attention_mask_full[:, : input_len + warm_i + 1]
-                out = model(
-                    input_ids=next_token,
-                    attention_mask=attn_mask_step,
-                    use_cache=use_cache,
-                    past_key_values=past,
-                )
-            past = getattr(out, "past_key_values", past)
-            logits = out.logits[:, -1, :]
-            next_token = torch.argmax(logits, dim=-1, keepdim=True)
-        torch.cuda.synchronize()
-
-    # Timed decode loops (best-of iters).
-    # We report total_time_ms as end-to-end (prefill + decode), but keep
-    # decode_itl_ms / decode_throughput_tok_s based on decode-only time.
-    total_times = []
-    decode_times = []
-    with torch.inference_mode():
-        for _ in range(max(1, iters)):
-            # Re-prefill to avoid measuring a "warmed" cache from prior iteration.
-            # Time prefill separately so decode_itl_ms stays decode-only.
-            torch.cuda.synchronize()
-            prefill_start = time.perf_counter()
-            try:
-                pre = model(
-                    input_ids=input_ids,
-                    attention_mask=attention_mask,
-                    use_cache=use_cache,
-                    logits_to_keep=1,
-                )
-            except TypeError:
-                # Some model/transformers combinations may not accept attention_mask.
-                pre = model(input_ids=input_ids, use_cache=use_cache)
-            past = getattr(pre, "past_key_values", None)
-            logits = pre.logits[:, -1, :]
-            next_token = torch.argmax(logits, dim=-1, keepdim=True)
-
-            torch.cuda.synchronize()
-            prefill_elapsed = time.perf_counter() - prefill_start
-
-            torch.cuda.synchronize()
-            start = time.perf_counter()  # decode start
-            try:
-                torch.cuda.nvtx.range_push("hf_decode_loop")
-            except Exception:
-                pass
-            for t in range(max_new_tokens):
-                attn_mask_step = attention_mask_full[:, : input_len + t + 1]
-                try:
-                    out = model(
-                        input_ids=next_token,
-                        attention_mask=attn_mask_step,
-                        use_cache=use_cache,
-                        past_key_values=past,
-                        logits_to_keep=1,
-                    )
-                except TypeError:
-                    out = model(
-                        input_ids=next_token,
-                        attention_mask=attn_mask_step,
-                        use_cache=use_cache,
-                        past_key_values=past,
-                    )
-                past = getattr(out, "past_key_values", past)
-                logits = out.logits[:, -1, :]
-                next_token = torch.argmax(logits, dim=-1, keepdim=True)
-            torch.cuda.synchronize()
-            decode_elapsed = time.perf_counter() - start
-            total_elapsed = prefill_elapsed + decode_elapsed
-            try:
-                torch.cuda.nvtx.range_pop()
-            except Exception:
-                pass
-            total_times.append(total_elapsed)
-            decode_times.append(decode_elapsed)
-
-    # Pick the iteration with the best end-to-end time; compute decode metrics
-    # from the corresponding decode-only time.
-    if total_times:
-        best_idx = min(range(len(total_times)), key=lambda i: total_times[i])
-        best_total = total_times[best_idx]
-        best_decode = decode_times[best_idx]
-    else:
-        best_total = 0.0
-        best_decode = 0.0
-
-    itl_ms = (best_decode * 1000.0 / max_new_tokens) if best_decode > 0 else None
-    thr = (max_new_tokens / best_decode) if best_decode > 0 else None
-    return {
-        "backend": "hf_decode_loop",
-        "total_time_ms": round(best_total * 1000, 2),
-        "input_tokens": int(input_len),
-        "output_tokens": int(max_new_tokens),
-        "decode_itl_ms": round(itl_ms, 4) if itl_ms is not None else None,
-        "decode_throughput_tok_s": round(thr, 2) if thr is not None else None,
-        "use_cache": bool(use_cache),
-        "warmup": int(warmup),
-        "iters": int(iters),
-    }
-
-
-def run_infinilm_inprocess(
-    model_path: str,
-    prompt: str,
-    max_new_tokens: int,
-    *,
-    cache_mode: Literal["static_fit", "static_maxpos", "paged"] = "paged",
-    paged_block_size: int = 256,
-    attn_backend: str = "flash-attn",
-):
-    """
-    Run InfiniLM in-process (no 2048-token truncation). Parses InferEngine's timing prints.
-    This expects PYTHONPATH to include InfiniLM/InfiniCore python packages (container runner does this).
-    """
-    import io
-    import torch
-    import contextlib
-
-    import infinicore
-    from transformers import AutoTokenizer
-
-    from infinilm.cache import PagedKVCacheConfig, StaticKVCacheConfig
-    from infinilm.distributed import DistConfig
-    from infinilm.infer_engine import GenerationConfig, InferEngine
-    from infinilm.modeling_utils import load_model_state_dict_by_file
-
-    model_path = os.path.expanduser(model_path)
-    # Prefer flash-attn when available; fall back to default.
-    try:
-        model = InferEngine(
-            model_path,
-            device=infinicore.device("cuda", 0),
-            distributed_config=DistConfig(1),
-            enable_graph_compiling=False,
-            attention_backend=attn_backend,
-        )
-    except TypeError:
-        # Older InferEngine builds may not accept attention_backend.
-        model = InferEngine(
-            model_path,
-            device=infinicore.device("cuda", 0),
-            distributed_config=DistConfig(1),
-            enable_graph_compiling=False,
-        )
-    except Exception:
-        try:
-            model = InferEngine(
-                model_path,
-                device=infinicore.device("cuda", 0),
-                distributed_config=DistConfig(1),
-                enable_graph_compiling=False,
-                attention_backend="default",
-            )
-        except TypeError:
-            model = InferEngine(
-                model_path,
-                device=infinicore.device("cuda", 0),
-                distributed_config=DistConfig(1),
-                enable_graph_compiling=False,
-            )
-    load_model_state_dict_by_file(model, model_path, dtype=model.config.dtype)
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-
-    input_ids = _build_chat_input_ids(tokenizer, prompt)
-    input_ids_infini = infinicore.from_list([input_ids])
-
-    initial_capacity = len(input_ids) + max_new_tokens
-    if cache_mode == "paged":
-        num_blocks = (initial_capacity + (paged_block_size - 1)) // paged_block_size
-        cache_config = PagedKVCacheConfig(
-            num_blocks=num_blocks,
-            block_size=paged_block_size,
-        )
-    else:
-        if cache_mode == "static_maxpos":
-            max_pos = getattr(model.config, "max_position_embeddings", 4096)
-            max_cache_len = max(initial_capacity, max_pos)
-        else:
-            # Fit cache to what we actually need for this run.
-            max_cache_len = initial_capacity
-        cache_config = StaticKVCacheConfig(max_batch_size=1, max_cache_len=max_cache_len)
-    # Basic GPU memory stats around cache construction (CUDA device assumed to be index 0).
-    mem_before_cache = torch.cuda.memory_allocated(0)
-    max_mem_before_cache = torch.cuda.max_memory_allocated(0)
-
-    model.reset_cache(cache_config)
-
-    mem_after_cache = torch.cuda.memory_allocated(0)
-    max_mem_after_cache = torch.cuda.max_memory_allocated(0)
-
-    buf = io.StringIO()
-    start = time.perf_counter()
-    with contextlib.redirect_stdout(buf):
-        try:
-            torch.cuda.nvtx.range_push("infinilm_generate")
-        except Exception:
-            pass
-        try:
-            model.generate(
-                input_ids_infini,
-                GenerationConfig(
-                    max_new_tokens=max_new_tokens,
-                    temperature=1.0,
-                    top_k=1,
-                    top_p=1.0,
-                    # Profiling: avoid per-step EOS checks + early stop variability.
-                    stop_on_eos=False,
-                ),
-                _measure_and_log_time=True,
-            )
-        finally:
-            try:
-                torch.cuda.nvtx.range_pop()
-            except Exception:
-                pass
-    elapsed = time.perf_counter() - start
-    stdout = buf.getvalue()
-
-    prefill_ttft_ms = None
-    prefill_throughput = None
-    decode_itl_ms = None
-    decode_throughput = None
-    gen_completed_ms = None
-    for line in stdout.splitlines():
-        if "Prefill TTFT:" in line:
-            m = re.search(
-                r"Prefill TTFT:\s*([\d.]+)\s*ms.*Throughput:\s*([\d.]+)\s*tok/s", line
-            )
-            if m:
-                prefill_ttft_ms = float(m.group(1))
-                prefill_throughput = float(m.group(2))
-        if "Decode" in line and "ITL:" in line:
-            m = re.search(
-                r"Decode\s+Avg ITL:\s*([\d.]+)\s*ms.*Throughput:\s*([\d.]+)\s*tok/s", line
-            )
-            if m:
-                decode_itl_ms = float(m.group(1))
-                decode_throughput = float(m.group(2))
-        if "Generation completed in" in line:
-            m = re.search(r"Generation completed in\s*([\d.]+)\s*ms", line)
-            if m:
-                gen_completed_ms = float(m.group(1))
-
-    return {
-        "backend": "infinilm",
-        "total_time_ms": round(elapsed * 1000, 2),
-        "input_tokens": len(input_ids),
-        "output_tokens": max_new_tokens,
-        "prefill_ttft_ms": prefill_ttft_ms,
-        "prefill_throughput_tok_s": prefill_throughput,
-        "decode_itl_ms": decode_itl_ms,
-        "decode_throughput_tok_s": decode_throughput,
-        "engine_reported_generation_ms": gen_completed_ms,
-        # Cache / attention configuration
-        "cache_mode": cache_mode,
-        "paged_block_size": paged_block_size if cache_mode == "paged" else None,
-        "enable_paged_attn": getattr(model, "enable_paged_attn", False),
-        "static_max_cache_len": max_cache_len if cache_mode != "paged" else None,
-        "paged_num_blocks": num_blocks if cache_mode == "paged" else None,
-        # Torch CUDA memory snapshots (bytes)
-        "torch_memory_allocated_before_cache": int(mem_before_cache),
-        "torch_memory_allocated_after_cache": int(mem_after_cache),
-        "torch_max_memory_allocated_before_cache": int(max_mem_before_cache),
-        "torch_max_memory_allocated_after_cache": int(max_mem_after_cache),
-    }
-
-
-def run_infinilm(model_path: str, prompt: str, max_new_tokens: int, env=None):
-    """Run InfiniLM jiuge via subprocess and parse stdout for metrics."""
-    run_env = {**os.environ, **(env or {})}
-    examples_dir = os.path.dirname(os.path.abspath(__file__))
-    jiuge_py = os.path.join(examples_dir, "jiuge.py")
-    cmd = [
-        sys.executable,
-        jiuge_py,
-        "--nvidia",
-        "--model_path", model_path,
-        "--prompt", prompt,
-        "--max_new_tokens", str(max_new_tokens),
-    ]
-    try:
-        result = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            timeout=300,
-            env=run_env,
-            cwd=examples_dir,
-        )
-        stdout = result.stdout or ""
-        if result.returncode != 0 and not stdout:
-            return {"backend": "infinilm", "error": (result.stderr or f"exit code {result.returncode}")[:500]}
-    except Exception as e:
-        return {"backend": "infinilm", "error": str(e)}
-
-    # Parse jiuge / InferEngine output
-    prefill_ttft_ms = None
-    prefill_throughput = None
-    decode_itl_ms = None
-    decode_throughput = None
-    total_time_ms = None
-    for line in stdout.splitlines():
-        if "Prefill TTFT:" in line:
-            m = re.search(r"Prefill TTFT:\s*([\d.]+)\s*ms.*Throughput:\s*([\d.]+)\s*tok/s", line)
-            if m:
-                prefill_ttft_ms = float(m.group(1))
-                prefill_throughput = float(m.group(2))
-        if "Decode" in line and "ITL:" in line:
-            m = re.search(r"Decode\s+Avg ITL:\s*([\d.]+)\s*ms.*Throughput:\s*([\d.]+)\s*tok/s", line)
-            if m:
-                decode_itl_ms = float(m.group(1))
-                decode_throughput = float(m.group(2))
-        if "total_time:" in line:
-            m = re.search(r"total_time:\s*([\d.]+)\s*ms", line)
-            if m:
-                total_time_ms = float(m.group(1))
-        if "Generation completed in" in line:
-            m = re.search(r"Generation completed in\s*([\d.]+)\s*ms", line)
-            if m:
-                total_time_ms = float(m.group(1))
-
-    return {
-        "backend": "infinilm",
-        "total_time_ms": total_time_ms,
-        "prefill_ttft_ms": prefill_ttft_ms,
-        "prefill_throughput_tok_s": prefill_throughput,
-        "decode_itl_ms": decode_itl_ms,
-        "decode_throughput_tok_s": decode_throughput,
-    }
-
-
-def run_sglang_client(sglang_url: str, prompt: str, max_new_tokens: int):
-    """Send one request to SGLang server and return metrics."""
-    try:
-        import requests
-    except ImportError:
-        return {"backend": "sglang", "error": "requests not installed"}
-
-    url = sglang_url.rstrip("/") + "/generate"
-    payload = {
-        "text": prompt,
-        "sampling_params": {"max_new_tokens": max_new_tokens, "temperature": 0},
-    }
-    start = time.perf_counter()
-    try:
-        r = requests.post(url, json=payload, timeout=120)
-        r.raise_for_status()
-        data = r.json()
-    except Exception as e:
-        return {"backend": "sglang", "error": str(e)}
-    elapsed_ms = (time.perf_counter() - start) * 1000
-
-    # SGLang response may have "meta_info" with "completion_tokens" or we use prompt + output length
-    output_text = (data.get("text") or data.get("choices", [{}])[0].get("text") or "")
-    completion_tokens = data.get("meta_info", {}).get("completion_tokens") or data.get("usage", {}).get("completion_tokens")
-    if completion_tokens is None and "usage" in data:
-        completion_tokens = data["usage"].get("completion_tokens")
-    if completion_tokens is None:
-        completion_tokens = max_new_tokens  # fallback
-
-    return {
-        "backend": "sglang",
-        "total_time_ms": round(elapsed_ms, 2),
-        "output_tokens": completion_tokens,
-        "total_throughput_tok_s": round(completion_tokens / (elapsed_ms / 1000), 2) if elapsed_ms > 0 else None,
-    }
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Compare MiniCPM-SALA inference speed: HF, InfiniLM, SGLang")
-    parser.add_argument("--model_path", required=True, help="Path to MiniCPM-SALA model dir")
-    parser.add_argument("--prompt", default="How are you", help="Prompt for generation")
-    parser.add_argument("--max_new_tokens", type=int, default=32, help="Max new tokens to generate")
-    parser.add_argument(
-        "--target_input_tokens",
-        type=int,
-        default=None,
-        help="If set, synthesize a long prompt so chat-templated input tokens >= this value (e.g. 65536).",
-    )
-    parser.add_argument(
-        "--infinilm_cache_mode",
-        type=str,
-        default="paged",
-        choices=["paged", "static_fit", "static_maxpos"],
-        help="InfiniLM KV cache mode when running long prompts in-process.",
-    )
-    parser.add_argument(
-        "--infinilm_paged_block_size",
-        type=int,
-        default=256,
-        help="Paged KV block size (tokens per block).",
-    )
-    parser.add_argument(
-        "--infinilm_attn_backend",
-        type=str,
-        default="flash-attn",
-        help="InfiniLM attention backend (e.g. flash-attn or default).",
-    )
-    parser.add_argument(
-        "--hf_attn_implementation",
-        type=str,
-        default="flash_attention_2",
-        help="HF attention implementation to request (e.g. flash_attention_2 or eager).",
-    )
-    parser.add_argument(
-        "--hf_mode",
-        type=str,
-        default="generate",
-        choices=["generate", "forward_prefill", "decode_loop"],
-        help="HF run mode: generate() end-to-end, forward-only prefill, or manual decode_loop timing with KV cache.",
-    )
-    parser.add_argument(
-        "--hf_forward_use_cache",
-        action="store_true",
-        help="In HF forward_prefill mode, pass use_cache=True (recommended).",
-    )
-    parser.add_argument(
-        "--hf_forward_warmup",
-        type=int,
-        default=1,
-        help="Warmup iterations for HF forward_prefill.",
-    )
-    parser.add_argument(
-        "--hf_forward_iters",
-        type=int,
-        default=1,
-        help="Measured iterations for HF forward_prefill (best-of).",
-    )
-    parser.add_argument(
-        "--hf_decode_warmup",
-        type=int,
-        default=8,
-        help="Warmup steps for HF decode_loop (not timed).",
-    )
-    parser.add_argument(
-        "--hf_decode_iters",
-        type=int,
-        default=1,
-        help="Measured iterations for HF decode_loop (best-of).",
-    )
-    parser.add_argument("--sglang_url", default=None, help="SGLang server URL (e.g. http://127.0.0.1:30000); if set, query SGLang")
-    parser.add_argument("--backends", default="hf,infinilm", help="Comma-separated: hf,infinilm,sglang")
-    parser.add_argument("--output", default=None, help="Write JSON results to this path")
-    parser.add_argument("--no_hf", action="store_true", help="Skip HF (e.g. if no GPU memory for two models)")
-    parser.add_argument("--no_infinilm", action="store_true", help="Skip InfiniLM")
-    parser.add_argument(
-        "--prefill_16k",
-        action="store_true",
-        help="Convenience flag: set --target_input_tokens=16384 and --max_new_tokens=1 (prefill-dominated).",
-    )
-    parser.add_argument(
-        "--infinilm_inprocess",
-        action="store_true",
-        help="Run InfiniLM in-process (no jiuge subprocess). Use when PYTHONPATH/LD_LIBRARY_PATH are set in this process.",
-    )
-    args = parser.parse_args()
-
-    backends = [b.strip() for b in args.backends.split(",")]
-    results = []
-
-    # Normalize convenience prefill-only configuration.
-    if args.prefill_16k:
-        if args.target_input_tokens is None:
-            args.target_input_tokens = 16384
-        # For prefill-dominated comparisons, prefer HF forward-only by default.
-        if args.hf_mode == "generate":
-            args.hf_mode = "forward_prefill"
-        if args.max_new_tokens != 1:
-            args.max_new_tokens = 1
-
-    # If requested, build a long prompt once using HF tokenizer.
-    if args.target_input_tokens is not None:
-        try:
-            from transformers import AutoTokenizer
-
-            tok = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
-            long_prompt, actual = _make_prompt_with_target_tokens(tok, args.prompt, args.target_input_tokens)
-            args.prompt = long_prompt
-            print(f"[prompt] synthesized chat input tokens: {actual} (target >= {args.target_input_tokens})")
-        except Exception as e:
-            print(f"[prompt] failed to synthesize long prompt: {e}")
-
-    if "hf" in backends and not args.no_hf:
-        try:
-            import torch
-            if args.hf_mode == "forward_prefill":
-                r = run_hf_forward_prefill(
-                    args.model_path,
-                    args.prompt,
-                    attn_implementation=args.hf_attn_implementation,
-                    use_cache=args.hf_forward_use_cache,
-                    warmup=args.hf_forward_warmup,
-                    iters=args.hf_forward_iters,
-                )
-            elif args.hf_mode == "decode_loop":
-                r = run_hf_decode_loop(
-                    args.model_path,
-                    args.prompt,
-                    args.max_new_tokens,
-                    attn_implementation=args.hf_attn_implementation,
-                    use_cache=True,
-                    warmup=args.hf_decode_warmup,
-                    iters=args.hf_decode_iters,
-                )
-            else:
-                r = run_hf(
-                    args.model_path,
-                    args.prompt,
-                    args.max_new_tokens,
-                    attn_implementation=args.hf_attn_implementation,
-                )
-            results.append(r)
-        except Exception as e:
-            results.append({"backend": "hf", "error": str(e)})
-
-    if "infinilm" in backends and not args.no_infinilm:
-        # In-process: when env is set in this process or --infinilm_inprocess, avoid jiuge subprocess.
-        # Also use in-process for long prompts (target_input_tokens) to avoid 2048-token truncation.
-        use_inprocess = args.infinilm_inprocess or args.target_input_tokens is not None
-        if use_inprocess:
-            try:
-                r = run_infinilm_inprocess(
-                    args.model_path,
-                    args.prompt,
-                    args.max_new_tokens,
-                    cache_mode=args.infinilm_cache_mode,  # type: ignore[arg-type]
-                    paged_block_size=args.infinilm_paged_block_size,
-                    attn_backend=args.infinilm_attn_backend,
-                )
-            except Exception as e:
-                r = {"backend": "infinilm", "error": str(e)}
-        else:
-            r = run_infinilm(args.model_path, args.prompt, args.max_new_tokens)
-        results.append(r)
-
-    if "sglang" in backends and args.sglang_url:
-        r = run_sglang_client(args.sglang_url, args.prompt, args.max_new_tokens)
-        results.append(r)
-    elif "sglang" in backends and not args.sglang_url:
-        results.append({"backend": "sglang", "error": "No --sglang_url provided; start SGLang server with MiniCPM-SALA first"})
-
-    # Print table
-    print("\n" + "=" * 60)
-    print("MiniCPM-SALA inference speed comparison")
-    print("=" * 60)
-    print(f"  prompt = {repr(args.prompt[:500])}   max_new_tokens = {args.max_new_tokens}")
-    print()
-    for r in results:
-        if "error" in r:
-            print(f"  {r['backend']}: ERROR {r['error']}")
-            continue
-        print(f"  {r['backend']}:")
-        for k, v in r.items():
-            if k == "backend" or v is None:
-                continue
-            if isinstance(v, float):
-                print(f"    {k}: {v}")
-            else:
-                print(f"    {k}: {v}")
-        print()
-    print("=" * 60)
-
-    if args.output:
-        with open(args.output, "w") as f:
-            json.dump({"prompt": args.prompt, "max_new_tokens": args.max_new_tokens, "results": results}, f, indent=2)
-        print(f"Wrote {args.output}")
-
-
-if __name__ == "__main__":
-    import os
-    main()
diff --git a/examples/metrics_16k_prefill.md b/examples/metrics_16k_prefill.md
deleted file mode 100644
index 2337fac0..00000000
--- a/examples/metrics_16k_prefill.md
+++ /dev/null
@@ -1,152 +0,0 @@
-### MiniCPM-SALA 16k long-prompt metrics (A/B cache modes)
-
-**Setup**
-
-- **Prompt construction**: `--target_input_tokens 16384` (actual synthesized **16386** chat-template tokens)
-- **Workload**: `--max_new_tokens 1` (prefill-dominated)
-- **Environment**: run via `scripts/run_compare_speed_in_container.sh` inside container `minicpm-sala`
-
-| backend | cache_mode | attn_backend | enable_paged_attn | cache sizing | prefill_ttft_ms | prefill_throughput_tok_s | total_time_ms |
-|---|---|---|---:|---|---:|---:|---:|
-| hf | — | — | — | — | — | 9325.01 | 1757.21 |
-| infinilm | static_fit | default | False | static_max_cache_len=16387 | 33632.05 | 487.21 | 33632.29 |
-| infinilm | static_maxpos | default | False | static_max_cache_len=524288 | 34067.49 | 480.99 | 34067.75 |
-| infinilm | paged | default | True | paged_block_size=256, paged_num_blocks=65 | 35626.25 | 459.94 | 35627.10 |
-
-**Raw commands**
-
-```bash
-./scripts/run_compare_speed_in_container.sh --backends hf --target_input_tokens 16384 --max_new_tokens 1
-./scripts/run_compare_speed_in_container.sh --backends infinilm --target_input_tokens 16384 --max_new_tokens 1 --infinilm_attn_backend default --infinilm_cache_mode static_fit
-./scripts/run_compare_speed_in_container.sh --backends infinilm --target_input_tokens 16384 --max_new_tokens 1 --infinilm_attn_backend default --infinilm_cache_mode static_maxpos
-./scripts/run_compare_speed_in_container.sh --backends infinilm --target_input_tokens 16384 --max_new_tokens 1 --infinilm_attn_backend default --infinilm_cache_mode paged --infinilm_paged_block_size 256
-```
-
-### Profiling methodology (nsys) for kernel attribution (HF vs InfiniLM prefill)
-
-**Goal**: attribute the 16k prefill gap to kernel families (attention vs GEMMs vs layout/copies/sync), using the same prompt and a prefill-dominated workload.
-
-**Environment**: all profiling commands in this section are run **inside the container `minicpm-sala`** (not on the host), so that PyTorch, InfiniCore, and the model path are available. Use `docker exec -it minicpm-sala bash` or the host script `./scripts/profile_prefill_torchprof_in_container.sh` to run in-container.
-
-**Workload**
-
-- HF: forward-only prefill (`--hf_mode forward_prefill`, `--max_new_tokens 1`)
-- InfiniLM: prefill-dominated generation (`--target_input_tokens 16384 --max_new_tokens 1`)
-
-**Key requirements**
-
-- Use a free GPU to avoid allocator failures and noisy traces, e.g. `CUDA_VISIBLE_DEVICES=1`.
-- Prefer `nsys stats` reports:
-  - `cuda_gpu_kern_sum`
-  - `cuda_gpu_mem_time_sum`
-  - `cuda_api_sum`
-  - `nvtx_sum`
-
-**Example (inside container `minicpm-sala`)**
-
-```bash
-export CUDA_VISIBLE_DEVICES=1
-REPO=/home/zenghua/workspace/minicpm-sala-support
-MODEL=/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA
-OUT=${REPO}/profiles
-mkdir -p ${OUT}
-
-source /app/docker/nvidia/env-set.sh 2>/dev/null || true
-export PYTHONPATH=${REPO}/InfiniLM/python:${REPO}/InfiniCore/python:${PYTHONPATH}
-
-# HF forward-only prefill (single forward, best for kernel attribution)
-nsys profile --force-overwrite=true --trace=cuda,nvtx,osrt \
-  -o ${OUT}/hf_forward_prefill_16k \
-  python3 ${REPO}/InfiniLM/examples/compare_inference_speed.py \
-    --model_path "${MODEL}" --prefill_16k --backends hf \
-    --hf_mode forward_prefill --hf_forward_use_cache \
-    --hf_forward_warmup 1 --hf_forward_iters 1 \
-    --hf_attn_implementation flash_attention_2
-
-# InfiniLM prefill-dominated (max_new_tokens=1)
-nsys profile --force-overwrite=true --trace=cuda,nvtx,osrt \
-  -o ${OUT}/infinilm_prefill_16k \
-  python3 ${REPO}/InfiniLM/examples/compare_inference_speed.py \
-    --model_path "${MODEL}" --prefill_16k --backends infinilm \
-    --infinilm_cache_mode static_fit --infinilm_attn_backend default
-
-# Summaries
-nsys stats --report cuda_gpu_kern_sum --format table ${OUT}/hf_forward_prefill_16k.nsys-rep > ${OUT}/hf_forward_prefill_16k_cuda_gpu_kern_sum.txt
-nsys stats --report cuda_gpu_kern_sum --format table ${OUT}/infinilm_prefill_16k.nsys-rep   > ${OUT}/infinilm_prefill_16k_cuda_gpu_kern_sum.txt
-nsys stats --report cuda_gpu_mem_time_sum --format table ${OUT}/hf_forward_prefill_16k.nsys-rep > ${OUT}/hf_forward_prefill_16k_cuda_gpu_mem_time_sum.txt
-nsys stats --report cuda_gpu_mem_time_sum --format table ${OUT}/infinilm_prefill_16k.nsys-rep   > ${OUT}/infinilm_prefill_16k_cuda_gpu_mem_time_sum.txt
-nsys stats --report cuda_api_sum --format table ${OUT}/hf_forward_prefill_16k.nsys-rep > ${OUT}/hf_forward_prefill_16k_cuda_api_sum.txt
-nsys stats --report cuda_api_sum --format table ${OUT}/infinilm_prefill_16k.nsys-rep   > ${OUT}/infinilm_prefill_16k_cuda_api_sum.txt
-nsys stats --report nvtx_sum --format table ${OUT}/hf_forward_prefill_16k.nsys-rep > ${OUT}/hf_forward_prefill_16k_nvtx_sum.txt
-nsys stats --report nvtx_sum --format table ${OUT}/infinilm_prefill_16k.nsys-rep   > ${OUT}/infinilm_prefill_16k_nvtx_sum.txt
-```
-
-### Prefill kernel launch reduction: SiLU/SwiGLU evidence and change
-
-**Evidence that SiLU/SwiGLU contributed to launch count**
-
-- Prefill profiling (e.g. `profile_prefill_infinilm_torchprof.py` at seq_len=512) showed ~298k `cudaLaunchKernel` and many small **elementwise** kernels (~36k calls). The MLP path used two separate InfiniCore ops per layer for SwiGLU:
-  - `infinicore::op::silu_(gate, gate)` — one kernel per layer
-  - `infinicore::op::mul(gate, up)` — one kernel per layer
-- With 32 layers that is **64 extra launches** from this pattern alone. InfiniCore provides a **fused** `swiglu(a, b)` (single kernel: `a * b * sigmoid(b)`), which matches SwiGLU as `silu(gate)*up` when called as `swiglu(up, gate)`.
-
-**Change applied**
-
-- **File**: `InfiniLM/csrc/models/minicpm_sala/minicpm_sala_mlp.cpp`
-- **Before**: `silu_(gate, gate)` then `mul(gate, up)` (two kernel launches per layer).
-- **After**: single `infinicore::op::swiglu(up, gate)` (one kernel per layer).
-- **Effect**: 32 fewer kernel launches per prefill (one per layer). Re-run the same prefill profiler or nsys commands above and compare `cuda_api_sum` (e.g. `cudaLaunchKernel` count) and `cuda_gpu_kern_sum` to confirm.
-
-### Environment fix: run InfiniLM/InfiniCore with InfLLM-v2 without LD_PRELOAD (nsys-safe)
-
-When profiling with `nsys`, setting `LD_PRELOAD` to the `infllm_v2` extension can break `nsys` itself (loader errors from PyTorch's `libtorch_python.so`). To make `nsys profile ... python ...` work reliably, we preload the InfLLM-v2 `.so` **inside Python** (RTLD_GLOBAL) before importing `infinicore`, so that `libinfinicore_cpp_api.so` can resolve `mha_varlen_fwd` / `mha_fwd_kvcache` without using `LD_PRELOAD`.
-
-- **Note**: InfLLM-v2 is now linked normally via InfiniCore build; no Python-side preload helper is required.
-- **Wired into scripts** (preload before `import infinicore`):
-  - `InfiniLM/examples/compare_inference_speed.py`
-  - `InfiniLM/examples/profile_prefill_infinilm_torchprof.py`
-  - `InfiniLM/examples/minicpm_sala_logits_sanity.py`
-
-This unblocks running both torchprof and `nsys profile` inside the `minicpm-sala` container with a consistent environment.
-
-### 16k prefill nsys numbers (post env-fix)
-
-**Workload:** `--prefill_16k` (prompt tokens 16386), `--max_new_tokens 1`, `--infinilm_cache_mode static_fit`, `--infinilm_attn_backend default`
-
-- **HF forward-only prefill** (from `compare_inference_speed.py`): `total_time_ms ≈ 1782.58` for 16386 tokens.
-- **HF forward-only prefill (rerun)** (from `compare_inference_speed.py`): `total_time_ms = 1757.21`, `prefill_throughput_tok_s = 9325.01` for 16386 tokens.
-- **InfiniLM prefill-dominated** (from `compare_inference_speed.py`): `prefill_ttft_ms ≈ 55646.11` (baseline run) and `prefill_ttft_ms ≈ 57623.64` (rerun after minor code changes).
-
-**InfiniLM 16k CUDA API summary** (nsys `cuda_api_sum`, baseline run `profiles/infinilm_prefill_16k_cuda_api_sum.txt`):
-
-- `cudaLaunchKernel`: **3,147,266 calls**
-- `cudaMemcpyAsync`: **394,155 calls**
-
-Top GPU kernels by time (nsys `cuda_gpu_kern_sum`, baseline run `profiles/infinilm_prefill_16k_cuda_gpu_kern_sum.txt`) show very high call counts tied to the Lightning Simple GLA path:
-
-- Several `at::native::*elementwise_kernel*` entries at **393,264 instances each** (exactly `16386 * 24`), indicating a large per-token kernel launch budget in the current GLA implementation.
-
-**Prefill profiling: run inside container `minicpm-sala`**
-
-All profiling commands below are intended to run **inside the container** (so PyTorch, InfiniCore, and the model are available). From the host you can either `docker exec -it minicpm-sala bash` and run the commands, or use the helper script that runs the torchprof prefill script in-container.
-
-- **Launch-count confirmation (torchprof, in-container)**
-
-  From repo root on host:
-
-  ```bash
-  ./scripts/profile_prefill_torchprof_in_container.sh
-  ```
-
-  Optional env: `SEQ_LEN=512` (default), `ACTIVE=1`, `MODEL_PATH`, `CUDA_VISIBLE_DEVICES`, `INFINILM_CUDA_INDEX`. The script prints `[launch_summary] cudaLaunchKernel_count=... cudaMemcpy_count=...` and the kernel table; compare after the SwiGLU fusion to confirm ~32 fewer launches per prefill.
-
-  Or inside the container:
-
-  ```bash
-  source /app/docker/nvidia/env-set.sh 2>/dev/null || true
-  export PYTHONPATH=${REPO}/InfiniLM/python:${REPO}/InfiniCore/python:${PYTHONPATH}
-  cd ${REPO}/InfiniLM
-  INFINILM_CUDA_INDEX=0 python3 examples/profile_prefill_infinilm_torchprof.py --model_path "${MODEL}" --seq_len 512 --active 1 --out /tmp/torchprof_prefill_512.txt
-  ```
-
-- **nsys prefill profiling** (see “Example (inside container minicpm-sala)” above) also runs in-container; use the same `REPO`, `MODEL`, `source env-set.sh`, and `PYTHONPATH` before `nsys profile` and `nsys stats`.
diff --git a/examples/metrics_longtext_mem.md b/examples/metrics_longtext_mem.md
deleted file mode 100644
index 28fe8f33..00000000
--- a/examples/metrics_longtext_mem.md
+++ /dev/null
@@ -1,378 +0,0 @@
-### MiniCPM-SALA long-context metrics + memory history
-
-**Goal**: record reproducible long-context runs with:
-
-- **time** (prefill TTFT / throughput)
-- **peak GPU memory** (from 1s `nvidia-smi` polling)
-- exact **command lines** and key env
-
-**Notes**
-
-- All commands are intended to run **inside** docker container `minicpm-sala`.
-- Prefer an **idle** GPU (avoid indices that are already near full VRAM). Scan on the host (or `docker exec minicpm-sala nvidia-smi ...` if all GPUs are visible there):
-  `nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv,noheader,nounits`
-  Then set `export CUDA_VISIBLE_DEVICES=<N>` for the run and, for scripts that poll VRAM (e.g. `collect_metrics_longtext_decode.py`), set `NVML_GPU_INDEX=<N>` to the **same** index. Example when GPUs 2–4 are mostly free: `CUDA_VISIBLE_DEVICES=2` and `NVML_GPU_INDEX=2`.
-- For InfiniLM + InfLLM-v2 builds, `libinfinicore_cpp_api.so` may require preloading `infllm_v2` with `RTLD_GLOBAL` before importing `infinicore`.
-
-### OOM-safe sweep: one case per process
-
-Running every long-context case in a **single** Python session can leave CUDA memory fragmented or peak across cases. Prefer `**run_longtext_metrics_cases.sh`**, which runs each `(backend × target × max_new)` as its **own** `python3 collect_metrics_longtext_decode.py --case ...` subprocess, appends one JSON line per row to `profiling_runs/longtext_decode_rows.jsonl`, then prints a markdown table via `--from-jsonl`.
-
-```bash
-REPO=/home/zenghua/workspace/minicpm-sala-support
-export CUDA_VISIBLE_DEVICES=2
-export NVML_GPU_INDEX=2
-export PYTHONPATH=$REPO/InfiniLM/examples:$REPO/InfiniCore/python:$REPO/InfiniLM/python
-export LD_LIBRARY_PATH=/root/.infini/lib:${LD_LIBRARY_PATH:-}
-export METRICS_DATE=2026-03-23
-cd $REPO/InfiniLM/examples
-./run_longtext_metrics_cases.sh
-```
-
-Single case manually: `python3 collect_metrics_longtext_decode.py --case hf:16384` or `infinilm_rec:65536:1`. Monolithic (unsafe) full matrix: `python3 collect_metrics_longtext_decode.py --all-in-process`.
-
-### Clean & Validate Status (post-cleanup: 2026-03-23)
-
-- Clean: removed unused debug helper `log_tensor_stats_to_file_if_enabled` and deprecated metrics padded-decode cases; `collect_metrics_longtext_decode.py` + `run_longtext_metrics_cases.sh` no longer sweep `infinilm_pad:*`.
-- Validate: rebuilt `_infinilm`, ran `InfiniCore/test/infinicore/ops/test_simple_gla_decode_recurrent.py --nvidia`, `test_simple_gla_prefill.py --nvidia`, and `InfiniLM/examples/minicpm_sala_logits_sanity.py` in `prefill` + `decode1`; confirmed `collect_metrics_longtext_decode.py --case infinilm_pad:*` is rejected with an "Unknown case kind" error.
-
----
-
-## 2026-03-23 long-context + decode (`longtext_decode_rows.jsonl`)
-
-Subprocess sweep via `./run_longtext_metrics_cases.sh`. **GPU:** `CUDA_VISIBLE_DEVICES=0`, `NVML_GPU_INDEX=0`. **Targets:** `METRICS_TARGETS=16384,32768`. **Decode steps:** `METRICS_DECODE_STEPS=32`. Recurrent InfiniLM uses `INFINI_LIGHTNING_GLA_RECURRENT_DECODE=1` with batched GLA state sync (GEMM). HF `total_ms` is end-to-end (prefill + decode), matching InfiniLM. `hf (decode_loop)` rows for `max_new=32` are appended via `hf:<target>:32`. Regenerate this table: `python3 collect_metrics_longtext_decode.py --from-jsonl profiling_runs/longtext_decode_rows.jsonl`
-
-
-| date       | backend                                          | target_in | max_new | peak_mem_mib | total_ms | prefill_ttft_ms | prefill_tok_s | decode_itl_ms | decode_tok_s | gpu |
-| ---------- | ------------------------------------------------ | --------- | ------- | ------------ | -------- | --------------- | ------------- | ------------- | ------------ | --- |
-| 2026-03-23 | hf (decode_loop)                                 | 16384     | 1       | 38101        | 1821.53  | —              | —             | —             | —            | 0   |
-| 2026-03-23 | hf (decode_loop)                                 | 32768     | 1       | 51545        | 3711.99  | —              | —             | —             | —            | 0   |
-| 2026-03-23 | hf (decode_loop)                                 | 16384     | 32      | 38365        | 3435.09  | —              | —             | 52.24         | 19.14        | 0   |
-| 2026-03-23 | hf (decode_loop)                                 | 32768     | 32      | 41717        | 5247.77  | —              | —             | 52.90         | 18.90        | 0   |
-| 2026-03-23 | infinilm (static_fit, recurrent GLA decode)      | 16384     | 1       | 33525        | 3162.11  | 3161.5          | 5182.98       | —             | —            | 0   |
-| 2026-03-23 | infinilm (static_fit, recurrent GLA decode)      | 32768     | 1       | 44897        | 7139.12  | 7138.74         | 4590.45       | —             | —            | 0   |
-| 2026-03-23 | infinilm (static_fit, recurrent GLA, +32 decode) | 16384     | 32      | 33537        | 4111.32  | 3182.07         | 5149.48       | 29.94         | 33.4         | 0   |
-| 2026-03-23 | infinilm (static_fit, recurrent GLA, +32 decode) | 32768     | 32      | 44911        | 8357.39  | 7146.78         | 4585.28       | 39            | 25.64        | 0   |
-
-
----
-
-## History table
-
-
-| date       | backend                                       | target_input_tokens | max_new_tokens | cache_mode | peak_mem_mib | total_time_ms | prefill_ttft_ms | prefill_throughput_tok_s | gpu |
-| ---------- | --------------------------------------------- | ------------------- | -------------- | ---------- | ------------ | ------------- | --------------- | ------------------------ | --- |
-| 2026-03-18 | hf                                            | 16384               | 1              | —          | 38091        | 1757.21       | —               | 9325.01                  | 2   |
-| 2026-03-19 | hf                                            | 16384               | 1              | —          | 38091        | 1760.08       | —               | 9311.48                  | 2   |
-| 2026-03-18 | hf                                            | 32768               | 1              | —          | 41173        | 3537.65       | —               | 9263.22                  | 2   |
-| 2026-03-19 | hf                                            | 32768               | 1              | —          | 41151        | 3516.06       | —               | 9319.51                  | 2   |
-| 2026-03-19 | infinilm(baseline)                            | 16384               | 1              | static_fit | 33570        | 2849.22       | 2849.03         | 5751.44                  | 0   |
-| 2026-03-19 | infinilm(baseline)                            | 32768               | 1              | static_fit | 44174        | 5960.41       | 5960.14         | 5498.19                  | 0   |
-| 2026-03-19 | infinilm(baseline)                            | 65536               | 1              | static_fit | 67195        | 13929.51      | 13929.12        | 4705.11                  | 4   |
-| 2026-03-19 | hf (consistent-batch)                         | 16384               | 1              | —          | 38091        | 1782.63       | —               | 9192.04                  | 4   |
-| 2026-03-19 | hf (consistent-batch)                         | 32768               | 1              | —          | 41173        | 3585.96       | —               | 9138.42                  | 4   |
-| 2026-03-19 | hf (consistent-batch)                         | 65536               | 1              | —          | 47319        | 7426.98       | —               | 8824.32                  | 4   |
-| 2026-03-19 | infinilm (consistent-batch)                   | 16384               | 1              | static_fit | 32605        | 2887.28       | 2887.06         | 5675.67                  | 4   |
-| 2026-03-19 | infinilm (consistent-batch)                   | 32768               | 1              | static_fit | 43209        | 6005.78       | 6005.57         | 5456.60                  | 4   |
-| 2026-03-19 | infinilm (consistent-batch)                   | 65536               | 1              | static_fit | 67195        | 13940.17      | 13939.90        | 4701.47                  | 4   |
-| 2026-03-19 | infinilm (exp2/3 opt: strided KV + GLA views) | 32768               | 1              | static_fit | 38613        | 5993.70       | 5993.45         | 5467.64                  | 4   |
-| 2026-03-19 | infinilm (exp2/3 opt: strided KV + GLA views) | 65536               | 1              | static_fit | 67195        | 13959.08      | 13958.78        | 4695.11                  | 4   |
-| 2026-03-19 | infinilm(baseline)                            | 131072              | 1              | static_fit | 79883        | OOM           | —               | —                        | 6   |
-| 2026-03-18 | hf                                            | 524288              | 1              | —          | 59591        | OOM           | —               | —                        | 3   |
-| 2026-03-18 | hf                                            | 65536               | 1              | —          | 47319        | 7340.99       | —               | 8927.67                  | 1   |
-| 2026-03-18 | hf                                            | 131072              | 1              | —          | 61641        | 15290.39      | —               | 8572.31                  | 1   |
-| 2026-03-18 | hf                                            | 262144              | 1              | —          | 80059        | OOM           | —               | —                        | 1   |
-
-
----
-
-## 2026-03-19 consistent batch summary (GPU 4, 1s polling)
-
-Protocol used for both backends:
-
-- same physical GPU (`CUDA_VISIBLE_DEVICES=4`), same model, `max_new_tokens=1`
-- same target lengths: 16k / 32k / 64k
-- memory measured from 1s `nvidia-smi -i 4 --query-gpu=memory.used` polling
-- HF path: `--hf_mode forward_prefill --hf_forward_use_cache --hf_forward_warmup 1 --hf_forward_iters 1`
-- InfiniLM path: `--infinilm_inprocess --infinilm_cache_mode static_fit`
-
-### Growth deltas (16k->32k and 32k->64k)
-
-TTFT note: HF forward-prefill does not emit TTFT; `total_time_ms` is used as prefill-time proxy for HF deltas.
-
-
-| backend               | 16k->32k mem delta (MiB) | 32k->64k mem delta (MiB) | 16k->32k time delta (ms) | 32k->64k time delta (ms) |
-| --------------------- | ------------------------ | ------------------------ | ------------------------ | ------------------------ |
-| hf (forward-prefill)  | +3082                    | +6146                    | +1803.33                 | +3841.02                 |
-| infinilm (static_fit) | +10604                   | +23986                   | +3118.51 (TTFT)          | +7934.33 (TTFT)          |
-
-
-### Attribution profiling (InfiniLM 32k / 64k)
-
-Artifacts are saved in `InfiniLM/examples/profiling_runs`:
-
-- allocator logs: `alloc_infinilm_32768_gpu4.log`, `alloc_infinilm_65536_gpu4.log`
-- nsys logs: `nsys_infinilm_32768_gpu4.log`, `nsys_infinilm_65536_gpu4.log`
-
-Allocator observations (`INFINICORE_DEBUG_ALLOC=1`):
-
-- both runs show identical small/medium allocation patterns (e.g., many `32 MiB` and `128 MiB` class allocations), suggesting these are mostly fixed/runtime-structural.
-- 64k introduces substantially larger "large" allocations than 32k (examples in logs include `12.0 GiB`, `9.0 GiB`, and `2.0 GiB`-class requests), consistent with context-length-driven persistent KV slab growth.
-- 32k large allocations are present but markedly smaller (e.g., `~6.0 GiB`, `~4.5 GiB`, `~1.0 GiB`), aligning with lower persistent cache footprint.
-
-Nsight Systems observations (`nsys profile --trace=cuda,nvtx,osrt --stats=true`):
-
-- NVTX `infinilm_generate` range scales from `~6.18s` (32k) to `~14.17s` (64k), matching TTFT growth.
-- CUDA API summary becomes more memcpy-dominated at 64k:
-  - 32k: `cudaMemcpy ~64.6%`, `cudaMemcpyAsync ~33.0%`
-  - 64k: `cudaMemcpy ~83.0%`, `cudaMemcpyAsync ~15.7%`
-- GPU kernel summary shows both attention and GLA prefill kernels scaling up:
-  - `flash_fwd_kernel` total: `~1.03s` -> `~4.09s`
-  - `simple_gla_prefill_chunked_kernel` total: `~1.24s` -> `~2.45s`
-
-Attribution confidence:
-
-- **High**: persistent KV/cache-related allocations are the primary memory-growth driver from 32k to 64k.
-- **Medium**: transient prefill compute/workspace growth contributes, but is secondary vs persistent slabs for memory.
-- **Medium**: synchronization/memcpy behavior is a major TTFT growth contributor at 64k.
-
-### Short-context decode profiling (Nsight Systems, vs HF)
-
-**Artifacts** (under `InfiniLM/examples/profiling_runs/`):
-
-- HF manual decode: `nsys_decode_hf_tok256_gpu4.log` (`--hf_mode decode_loop`, short prompt, `max_new_tokens=256`).
-- InfiniLM generate: `nsys_decode_infinilm_tok256_gpu4.log`, `nsys_decode_infinilm_nvtx_tok256_gpu4.log`, `nsys_decode_infinilm_nvtx_opt_tok256_gpu4.log` (same prompt / 256 new tokens; NVTX ranges from `infer_engine.generate`).
-- Post–`write_i32`/`write_i64` rebuild (2026-03-20, GPU 4): `nsys_decode_infinilm_tok256_gpu4_pybind_run.log` (failed: stale `_infinicore` without `write_i32`), `nsys_decode_infinilm_tok256_gpu4_pybind_run2.log` + `decode_infinilm_tok256_gpu4_pybind_run2.nsys-rep` (**good** after `install.py` + `xmake build/install _infinicore` in container). Script `compare_inference_speed.py` preloads InfLLM-v2 (`RTLD_GLOBAL`) so `libinfinicore_cpp_api.so` resolves `mha_varlen_fwd`; bare `python -c import infinicore` without that preload can show an undefined-symbol error.
-
-**NVTX (InfiniLM)** — use these ranges in the Nsight UI / `nsys stats` to isolate prefill vs steady decode:
-
-- `infinilm_prefill_step` — first `generate` iteration.
-- `infinilm_decode_total` — spans decode iterations 1..N-1 (opened on iter 1).
-- `infinilm_decode_step` — one range per token step (high instance count).
-- `infinilm_generate` — full `engine.generate()` call.
-
-**HF**: `hf_decode_loop` wraps the timed decode loop (prefill is outside this range).
-
-**Headline comparison** (same GPU, 256 decode steps, short prompt; numbers from the logs above):
-
-
-| Metric (CUDA API sum)    | HF `decode_loop`    | InfiniLM `generate` |
-| ------------------------ | ------------------- | ------------------- |
-| `cudaLaunchKernel` calls | ~593k               | ~7.44M              |
-| ~calls / decode step     | ~2.3k               | ~29k                |
-| `cudaMemcpyAsync` calls  | lower than InfiniLM | ~988k               |
-
-
-**Memcpy time** (`cuda_gpu_mem_time_sum`): InfiniLM decode shows large **H2D** wall share (~63% of memcpy time in one run) with **many** small transfers; HF decode shows **fewer** H2D operations but they can dominate memcpy time when they occur.
-
-**Interpretation**: InfiniLM short decode is limited less by a single kernel and more by **per-step framework overhead** (launch count + small copies). Next wins are structural (fewer launches per token, true decode KV path, graph/capture where safe), not scalar metadata alone.
-
-**Continuing profiling — repro commands** (inside `minicpm-sala`, pick idle `GPU`; outputs go to `profiling_runs/`):
-
-```bash
-REPO=/home/zenghua/workspace/minicpm-sala-support
-MODEL=/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA
-GPU=4
-export CUDA_VISIBLE_DEVICES=$GPU
-export PYTHONPATH=$REPO/InfiniLM/python:$REPO/InfiniCore/python:${PYTHONPATH:-}
-export LD_LIBRARY_PATH=/root/.infini/lib:${LD_LIBRARY_PATH:-}
-cd $REPO/InfiniLM/examples
-
-TAG=decode_infinilm_tok256_gpu${GPU}
-nsys profile --trace=cuda,nvtx,osrt --stats=true -o profiling_runs/${TAG} --force-overwrite true \
-  python3 compare_inference_speed.py \
-    --model_path "$MODEL" \
-    --prompt "Write a short haiku about GPUs." \
-    --max_new_tokens 256 \
-    --backends infinilm \
-    --no_hf \
-    --infinilm_inprocess \
-    --infinilm_cache_mode static_fit \
-  2>&1 | tee profiling_runs/nsys_${TAG}.log
-
-# Optional (InfiniLM decode): reduce D2H / Python overhead and A/B CPU metadata tensors
-# export INFINI_PROFILE_KEEP_OUTPUT_IDS_ON_DEVICE=1
-# export INFINI_PROFILE_COLLECT_OUTPUT_IDS=0
-# export INFINI_PROFILE_DISABLE_FAST_DECODE_META=1   # force per-step from_list() metadata vs reusable CPU+write_i* fast path
-
-TAG=decode_hf_tok256_gpu${GPU}
-nsys profile --trace=cuda,nvtx,osrt --stats=true -o profiling_runs/${TAG} --force-overwrite true \
-  python3 compare_inference_speed.py \
-    --model_path "$MODEL" \
-    --prompt "Write a short haiku about GPUs." \
-    --max_new_tokens 256 \
-    --backends hf \
-    --no_infinilm \
-    --hf_mode decode_loop \
-    --hf_decode_warmup 8 \
-    --hf_decode_iters 1 \
-    --hf_attn_implementation flash_attention_2 \
-  2>&1 | tee profiling_runs/nsys_${TAG}.log
-```
-
-**Long-context decode** (optional): add e.g. `--target_input_tokens 32768` to either command so NVTX still tags prefill vs decode; expect traces to be large.
-
-**Prefill-only nsys** (matches earlier 32k/64k attribution):
-
-```bash
-TAG=infinilm_prefill_32768_gpu${GPU}
-nsys profile --trace=cuda,nvtx,osrt --stats=true -o profiling_runs/${TAG} --force-overwrite true \
-  python3 compare_inference_speed.py \
-    --model_path "$MODEL" \
-    --target_input_tokens 32768 \
-    --max_new_tokens 1 \
-    --backends infinilm \
-    --no_hf \
-    --infinilm_inprocess \
-    --infinilm_cache_mode static_fit \
-  2>&1 | tee profiling_runs/nsys_${TAG}.log
-```
-
-After code changes (e.g. pybind metadata path), re-run the **same** `TAG` with a suffix (`_run2`) and diff `cuda_api_sum` / `cuda_gpu_kern_sum` / NVTX tables.
-
-### Ranked next optimization experiments (minimal changes)
-
-1. **Constrain/reshape persistent KV growth first**
-Expected impact: High memory reduction, likely best leverage on 32k->64k slope.
-Minimal experiment: compare `static_fit` vs `paged` (small block sizes, e.g., 128/256) at 32k/64k and re-measure peaks + TTFT.
-2. **Reduce transient prefill movement/workspace**
-Expected impact: Medium TTFT gain, small-to-medium memory relief.
-Minimal experiment: isolate `simple_gla_prefill` transform/workspace path and reduce extra copies/format conversions; confirm via reduced `cudaMemcpy` share in nsys.
-3. **Trim synchronization/copy overhead around prefill**
-Expected impact: Medium TTFT gain at long context.
-Minimal experiment: profile before/after removing avoidable sync points or host-device transfers in attention/prefill orchestration; success criterion is lower `cudaMemcpy` wall share with unchanged logits.
-
-Applied (2026-03-19): removed `permute(...)->contiguous()` materialization for KV cache update and GLA prefill inputs in `minicpm_sala_attention.cpp` (pass strided views).
-Result: 32k peak memory improved on GPU 4 (**43209 MiB → 38613 MiB**) with similar TTFT; 64k peak unchanged (dominated by persistent KV slabs).
-
-Validation gate for each experiment:
-
-- **Operator unit tests (CUDA) first** — InfLLM-v2 + Simple GLA prefill (see below). Failing ops almost always mean wasted time on full-model logits debugging.
-- run `minicpm_sala_logits_sanity.py` (prefill mode) and compare ratio/max_diff/mean_diff against current baseline.
-- run one prompt generation sanity and verify no functional regression.
-
----
-
-## Commands (repro)
-
-### InfiniCore operator tests (run before logits sanity)
-
-MiniCPM-SALA stack depends on `infllmv2_varlen` / `infllmv2_kvcache` and `simple_gla_prefill`. Run these inside `minicpm-sala` with `InfiniLM/python` on `PYTHONPATH` so InfLLM-v2 preloads before `import infinicore`:
-
-```bash
-REPO=/home/zenghua/workspace/minicpm-sala-support
-export CUDA_VISIBLE_DEVICES=1
-export PYTHONPATH=$REPO/InfiniCore/test/infinicore:$REPO/InfiniCore/python:$REPO/InfiniLM/python:${PYTHONPATH:-}
-export LD_LIBRARY_PATH=/root/.infini/lib:${LD_LIBRARY_PATH:-}
-cd $REPO/InfiniCore/test/infinicore/ops
-
-python3 test_infllmv2_attention.py --nvidia
-python3 test_simple_gla_prefill.py --nvidia
-```
-
-One-liner wrapper (same env assumptions as the repo):
-
-```bash
-bash $REPO/InfiniLM/examples/run_infinicore_ops_before_logits.sh
-```
-
-### Logits correctness gate (HF vs InfiniLM)
-
-Run (inside `minicpm-sala`) to sanity-check HF vs InfiniLM prefill logits on a short prompt:
-
-```bash
-REPO=/home/zenghua/workspace/minicpm-sala-support
-MODEL=/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA
-export CUDA_VISIBLE_DEVICES=1
-export HF_CUDA_INDEX=0
-export INFINILM_CUDA_INDEX=0
-export PYTHONPATH=$REPO/InfiniLM/python:$REPO/InfiniCore/python:${PYTHONPATH:-}
-export LD_LIBRARY_PATH=/root/.infini/lib:${LD_LIBRARY_PATH:-}
-cd $REPO/InfiniLM/examples
-
-python3 minicpm_sala_logits_sanity.py \
-  --model_path "$MODEL" \
-  --mode prefill \
-  --prompt "How are you? Tell me a short joke." \
-  --k 10
-```
-
-Recorded output (2026-03-18, GPU=1):
-
-```text
-SANITY_ONELINE ratio=0.9889 max_diff=0.1875 mean_diff=0.0682
-```
-
-`--mode decode1` (prefill + one decode step): **prefill section** should match the prefill-only run. The **decode** section should now be finite (the previous `NaN` issue was traced to the CUDA embedding kernel leaving outputs uninitialized for out-of-range indices). Correctness can still diverge from HF for longer prompts due to decode/KV/attention parity work; treat **prefill** as the strongest HF parity gate for now.
-
-### GPU scan
-
-```bash
-nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv,noheader,nounits
-```
-
-### HF-only prefill (32k) with 1s memory polling
-
-```bash
-REPO=/home/zenghua/workspace/minicpm-sala-support
-MODEL=/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA
-export CUDA_VISIBLE_DEVICES=2
-export PYTHONPATH=$REPO/InfiniLM/python:$REPO/InfiniCore/python:${PYTHONPATH:-}
-cd $REPO/InfiniLM/examples
-
-python3 compare_inference_speed.py \
-  --model_path "$MODEL" \
-  --target_input_tokens 32768 \
-  --max_new_tokens 1 \
-  --backends hf \
-  --hf_mode forward_prefill \
-  --hf_forward_use_cache \
-  --hf_forward_warmup 1 \
-  --hf_forward_iters 1 \
-  --hf_attn_implementation flash_attention_2 \
-  & pid=$!
-
-echo "[mem] polling physical GPU 2 while pid=$pid"
-while kill -0 $pid 2>/dev/null; do
-  date +"%F %T"
-  nvidia-smi -i 2 --query-gpu=memory.used,memory.total --format=csv,noheader,nounits
-  sleep 1
-done
-wait $pid
-```
-
-### InfiniLM-only (32k) with InfLLM-v2 preload + 1s memory polling
-
-```bash
-REPO=/home/zenghua/workspace/minicpm-sala-support
-MODEL=/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA
-export CUDA_VISIBLE_DEVICES=2
-export PYTHONPATH=$REPO/InfiniLM/python:$REPO/InfiniCore/python:${PYTHONPATH:-}
-export LD_LIBRARY_PATH=/root/.infini/lib:$REPO/InfiniLM/build/linux/x86_64/release:${LD_LIBRARY_PATH:-}
-cd $REPO/InfiniLM/examples
-
-python3 - <<'PY' & pid=$!
-import ctypes, os, runpy, sys
-ctypes.CDLL("/usr/local/lib/python3.12/dist-packages/infllm_v2/C.cpython-312-x86_64-linux-gnu.so", mode=ctypes.RTLD_GLOBAL)
-sys.argv = [
-  "compare_inference_speed.py",
-  "--model_path", os.environ["MODEL"],
-  "--target_input_tokens", "32768",
-  "--max_new_tokens", "1",
-  "--backends", "infinilm",
-  "--no_hf",
-  "--infinilm_inprocess",
-  "--infinilm_cache_mode", "static_fit",
-]
-runpy.run_path("compare_inference_speed.py", run_name="__main__")
-PY
-
-echo "[mem] polling physical GPU 2 while pid=$pid"
-while kill -0 $pid 2>/dev/null; do
-  date +"%F %T"
-  nvidia-smi -i 2 --query-gpu=memory.used,memory.total --format=csv,noheader,nounits
-  sleep 1
-done
-wait $pid
-```
-
diff --git a/examples/run_infinicore_ops_before_logits.sh b/examples/run_infinicore_ops_before_logits.sh
deleted file mode 100755
index 5a93fe11..00000000
--- a/examples/run_infinicore_ops_before_logits.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/usr/bin/env bash
-# InfiniCore CUDA operator smoke tests for MiniCPM-SALA-related ops.
-# Run inside minicpm-sala docker before deeper HF-vs-InfiniLM alignment probes.
-set -euo pipefail
-
-REPO="${REPO:-/home/zenghua/workspace/minicpm-sala-support}"
-export PYTHONPATH="$REPO/InfiniCore/test/infinicore:$REPO/InfiniCore/python:$REPO/InfiniLM/python:${PYTHONPATH:-}"
-export LD_LIBRARY_PATH="/root/.infini/lib:${LD_LIBRARY_PATH:-}"
-
-OPS_DIR="$REPO/InfiniCore/test/infinicore/ops"
-cd "$OPS_DIR"
-
-echo "[run_infinicore_ops] REPO=$REPO"
-echo "[run_infinicore_ops] test_infllmv2_attention.py --nvidia"
-python3 test_infllmv2_attention.py --nvidia
-echo "[run_infinicore_ops] test_simple_gla_prefill.py --nvidia"
-python3 test_simple_gla_prefill.py --nvidia
-echo "[run_infinicore_ops] OK"
diff --git a/examples/run_longtext_metrics_cases.sh b/examples/run_longtext_metrics_cases.sh
deleted file mode 100755
index dd595c7b..00000000
--- a/examples/run_longtext_metrics_cases.sh
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/usr/bin/env bash
-# Run each longtext/decode metric case in a **separate** Python process to release CUDA
-# memory between runs (reduces OOM when sweeping 16k/32k/64k × HF + InfiniLM).
-#
-# Usage (inside minicpm-sala, after picking an idle GPU):
-#   export CUDA_VISIBLE_DEVICES=2
-#   export NVML_GPU_INDEX=2
-#   export REPO=/home/zenghua/workspace/minicpm-sala-support
-#   export PYTHONPATH=$REPO/InfiniLM/examples:$REPO/InfiniCore/python:$REPO/InfiniLM/python
-#   export LD_LIBRARY_PATH=/root/.infini/lib:${LD_LIBRARY_PATH:-}
-#   export METRICS_DATE=2026-03-23
-#   cd $REPO/InfiniLM/examples && ./run_longtext_metrics_cases.sh
-#
-# Optional:
-#   METRICS_TARGETS=16384,32768   METRICS_DECODE_STEPS=32  ./run_longtext_metrics_cases.sh
-#   SLEEP_BETWEEN_SEC=3   # extra pause between subprocesses
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "$SCRIPT_DIR"
-
-REPO="${REPO:-/home/zenghua/workspace/minicpm-sala-support}"
-export PYTHONPATH="${SCRIPT_DIR}:${REPO}/InfiniCore/python:${REPO}/InfiniLM/python:${PYTHONPATH:-}"
-export LD_LIBRARY_PATH="/root/.infini/lib:${LD_LIBRARY_PATH:-}"
-
-: "${CUDA_VISIBLE_DEVICES:=0}"
-: "${NVML_GPU_INDEX:=${CUDA_VISIBLE_DEVICES}}"
-: "${METRICS_DATE:=2026-03-23}"
-: "${METRICS_DECODE_STEPS:=32}"
-: "${METRICS_TARGETS:=16384,32768,65536}"
-: "${SLEEP_BETWEEN_SEC:=2}"
-
-OUT_JSONL="${OUT_JSONL:-${SCRIPT_DIR}/profiling_runs/longtext_decode_rows.jsonl}"
-mkdir -p "$(dirname "$OUT_JSONL")"
-rm -f "$OUT_JSONL"
-echo "[run_longtext_metrics] jsonl -> $OUT_JSONL  GPU smi index=$NVML_GPU_INDEX"
-
-IFS=',' read -r -a TARGETS <<< "$METRICS_TARGETS"
-
-run_one() {
-  local c="$1"
-  echo "[run_longtext_metrics] case=$c"
-  python3 collect_metrics_longtext_decode.py --case "$c" --append-jsonl "$OUT_JSONL" || true
-  sleep "${SLEEP_BETWEEN_SEC}"
-}
-
-for t in "${TARGETS[@]}"; do
-  run_one "hf:${t}"
-done
-for t in "${TARGETS[@]}"; do
-  run_one "infinilm_rec:${t}:1"
-done
-for t in "${TARGETS[@]}"; do
-  run_one "infinilm_rec:${t}:${METRICS_DECODE_STEPS}"
-done
-
-echo "[run_longtext_metrics] merged table:"
-python3 collect_metrics_longtext_decode.py --from-jsonl "$OUT_JSONL"

From 8f85cb726cdd99423a07a416059aac7f619de00b Mon Sep 17 00:00:00 2001
From: Ceng23333 <441651826@qq.com>
Date: Thu, 9 Apr 2026 01:45:25 +0000
Subject: [PATCH 04/11] revert server

Signed-off-by: Ceng23333 <441651826@qq.com>
---
 .../infinilm/server/chat_message_normalize.py | 76 -------------------
 python/infinilm/server/inference_server.py    | 34 ++++++++-
 2 files changed, 31 insertions(+), 79 deletions(-)
 delete mode 100644 python/infinilm/server/chat_message_normalize.py

diff --git a/python/infinilm/server/chat_message_normalize.py b/python/infinilm/server/chat_message_normalize.py
deleted file mode 100644
index 04afe176..00000000
--- a/python/infinilm/server/chat_message_normalize.py
+++ /dev/null
@@ -1,76 +0,0 @@
-"""Normalize OpenAI-style chat messages before HuggingFace chat_template.
-
-Kept separate from ``inference_server`` so this logic can be smoke-tested without
-loading InfiniCore / CUDA (see ``__main__`` block).
-"""
-
-
-def normalize_openai_messages_for_hf_template(messages: list) -> list:
-    """Strip lm-eval ``type: text`` wrappers; flatten multimodal text parts.
-
-    lm-eval ``local-chat-completions`` with ``tokenized_requests=False`` JSON-encodes
-    each turn with an extra top-level ``"type": "text"`` (see ``TemplateAPI.apply_chat_template``
-    in lm-eval). HuggingFace ``--model hf`` passes plain ``{role, content}`` dicts into
-    ``apply_chat_template``. Stripping unknown keys keeps server templating aligned with
-    the HF harness for text-only tasks.
-    """
-    normalized: list = []
-    for msg in messages:
-        if not isinstance(msg, dict):
-            normalized.append(msg)
-            continue
-
-        role = msg.get("role")
-        if role is None:
-            normalized.append(msg)
-            continue
-
-        content = msg.get("content")
-        if isinstance(content, list):
-            text_parts: list[str] = []
-            for part in content:
-                if isinstance(part, dict):
-                    if part.get("type") == "text" and "text" in part:
-                        text_parts.append(part["text"])
-                    elif isinstance(part, str):
-                        text_parts.append(part)
-                elif isinstance(part, str):
-                    text_parts.append(part)
-            merged = "".join(text_parts) if text_parts else ""
-            core = {"role": role, "content": merged}
-            if msg.get("name") is not None:
-                core["name"] = msg["name"]
-            normalized.append(core)
-        elif isinstance(content, str):
-            core = {"role": role, "content": content}
-            if msg.get("name") is not None:
-                core["name"] = msg["name"]
-            normalized.append(core)
-        else:
-            normalized.append(msg)
-
-    return normalized
-
-
-if __name__ == "__main__":
-    # Smoke test (no InfiniCore): run as
-    #   python3 -m infinilm.server.chat_message_normalize
-    lm_eval_style = [
-        {"role": "system", "content": "sys", "type": "text"},
-        {"role": "user", "content": "hi", "type": "text"},
-    ]
-    out = normalize_openai_messages_for_hf_template(lm_eval_style)
-    assert out == [{"role": "system", "content": "sys"}, {"role": "user", "content": "hi"}], out
-    mm = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": "a"},
-                {"type": "text", "text": "b"},
-            ],
-        }
-    ]
-    assert normalize_openai_messages_for_hf_template(mm) == [
-        {"role": "user", "content": "ab"}
-    ]
-    print("chat_message_normalize: ok")
diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py
index 8c361c4e..b5c49247 100644
--- a/python/infinilm/server/inference_server.py
+++ b/python/infinilm/server/inference_server.py
@@ -17,7 +17,6 @@
 from fastapi.responses import JSONResponse, StreamingResponse
 
 from infinilm.llm import AsyncLLMEngine, SamplingParams, FinishReason
-from infinilm.server.chat_message_normalize import normalize_openai_messages_for_hf_template
 
 logger = logging.getLogger(__name__)
 
@@ -267,8 +266,37 @@ async def list_models_legacy():
             return _models_payload()
 
     def _normalize_messages(self, messages: list) -> list:
-        """Delegate to :func:`normalize_openai_messages_for_hf_template`."""
-        return normalize_openai_messages_for_hf_template(messages)
+        """Normalize messages to handle multimodal content (list format).
+
+        Converts content from list format [{"type": "text", "text": "..."}]
+        to string format for chat template compatibility.
+        """
+        normalized = []
+        for msg in messages:
+            if not isinstance(msg, dict):
+                normalized.append(msg)
+                continue
+
+            content = msg.get("content")
+            if isinstance(content, list):
+                # Extract text from multimodal content list
+                text_parts = []
+                for part in content:
+                    if isinstance(part, dict):
+                        if part.get("type") == "text" and "text" in part:
+                            text_parts.append(part["text"])
+                        elif isinstance(part, str):
+                            text_parts.append(part)
+                    elif isinstance(part, str):
+                        text_parts.append(part)
+                # Join all text parts
+                normalized_msg = msg.copy()
+                normalized_msg["content"] = "".join(text_parts) if text_parts else ""
+                normalized.append(normalized_msg)
+            else:
+                normalized.append(msg)
+
+        return normalized
 
     def _build_sampling_params(self, data: dict) -> SamplingParams:
         """Build SamplingParams from request data."""

From 0d98e759b69e884e1dec99ed00d8829bf79b1980 Mon Sep 17 00:00:00 2001
From: Ceng23333 <441651826@qq.com>
Date: Thu, 9 Apr 2026 01:47:42 +0000
Subject: [PATCH 05/11] revert some code

Signed-off-by: Ceng23333 <441651826@qq.com>
---
 include/infinicore_infer/cache.h          | 5 -----
 include/infinicore_infer/weights_loader.h | 5 -----
 2 files changed, 10 deletions(-)

diff --git a/include/infinicore_infer/cache.h b/include/infinicore_infer/cache.h
index 5f691c64..522f2235 100644
--- a/include/infinicore_infer/cache.h
+++ b/include/infinicore_infer/cache.h
@@ -3,11 +3,6 @@
 
 #include <infinirt.h>
 
-#ifndef __INFINI_C
-// Compat: older InfiniCore headers use `__C` instead of `__INFINI_C`.
-#define __INFINI_C __C
-#endif
-
 __INFINI_C __export struct KVCache *createKVCache(
     size_t nlayers,
     size_t max_len,
diff --git a/include/infinicore_infer/weights_loader.h b/include/infinicore_infer/weights_loader.h
index 057c3a1b..82eafe59 100644
--- a/include/infinicore_infer/weights_loader.h
+++ b/include/infinicore_infer/weights_loader.h
@@ -3,11 +3,6 @@
 
 #include <infinirt.h>
 
-#ifndef __INFINI_C
-// Compat: older InfiniCore headers use `__C` instead of `__INFINI_C`.
-#define __INFINI_C __C
-#endif
-
 struct ModelWeights;
 
 __INFINI_C __export void

From 0583ab5be7b134a1aa0c708c0c1266b5b5928f24 Mon Sep 17 00:00:00 2001
From: Ceng23333 <441651826@qq.com>
Date: Thu, 9 Apr 2026 03:21:44 +0000
Subject: [PATCH 06/11] refactor

Signed-off-by: Ceng23333 <441651826@qq.com>
---
 ...minicpm_sala_allocate_kv_cache_tensors.cpp |   4 +-
 .../minicpm_sala/minicpm_sala_attention.cpp   | 133 +++++++++---------
 .../minicpm_sala/minicpm_sala_attention.hpp   |  26 +---
 .../minicpm_sala_decoder_layer.cpp            |  26 ++--
 .../minicpm_sala_decoder_layer.hpp            |   5 +-
 .../minicpm_sala_for_causal_lm.cpp            |  28 +++-
 .../minicpm_sala_for_causal_lm.hpp            |   8 +-
 .../minicpm_sala/minicpm_sala_model.cpp       |  85 +----------
 .../minicpm_sala/minicpm_sala_model.hpp       |  12 +-
 9 files changed, 120 insertions(+), 207 deletions(-)

diff --git a/csrc/models/minicpm_sala/minicpm_sala_allocate_kv_cache_tensors.cpp b/csrc/models/minicpm_sala/minicpm_sala_allocate_kv_cache_tensors.cpp
index f4cb3b55..3ad0b506 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_allocate_kv_cache_tensors.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_allocate_kv_cache_tensors.cpp
@@ -32,7 +32,7 @@ std::vector<infinicore::Tensor> minicpm_sala_allocate_kv_cache_tensors(const cac
         const size_t num_key_value_heads = text_config->get<size_t>("num_key_value_heads");
         const size_t max_position_embeddings = text_config->get<size_t>("max_position_embeddings");
 
-        const auto &dtype{text_config->get_dtype()};
+        const auto &dtype{text_config->get_kv_cache_dtype()};
         std::vector<std::string> mixer_types = text_config->get<std::vector<std::string>>("mixer_types");
         size_t current_layer_head_dim, current_layer_num_key_value_heads;
         for (size_t layer_idx = 0; layer_idx < num_hidden_layers; ++layer_idx) {
@@ -70,7 +70,7 @@ std::vector<infinicore::Tensor> minicpm_sala_allocate_kv_cache_tensors(const cac
 
         const size_t head_dim = text_config->get<size_t>("head_dim");
         const size_t num_key_value_heads = text_config->get<size_t>("num_key_value_heads");
-        const auto &dtype{text_config->get_dtype()};
+        const auto &dtype{text_config->get_kv_cache_dtype()};
         std::vector<std::string> mixer_types = text_config->get<std::vector<std::string>>("mixer_types");
         size_t current_layer_head_dim, current_layer_num_key_value_heads;
         for (size_t layer_idx = 0; layer_idx < num_hidden_layers; ++layer_idx) {
diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp
index 001122e4..f437f9e9 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp
@@ -7,6 +7,7 @@
 #include "infinicore/ops/simple_gla_prefill.hpp"
 #include "infinicore/ops/simple_gla_recurrent_state_append.hpp"
 #include "infinicore/context/context.hpp"
+#include "../../global_state/global_state.hpp"
 #include "../debug_utils/tensor_utils.hpp"
 
 #include <cmath>
@@ -18,6 +19,35 @@
 namespace infinilm::models::minicpm_sala {
 
 namespace {
+
+// Per-layer KV tensor layout from `StaticKVCache::create_layer_kv_cache`: [2, B, n_kv, max_len, D].
+void minicpm_sala_update_layer_kv_tensor(infinicore::Tensor &kv_bundle,
+                                         const infinicore::Tensor &k_permuted,
+                                         const infinicore::Tensor &v_permuted,
+                                         const infinicore::Tensor &past_sequence_lengths) {
+    auto k_cache_layer = kv_bundle->narrow({{0, 0, 1}})->squeeze(0);
+    auto v_cache_layer = kv_bundle->narrow({{0, 1, 1}})->squeeze(0);
+
+#ifdef ENABLE_KV_CACHING
+    infinicore::op::kv_caching_(
+        k_cache_layer,
+        v_cache_layer,
+        k_permuted,
+        v_permuted,
+        past_sequence_lengths);
+#else
+    const size_t cache_pos = static_cast<size_t>(
+        reinterpret_cast<int32_t *>(past_sequence_lengths->to(infinicore::Device::cpu())->data())[0]);
+    const size_t update_len = k_permuted->size(2);
+    const size_t result_len = cache_pos + update_len;
+    if (result_len > k_cache_layer->size(2)) {
+        throw std::runtime_error("MiniCPMSALAAttention: KV cache length exceeded");
+    }
+    k_cache_layer->narrow({{2, cache_pos, update_len}})->copy_from(k_permuted);
+    v_cache_layer->narrow({{2, cache_pos, update_len}})->copy_from(v_permuted);
+#endif
+}
+
 // Same as HF MiniCPM-SALA _build_slope_tensor (used for Simple GLA decay).
 std::vector<float> build_slope_tensor(size_t n) {
     auto get_slopes_power_of_2 = [](size_t n) -> std::vector<float> {
@@ -105,35 +135,6 @@ MiniCPMSALAAttention::MiniCPMSALAAttention(std::shared_ptr<infinilm::config::Mod
     }
     scaling_ = static_cast<float>(1.0 / std::sqrt(static_cast<double>(head_dim_)));
 
-    // StaticKVCache is allocated as a compact slab per cache type:
-    //  - minicpm4-cache stores only layers where mixer_types[i] == "minicpm4"
-    //  - lightning-cache stores only layers where mixer_types[i] != "minicpm4"
-    //
-    // Compute this attention instance's local cache index (0-based) from its
-    // absolute layer_idx_.
-    {
-        bool this_is_minicpm4_cache = (mixer_type == "minicpm4");
-        std::vector<std::string> mixer_types;
-        try {
-            mixer_types = model_config_->get<std::vector<std::string>>("mixer_types");
-        } catch (...) {
-            mixer_types.assign(model_config_->get<size_t>("num_hidden_layers"), "minicpm4");
-        }
-        // Be defensive if mixer_types size mismatches.
-        if (mixer_types.size() != model_config_->get<size_t>("num_hidden_layers")) {
-            mixer_types.resize(model_config_->get<size_t>("num_hidden_layers"), "minicpm4");
-        }
-        size_t count = 0;
-        for (size_t i = 0; i <= layer_idx_ && i < mixer_types.size(); ++i) {
-            const bool is_minicpm4_layer = (mixer_types[i] == "minicpm4");
-            if (is_minicpm4_layer == this_is_minicpm4_cache) {
-                ++count;
-            }
-        }
-        // layer_idx_ is always a valid layer, so count should be >= 1.
-        cache_layer_idx_ = count > 0 ? (count - 1) : 0;
-    }
-
     // HyPE: RoPE in lightning layers, NoPE in sparse (minicpm4) layers.
     // We treat all non-minicpm4 as "linear" (lightning-attn) for M1 dense fallback.
     use_rope_ = (mixer_type != "minicpm4") && model_config_->get_or<bool>("lightning_use_rope", true);
@@ -176,7 +177,7 @@ void MiniCPMSALAAttention::set_rotary_emb(const std::shared_ptr<infinicore::nn::
     rotary_emb_ = rotary_emb;
 }
 
-void MiniCPMSALAAttention::reset_cache() {
+void MiniCPMSALAAttention::reset_state() {
     // KV tensors are maintained by the shared engine cache (StaticKVCache).
     // Lightning decode recurrent state is maintained locally for performance.
     gla_state_valid_ = false;
@@ -185,27 +186,14 @@ void MiniCPMSALAAttention::reset_cache() {
 }
 
 
-infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &hidden_states,
-                                                 const infinicore::Tensor &position_ids,
-                                                 std::shared_ptr<infinilm::cache::Cache> kv_cache,
-                                                 std::optional<infinicore::Tensor> past_sequence_lengths,
-                                                 std::optional<infinicore::Tensor> total_sequence_lengths,
-                                                 std::optional<infinicore::Tensor> input_offsets,
-                                                 std::optional<infinicore::Tensor> cu_seqlens,
-                                                 std::optional<infinicore::Tensor> block_tables,
-                                                 std::optional<infinicore::Tensor> slot_mapping) const {
-    (void)input_offsets;
-    (void)block_tables;
-    (void)slot_mapping;
-    return forward_dense_(hidden_states, position_ids, kv_cache, past_sequence_lengths, total_sequence_lengths, cu_seqlens);
-}
-
-infinicore::Tensor MiniCPMSALAAttention::forward_dense_(const infinicore::Tensor &hidden_states,
-                                                       const infinicore::Tensor &position_ids,
-                                                       std::shared_ptr<infinilm::cache::Cache> kv_cache,
-                                                       std::optional<infinicore::Tensor> past_sequence_lengths,
-                                                       std::optional<infinicore::Tensor> total_sequence_lengths,
-                                                       std::optional<infinicore::Tensor> cu_seqlens) const {
+infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &position_ids,
+                                                const infinicore::Tensor &hidden_states) const {
+    const auto &attn_meta = infinilm::global_state::get_forward_context().attn_metadata;
+    auto past_sequence_lengths = attn_meta.past_sequence_lengths;
+    auto total_sequence_lengths = attn_meta.total_sequence_lengths;
+    auto cu_seqlens = attn_meta.cu_seqlens;
+    // input_offsets/block_tables/slot_mapping are not used in this dense/per-layer-kv implementation yet.
+    (void)cu_seqlens;
     // Input: [B, S, H]
     auto shape = hidden_states->shape();
     const size_t batch_size = shape[0];
@@ -277,22 +265,28 @@ infinicore::Tensor MiniCPMSALAAttention::forward_dense_(const infinicore::Tensor
     auto k_permuted = k_reshaped->permute({0, 2, 1, 3})->contiguous(); // [B, n_kv, S, D]
     auto v_permuted = v_reshaped->permute({0, 2, 1, 3})->contiguous(); // [B, n_kv, S, D]
 
-    // HF-like dense KV caching using the engine-provided StaticKVCache.
+    // Per-layer KV tensors in `global_state::get_forward_context().kv_cache_vec` (same pattern as
+    // `InfinilmModel::reset_cache` / `StaticAttentionImpl`).
     infinicore::Tensor k_total = k_permuted;
     infinicore::Tensor v_total = v_permuted;
-    std::shared_ptr<cache::StaticKVCache> static_kv_cache = nullptr;
-    if (kv_cache != nullptr && has_cache_meta) {
-        static_kv_cache = std::dynamic_pointer_cast<cache::StaticKVCache>(kv_cache);
-        if (!static_kv_cache) {
-            throw std::runtime_error("MiniCPMSALAAttention: Unsupported cache type (expected StaticKVCache)");
+    bool use_forward_kv = false;
+    if (has_cache_meta) {
+        auto &kv_vec = infinilm::global_state::get_forward_context().kv_cache_vec;
+        if (layer_idx_ >= kv_vec.size()) {
+            throw std::runtime_error(
+                "MiniCPMSALAAttention: forward_context.kv_cache_vec is unset or too small (call reset_cache / align layer count)");
         }
-        // Default behavior: update cache here. For minicpm4 decode we may override and let InfLLM-v2 update.
-        auto [k_cached, v_cached] = static_kv_cache->update(
-            cache_layer_idx_, k_permuted, v_permuted, past_sequence_lengths.value());
-        k_total = k_cached;
-        v_total = v_cached;
+        use_forward_kv = true;
+        minicpm_sala_update_layer_kv_tensor(
+            kv_vec[layer_idx_],
+            k_permuted,
+            v_permuted,
+            past_sequence_lengths.value());
+        auto k_cache_layer = kv_vec[layer_idx_]->narrow({{0, 0, 1}})->squeeze(0);
+        auto v_cache_layer = kv_vec[layer_idx_]->narrow({{0, 1, 1}})->squeeze(0);
+        k_total = k_cache_layer;
+        v_total = v_cache_layer;
     } else {
-        // No cache metadata => treat as prefill-only.
         total_seq_len = seq_len;
     }
 
@@ -339,7 +333,7 @@ infinicore::Tensor MiniCPMSALAAttention::forward_dense_(const infinicore::Tensor
 
         // Lightning fast decode: maintain recurrent state locally (do NOT depend on StaticKVCache extensions).
         // We rebuild state on-demand if it is out-of-sync with cache_pos.
-        const bool is_decode = has_cache_meta && static_kv_cache && (seq_len == 1) && (total_seq_len > 1);
+        const bool is_decode = has_cache_meta && use_forward_kv && (seq_len == 1) && (total_seq_len > 1);
         if (is_decode) {
             ensure_gla_state_allocated(gla_state_, q_bthd->device(), batch_size, n_h, head_dim_);
 
@@ -416,13 +410,12 @@ infinicore::Tensor MiniCPMSALAAttention::forward_dense_(const infinicore::Tensor
                     "MiniCPMSALAAttention(minicpm4): total_sequence_lengths is required for InfLLM-v2 path");
             }
             // `infllmv2_kvcache` expects the number of valid K/V entries in the
-            // provided cache tensors. Since we already appended the current
-            // token via StaticKVCache::update, the valid length is the total
-            // KV length (past + current token).
+            // provided cache tensors. After per-layer KV update, valid length is
+            // total KV length (past + current token).
             const auto cache_lens = total_sequence_lengths.value();
 
             // Prefill: InfLLM-v2 varlen (Q and K packed lengths match `seq_len == total_seq_len` here).
-            // Decode: `seq_len < total_seq_len` — use `infllmv2_kvcache` after StaticKVCache::update
+            // Decode: `seq_len < total_seq_len` — use `infllmv2_kvcache` after KV tensor update
             // (valid KV length == `total_seq_len`). Using varlen for decode (1 query vs long K) hit NaNs
             // in practice for modest sequence lengths; kvcache matches operator tests and Flash path.
             const bool force_varlen_decode = [&]() {
@@ -465,7 +458,7 @@ infinicore::Tensor MiniCPMSALAAttention::forward_dense_(const infinicore::Tensor
                     /*window_size_left=*/window_left,
                     /*window_size_right=*/window_right);
                 attn_output = out_var->view({batch_size, seq_len, num_attention_heads_ * head_dim_});
-            } else if (static_kv_cache) {
+            } else if (use_forward_kv) {
                 if (batch_size != 1) {
                     throw std::runtime_error("MiniCPMSALAAttention(minicpm4): kvcache decode path currently requires batch_size=1");
                 }
@@ -490,7 +483,7 @@ infinicore::Tensor MiniCPMSALAAttention::forward_dense_(const infinicore::Tensor
                     {batch_size, seq_len, num_attention_heads_ * head_dim_});
             } else {
                 throw std::runtime_error(
-                    "MiniCPMSALAAttention(minicpm4): decode requires StaticKVCache (missing cache metadata or cache)");
+                    "MiniCPMSALAAttention(minicpm4): decode requires KV cache (missing cache metadata or kv_cache_vec)");
             }
         } catch (const std::exception &e) {
             throw std::runtime_error(
diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp
index 37dab7ec..d11a6037 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp
@@ -30,26 +30,13 @@ class MiniCPMSALAAttention : public infinicore::nn::Module {
                          engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(),
                          backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
 
-    infinicore::Tensor forward(const infinicore::Tensor &hidden_states,
-                               const infinicore::Tensor &position_ids,
-                               std::shared_ptr<infinilm::cache::Cache> kv_cache,
-                               std::optional<infinicore::Tensor> past_sequence_lengths,
-                               std::optional<infinicore::Tensor> total_sequence_lengths,
-                               std::optional<infinicore::Tensor> input_offsets,
-                               std::optional<infinicore::Tensor> cu_seqlens,
-                               std::optional<infinicore::Tensor> block_tables,
-                               std::optional<infinicore::Tensor> slot_mapping) const;
+    // Match `infinilm::layers::attention::Attention` API: metadata is pulled from
+    // `global_state::get_forward_context().attn_metadata`.
+    infinicore::Tensor forward(const infinicore::Tensor &position_ids,
+                               const infinicore::Tensor &hidden_states) const;
 
     void set_rotary_emb(const std::shared_ptr<infinicore::nn::RoPE> &rotary_emb);
-    void reset_cache();
-
-private:
-    infinicore::Tensor forward_dense_(const infinicore::Tensor &hidden_states,
-                                     const infinicore::Tensor &position_ids,
-                                     std::shared_ptr<infinilm::cache::Cache> kv_cache,
-                                     std::optional<infinicore::Tensor> past_sequence_lengths,
-                                     std::optional<infinicore::Tensor> total_sequence_lengths,
-                                     std::optional<infinicore::Tensor> cu_seqlens) const;
+    void reset_state();
 
 protected:
     // Projections (HF-aligned naming)
@@ -72,9 +59,6 @@ class MiniCPMSALAAttention : public infinicore::nn::Module {
     engine::distributed::RankInfo rank_info_;
 
     size_t layer_idx_;
-    // Layer index remapped into the cache instance (minicpm4-cache vs lightning-cache).
-    // StaticKVCache allocates a compact [num_layers, ...] slab per cache type.
-    size_t cache_layer_idx_ = 0;
     size_t hidden_size_;
     size_t num_attention_heads_;
     size_t num_key_value_heads_;
diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp
index 391b626b..feacb3d3 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp
@@ -1,5 +1,6 @@
 #include "minicpm_sala_decoder_layer.hpp"
 
+#include "../../global_state/global_state.hpp"
 #include "infinicore/ops.hpp"
 #include "infinicore/context/context.hpp"
 #include <cmath>
@@ -38,10 +39,6 @@ void MiniCPMSALADecoderLayer::set_rotary_emb(const std::shared_ptr<infinicore::n
     self_attn_->set_rotary_emb(rotary_emb);
 }
 
-void MiniCPMSALADecoderLayer::reset_cache() {
-    self_attn_->reset_cache();
-}
-
 infinicore::Tensor MiniCPMSALADecoderLayer::forward(const infinicore::Tensor &hidden_states,
                                                     const infinicore::Tensor &position_ids,
                                                     std::shared_ptr<infinilm::cache::Cache> kv_cache,
@@ -51,18 +48,19 @@ infinicore::Tensor MiniCPMSALADecoderLayer::forward(const infinicore::Tensor &hi
                                                     std::optional<infinicore::Tensor> cu_seqlens,
                                                     std::optional<infinicore::Tensor> block_tables,
                                                     std::optional<infinicore::Tensor> slot_mapping) const {
+    // Match `layers/attention/Attention`: stash attention metadata in global forward context.
+    infinilm::global_state::get_forward_context().attn_metadata =
+        infinilm::global_state::AttentionMetadata(past_sequence_lengths,
+                                                  total_sequence_lengths,
+                                                  input_offsets,
+                                                  cu_seqlens,
+                                                  block_tables,
+                                                  slot_mapping);
+
     // Pre-norm attention
     auto hs1 = input_layernorm_->forward(hidden_states);
-    auto attn_out = self_attn_->forward(
-        hs1,
-        position_ids,
-        kv_cache,
-        past_sequence_lengths,
-        total_sequence_lengths,
-        input_offsets,
-        cu_seqlens,
-        block_tables,
-        slot_mapping);
+    (void)kv_cache;
+    auto attn_out = self_attn_->forward(position_ids, hs1);
 
     // residual + scale_down * attn_out (MuP)
     auto ones_attn = infinicore::Tensor::empty(attn_out->shape(), attn_out->dtype(), attn_out->device());
diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp
index 948e4d97..094e8650 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp
@@ -17,6 +17,8 @@
 
 namespace infinilm::models::minicpm_sala {
 
+class MiniCPMSALAModel;
+
 class MiniCPMSALADecoderLayer : public infinicore::nn::Module {
 public:
     MiniCPMSALADecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
@@ -37,9 +39,10 @@ class MiniCPMSALADecoderLayer : public infinicore::nn::Module {
                                std::optional<infinicore::Tensor> slot_mapping) const;
 
     void set_rotary_emb(const std::shared_ptr<infinicore::nn::RoPE> &rotary_emb);
-    void reset_cache();
 
 private:
+    friend class MiniCPMSALAModel;
+
     double residual_scale_ = 1.0;
     size_t layer_idx_ = 0;
 
diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp
index 74ea4f9a..bcbb9f6f 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp
@@ -1,6 +1,7 @@
 #include "minicpm_sala_for_causal_lm.hpp"
 #include "../models_registry.hpp"
 
+#include "../../global_state/global_state.hpp"
 #include "infinicore/ops.hpp"
 #include <cmath>
 #include <stdexcept>
@@ -8,6 +9,11 @@
 
 namespace infinilm::models::minicpm_sala {
 
+std::vector<infinicore::Tensor> minicpm_sala_allocate_kv_cache_tensors(
+    const cache::CacheConfig *cache_config,
+    const std::shared_ptr<infinilm::config::ModelConfig> &text_config,
+    const backends::AttentionBackend &attention_backend);
+
 std::shared_ptr<infinilm::config::ModelConfig> create_minicpm_sala_model_config(
     std::shared_ptr<infinilm::config::ModelConfig> model_config) {
     const std::string &model_type = model_config->get<std::string>("model_type");
@@ -23,6 +29,7 @@ MiniCPMSALAForCausalLM::MiniCPMSALAForCausalLM(
     engine::distributed::RankInfo rank_info,
     backends::AttentionBackend attention_backend) {
     device_ = device;
+    model_config_ = model_config;
 
     // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA).
     const auto dtype = model_config->get_dtype();
@@ -62,12 +69,23 @@ MiniCPMSALAForCausalLM::Output MiniCPMSALAForCausalLM::forward(
 }
 
 void MiniCPMSALAForCausalLM::reset_cache(const cache::CacheConfig *cache_config) {
+    // Match `InfinilmModel::reset_cache`: own `cache_config_` + `kv_cache_vec` here; inner model only
+    // resets per-layer attention state. MiniCPM uses `minicpm_sala_allocate_kv_cache_tensors` instead of
+    // `default_allocate_kv_cache_tensors`.
+    if (cache_config == nullptr) {
+        cache_config_.reset();
+        infinilm::global_state::get_forward_context().kv_cache_vec.clear();
+        model_->reset_state();
+        return;
+    }
     cache_config_ = cache_config->unique_copy();
-    model_->reset_cache(cache_config_.get());
-}
-
-const cache::CacheConfig *MiniCPMSALAForCausalLM::get_cache_config() const {
-    return cache_config_.get();
+    auto &kv_cache_vec = infinilm::global_state::get_forward_context().kv_cache_vec;
+    kv_cache_vec.clear();
+    const backends::AttentionBackend attention_backend =
+        infinilm::global_state::get_infinilm_config().attention_backend;
+    kv_cache_vec = std::move(
+        minicpm_sala_allocate_kv_cache_tensors(cache_config, model_config_, attention_backend));
+    model_->reset_state();
 }
 
 } // namespace infinilm::models::minicpm_sala
diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp
index 33305b23..9344dfd3 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp
@@ -6,9 +6,9 @@
 #include "../../config/model_config.hpp"
 #include "../../engine/distributed/distributed.hpp"
 #include "../../backends/attention_backends.hpp"
+#include "../../layers/linear/linear.hpp"
 
 #include "infinicore/device.hpp"
-#include "infinicore/nn/linear.hpp"
 
 namespace infinilm::models::minicpm_sala {
 
@@ -26,12 +26,9 @@ class MiniCPMSALAForCausalLM : public InfinilmModel {
 
     void reset_cache(const cache::CacheConfig *cache_config) override;
 
-    const cache::CacheConfig *get_cache_config() const override;
-
 private:
     INFINICORE_NN_MODULE(MiniCPMSALAModel, model);
-    INFINICORE_NN_MODULE(infinicore::nn::Linear, lm_head);
-    std::unique_ptr<cache::CacheConfig> cache_config_;
+    INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, lm_head);
 };
 
 } // namespace infinilm::models::minicpm_sala
@@ -42,4 +39,3 @@ std::shared_ptr<infinilm::config::ModelConfig> create_minicpm_sala_model_config(
     std::shared_ptr<infinilm::config::ModelConfig> model_config);
 
 } // namespace infinilm::models::minicpm_sala
-
diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.cpp b/csrc/models/minicpm_sala/minicpm_sala_model.cpp
index 6fd00bfe..de63831a 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_model.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_model.cpp
@@ -7,7 +7,6 @@
 #include <cstdlib>
 #include <fstream>
 #include <stdexcept>
-#include <algorithm>
 #include <vector>
 
 namespace infinilm::models::minicpm_sala {
@@ -34,15 +33,8 @@ MiniCPMSALAModel::MiniCPMSALAModel(std::shared_ptr<infinilm::config::ModelConfig
     INFINICORE_NN_MODULE_INIT(embed_tokens, vocab_size, hidden_size_, std::nullopt, dtype, device);
     INFINICORE_NN_MODULE_INIT(norm, hidden_size_, model_config_->get<double>("rms_norm_eps"), dtype, device);
 
-    // Shared rotary embedding (used by lightning layers only)
-    INFINICORE_NN_MODULE_INIT(rotary_emb,
-                              model_config_->get_head_dim(),
-                              model_config_->get<size_t>("max_position_embeddings"),
-                              model_config_->get<double>("rope_theta"),
-                              infinicore::nn::RoPE::Algo::GPT_NEOX,
-                              dtype,
-                              device,
-                              model_config_->get_rope_scaling());
+    // Shared rotary embedding (used by lightning layers only) — match `get_rope` pattern.
+    rotary_emb_ = infinilm::layers::rotary_embedding::get_rope(model_config_, device);
 
     // Mixer types per-layer decide attention flavor (minicpm4 vs lightning-attn).
     std::vector<std::string> mixer_types;
@@ -54,7 +46,6 @@ MiniCPMSALAModel::MiniCPMSALAModel(std::shared_ptr<infinilm::config::ModelConfig
     if (mixer_types.size() != num_layers) {
         mixer_types.resize(num_layers, mixer_types.empty() ? "minicpm4" : mixer_types.back());
     }
-    mixer_types_ = mixer_types;
 
     layers_.reserve(num_layers);
     for (size_t i = 0; i < num_layers; ++i) {
@@ -64,61 +55,9 @@ MiniCPMSALAModel::MiniCPMSALAModel(std::shared_ptr<infinilm::config::ModelConfig
     }
 }
 
-void MiniCPMSALAModel::reset_cache(const cache::CacheConfig *cache_config) {
-    if (cache_config == nullptr) {
-        kv_cache_minicpm4_ = nullptr;
-        kv_cache_lightning_ = nullptr;
-        for (auto &layer : layers_) {
-            layer->reset_cache();
-        }
-        return;
-    }
-
-    if (auto static_cfg = dynamic_cast<const cache::StaticKVCacheConfig *>(cache_config)) {
-        // Allocate separate caches by KV shape to avoid per-layer padding copies.
-        const size_t num_hidden_layers = model_config_->get<size_t>("num_hidden_layers");
-        // mixer_types_ is filled in ctor from model_config_->get("mixer_types").
-        const size_t minicpm4_layer_count =
-            !mixer_types_.empty() ? std::count(mixer_types_.begin(), mixer_types_.end(), "minicpm4") : num_hidden_layers;
-        const size_t lightning_layer_count = num_hidden_layers - minicpm4_layer_count;
-
-        const size_t base_kv_heads = model_config_->get<size_t>("num_key_value_heads");
-        const size_t base_head_dim = model_config_->get<size_t>("head_dim");
-        const size_t lightning_kv_heads = model_config_->get_or<size_t>("lightning_nkv", base_kv_heads);
-        const size_t lightning_head_dim = model_config_->get_or<size_t>("lightning_head_dim", base_head_dim);
-
-        kv_cache_minicpm4_ = (minicpm4_layer_count > 0)
-                                 ? std::make_shared<cache::StaticKVCache>(
-                                       /*k_dim=*/base_head_dim,
-                                       /*v_dim=*/base_head_dim,
-                                       /*num_k_heads=*/base_kv_heads,
-                                       /*num_v_heads=*/base_kv_heads,
-                                       /*num_layers=*/minicpm4_layer_count,
-                                       /*max_positional_embedding=*/model_config_->get<size_t>("max_position_embeddings"),
-                                       /*dtype=*/model_config_->get_dtype(),
-                                       *static_cfg,
-                                       rank_info_)
-                                 : nullptr;
-
-        kv_cache_lightning_ = (lightning_layer_count > 0)
-                                   ? std::make_shared<cache::StaticKVCache>(
-                                         /*k_dim=*/lightning_head_dim,
-                                         /*v_dim=*/lightning_head_dim,
-                                         /*num_k_heads=*/lightning_kv_heads,
-                                         /*num_v_heads=*/lightning_kv_heads,
-                                         /*num_layers=*/lightning_layer_count,
-                                         /*max_positional_embedding=*/model_config_->get<size_t>("max_position_embeddings"),
-                                         /*dtype=*/model_config_->get_dtype(),
-                                         *static_cfg,
-                                         rank_info_)
-                                 : nullptr;
-    } else {
-        // This refactor implements HF-like dense caching only.
-        throw std::runtime_error("MiniCPMSALAModel::reset_cache: Unsupported cache type (expected StaticKVCacheConfig)");
-    }
-
+void MiniCPMSALAModel::reset_state() {
     for (auto &layer : layers_) {
-        layer->reset_cache();
+        layer->self_attn_->reset_state();
     }
 }
 
@@ -134,29 +73,15 @@ infinicore::Tensor MiniCPMSALAModel::forward(const infinicore::Tensor &input_ids
     auto hs = embed_tokens_->forward(input_ids);
 
     for (size_t i = 0; i < layers_.size(); ++i) {
-        std::shared_ptr<cache::Cache> layer_cache;
-        if (!mixer_types_.empty() && mixer_types_[i] == "minicpm4") {
-            layer_cache = kv_cache_minicpm4_;
-        } else {
-            layer_cache = kv_cache_lightning_;
-        }
         hs = layers_[i]->forward(hs,
                                  position_ids,
-                                 layer_cache,
+                                 nullptr,
                                  past_sequence_lengths,
                                  total_sequence_lengths,
                                  input_offsets,
                                  cu_seqlens,
                                  block_tables,
                                  slot_mapping);
-        if (const char *env = std::getenv("MINICPM_SALA_LAYER_TRACE")) {
-            if (env[0] != '\0' && env[0] != '0') {
-                fprintf(stderr, "[minicpm_sala][layer_trace] layer=%zu mixer=%s\n",
-                        i,
-                        mixer_types_.empty() ? "unknown" : mixer_types_[i].c_str());
-                fflush(stderr);
-            }
-        }
     }
 
     hs = norm_->forward(hs);
diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.hpp b/csrc/models/minicpm_sala/minicpm_sala_model.hpp
index d360dd3e..93f0a7e7 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_model.hpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_model.hpp
@@ -3,10 +3,11 @@
 #include "minicpm_sala_decoder_layer.hpp"
 
 #include "../../backends/attention_backends.hpp"
-#include "../../cache/kv_cache.hpp"
+#include "../../cache/cache.hpp"
 #include "../../config/model_config.hpp"
 #include "../../engine/distributed/distributed.hpp"
 
+#include "../../layers/rotary_embedding/rotary_embedding.hpp"
 #include "infinicore/nn/embedding.hpp"
 #include "infinicore/nn/module.hpp"
 #include "infinicore/nn/rmsnorm.hpp"
@@ -35,7 +36,7 @@ class MiniCPMSALAModel : public infinicore::nn::Module {
                                std::optional<infinicore::Tensor> block_tables,
                                std::optional<infinicore::Tensor> slot_mapping) const;
 
-    void reset_cache(const cache::CacheConfig *cache_config);
+    void reset_state();
 
     size_t hidden_size() const { return hidden_size_; }
     double dim_model_base() const { return dim_model_base_; }
@@ -44,17 +45,12 @@ class MiniCPMSALAModel : public infinicore::nn::Module {
     INFINICORE_NN_MODULE(infinicore::nn::Embedding, embed_tokens);
     INFINICORE_NN_MODULE_VEC(MiniCPMSALADecoderLayer, layers);
     INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, norm);
-    INFINICORE_NN_MODULE(infinicore::nn::RoPE, rotary_emb);
 
 private:
     std::shared_ptr<infinilm::config::ModelConfig> model_config_;
+    std::shared_ptr<infinicore::nn::RoPE> rotary_emb_;
     engine::distributed::RankInfo rank_info_;
     backends::AttentionBackend attention_backend_;
-    // MiniCPM-SALA is hybrid: minicpm4 vs lightning layers can have different KV shapes.
-    // Use two StaticKVCache instances to avoid per-layer padding/copies during long prefill.
-    std::shared_ptr<cache::StaticKVCache> kv_cache_minicpm4_;
-    std::shared_ptr<cache::StaticKVCache> kv_cache_lightning_;
-    std::vector<std::string> mixer_types_;
     infinicore::Device compute_device_;
 
     size_t hidden_size_;

From e11223f1a7c4deaae1e3fcbc12ab1c6a5da2fb2e Mon Sep 17 00:00:00 2001
From: Ceng23333 <441651826@qq.com>
Date: Thu, 9 Apr 2026 06:12:39 +0000
Subject: [PATCH 07/11] refactor

Signed-off-by: Ceng23333 <441651826@qq.com>
---
 .../minicpm_sala/minicpm_sala_attention.cpp   |  5 +--
 .../minicpm_sala/minicpm_sala_attention.hpp   |  2 +-
 .../minicpm_sala_decoder_layer.cpp            | 24 +-------------
 .../minicpm_sala_decoder_layer.hpp            | 12 +------
 .../minicpm_sala_for_causal_lm.cpp            |  4 ++-
 .../minicpm_sala/minicpm_sala_model.cpp       | 31 +++++++++----------
 .../minicpm_sala/minicpm_sala_model.hpp       |  7 ++---
 python/infinilm/infer_engine.py               | 16 ++--------
 python/infinilm/llm/llm.py                    |  9 +-----
 python/infinilm/llm/static_scheduler.py       |  7 -----
 10 files changed, 27 insertions(+), 90 deletions(-)

diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp
index f437f9e9..f36b84c5 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp
@@ -138,6 +138,7 @@ MiniCPMSALAAttention::MiniCPMSALAAttention(std::shared_ptr<infinilm::config::Mod
     // HyPE: RoPE in lightning layers, NoPE in sparse (minicpm4) layers.
     // We treat all non-minicpm4 as "linear" (lightning-attn) for M1 dense fallback.
     use_rope_ = (mixer_type != "minicpm4") && model_config_->get_or<bool>("lightning_use_rope", true);
+    rotary_emb_ = infinilm::layers::rotary_embedding::get_rope(model_config_, device);
 
     // MiniCPM-SALA uses QK-norm and output gates by default.
     use_qk_norm_ = model_config_->get_or<bool>("qk_norm", true) && (mixer_type != "minicpm4");
@@ -173,10 +174,6 @@ MiniCPMSALAAttention::MiniCPMSALAAttention(std::shared_ptr<infinilm::config::Mod
     g_gamma_ = g_cpu->to(device);
 }
 
-void MiniCPMSALAAttention::set_rotary_emb(const std::shared_ptr<infinicore::nn::RoPE> &rotary_emb) {
-    rotary_emb_ = rotary_emb;
-}
-
 void MiniCPMSALAAttention::reset_state() {
     // KV tensors are maintained by the shared engine cache (StaticKVCache).
     // Lightning decode recurrent state is maintained locally for performance.
diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp
index d11a6037..2013d678 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp
@@ -4,6 +4,7 @@
 #include "../../cache/kv_cache.hpp"
 #include "../../config/model_config.hpp"
 #include "../../engine/distributed/distributed.hpp"
+#include "../../layers/rotary_embedding/rotary_embedding.hpp"
 
 #include "infinicore/nn/linear.hpp"
 #include "infinicore/nn/module.hpp"
@@ -35,7 +36,6 @@ class MiniCPMSALAAttention : public infinicore::nn::Module {
     infinicore::Tensor forward(const infinicore::Tensor &position_ids,
                                const infinicore::Tensor &hidden_states) const;
 
-    void set_rotary_emb(const std::shared_ptr<infinicore::nn::RoPE> &rotary_emb);
     void reset_state();
 
 protected:
diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp
index feacb3d3..7a44704e 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp
@@ -1,6 +1,5 @@
 #include "minicpm_sala_decoder_layer.hpp"
 
-#include "../../global_state/global_state.hpp"
 #include "infinicore/ops.hpp"
 #include "infinicore/context/context.hpp"
 #include <cmath>
@@ -35,31 +34,10 @@ MiniCPMSALADecoderLayer::MiniCPMSALADecoderLayer(std::shared_ptr<infinilm::confi
     INFINICORE_NN_MODULE_INIT(mlp, model_config, device);
 }
 
-void MiniCPMSALADecoderLayer::set_rotary_emb(const std::shared_ptr<infinicore::nn::RoPE> &rotary_emb) {
-    self_attn_->set_rotary_emb(rotary_emb);
-}
-
 infinicore::Tensor MiniCPMSALADecoderLayer::forward(const infinicore::Tensor &hidden_states,
-                                                    const infinicore::Tensor &position_ids,
-                                                    std::shared_ptr<infinilm::cache::Cache> kv_cache,
-                                                    std::optional<infinicore::Tensor> past_sequence_lengths,
-                                                    std::optional<infinicore::Tensor> total_sequence_lengths,
-                                                    std::optional<infinicore::Tensor> input_offsets,
-                                                    std::optional<infinicore::Tensor> cu_seqlens,
-                                                    std::optional<infinicore::Tensor> block_tables,
-                                                    std::optional<infinicore::Tensor> slot_mapping) const {
-    // Match `layers/attention/Attention`: stash attention metadata in global forward context.
-    infinilm::global_state::get_forward_context().attn_metadata =
-        infinilm::global_state::AttentionMetadata(past_sequence_lengths,
-                                                  total_sequence_lengths,
-                                                  input_offsets,
-                                                  cu_seqlens,
-                                                  block_tables,
-                                                  slot_mapping);
-
+                                                    const infinicore::Tensor &position_ids) const {
     // Pre-norm attention
     auto hs1 = input_layernorm_->forward(hidden_states);
-    (void)kv_cache;
     auto attn_out = self_attn_->forward(position_ids, hs1);
 
     // residual + scale_down * attn_out (MuP)
diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp
index 094e8650..44d320c9 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp
@@ -4,7 +4,6 @@
 #include "minicpm_sala_mlp.hpp"
 
 #include "../../backends/attention_backends.hpp"
-#include "../../cache/kv_cache.hpp"
 #include "../../config/model_config.hpp"
 #include "../../engine/distributed/distributed.hpp"
 
@@ -29,16 +28,7 @@ class MiniCPMSALADecoderLayer : public infinicore::nn::Module {
                             backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
 
     infinicore::Tensor forward(const infinicore::Tensor &hidden_states,
-                               const infinicore::Tensor &position_ids,
-                               std::shared_ptr<infinilm::cache::Cache> kv_cache,
-                               std::optional<infinicore::Tensor> past_sequence_lengths,
-                               std::optional<infinicore::Tensor> total_sequence_lengths,
-                               std::optional<infinicore::Tensor> input_offsets,
-                               std::optional<infinicore::Tensor> cu_seqlens,
-                               std::optional<infinicore::Tensor> block_tables,
-                               std::optional<infinicore::Tensor> slot_mapping) const;
-
-    void set_rotary_emb(const std::shared_ptr<infinicore::nn::RoPE> &rotary_emb);
+                               const infinicore::Tensor &position_ids) const;
 
 private:
     friend class MiniCPMSALAModel;
diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp
index bcbb9f6f..fb55556f 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp
@@ -33,7 +33,9 @@ MiniCPMSALAForCausalLM::MiniCPMSALAForCausalLM(
 
     // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA).
     const auto dtype = model_config->get_dtype();
-    INFINICORE_NN_MODULE_INIT(model, model_config, device, rank_info, attention_backend);
+    (void)rank_info;
+    (void)attention_backend;
+    INFINICORE_NN_MODULE_INIT(model, model_config, device);
 
     const size_t hidden_size = model_config->get<size_t>("hidden_size");
     const size_t vocab_size = model_config->get<size_t>("vocab_size");
diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.cpp b/csrc/models/minicpm_sala/minicpm_sala_model.cpp
index de63831a..f6d9bb4d 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_model.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_model.cpp
@@ -12,16 +12,14 @@
 namespace infinilm::models::minicpm_sala {
 
 MiniCPMSALAModel::MiniCPMSALAModel(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                                   const infinicore::Device &device,
-                                   engine::distributed::RankInfo rank_info,
-                                   backends::AttentionBackend attention_backend)
-    : model_config_(std::move(model_config)),
-      rank_info_(rank_info),
-      attention_backend_(attention_backend) {
+                                   const infinicore::Device &device)
+    : model_config_(std::move(model_config)) {
 
     // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA).
     const auto dtype = model_config_->get_dtype();
     compute_device_ = device;
+    const engine::distributed::RankInfo &rank_info = infinilm::global_state::get_tensor_model_parallel_rank_info();
+    const backends::AttentionBackend attention_backend = infinilm::global_state::get_infinilm_config().attention_backend;
 
     hidden_size_ = model_config_->get<size_t>("hidden_size");
     dim_model_base_ = model_config_->get_or<double>("dim_model_base", static_cast<double>(hidden_size_));
@@ -50,8 +48,7 @@ MiniCPMSALAModel::MiniCPMSALAModel(std::shared_ptr<infinilm::config::ModelConfig
     layers_.reserve(num_layers);
     for (size_t i = 0; i < num_layers; ++i) {
         layers_.push_back(this->register_module<MiniCPMSALADecoderLayer>(
-            "layers." + std::to_string(i), model_config_, device, i, mixer_types[i], rank_info_, attention_backend_));
-        layers_.back()->set_rotary_emb(rotary_emb_);
+            "layers." + std::to_string(i), model_config_, device, i, mixer_types[i], rank_info, attention_backend));
     }
 }
 
@@ -69,19 +66,19 @@ infinicore::Tensor MiniCPMSALAModel::forward(const infinicore::Tensor &input_ids
                                              std::optional<infinicore::Tensor> cu_seqlens,
                                              std::optional<infinicore::Tensor> block_tables,
                                              std::optional<infinicore::Tensor> slot_mapping) const {
+    infinilm::global_state::get_forward_context().attn_metadata =
+        infinilm::global_state::AttentionMetadata(past_sequence_lengths,
+                                                  total_sequence_lengths,
+                                                  input_offsets,
+                                                  cu_seqlens,
+                                                  block_tables,
+                                                  slot_mapping);
+
     // MuP scaling baked into weights at load time for minicpm_sala; no forward scaling here.
     auto hs = embed_tokens_->forward(input_ids);
 
     for (size_t i = 0; i < layers_.size(); ++i) {
-        hs = layers_[i]->forward(hs,
-                                 position_ids,
-                                 nullptr,
-                                 past_sequence_lengths,
-                                 total_sequence_lengths,
-                                 input_offsets,
-                                 cu_seqlens,
-                                 block_tables,
-                                 slot_mapping);
+        hs = layers_[i]->forward(hs, position_ids);
     }
 
     hs = norm_->forward(hs);
diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.hpp b/csrc/models/minicpm_sala/minicpm_sala_model.hpp
index 93f0a7e7..9b4a81c2 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_model.hpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_model.hpp
@@ -8,6 +8,7 @@
 #include "../../engine/distributed/distributed.hpp"
 
 #include "../../layers/rotary_embedding/rotary_embedding.hpp"
+#include "../../global_state/global_state.hpp"
 #include "infinicore/nn/embedding.hpp"
 #include "infinicore/nn/module.hpp"
 #include "infinicore/nn/rmsnorm.hpp"
@@ -23,9 +24,7 @@ namespace infinilm::models::minicpm_sala {
 class MiniCPMSALAModel : public infinicore::nn::Module {
 public:
     MiniCPMSALAModel(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                     const infinicore::Device &device,
-                     engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(),
-                     backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
+                     const infinicore::Device &device);
 
     infinicore::Tensor forward(const infinicore::Tensor &input_ids,
                                const infinicore::Tensor &position_ids,
@@ -49,8 +48,6 @@ class MiniCPMSALAModel : public infinicore::nn::Module {
 private:
     std::shared_ptr<infinilm::config::ModelConfig> model_config_;
     std::shared_ptr<infinicore::nn::RoPE> rotary_emb_;
-    engine::distributed::RankInfo rank_info_;
-    backends::AttentionBackend attention_backend_;
     infinicore::Device compute_device_;
 
     size_t hidden_size_;
diff --git a/python/infinilm/infer_engine.py b/python/infinilm/infer_engine.py
index 6552227c..9046b790 100644
--- a/python/infinilm/infer_engine.py
+++ b/python/infinilm/infer_engine.py
@@ -164,19 +164,9 @@ def generate(
                 dtype=infinicore.int32,
             )
 
-        # Decode metadata fast path (batch=1, static cache):
-        # avoid per-step from_list()/numpy allocations for tiny scalar tensors.
-        # Those tensors live on CPU and are H2D-copied each forward; for profiling
-        # comparisons vs `from_list` device metadata, set:
-        #   INFINI_PROFILE_DISABLE_FAST_DECODE_META=1
-        disable_fast_decode_meta = os.environ.get(
-            "INFINI_PROFILE_DISABLE_FAST_DECODE_META", "0"
-        ) not in ("", "0", "false", "False")
-        fast_decode_meta = (
-            (not self.enable_paged_attn)
-            and (initial_batch_size == 1)
-            and not disable_fast_decode_meta
-        )
+        # Decode metadata fast path (batch=1, static cache): avoid per-step from_list() allocations
+        # for tiny scalar tensors (these live on CPU and are H2D-copied each forward).
+        fast_decode_meta = (not self.enable_paged_attn) and (initial_batch_size == 1)
         if fast_decode_meta:
             cpu = infinicore.device("cpu", 0)
 
diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py
index e07e2155..07ada981 100644
--- a/python/infinilm/llm/llm.py
+++ b/python/infinilm/llm/llm.py
@@ -95,15 +95,8 @@ def __init__(self, config: EngineConfig):
         )
 
         # Load model weights
-        dtype_map = {
-            "float16": infinicore.float16,
-            "bfloat16": infinicore.bfloat16,
-            "float32": infinicore.float32,
-        }
         load_model_state_dict_by_file(
-            self.model_engine,
-            config.model_path,
-            dtype=dtype_map.get(config.dtype, self.model_engine.config.dtype),
+            self.model_engine, config.model_path, dtype=self.model_engine.config.dtype
         )
 
         # Initialize tokenizer
diff --git a/python/infinilm/llm/static_scheduler.py b/python/infinilm/llm/static_scheduler.py
index 25d64ae5..860bf7b9 100644
--- a/python/infinilm/llm/static_scheduler.py
+++ b/python/infinilm/llm/static_scheduler.py
@@ -115,11 +115,6 @@ def __init__(self, max_cache_len: int = 4096):
         self.max_cache_len = max_cache_len
         self.cached_block_hashes: List[int] = []
         self.pending_block_hashes: List[int] = []
-        # Safety switch: disable cross-request prefix reuse when investigating
-        # corrupted/contaminated generations.
-        self.disable_prefix_reuse = os.getenv(
-            "INFINILM_STATIC_DISABLE_PREFIX_REUSE", "0"
-        ) in ("1", "true", "True", "yes", "on")
 
     def add_request(self, request: InferenceRequest):
         if request is not None:
@@ -219,8 +214,6 @@ def schedule(self) -> Optional[StaticSchedulerOutput]:
             num_full_blocks = prompt_len // _BLOCK_SIZE
             matched = 0
 
-            if self.disable_prefix_reuse and self.cached_block_hashes:
-                self.cached_block_hashes.clear()
             self.pending_block_hashes.clear()
 
             for i in range(num_full_blocks):

From 33eb78dd781bd4e3ae50ae83131c64bdf482879e Mon Sep 17 00:00:00 2001
From: Ceng23333 <441651826@qq.com>
Date: Thu, 9 Apr 2026 06:27:15 +0000
Subject: [PATCH 08/11] refactor

Signed-off-by: Ceng23333 <441651826@qq.com>
---
 MINICPM_SALA_BUILD_AND_CHANGES.md       | 244 ----------------
 MiniCPM_SALA_alignment_progress.md      | 359 ------------------------
 examples/jiuge.py                       |  16 +-
 python/infinilm/llm/llm.py              |   1 -
 python/infinilm/llm/static_scheduler.py |   1 -
 python/infinilm/modeling_utils.py       |  20 +-
 6 files changed, 12 insertions(+), 629 deletions(-)
 delete mode 100644 MINICPM_SALA_BUILD_AND_CHANGES.md
 delete mode 100644 MiniCPM_SALA_alignment_progress.md

diff --git a/MINICPM_SALA_BUILD_AND_CHANGES.md b/MINICPM_SALA_BUILD_AND_CHANGES.md
deleted file mode 100644
index 1ec53fad..00000000
--- a/MINICPM_SALA_BUILD_AND_CHANGES.md
+++ /dev/null
@@ -1,244 +0,0 @@
-# MiniCPM-SALA on InfiniLM: Build Guide and Change Summary
-
-This document describes the changes in **InfiniCore** and **InfiniLM** from their baseline commits to support MiniCPM-SALA with InfLLM-v2, the **prerequisites**, and a **step-by-step build and run guide**. With these changes, `InfiniLM/examples/jiuge.py` produces **reasonable MiniCPM-SALA generation output** when run with the correct environment.
-
-**Baseline commits (for reference):**
-
-- **InfiniLM:** `main`
-- **InfiniCore:** `5fc85c8b1e6728839993f1b743a525a066da585f`
-
-To see the exact diff from baseline:  
-`git diff 5fc85c8b1e6728839993f1b743a525a066da585f -- InfiniCore` and  
-`git diff main -- InfiniLM`.
-
----
-
-## 1. Changes in InfiniCore (from `5fc85c8b1e6728839993f1b743a525a066da585f`)
-
-InfiniCore was extended to **wire InfLLM-v2** (Stage-2 sparse attention) so that when built with `--infllmv2=y`, the C++ API calls `mha_varlen_fwd` and `mha_fwd_kvcache` from the infllmv2_cuda_impl .so.
-
-### 1.1 New or modified files (summary)
-
-| Area | Path | Purpose |
-|------|------|--------|
-| API (decl) | `include/infinicore/ops/infllmv2_api.hpp` | Declares `mha_varlen_fwd`, `mha_fwd_kvcache` (must be provided by infllmv2 .so at link/runtime). |
-| API (decl) | `include/infinicore/ops/infllmv2_attention.hpp` | Public op header for infllmv2 attention. |
-| Ops impl | `src/infinicore/ops/infllmv2_attention/infllmv2_attention.cc` | Implements `infllmv2_varlen` and `infllmv2_kvcache` by calling the above APIs when `ENABLE_INFLLMV2` and `ENABLE_ATEN` are set. |
-| Pybind | `src/infinicore/pybind11/ops/infllmv2_attention.hpp` | Exposes infllmv2 ops to Python. |
-| Pybind | `src/infinicore/pybind11/ops.hpp` | Includes infllmv2 op bindings. |
-| Python | `python/infinicore/ops/infllmv2_attention.py` | Python wrapper for `infllmv2_varlen` / `infllmv2_kvcache`. |
-| Python | `python/infinicore/__init__.py` | Exports `infllmv2_varlen`, `infllmv2_kvcache`. |
-| Build | `xmake.lua` | New option `--infllmv2=y`; when set with `--aten=y`, defines `ENABLE_INFLLMV2` and links/rpath to the auto-detected .so. |
-| Test | `test/infinicore/ops/test_infllmv2_attention.py` | Unit tests for infllmv2 varlen/kvcache (skipped if not built or no CUDA). |
-| Example | `examples/infllmv2_sanity.py` | Sanity script for InfLLM-v2 (skips if .so absent or no CUDA). |
-
-### 1.2 Build option
-
-- **Option:** `infllmv2` (enable InfLLM-v2; xmake auto-detects `infllm_v2/*.so` under `InfiniCore/third_party/infllmv2_cuda_impl/build/...`).
-- **Requires:** `aten=y` (InfiniCore must be built with PyTorch/ATen).
-- **Effect:** Defines `ENABLE_INFLLMV2`, adds link and rpath to the auto-detected infllmv2 .so. At runtime, `libinfinicore_cpp_api.so` resolves `mha_varlen_fwd` / `mha_fwd_kvcache` from that .so (via `LD_LIBRARY_PATH` or `LD_PRELOAD`).
-
----
-
-## 2. Changes in InfiniLM (from `main`)
-
-InfiniLM was extended to support the **MiniCPM-SALA** model (embedding, layers, attention, MLP, LM head) and to use InfiniCore (including InfLLM-v2 when available) for inference.
-
-### 2.1 New or modified files (summary)
-
-| Area | Path | Purpose |
-|------|------|--------|
-| C++ model | `csrc/models/minicpm_sala/*.cpp`, `*.hpp` | MiniCPM-SALA model: `minicpm_sala_attention`, `minicpm_sala_decoder_layer`, `minicpm_sala_model`, `minicpm_sala_for_causal_lm`, `minicpm_sala_mlp`. Per-layer dense KV cache; lightning (GLA) and optional InfLLM-v2 (minicpm4) attention paths. |
-| C++ factory | `csrc/models/model_factory.cpp` | Registers MiniCPM-SALA model type. |
-| Config | `python/infinilm/auto_config.py` | MiniCPM-SALA config handling. |
-| Weights | `python/infinilm/modeling_utils.py` | MiniCPM-SALA weight loading (MuP scaling, etc.). |
-| Examples | `examples/jiuge.py` | Generic InferEngine generation script; docstring updated with env (PYTHONPATH, LD_LIBRARY_PATH, LD_PRELOAD) for MiniCPM-SALA. |
-| Examples | `examples/minicpm_sala_logits_sanity.py` | HF vs InfiniLM logits sanity (prefill/decode1/decodeN); single-token decode for correct KV cache; one-prompt output comparison. |
-| Examples | `examples/modeling_minicpm_sala.py` | HF-side MiniCPM-SALA modeling (reference). |
-| Docs | `MiniCPM_SALA_alignment_progress.md` | Alignment and debugging notes. |
-
-### 2.2 Behaviour notes
-
-- **Attention:** Layer 0 (minicpm4) can use compiled InfLLM-v2 when InfiniCore is built with `--infllmv2=y` and the .so is preloaded; other layers use lightning (GLA) path.
-- **Attention overhead optimizations:** In `minicpm_sala_attention.cpp`: (1) sequence lengths are read in one place when both `past_sequence_lengths` and `total_sequence_lengths` are present (`has_cache_meta`), avoiding duplicate logic; (2) Q/K/V use a single `contiguous()->view` chain after projections; (3) lightning path builds `q_bthd` via one `permute->contiguous` from `q_perm`; (4) sparse path uses `q_perm` directly (already contiguous) and only calls `contiguous()` on K/V when repeating heads. Semantics and logits are unchanged.
-- **KV cache:** Decode must use **single-token input** per step; passing the full sequence each step would misalign the per-layer KV cache (see sanity script).
-- **Engine / KV cache config:** MiniCPM-SALA uses per-layer dense KV cache in C++; the engine’s `cache_config` is used only for scheduling (e.g. `past_sequence_lengths` / `total_sequence_lengths`). **Static cache** is recommended (default in `jiuge.py` when not passing `--enable-paged-attn`). For static, `jiuge.py` sets `max_cache_len = max(initial_capacity, max_position_embeddings)` when `model_type == "minicpm_sala"` so long contexts are supported without re-alloc.
-
----
-
-## 3. Prerequisites
-
-### 3.1 System and toolchain
-
-- **OS:** Linux.
-- **Python:** 3.12 recommended (match the infllmv2 .so and InfiniCore pybind ABI).
-- **CUDA:** 11.6+ (e.g. 12.x); `nvcc` in `PATH` (e.g. via `CUDA_HOME=/usr/local/cuda` and `PATH=$CUDA_HOME/bin:$PATH`).
-- **C++:** GCC (e.g. `CC=gcc CXX=g++`) for infllmv2_cuda_impl and InfiniCore.
-- **xmake:** For building InfiniCore (install from https://xmake.io or use a project-provided path).
-- **PyTorch:** Installed in the same Python env used to build infllmv2 and to run InfiniLM (InfiniCore with `aten=y` links against this PyTorch’s libs).
-
-### 3.2 Python environment
-
-Use a **single venv** (or env) that has:
-
-- `torch`
-- `transformers`
-- `triton` (e.g. 3.2.0; for MiniCPM-SALA HF path; if CUDA 12.8, a small patch may be needed for Triton’s `ptx_get_version` or use a Triton version that supports 12.8)
-- `flash-linear-attention` (or HF deps for MiniCPM-SALA)
-- Other InfiniLM/InfiniCore runtime deps
-
-Build **infllmv2_cuda_impl** and **InfiniCore** with this same Python (and thus same PyTorch ABI).
-
-### 3.3 Repo layout
-
-- **minicpm-sala-support** (repo root) contains:
-  - **InfiniCore/** — InfiniCore with InfLLM-v2 wiring.
-  - **InfiniLM/** — InfiniLM with MiniCPM-SALA.
-  - **InfiniCore/third_party/infllmv2_cuda_impl/** — InfLLM-v2 CUDA kernel implementation (provides `mha_varlen_fwd`, `mha_fwd_kvcache`).
-
----
-
-## 4. Build Guide
-
-### 4.1 Build InfLLM-v2 (infllmv2_cuda_impl)
-
-This produces the `.so` that provides `mha_varlen_fwd` and `mha_fwd_kvcache`. InfiniCore must be built with a PyTorch/ABI-compatible env (same Python/torch as here).
-
-1. **From repo root:**
-   ```bash
-   cd InfiniCore/third_party/infllmv2_cuda_impl
-   ```
-2. **Submodules:**
-   ```bash
-   git submodule update --init --recursive
-   ```
-3. **Env (recommended):**
-   ```bash
-   export CC=gcc CXX=g++
-   export CUDA_HOME=/usr/local/cuda   # or your CUDA path
-   export PATH=$CUDA_HOME/bin:$PATH
-   ```
-4. **Build/install** (use the Python that has torch and that you will use for InfiniLM):
-   ```bash
-   python setup.py install
-   ```
-   Or: `pip install -e .`
-5. **Locate the .so:**  
-   Typically under `build/lib.linux-x86_64-cpython-312/infllm_v2/` (name like `C.cpython-312-x86_64-linux-gnu.so`). Set:
-   ```bash
-   INFLLMV2_SO_DIR="<repo>/InfiniCore/third_party/infllmv2_cuda_impl/build/lib.linux-x86_64-cpython-312/infllm_v2"
-   ```
-
-### 4.2 Build InfiniCore (with InfLLM-v2)
-
-InfiniCore must be built with **aten** and, for MiniCPM-SALA with InfLLM-v2, with **infllmv2=y** enabled (xmake auto-detects the .so).
-
-1. **Install Infini dependencies** (if not already):  
-   Build and install Infini libs so they are under `$INFINI_ROOT` (default `~/.infini`). InfiniCore’s xmake expects `include/` and `lib/` there (e.g. `libinfinicore_cpp_api.so`, `libinfiniop.so`, etc.).
-
-2. **From repo root:**
-   ```bash
-   cd InfiniCore
-   ```
-3. **Configure** (use the same Python/torch as infllmv2):
-   ```bash
-   xmake config -y --root --nv-gpu=y --aten=y --infllmv2=y
-   ```
-   Omit `--infllmv2=y` for a build without InfLLM-v2 (then no MiniCPM-SALA layer0 infllmv2 path).
-4. **Build the Python extension:**
-   ```bash
-   xmake --root _infinicore
-   ```
-5. **Optional – install to ~/.infini:**
-   ```bash
-   xmake install
-   ```
-   The Python loadable is also copied under `InfiniCore/python/infinicore/lib/` by the build.
-
-### 4.3 Run jiuge.py (MiniCPM-SALA)
-
-Use the **same venv** that has `torch`, `transformers`, etc., and set env so InfiniCore and the infllmv2 .so are found and symbols resolve.
-
-**Required:**
-
-- `PYTHONPATH`: InfiniLM and InfiniCore Python packages.
-- `LD_LIBRARY_PATH`: Torch lib, Infini lib (`/root/.infini/lib` or your `INFINI_ROOT/lib`), and optionally `INFLLMV2_SO_DIR` (if not using `LD_PRELOAD`).
-- If InfiniCore was built with InfLLM-v2: **`LD_PRELOAD`** the infllmv2 .so so `libinfinicore_cpp_api.so` resolves `mha_varlen_fwd` (and `mha_fwd_kvcache`).
-
-**Example (from repo root):**
-
-```bash
-INFLLMV2_SO_DIR="$(pwd)/InfiniCore/third_party/infllmv2_cuda_impl/build/lib.linux-x86_64-cpython-312/infllm_v2"
-
-PYTHONPATH="$(pwd)/InfiniLM/python:$(pwd)/InfiniCore/python:$PYTHONPATH" \
-LD_LIBRARY_PATH="$(python -c 'import torch; print(torch.__path__[0])')/lib:/root/.infini/lib:${INFLLMV2_SO_DIR}:$LD_LIBRARY_PATH" \
-LD_PRELOAD="${INFLLMV2_SO_DIR}/C.cpython-312-x86_64-linux-gnu.so" \
-python InfiniLM/examples/jiuge.py --nvidia --model_path /root/.cache/modelscope/hub/models/OpenBMB/MiniCPM-SALA
-```
-
-Use the **venv** Python explicitly if needed, e.g.:
-
-```bash
-/path/to/venv/bin/python InfiniLM/examples/jiuge.py ...
-```
-
-For Triton (HF path) on CUDA 12.8 you may need:
-
-```bash
-TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
-```
-
----
-
-## 5. Verification
-
-- **InfiniCore InfLLM-v2 ops:**  
-  `PYTHONPATH=InfiniCore/python:InfiniCore/test/infinicore LD_LIBRARY_PATH=<torch_lib>:${INFLLMV2_SO_DIR}:/root/.infini/lib LD_PRELOAD=${INFLLMV2_SO_DIR}/C.cpython-312-x86_64-linux-gnu.so python InfiniCore/test/infinicore/ops/test_infllmv2_attention.py --nvidia`
-
-- **HF vs InfiniLM logits (one-prompt decode):**  
-  Same env + `LD_PRELOAD` and (if needed) `TRITON_PTXAS_PATH`:  
-  `python InfiniLM/examples/minicpm_sala_logits_sanity.py --model_path <path> --mode decodeN --decode_steps 64`
-
-- **Generation:**  
-  `jiuge.py` with the same env should produce **reasonable MiniCPM-SALA output** (e.g. for prompt "How are you").
-
----
-
-## 6. Related docs
-
-- **CURRENT_PROGRESS.md** — Local progress, InfLLM-v2 plan, and run commands.
-- **InfiniLM/MiniCPM_SALA_alignment_progress.md** — Alignment and debugging details.
-- **InfiniCore/third_party/infllmv2_cuda_impl/README.md** — InfLLM-v2 kernel design and install.
-- **InfiniLM/examples/jiuge.py** — Docstring at top with env summary.
-
----
-
-## 7. TODO
-
-- **Remove temporal log and dump code** — Strip or gate debug logging, `INFINI_DEBUG_*`, and temporary dump paths (e.g. `/tmp/` tensor dumps, `dump_tensor_to_bin_if_enabled`, `log_tensor_stats_if_enabled`) from InfiniLM/InfiniCore once alignment and bring-up are stable.
-- **Adapt inference_server.py** — Wire MiniCPM-SALA (and InfiniLM InferEngine) into the inference server (e.g. `inference_server.py` or equivalent in the workspace) so that the server can load and serve MiniCPM-SALA with the same env (PYTHONPATH, LD_LIBRARY_PATH, LD_PRELOAD) and run generation endpoints.
-
-### 7.1 Debug and sanity env and code (for future erasing)
-
-When removing temporal log and dump code, use this as the reference for **env parsing** and **locations to erase or gate**.
-
-**Environment variables (debug / sanity):**
-
-| Env var | Parsing / behavior | Purpose |
-|---------|---------------------|--------|
-| `INFINI_DEBUG_LOG` | Set to a file path (e.g. `/tmp/minicpm_sala_sanity_debug.log`). When set, C++ and Python append JSON/text lines to this file. | Text log for alignment debugging. |
-| `INFINI_DEBUG_ATTN_DUMP` | Presence = enable (e.g. `"1"` or any). When set, tensors are written to fixed `/tmp/` paths below. | Enable binary tensor dumps and per-layer stats. |
-
-**Where they are read:**
-
-- **InfiniLM C++:** `std::getenv("INFINI_DEBUG_LOG")`, `std::getenv("INFINI_DEBUG_ATTN_DUMP")` in:
-  - `InfiniLM/csrc/models/minicpm_sala/minicpm_sala_attention.cpp` (dump_tensor_f32, layer q/k/v/g_gamma and attn out dumps)
-  - `InfiniLM/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp` (log_tensor_stats_if_enabled, tensor_to_f32_and_dump, layer input/out dumps)
-  - `InfiniLM/csrc/models/minicpm_sala/minicpm_sala_model.cpp` (dump_tensor_to_bin_if_enabled, log_tensor_stats_if_enabled; embed and final hidden dumps)
-- **InfiniLM Python (sanity script):** `os.environ["INFINI_DEBUG_LOG"]`, `os.environ["INFINI_DEBUG_ATTN_DUMP"]` set in `InfiniLM/examples/minicpm_sala_logits_sanity.py` before runs; `os.getenv("INFINI_DEBUG_*")` in `InfiniLM/examples/modeling_minicpm_sala.py` (HF-side hooks that write `/tmp/hf_*.pt` and log to `INFINI_DEBUG_LOG`).
-
-**Temporary paths to remove or stop writing:**
-
-- **C++ dumps (binary):** `/tmp/inf_embed_out.bin`, `/tmp/inf_final_hidden.bin`, `/tmp/inf_layer0_q.bin`, `/tmp/inf_layer0_k.bin`, `/tmp/inf_layer0_v.bin`, `/tmp/inf_layer0_g_gamma.bin`, `/tmp/inf_layer1_q.bin`, `/tmp/inf_layer1_k.bin`, `/tmp/inf_layer1_v.bin`, `/tmp/inf_layer1_g_gamma.bin`, `/tmp/inf_layer0_attn_input.bin`, `/tmp/inf_attn_out_layer0.bin`, `/tmp/inf_attn_out_layer1.bin`, `/tmp/inf_layer_out_<N>.bin`.
-- **Python (sanity) writes:** `DEBUG_LOG_PATH` (e.g. `/tmp/minicpm_sala_sanity_debug.log`); `/tmp/hf_embed_out.pt`, `/tmp/hf_final_hidden.pt`, `/tmp/hf_layer0_attn_input.pt`, `/tmp/hf_layer_out_<idx>.pt`, `/tmp/hf_layer0_q.pt`, `/tmp/hf_layer0_k.pt`, `/tmp/hf_layer0_v.pt`, `/tmp/hf_attn_out_layer0.pt`, `/tmp/hf_layer1_q.pt`, `/tmp/hf_layer1_k.pt`, `/tmp/hf_layer1_v.pt`, `/tmp/hf_attn_out_layer1.pt`.
-- **Helpers to remove or gate:** `dump_tensor_f32`, `dump_tensor_to_bin_if_enabled`, `log_tensor_stats_if_enabled`, `tensor_to_f32_and_dump`; sanity script’s `_append_debug_log`, and all `torch.save(..., "/tmp/...")` / `np.fromfile("/tmp/...")` / `os.path.isfile("/tmp/...")` blocks that exist only for alignment comparison.
diff --git a/MiniCPM_SALA_alignment_progress.md b/MiniCPM_SALA_alignment_progress.md
deleted file mode 100644
index 538208c9..00000000
--- a/MiniCPM_SALA_alignment_progress.md
+++ /dev/null
@@ -1,359 +0,0 @@
-### MiniCPM‑SALA sanity alignment – current status
-
-### Scope
-
-- **Goal**: Align InfiniLM MiniCPM‑SALA logits with HF reference on the dense/GLA (non‑sparse) path, using the `examples/minicpm_sala_logits_sanity.py` script running inside the `minicpm-sala` container.
-
----
-
-### Instrumentation and plumbing
-
-- **Sanity script (`minicpm_sala_logits_sanity.py`)**
-  - **Backend lock**: All InfiniLM `InferEngine` paths now use `attention_backend="default"` so they hit the dense/GLA fallback.
-  - **Debug log target**: The script sets `INFINI_DEBUG_LOG=/home/zenghua/repos/.cursor/debug-9146ea.log` and `INFINI_DEBUG_ATTN_DUMP=1` so both Python and C++ write to the same NDJSON file.
-  - **HF per-layer hooks**:
-    - `_register_hf_layer_hooks` walks the model (`hf.transformer.layers`, `hf.model.layers`, or `hf.layers`) and registers forward hooks on the first 3 layers.
-    - For each layer \(i\), it logs:
-      - `min`, `max`, `mean`, `l2` of the layer output, as `hypothesisId="HF_L"`, `data.layer = i`.
-    - Hooks are installed for `run_prefill_only` and removed after the forward pass.
-
-- **InfiniLM attention (`minicpm_sala_attention.cpp`)**
-  - Existing **layer‑0** diagnostics:
-    - At entry to `forward_dense_`: `forward_dense_entry` logs env/config, including `INFINI_DEBUG_ATTN_DUMP`, `use_rope`, `use_qk_norm`, `use_output_gate`, `use_output_norm`, `is_sparse_layer`, and shapes.
-    - For layer 0, logs stats for:
-      - Pre‑gate attention output (`attn_pre_gate`): full tensor min/max/mean, `l2`, shape and scaling.
-      - Post‑gate/norm (`attn_post_gate`), and post‑`o_proj` (`attn_post_oproj`).
-  - **Planned / partially implemented**: extended logging for `layer_idx_ < 2` (layers 0 and 1) with:
-    - `attn_pre_gate_l0` / `attn_pre_gate_l1`.
-    - `attn_post_gate_l0` / `attn_post_gate_l1`.
-    - `attn_post_oproj_l0` / `attn_post_oproj_l1`.
-  - Current runs still only show layer‑0 entries; the `_infinilm` binary in use has not yet picked up the `_l1` variants (see below).
-
-- **InfiniLM decoder layer (`minicpm_sala_decoder_layer.cpp/.hpp`)**
-  - **MuP residual scaling**:
-    - `residual_scale_ = scale_depth / sqrt(num_hidden_layers)` using `scale_depth` from `ModelConfig` (matches HF path).
-    - `forward` applies:
-      - `out1 = hidden_states + residual_scale_ * attn_out`.
-      - `out2 = out1 + residual_scale_ * mlp_out`.
-  - **Per-layer Inf output stats**:
-    - New member `size_t layer_idx_` stored from constructor.
-    - For `layer_idx_ < 3`, after computing `out2`, it:
-      - Copies to CPU, converts BF16/F16/F32 to float, computes `min`, `max`, `mean`, `l2` and shape.
-      - Logs as `hypothesisId="INF_L"`, with `data.layer = layer_idx_`.
-
-- **Weight scaling / MuP configuration (`modeling_utils.py`)**
-  - Loader reads `config.json` and applies MiniCPM‑style scaling:
-    - `scale_input = scale_emb`, `scale_depth`, `num_hidden_layers`, `dim_model_base`, `hidden_size`.
-    - For `model_type == "minicpm_sala"`:
-      - `scale_o` and `scale_down` are reset to 1.0 (residual scaling is done at C++ forward time).
-      - `scale_lm_head = dim_model_base / hidden_size` is baked into `lm_head.weight`.
-    - Embedding and norm weights are scaled as in the MiniCPM scripts.
-
-- **Rebuild and install (`rebuild.sh`, xmake)**
-  - `rebuild.sh`:
-    - `InfiniCore`: `python scripts/install.py --nv-gpu=y --ccl=y --aten=y`, then `xmake build _infinicore` and `xmake install _infinicore`.
-    - `InfiniLM`: optional `xmake clean`, then `xmake build _infinilm` and `xmake install _infinilm`.
-  - Verified inside container:
-    - Shared libs in `/root/.infini/lib` are updated (e.g. `libinfiniop.so`, `libinfinicore_cpp_api.so` with current timestamps).
-    - Python sees `infinilm` from `/home/zenghua/repos/InfiniLM/python/infinilm`.
-    - The extension in use is `_infinilm` at:
-      - `/home/zenghua/repos/InfiniLM/python/infinilm/lib/_infinilm.cpython-312-x86_64-linux-gnu.so`.
-
----
-
-### Sanity run behavior and current misalignment
-
-- **Command used (container, GPU 1)**:
-  ```bash
-  docker exec -e CUDA_VISIBLE_DEVICES=1 minicpm-sala bash -lc '
-    source /app/docker/nvidia/env-set.sh
-    cd /home/zenghua/repos/InfiniLM
-    python3 examples/minicpm_sala_logits_sanity.py \
-      --model_path /data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA \
-      --mode prefill \
-      --prompt "How are you"
-  '
-  ```
-- **HF vs Inf logits (from `SANITY_ONELINE`)**
-  - `inf_norm ≈ 387.66`
-  - `hf_norm ≈ 1588.89`
-  - **ratio_inf_hf ≈ 0.244**
-  - `max_diff ≈ 12.77`, `mean_diff ≈ 4.64`
-  - Top‑1 token IDs differ (HF: 74, Inf: 59358).
-
-- **HF early layers (from `HF_L` logs)**
-  - Using the HF hooks in the sanity script:
-    - Layer 0: `l2 ≈ 59.49`
-    - Layer 1: `l2 ≈ 73.91` (first GLA layer)
-    - Layer 2: `l2 ≈ 87.38`
-  - Norms grow smoothly with depth; nothing obviously pathological on HF side.
-
-- **Inf attention layer‑0 vs HF**
-  - HF layer‑0 pre‑gate attention (`modeling_minicpm_sala.py:attn_pre_gate`):
-    - Shape `[1, 4, 4096]`, `min=-8.375`, `max=9.0`, `mean≈-0.1273`.
-  - Inf layer‑0:
-    - **Pre‑gate (`attn_pre_gate`)**:
-      - `l2 ≈ 105.50`, `min=-8.375`, `max=9.0`.
-      - Python’s comparison (`compare_attn`) reports `norm_ratio_inf_hf ≈ 0.4487`, i.e. Inf pre‑gate norm ≈ 0.45× HF’s.
-    - **Post‑gate/norm (`attn_post_gate`)**:
-      - `l2 ≈ 60.38`, very close to HF layer‑0 output `l2 ≈ 59.49`.
-    - **Post‑o_proj (`attn_post_oproj`)**:
-      - `l2 ≈ 98.66` (used as input to the decoder’s residual path).
-  - Interpretation:
-    - By the end of the **layer‑0 attention block**, Inf and HF are roughly matched in scale at the decoder output (norms ≈ 60).
-    - The severe **0.244 logits norm ratio** is therefore not due to an immediate blow‑up/vanish at layer‑0 attention output; it accumulates later (likely starting at the first GLA layer and/or via MuP/residual/MLP scaling).
-
----
-
-### Binary / build state
-
-- **Extension module mapping**
-  - In container, importing `infinilm` shows:
-    - `infinilm.__file__` → `/home/zenghua/repos/InfiniLM/python/infinilm/__init__.py`
-    - `_infinilm` (top‑level) → `/home/zenghua/repos/InfiniLM/python/infinilm/lib/_infinilm.cpython-312-x86_64-linux-gnu.so`
-  - That is the `.so` used by the sanity script.
-
-- **Why new attention logs for layer 1 don’t appear yet**
-  - `strings _infinilm.cpython-312-...so | grep 'attn_pre_gate_l1'` currently returns **no matches**:
-    - This confirms the loaded `_infinilm` was built **before** we added the `_l1` logging strings.
-  - We attempted a fresh `_infinilm` build and initially hit:
-    - C++ error in `MiniCPMSALADecoderLayer::forward`: `layer_idx_` not declared.
-  - That prevented `_infinilm` from rebuilding/overwriting the old `.so`, so your layer‑1 logging changes never reached runtime.
-
-- **Decoder fix applied to unblock rebuild**
-  - Added `size_t layer_idx_ = 0;` as a private member in `minicpm_sala_decoder_layer.hpp`.
-  - Set `layer_idx_ = layer_idx;` in the decoder layer constructor.
-  - After this fix, `_infinilm` can compile; `rebuild.sh` now proceeds past the decoder layer and updates the core libraries (and should be able to update `_infinilm` when the entire build/install completes successfully).
-
----
-
-### Open issues / next steps
-
-- **1. Get the new `_infinilm` into use**
-  - Ensure `rebuild.sh` completes the `_infinilm` build + install step successfully (no early termination due to missing libffi/openssl/ca‑certificates link checks).
-  - Confirm via:
-    ```bash
-    strings /home/zenghua/repos/InfiniLM/python/infinilm/lib/_infinilm.cpython-312-x86_64-linux-gnu.so \
-      | grep -E 'attn_pre_gate_l1|attn_post_gate_l1|attn_post_oproj_l1'
-    ```
-    If this prints the `_l1` labels, the new binary is in place.
-
-- **2. Re‑run sanity and capture layer‑1 attention logs**
-  - With the updated `_infinilm`, re‑run the prefill sanity script and inspect `debug-9146ea.log` for:
-    - `minicpm_sala_attention.cpp:attn_pre_gate_l1`
-    - `minicpm_sala_attention.cpp:attn_post_gate_l1`
-    - `minicpm_sala_attention.cpp:attn_post_oproj_l1`
-  - Compare their `l2` to HF layer‑1 (`HF_L` `l2 ≈ 73.9`).
-  - This will tell us whether the **first GLA layer** is where Inf starts to diverge in norm, or whether norms remain close through layer 1 and drift later.
-
-- **3. Use decoder `INF_L` logs to see per‑layer drift**
-  - Once `_infinilm` is rebuilt, `MiniCPMSALADecoderLayer`’s per‑layer `INF_L` logs for `layer_idx_ < 3` should appear in `debug-9146ea.log`.
-  - By comparing HF (`HF_L`) vs Inf (`INF_L`) for layers 0/1/2, we can see exactly where norm ratios deviate from ~1 and head toward ~0.244 at the logits.
-  - That will guide targeted fixes in:
-    - GLA gating / normalization (in `minicpm_sala_attention.cpp`), and/or
-    - MuP residual & MLP scaling (still matching HF in formula, but potentially interacting differently with the SALA configuration).
-
----
-
-### Summary
-
-- **Plumbing**: Shared log path and HF/Inf instrumentation are in place; per‑layer HF stats and layer‑0 Inf attention stats work and confirm that **layer‑0 attention output scale is roughly aligned**.
-- **Mismatch**: Final logits norm is still **Inf/HF ≈ 0.244**, so the discrepancy is accumulating across layers, likely starting at or after the first GLA layer.
-- **Blocking issue**: The `_infinilm` C++ extension in use predates the layer‑1 logging changes; an earlier C++ compile error prevented a fresh install. That decode‑layer bug has been fixed so we can now rebuild and get the new diagnostics into the runtime.
-- **Next milestone**: Successfully rebuild `_infinilm`, confirm the `_l1` log strings are present, rerun sanity, and use the new layer‑1 and decoder `INF_L` stats to precisely locate where Inf’s norms start drifting away from HF.
-
----
-
-### Host follow-up (2026-03-14)
-
-- Ran `examples/minicpm_sala_logits_sanity.py --mode prefill --prompt "How are you"` directly on the host using the local venv and the same base env as the documented `jiuge.py` run.
-- Extra host-only prep required for the HF reference path:
-  - installed `flash-linear-attention` to provide the `fla` module
-  - installed `triton==3.2.0` to avoid the Triton `STAGE` autotune import failure
-  - created `/home/zenghua/repos/.cursor/` because the script hardcodes `DEBUG_LOG_PATH` there
-- Result on host:
-  - `SANITY_ONELINE ratio=0.6215 max_diff=11.5391 mean_diff=2.5607`
-  - HF top-1 token id `74`, Inf top-1 token id `23917`
-- Interpretation:
-  - The host environment now reproduces the alignment issue without Docker.
-  - The ratio is better than the older container snapshot (`~0.244`) but still far from aligned, so the poor generation quality remains consistent with a real logits mismatch.
-- Full reproducibility details for this host run were appended to `CURRENT_PROGRESS.md`.
-
----
-
-### HF MiniCPM4 dense-fallback experiment (2026-03-14)
-
-- Goal:
-  - Test whether the remaining mismatch is coming from the HF `minicpm4` sparse-vs-dense code path by forcing `minicpm4` layers onto the standard dense attention implementation.
-- HF model-file change:
-  - Patched both cached copies of `modeling_minicpm_sala.py` so `MiniCPMSALADecoderLayer` uses `MINICPM_ATTENTION_CLASSES[config._attn_implementation]` for `mixer_type == "minicpm4"` instead of `MiniCPMInfLLMv2Attention`.
-  - Backups:
-    - `/root/.cache/modelscope/hub/models/OpenBMB/MiniCPM-SALA/modeling_minicpm_sala.py.bak-20260314-210428`
-    - `/root/.cache/huggingface/modules/transformers_modules/MiniCPM-SALA/modeling_minicpm_sala.py.bak-20260314-210619`
-- Rerun result:
-  - `SANITY_ONELINE ratio=0.6215 max_diff=11.5391 mean_diff=2.5607`
-  - HF top-1 token id `74`, Inf top-1 token id `23917`
-  - These numbers are unchanged from the earlier host run.
-- Fresh per-layer log from `debug-9146ea.log`:
-  - HF decoder output `l2`:
-    - layer 0: `59.49`
-    - layer 1: `73.91`
-    - layer 2: `87.38`
-  - Inf decoder output `l2`:
-    - layer 0: `35.08`
-    - layer 1: `295.86`
-    - layer 2: `531.38`
-  - Inf layer-1 attention stats:
-    - pre-gate `l2 ~= 749.58`
-    - post-gate `l2 ~= 745.29`
-    - post-`o_proj` `l2 ~= 1112.6`
-- Interpretation:
-  - For this short prefill case, forcing HF `minicpm4` to the dense fallback path does not move the mismatch at all.
-  - The strongest current evidence is that the large norm drift starts in the InfiniLM implementation at or immediately after the first `lightning-attn` layer, not in the HF `minicpm4` branch.
-
----
-
-### InfiniLM MiniCPM4 HF-math experiment (2026-03-14)
-
-- Goal:
-  - Make the InfiniLM `minicpm4` layer compute the same dense attention math as the HF reference path and see whether layer 0 aligns at the start of sanity.
-- C++ change:
-  - In `csrc/models/minicpm_sala/minicpm_sala_attention.cpp`, replaced the `minicpm4` sparse/varlen/grouped fallback branch with an explicit HF-style dense path:
-    - repeat KV heads to `num_attention_heads`
-    - compute per-head dense causal attention
-    - keep the same sigmoid output gate and `o_proj`
-- Rebuild:
-  - Rebuilt and reinstalled `_infinilm` successfully using the local `xmake` toolchain.
-- Rerun result:
-  - `SANITY_ONELINE ratio=0.6215 max_diff=11.5391 mean_diff=2.5607`
-  - HF top-1 token id `74`, Inf top-1 token id `23917`
-  - These numbers are unchanged.
-- Fresh layer stats after the InfiniLM-side change:
-  - HF decoder output `l2`: `59.49 -> 73.91 -> 87.38`
-  - Inf decoder output `l2`: `35.08 -> 295.86 -> 531.38`
-  - Inf layer-0 attention:
-    - pre-gate `142.87`
-    - post-gate `80.43`
-    - post-`o_proj` `135.39`
-- Interpretation:
-  - Even after making the InfiniLM `minicpm4` branch follow the HF dense attention structure, layer 0 does not move toward HF.
-  - This strongly suggests the remaining mismatch is not in the `minicpm4` attention branch itself; attention should shift to other decoder-path components and especially the first `lightning-attn` layer.
-
----
-
-### Temporary all-lightning experiment (2026-03-14)
-
-- Goal:
-  - Force both HF and InfiniLM to use lightning-style attention math for former `minicpm4` layers as a temporary precision-alignment probe, without changing checkpoint tensor shapes.
-- Why not use `config.json` only:
-  - A direct `mixer_types -> all lightning-attn` config edit failed during HF weight load because former `minicpm4` layers have incompatible checkpoint shapes for the stock `LightningAttention` module (e.g. `256 x 4096` vs `4096 x 4096`).
-  - The original `mixer_types` config was restored.
-- Temporary override implementation:
-  - Added env flag `MINICPM_SALA_FORCE_ALL_LIGHTNING=1`.
-  - HF side:
-    - former `minicpm4` layers instantiate `MiniCPMAttention` under the flag
-    - `MiniCPMAttention.forward()` switches to lightning-style GLA computation under the flag, while keeping original q/k/v/o_proj/o_gate weights
-  - InfiniLM side:
-    - `minicpm_sala_attention.cpp` routes sparse layers through `gla_attention` under the same flag
-  - Sanity script:
-    - `examples/minicpm_sala_logits_sanity.py` now sets `MINICPM_SALA_FORCE_ALL_LIGHTNING=1` for this experiment
-- Result:
-  - `SANITY_ONELINE ratio=0.4728 max_diff=12.1406 mean_diff=1.9942`
-  - HF top-1 token id `59375`, Inf top-1 token id `59358`
-- Fresh per-layer stats under the override:
-  - HF decoder output `l2`:
-    - layer 0: `385.10`
-    - layer 1: `374.87`
-    - layer 2: `426.87`
-  - Inf decoder output `l2`:
-    - layer 0: `26.23`
-    - layer 1: `208.72`
-    - layer 2: `403.90`
-  - Inf layer-0 attention:
-    - pre-gate `105.50`
-    - post-gate `60.38`
-    - post-`o_proj` `98.66`
-  - Inf layer-1 attention:
-    - pre-gate `672.74`
-    - post-gate `459.67`
-    - post-`o_proj` `737.03`
-- Interpretation:
-  - The override is definitely active on both sides, because HF logits/top-1 and HF early-layer norms changed substantially.
-  - However, the former `minicpm4` layers still do not align numerically with InfiniLM under lightning-style attention.
-  - This points to a mismatch in the lightning formulation itself (decay/slopes, layout, gating, norm/casting, or related details), not just in the original mixed `mixer_types` layout.
-
----
-
-### Layer-0 narrowing after matched temporary semantics (2026-03-14)
-
-- Change:
-  - Updated the temporary HF override so its former `minicpm4` path uses the same grouped causal-softmax math as `InfiniCore` `gla_attention`, instead of `simple_gla` with decay.
-  - Added layer-0 sub-stage logging on both sides:
-    - HF: `inputs_embeds`, `input_layernorm`, `attn_pre_gate`, `attn_post_oproj`
-    - Inf: embedding output, `input_layernorm`, `attn_pre_gate`, `attn_post_oproj`
-- Result:
-  - Layer-0 pre-gate attention still mismatches strongly:
-    - HF `attn_pre_gate l2 ~= 235.11`
-    - Inf `attn_pre_gate l2 ~= 105.50`
-    - `Inf/HF ~= 0.4487`
-  - But this is no longer the earliest divergence.
-- New root-cause evidence:
-  - Embedding output already differs:
-    - HF `inputs_embeds l2 ~= 44.09`
-    - Inf embed output `l2 ~= 25.51`
-  - First decoder layer pre-norm output also differs:
-    - HF layer0 `input_layernorm l2 ~= 95.88`
-    - Inf layer0 `input_layernorm l2 ~= 70.94`
-- Interpretation:
-  - The mismatch starts before layer-0 attention.
-  - Attention, gating, and `o_proj` are downstream amplifiers, but not the first source.
-  - The next priority should be MiniCPM-SALA embedding behavior in InfiniLM:
-    - verify `model.embed_tokens.weight` load/scaling,
-    - verify runtime embedding lookup output against HF for the same token ids,
-    - then re-check whether layer-0 attention comes into line automatically.
-
----
-
-### Multi-layer alignment after embed fix (2026-03-14)
-
-- Instrumentation added:
-  - InfiniLM dumps decoder layer outputs (out2) for layers 0–2 to `/tmp/inf_layer_out_{0,1,2}.bin` and final hidden (after norm) to `/tmp/inf_final_hidden.bin` when `INFINI_DEBUG_ATTN_DUMP=1`.
-  - HF hooks save layer outputs to `/tmp/hf_layer_out_{0,1,2}.pt` and final hidden to `/tmp/hf_final_hidden.pt`.
-  - Sanity script prints per-layer and final-hidden norm_ratio and max/mean diff.
-- Result (prefill "How are you", int32 input_ids workaround):
-  - **Layer 0**: norm_ratio ≈ 1.0002, max_diff ≈ 0.0625 → aligned.
-  - **Layer 1**: norm_ratio ≈ 3.24, max_diff ≈ 28.4 → large divergence.
-  - **Layer 2**: norm_ratio ≈ 5.73 → further drift.
-- Root cause for layer 1+:
-  - Config: layer 0 = `minicpm4` (sparse/dense), layer 1+ = `lightning-attn`.
-  - HF `LightningAttention` uses **Simple GLA** (`chunk_simple_gla` / `fused_recurrent_simple_gla`): linear/recurrent attention with decay (g_gamma), not causal softmax.
-  - InfiniLM now routes lightning layers through **Simple GLA** (InfiniCore `simple_gla_*` ops), matching HF’s formulation (recurrent with decay).
-- Next step to align after layer 0:
-  - Implement Simple GLA (chunk or fused_recurrent) in InfiniCore and route lightning layers through it, matching HF’s `attn_fn` (decay, scale=1/sqrt(d), layout).
-
----
-
-### MMLU-Pro validation mismatches vs logit work (2026-03-24)
-
-Paired lm-eval `--log_samples` runs (HF vs local chat / Infini server) often disagree for **heterogeneous** reasons. Treat them differently before spending time on logits:
-
-| Heuristic tag (export script) | Meaning | Use logits / greedy trace? |
-|------------------------------|---------|----------------------------|
-| `model_disagreement` | Both sides return a valid letter choice but disagree; text is on-topic. | **Yes** — same `input_ids` + `run_prefill_and_greedy_trace` localizes numerical / decode divergence. |
-| `parse_or_format` | One side `[invalid]` or regex extraction differs though the model may agree. | **No** (first fix template, stops, or metric extraction). |
-| `garbage` | Off-topic or corrupted completion (e.g. wrong language / spam). | **No** — serving hygiene, batching, or cache contamination. |
-
-**Repo tooling**
-
-- `InfiniLM/examples/eval_tasks/mmlu_pro_val/export_mismatch_subset.py` — join two `samples_*.jsonl` dirs on `doc_hash`, optional filters, heuristic tag, write `mismatch_subset.json` + `.md` (includes `arguments_a` / `arguments_b` for replay).
-- `InfiniLM/examples/eval_tasks/mmlu_pro_val/mmlu_pro_val_prompt.py` — rebuild `input_ids` from logged rows (rendered string vs JsonChat message list) like lm-eval.
-- `InfiniLM/examples/eval_tasks/mmlu_pro_val/mmlu_pro_val_logit_probe.py` — drive `minicpm_sala_logits_sanity.run_prefill_and_greedy_trace` on subset rows (in-process HF + `InferEngine` only; HTTP cannot return logits).
-- `InfiniLM/examples/minicpm_sala_logits_sanity.py` — `--mode greedy_trace` for ad-hoc prompts; shared `run_prefill_and_greedy_trace()` for subset probes.
-
-If greedy trace matches HF on a row but the API eval still differs, diff **chat template**, **stop sequences**, **max_tokens**, or server batching — not the GLA kernel alone.
-
-**HF vs `local-chat-completions` harness (practical parity)**
-
-- For the same `doc_hash`, the **rendered prompt string** from `--model hf` can match **byte-for-byte** re-templating the JSON messages the API path logs (verified on a biology mismatch example).
-- Differences that still moved scores: **regex extraction** used the *first* `answer is (X)` in long CoT while the model’s final line said another letter; `_default_template_yaml` now uses `group_select: -1` (last match) and case-insensitive pattern.
-- **Server**: strip lm-eval’s per-message `type: text` wrapper to `{role, content}` before `apply_chat_template`, and set `continue_final_message=not add_generation_prompt` like lm-eval’s HF model class (`inference_server.py`, `llm.py`).
diff --git a/examples/jiuge.py b/examples/jiuge.py
index 1fcba6c4..fa547435 100644
--- a/examples/jiuge.py
+++ b/examples/jiuge.py
@@ -252,13 +252,9 @@ def test(
     # ---------------------------------------------------------------------------- #
     #                       Create KVCache
     # ---------------------------------------------------------------------------- #
-    batch_size = 1 if isinstance(prompts, str) else len(prompts)
-    initial_capacity = max_new_tokens + len(input_ids_list[0])
-    # MiniCPM-SALA uses per-layer dense KV cache in C++; engine cache_config drives
-    # scheduling only. Static cache is recommended (no paged bookkeeping) unless
-    # --enable-paged-attn is explicitly set.
     if enable_paged_attn:
-        max_total_tokens = initial_capacity
+        batch_size = 1 if prompts is str else len(prompts)
+        max_total_tokens = max_new_tokens + len(input_ids_list[0])
         cache_config = PagedKVCacheConfig(
             num_blocks=(
                 (max_total_tokens + (_PAGED_KV_BLOCK_SIZE - 1)) // _PAGED_KV_BLOCK_SIZE
@@ -267,12 +263,10 @@ def test(
             block_size=_PAGED_KV_BLOCK_SIZE,
         )
     else:
-        max_cache_len = initial_capacity
-        if getattr(model.config, "model_type", None) == "minicpm_sala":
-            max_pos = getattr(model.config, "max_position_embeddings", 4096)
-            max_cache_len = max(initial_capacity, max_pos)
+        batch_size = 1 if prompts is str else len(prompts)
+        initial_capacity = max_new_tokens + len(input_ids_list[0])
         cache_config = StaticKVCacheConfig(
-            max_batch_size=batch_size, max_cache_len=max_cache_len
+            max_batch_size=batch_size, max_cache_len=initial_capacity
         )
 
     model.reset_cache(cache_config)
diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py
index 07ada981..7b6ceea4 100644
--- a/python/infinilm/llm/llm.py
+++ b/python/infinilm/llm/llm.py
@@ -371,7 +371,6 @@ def apply_chat_template(
             conversation=messages,
             add_generation_prompt=add_generation_prompt,
             tokenize=False,
-            continue_final_message=not add_generation_prompt,
             **chat_template_kwargs,
         )
 
diff --git a/python/infinilm/llm/static_scheduler.py b/python/infinilm/llm/static_scheduler.py
index 860bf7b9..c9b0cb30 100644
--- a/python/infinilm/llm/static_scheduler.py
+++ b/python/infinilm/llm/static_scheduler.py
@@ -4,7 +4,6 @@
 
 import logging
 import queue
-import os
 import janus
 from typing import List, Optional
 
diff --git a/python/infinilm/modeling_utils.py b/python/infinilm/modeling_utils.py
index 17a5fe58..ec045185 100644
--- a/python/infinilm/modeling_utils.py
+++ b/python/infinilm/modeling_utils.py
@@ -155,32 +155,26 @@ def load_model_state_dict_by_file(
     torch_dtype = infinicore.utils.to_torch_dtype(dtype)
     model_keys = model.state_dict_keyname()
 
-    # MiniCPM-style scaling (used by MiniCPM / FM9G; also applies to MiniCPM-SALA checkpoints).
-    # This matches `InfiniLM/scripts/jiuge.py` weight scaling behavior.
+    # MiniCPM-SALA scaling (bake selected MuP scales into weights).
+    # This matches `InfiniLM/scripts/jiuge.py` weight scaling behavior for `model_type=="minicpm_sala"`.
     scale_input = 1.0
     scale_output = 1.0
     scale_o = 1.0
     scale_down = 1.0
     scale_lm_head = 1.0
     try:
+        # TODO: fetch config from model rather than file directly
         with open(os.path.join(model_path, "config.json")) as f:
             cfg = json.load(f)
-        if (
-            cfg.get("model_type") in ["fm9g", "minicpm", "minicpm_sala"]
-            and "scale_emb" in cfg
-            and "scale_depth" in cfg
-        ):
+        if cfg.get("model_type") == "minicpm_sala" and "scale_emb" in cfg and "scale_depth" in cfg:
             scale_input = float(cfg["scale_emb"])
             scale_o = float(cfg["scale_depth"]) / math.sqrt(float(cfg["num_hidden_layers"]))
             scale_down = float(cfg["scale_depth"]) / math.sqrt(float(cfg["num_hidden_layers"]))
-            if cfg.get("model_type") in ["fm9g", "minicpm"] and "dim_model_base" in cfg:
-                scale_output = float(int(cfg["hidden_size"]) // int(cfg["dim_model_base"]))
-            if cfg.get("model_type") == "minicpm_sala" and "dim_model_base" in cfg and "hidden_size" in cfg:
+            if "dim_model_base" in cfg and "hidden_size" in cfg:
                 scale_lm_head = float(cfg["dim_model_base"]) / float(cfg["hidden_size"])
             # minicpm_sala: only bake embed and lm_head; residual scaling done at forward in C++
-            if cfg.get("model_type") == "minicpm_sala":
-                scale_o = 1.0
-                scale_down = 1.0
+            scale_o = 1.0
+            scale_down = 1.0
     except Exception:
         pass
 

From 54a07dd6e7c742ba0198daa82f1bd6d956bd5396 Mon Sep 17 00:00:00 2001
From: Ceng23333 <441651826@qq.com>
Date: Thu, 9 Apr 2026 06:34:31 +0000
Subject: [PATCH 09/11] refactor

Signed-off-by: Ceng23333 <441651826@qq.com>
---
 python/infinilm/infer_engine.py   | 7 ++++---
 python/infinilm/modeling_utils.py | 3 +--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/infinilm/infer_engine.py b/python/infinilm/infer_engine.py
index 9046b790..f25b97b9 100644
--- a/python/infinilm/infer_engine.py
+++ b/python/infinilm/infer_engine.py
@@ -1,5 +1,4 @@
 import time
-import os
 from dataclasses import dataclass
 
 import infinicore
@@ -79,7 +78,9 @@ def forward(
         try:
             # TODO: Remove `_underlying` and simplify the corresponding code.
             input_ids = input_ids._underlying if input_ids is not None else None
-            position_ids = position_ids._underlying if position_ids is not None else None
+            position_ids = (
+                position_ids._underlying if position_ids is not None else None
+            )
             past_kv_lengths = (
                 past_kv_lengths._underlying if past_kv_lengths is not None else None
             )
@@ -133,7 +134,6 @@ def generate(
             eos_token_id = generation_config.eos_token_id
 
         past_seq_len = 0
-
         output_ids = []
         initial_batch_size, initial_seqlen = input_ids.shape[:2]
         seq_len = initial_seqlen
@@ -291,6 +291,7 @@ def generate(
                 top_k=generation_config.top_k,
                 top_p=generation_config.top_p,
             )
+
             output_ids.append(output_id)
 
             if (
diff --git a/python/infinilm/modeling_utils.py b/python/infinilm/modeling_utils.py
index ec045185..03d3c062 100644
--- a/python/infinilm/modeling_utils.py
+++ b/python/infinilm/modeling_utils.py
@@ -95,8 +95,7 @@ def load_state_dict(
             )
 
         for k in f.keys():
-            # Explicitly cast dtype: some ops (e.g. embedding) may not support BF16 on all backends.
-            state_dict[k] = f.get_tensor(k).to(device=device, dtype=dtype)
+            state_dict[k] = f.get_tensor(k).to(device=device)
 
     return state_dict
 

From f9f6a120412f4e8654f20e246aaed8673b65f83e Mon Sep 17 00:00:00 2001
From: Ceng23333 <441651826@qq.com>
Date: Thu, 9 Apr 2026 07:11:28 +0000
Subject: [PATCH 10/11] seperate 2 attn

Signed-off-by: Ceng23333 <441651826@qq.com>
---
 .../minicpm_sala/minicpm_sala_attention.cpp   | 398 ++++++++++--------
 .../minicpm_sala/minicpm_sala_attention.hpp   |  80 ++--
 .../minicpm_sala_decoder_layer.cpp            |  13 +-
 .../minicpm_sala_decoder_layer.hpp            |   7 +-
 .../minicpm_sala_for_causal_lm.cpp            |  18 +-
 .../minicpm_sala/minicpm_sala_model.cpp       |  18 +-
 .../minicpm_sala/minicpm_sala_model.hpp       |   8 +-
 7 files changed, 297 insertions(+), 245 deletions(-)

diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp
index f36b84c5..af346445 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp
@@ -41,7 +41,7 @@ void minicpm_sala_update_layer_kv_tensor(infinicore::Tensor &kv_bundle,
     const size_t update_len = k_permuted->size(2);
     const size_t result_len = cache_pos + update_len;
     if (result_len > k_cache_layer->size(2)) {
-        throw std::runtime_error("MiniCPMSALAAttention: KV cache length exceeded");
+        throw std::runtime_error("MiniCPMSALAAttention(KV update): KV cache length exceeded");
     }
     k_cache_layer->narrow({{2, cache_pos, update_len}})->copy_from(k_permuted);
     v_cache_layer->narrow({{2, cache_pos, update_len}})->copy_from(v_permuted);
@@ -90,81 +90,39 @@ void ensure_gla_state_allocated(infinicore::Tensor &state,
 }
 } // namespace
 
-MiniCPMSALAAttention::MiniCPMSALAAttention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                                           const infinicore::Device &device,
-                                           size_t layer_idx,
-                                           const std::string &mixer_type,
-                                           engine::distributed::RankInfo rank_info,
-                                           backends::AttentionBackend attention_backend)
+MiniCPMSALALightningAttention::MiniCPMSALALightningAttention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                                                             const infinicore::Device &device,
+                                                             size_t layer_idx)
     : model_config_(std::move(model_config)),
-      rank_info_(rank_info),
-      layer_idx_(layer_idx),
-      attention_backend_(attention_backend) {
-
-    // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA).
+      layer_idx_(layer_idx) {
     const auto dtype = model_config_->get_dtype();
+    attention_backend_ = infinilm::global_state::get_infinilm_config().attention_backend;
     hidden_size_ = model_config_->get<size_t>("hidden_size");
-    if (mixer_type == "minicpm4") {
-        is_sparse_layer_ = true;
-        num_attention_heads_ = model_config_->get<size_t>("num_attention_heads");
-        num_key_value_heads_ = model_config_->get<size_t>("num_key_value_heads");
-        head_dim_ = model_config_->get<size_t>("head_dim");
-
-        // InfLLM-v2 local-window masking (causal-local semantics) for minicpm4.
-        // Prefer `sparse_window_size`, but fall back to `window_size` if needed.
-        int sparse_window_size = model_config_->get_or<int>("sparse_window_size", -1);
-        if (sparse_window_size <= 0) {
-            // Some HF configs store this under `sparse_config.window_size`.
-            auto sparse_cfg = model_config_->get_or<nlohmann::json>("sparse_config", nlohmann::json{});
-            if (!sparse_cfg.is_null() && sparse_cfg.contains("window_size")) {
-                sparse_window_size = sparse_cfg["window_size"].get<int>();
-            } else {
-                sparse_window_size = model_config_->get_or<int>("window_size", -1);
-            }
-        }
-        if (sparse_window_size > 0) {
-            infllmv2_window_left_ = sparse_window_size;
-            infllmv2_window_right_ = 0;
-            use_local_window_ = true;
-        }
-    } else {
-        // Lightning layers have their own head config.
-        num_attention_heads_ = model_config_->get_or<size_t>("lightning_nh", model_config_->get<size_t>("num_attention_heads"));
-        num_key_value_heads_ = model_config_->get_or<size_t>("lightning_nkv", model_config_->get<size_t>("num_key_value_heads"));
-        head_dim_ = model_config_->get_or<size_t>("lightning_head_dim", model_config_->get<size_t>("head_dim"));
-    }
+
+    num_attention_heads_ = model_config_->get_or<size_t>("lightning_nh", model_config_->get<size_t>("num_attention_heads"));
+    num_key_value_heads_ = model_config_->get_or<size_t>("lightning_nkv", model_config_->get<size_t>("num_key_value_heads"));
+    head_dim_ = model_config_->get_or<size_t>("lightning_head_dim", model_config_->get<size_t>("head_dim"));
     scaling_ = static_cast<float>(1.0 / std::sqrt(static_cast<double>(head_dim_)));
 
-    // HyPE: RoPE in lightning layers, NoPE in sparse (minicpm4) layers.
-    // We treat all non-minicpm4 as "linear" (lightning-attn) for M1 dense fallback.
-    use_rope_ = (mixer_type != "minicpm4") && model_config_->get_or<bool>("lightning_use_rope", true);
+    use_rope_ = model_config_->get_or<bool>("lightning_use_rope", true);
     rotary_emb_ = infinilm::layers::rotary_embedding::get_rope(model_config_, device);
 
-    // MiniCPM-SALA uses QK-norm and output gates by default.
-    use_qk_norm_ = model_config_->get_or<bool>("qk_norm", true) && (mixer_type != "minicpm4");
+    use_qk_norm_ = model_config_->get_or<bool>("qk_norm", true);
     use_output_gate_ = model_config_->get_or<bool>("use_output_gate", true);
 
-    // Projections
     INFINICORE_NN_MODULE_INIT(q_proj, hidden_size_, num_attention_heads_ * head_dim_, false, dtype, device);
     INFINICORE_NN_MODULE_INIT(k_proj, hidden_size_, num_key_value_heads_ * head_dim_, false, dtype, device);
     INFINICORE_NN_MODULE_INIT(v_proj, hidden_size_, num_key_value_heads_ * head_dim_, false, dtype, device);
     INFINICORE_NN_MODULE_INIT(o_proj, num_attention_heads_ * head_dim_, hidden_size_, false, dtype, device);
 
-    if (mixer_type == "minicpm4") {
-        // Sparse layers use o_gate (sigmoid gate on attention output)
-        INFINICORE_NN_MODULE_INIT(o_gate, hidden_size_, hidden_size_, false, dtype, device);
-    } else {
-        // Lightning layers use q/k norm + output norm and z-projection gate
-        if (use_qk_norm_) {
-            INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, model_config_->get<double>("rms_norm_eps"), dtype, device);
-            INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, model_config_->get<double>("rms_norm_eps"), dtype, device);
-        }
-        use_output_norm_ = true;
-        // Checkpoint uses o_norm over hidden_size (shape [hidden_size]).
-        INFINICORE_NN_MODULE_INIT(o_norm, hidden_size_, model_config_->get<double>("rms_norm_eps"), dtype, device);
-        INFINICORE_NN_MODULE_INIT(z_proj, hidden_size_, hidden_size_, false, dtype, device);
+    if (use_qk_norm_) {
+        INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, model_config_->get<double>("rms_norm_eps"), dtype, device);
+        INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, model_config_->get<double>("rms_norm_eps"), dtype, device);
     }
-    // Simple GLA decay for lightning path: g_gamma = _build_slope_tensor * -1.
+    use_output_norm_ = true;
+    INFINICORE_NN_MODULE_INIT(o_norm, hidden_size_, model_config_->get<double>("rms_norm_eps"), dtype, device);
+    INFINICORE_NN_MODULE_INIT(z_proj, hidden_size_, hidden_size_, false, dtype, device);
+
     std::vector<float> slopes = build_slope_tensor(num_attention_heads_);
     auto g_cpu = infinicore::Tensor::empty(
         {num_attention_heads_}, infinicore::DataType::F32, infinicore::Device::cpu());
@@ -174,17 +132,14 @@ MiniCPMSALAAttention::MiniCPMSALAAttention(std::shared_ptr<infinilm::config::Mod
     g_gamma_ = g_cpu->to(device);
 }
 
-void MiniCPMSALAAttention::reset_state() {
-    // KV tensors are maintained by the shared engine cache (StaticKVCache).
-    // Lightning decode recurrent state is maintained locally for performance.
+void MiniCPMSALALightningAttention::reset_state() {
     gla_state_valid_ = false;
     gla_state_cached_len_ = 0;
     gla_state_ = {};
 }
 
-
-infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &position_ids,
-                                                const infinicore::Tensor &hidden_states) const {
+infinicore::Tensor MiniCPMSALALightningAttention::forward(const infinicore::Tensor &position_ids,
+                                                         const infinicore::Tensor &hidden_states) const {
     const auto &attn_meta = infinilm::global_state::get_forward_context().attn_metadata;
     auto past_sequence_lengths = attn_meta.past_sequence_lengths;
     auto total_sequence_lengths = attn_meta.total_sequence_lengths;
@@ -218,7 +173,7 @@ infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &posit
     // RoPE only for lightning layers (HyPE)
     if (use_rope_) {
         if (!rotary_emb_) {
-            throw std::runtime_error("MiniCPMSALAAttention: rotary_emb is not set but use_rope=true");
+            throw std::runtime_error("MiniCPMSALALightningAttention: rotary_emb is not set but use_rope=true");
         }
         // position_ids can be [B,S] or [S]; follow LlamaAttention behavior.
         auto pos_shape = position_ids->shape();
@@ -229,7 +184,7 @@ infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &posit
         } else if (pos_shape.size() == 1) {
             pos_ids_for_rope = position_ids->contiguous();
         } else {
-            throw std::runtime_error("MiniCPMSALAAttention: Unexpected position_ids shape");
+            throw std::runtime_error("MiniCPMSALALightningAttention: Unexpected position_ids shape");
         }
 
         rotary_emb_->forward(q_reshaped, pos_ids_for_rope, true);
@@ -271,7 +226,7 @@ infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &posit
         auto &kv_vec = infinilm::global_state::get_forward_context().kv_cache_vec;
         if (layer_idx_ >= kv_vec.size()) {
             throw std::runtime_error(
-                "MiniCPMSALAAttention: forward_context.kv_cache_vec is unset or too small (call reset_cache / align layer count)");
+                "MiniCPMSALALightningAttention: forward_context.kv_cache_vec is unset or too small (call reset_cache / align layer count)");
         }
         use_forward_kv = true;
         minicpm_sala_update_layer_kv_tensor(
@@ -289,14 +244,14 @@ infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &posit
 
     // Slice to total_seq_len (decode-only / cont-batch)
     if (total_seq_len > k_total->shape()[2]) {
-        throw std::runtime_error("MiniCPMSALAAttention: total_seq_len exceeds available KV length (cache not correctly updated)");
+        throw std::runtime_error("MiniCPMSALALightningAttention: total_seq_len exceeds available KV length (cache not correctly updated)");
     }
     k_total = k_total->narrow({{2, 0, total_seq_len}});
     v_total = v_total->narrow({{2, 0, total_seq_len}});
 
     infinicore::Tensor attn_output;
-    if (!is_sparse_layer_) {
-        // Lightning-attn: Simple GLA (HF-aligned).
+    {
+        // Lightning-attn only: Simple GLA (HF-aligned).
         // simple_gla_attention(q,k,v,g_gamma,scale) expects [B, T, H, D]; g_gamma [H].
         const size_t n_h = num_attention_heads_;
         const size_t n_kv = num_key_value_heads_;
@@ -398,116 +353,17 @@ infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &posit
             infinicore::Tensor out_slice = gla_out->narrow({{1, total_seq_len - seq_len, seq_len}});
             attn_output = out_slice->view({batch_size, seq_len, n_h * head_dim_});
         }
-    } else {
-        // minicpm4 layers must use InfLLM-v2 attention (hard error if not available).
-        // NOTE: Lightning layers keep Simple GLA for correctness; only minicpm4 routes here.
-        try {
-            if (!total_sequence_lengths.has_value()) {
-                throw std::runtime_error(
-                    "MiniCPMSALAAttention(minicpm4): total_sequence_lengths is required for InfLLM-v2 path");
-            }
-            // `infllmv2_kvcache` expects the number of valid K/V entries in the
-            // provided cache tensors. After per-layer KV update, valid length is
-            // total KV length (past + current token).
-            const auto cache_lens = total_sequence_lengths.value();
-
-            // Prefill: InfLLM-v2 varlen (Q and K packed lengths match `seq_len == total_seq_len` here).
-            // Decode: `seq_len < total_seq_len` — use `infllmv2_kvcache` after KV tensor update
-            // (valid KV length == `total_seq_len`). Using varlen for decode (1 query vs long K) hit NaNs
-            // in practice for modest sequence lengths; kvcache matches operator tests and Flash path.
-            const bool force_varlen_decode = [&]() {
-                const char *env = std::getenv("INFINI_MINICPM4_DECODE_VARLEN");
-                return env && env[0] != '\0' && env[0] != '0';
-            }();
-
-            if (seq_len == total_seq_len || (force_varlen_decode && batch_size == 1)) {
-                if (batch_size != 1) {
-                    throw std::runtime_error("MiniCPMSALAAttention(minicpm4): varlen prefill path currently requires batch_size=1");
-                }
-                auto q_bshd = q_reshaped->contiguous();                     // [B, S, n_h, D]
-                auto k_btkd = k_total->permute({0, 2, 1, 3})->contiguous();  // [B, T, n_kv, D]
-                auto v_btkd = v_total->permute({0, 2, 1, 3})->contiguous();  // [B, T, n_kv, D]
-                auto q_var = q_bshd->view({static_cast<ptrdiff_t>(seq_len), static_cast<ptrdiff_t>(num_attention_heads_), static_cast<ptrdiff_t>(head_dim_)});
-                auto k_var = k_btkd->view({static_cast<ptrdiff_t>(total_seq_len), static_cast<ptrdiff_t>(num_key_value_heads_), static_cast<ptrdiff_t>(head_dim_)});
-                auto v_var = v_btkd->view({static_cast<ptrdiff_t>(total_seq_len), static_cast<ptrdiff_t>(num_key_value_heads_), static_cast<ptrdiff_t>(head_dim_)});
-
-                auto cuq_cpu = infinicore::Tensor::empty({2}, infinicore::DataType::I32, infinicore::Device::cpu());
-                reinterpret_cast<int32_t *>(cuq_cpu->data())[0] = 0;
-                reinterpret_cast<int32_t *>(cuq_cpu->data())[1] = static_cast<int32_t>(seq_len);
-                infinicore::Tensor cu_q = cuq_cpu->to(q_var->device());
-                // cu_k corresponds to the full KV length used by k_var/v_var.
-                auto cuk_cpu = infinicore::Tensor::empty({2}, infinicore::DataType::I32, infinicore::Device::cpu());
-                reinterpret_cast<int32_t *>(cuk_cpu->data())[0] = 0;
-                reinterpret_cast<int32_t *>(cuk_cpu->data())[1] = static_cast<int32_t>(total_seq_len);
-                infinicore::Tensor cu_k = cuk_cpu->to(q_var->device());
-
-                const bool infllmv2_causal = !use_local_window_;
-                const int window_left = use_local_window_ ? infllmv2_window_left_ : -1;
-                const int window_right = use_local_window_ ? 0 : -1;
-
-                auto out_var = infinicore::op::infllmv2_varlen(
-                    q_var, k_var, v_var,
-                    cu_q, cu_k,
-                    static_cast<int>(seq_len),
-                    static_cast<int>(total_seq_len),
-                    scaling_,
-                    /*causal=*/infllmv2_causal,
-                    /*window_size_left=*/window_left,
-                    /*window_size_right=*/window_right);
-                attn_output = out_var->view({batch_size, seq_len, num_attention_heads_ * head_dim_});
-            } else if (use_forward_kv) {
-                if (batch_size != 1) {
-                    throw std::runtime_error("MiniCPMSALAAttention(minicpm4): kvcache decode path currently requires batch_size=1");
-                }
-                auto q_bshd = q_reshaped->contiguous();                     // [B, S_q, n_h, D]
-                auto k_bthd = k_total->permute({0, 2, 1, 3})->contiguous(); // [B, T, n_kv, D]
-                auto v_bthd = v_total->permute({0, 2, 1, 3})->contiguous(); // [B, T, n_kv, D]
-
-                const bool infllmv2_causal = !use_local_window_;
-                const int window_left = use_local_window_ ? infllmv2_window_left_ : -1;
-                const int window_right = use_local_window_ ? 0 : -1;
-
-                auto out_bshd = infinicore::op::infllmv2_kvcache(
-                    q_bshd,
-                    k_bthd,
-                    v_bthd,
-                    cache_lens,
-                    scaling_,
-                    /*causal=*/infllmv2_causal,
-                    /*window_size_left=*/window_left,
-                    /*window_size_right=*/window_right);
-                attn_output = out_bshd->contiguous()->view(
-                    {batch_size, seq_len, num_attention_heads_ * head_dim_});
-            } else {
-                throw std::runtime_error(
-                    "MiniCPMSALAAttention(minicpm4): decode requires KV cache (missing cache metadata or kv_cache_vec)");
-            }
-        } catch (const std::exception &e) {
-            throw std::runtime_error(
-                std::string("MiniCPMSALAAttention(minicpm4): InfLLM-v2 attention failed. ")
-                + "This build must provide InfLLM-v2 (ENABLE_INFLLMV2+ENABLE_ATEN) and the infllmv2_cuda_impl .so "
-                + "must be available via LD_PRELOAD/LD_LIBRARY_PATH. Original error: " + e.what());
-        }
     }
 
-    // Output norm + gate variants
+    // Lightning output gate/norm
     if (use_output_gate_) {
-        if (o_gate_) {
-            // Sparse (minicpm4): y = sigmoid(o_gate(x)) * attn_output
-            auto gate_in = hidden_states;
-            auto gate = o_gate_->forward(gate_in);
-            infinicore::op::sigmoid_(gate, gate);
-            attn_output = infinicore::op::mul(attn_output, gate);
-        } else if (z_proj_) {
-            // Lightning: match HF LightningAttention: o_norm(o) then o * sigmoid(z_proj(x)).
-            auto z_in = hidden_states;
-            auto z = z_proj_->forward(z_in);
-            infinicore::op::sigmoid_(z, z);
-            if (use_output_norm_ && o_norm_) {
-                attn_output = o_norm_->forward(attn_output);
-            }
-            attn_output = infinicore::op::mul(attn_output, z);
+        auto z_in = hidden_states;
+        auto z = z_proj_->forward(z_in);
+        infinicore::op::sigmoid_(z, z);
+        if (use_output_norm_ && o_norm_) {
+            attn_output = o_norm_->forward(attn_output);
         }
+        attn_output = infinicore::op::mul(attn_output, z);
     } else if (use_output_norm_ && o_norm_) {
         attn_output = o_norm_->forward(attn_output);
     }
@@ -518,4 +374,188 @@ infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &posit
     return out;
 }
 
+MiniCPMSALAMinicpm4Attention::MiniCPMSALAMinicpm4Attention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                                                           const infinicore::Device &device,
+                                                           size_t layer_idx)
+    : model_config_(std::move(model_config)),
+      layer_idx_(layer_idx) {
+    (void)device;
+    const auto dtype = model_config_->get_dtype();
+    attention_backend_ = infinilm::global_state::get_infinilm_config().attention_backend;
+    hidden_size_ = model_config_->get<size_t>("hidden_size");
+    num_attention_heads_ = model_config_->get<size_t>("num_attention_heads");
+    num_key_value_heads_ = model_config_->get<size_t>("num_key_value_heads");
+    head_dim_ = model_config_->get<size_t>("head_dim");
+    scaling_ = static_cast<float>(1.0 / std::sqrt(static_cast<double>(head_dim_)));
+
+    int sparse_window_size = model_config_->get_or<int>("sparse_window_size", -1);
+    if (sparse_window_size <= 0) {
+        auto sparse_cfg = model_config_->get_or<nlohmann::json>("sparse_config", nlohmann::json{});
+        if (!sparse_cfg.is_null() && sparse_cfg.contains("window_size")) {
+            sparse_window_size = sparse_cfg["window_size"].get<int>();
+        } else {
+            sparse_window_size = model_config_->get_or<int>("window_size", -1);
+        }
+    }
+    if (sparse_window_size > 0) {
+        infllmv2_window_left_ = sparse_window_size;
+        infllmv2_window_right_ = 0;
+        use_local_window_ = true;
+    }
+
+    INFINICORE_NN_MODULE_INIT(q_proj, hidden_size_, num_attention_heads_ * head_dim_, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(k_proj, hidden_size_, num_key_value_heads_ * head_dim_, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(v_proj, hidden_size_, num_key_value_heads_ * head_dim_, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(o_proj, num_attention_heads_ * head_dim_, hidden_size_, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(o_gate, hidden_size_, hidden_size_, false, dtype, device);
+}
+
+void MiniCPMSALAMinicpm4Attention::reset_state() {
+    // no local recurrent state
+}
+
+infinicore::Tensor MiniCPMSALAMinicpm4Attention::forward(const infinicore::Tensor &position_ids,
+                                                        const infinicore::Tensor &hidden_states) const {
+    (void)position_ids;
+    const auto &attn_meta = infinilm::global_state::get_forward_context().attn_metadata;
+    auto past_sequence_lengths = attn_meta.past_sequence_lengths;
+    auto total_sequence_lengths = attn_meta.total_sequence_lengths;
+
+    auto shape = hidden_states->shape();
+    const size_t batch_size = shape[0];
+    const size_t seq_len = shape[1];
+
+    auto hs_mut = hidden_states;
+    auto q = q_proj_->forward(hs_mut);
+    auto k = k_proj_->forward(hs_mut);
+    auto v = v_proj_->forward(hs_mut);
+    auto q_reshaped = q->contiguous()->view({batch_size, seq_len, num_attention_heads_, head_dim_});
+    auto k_reshaped = k->contiguous()->view({batch_size, seq_len, num_key_value_heads_, head_dim_});
+    auto v_reshaped = v->contiguous()->view({batch_size, seq_len, num_key_value_heads_, head_dim_});
+
+    // KV update via per-layer kv_cache_vec when metadata present
+    size_t total_seq_len = seq_len;
+    size_t cache_pos = 0;
+    const bool has_cache_meta = past_sequence_lengths.has_value() && total_sequence_lengths.has_value();
+    if (has_cache_meta) {
+        auto past_cpu = past_sequence_lengths.value()->to(infinicore::Device::cpu());
+        cache_pos = reinterpret_cast<int32_t *>(past_cpu->data())[0];
+        total_seq_len = cache_pos + seq_len;
+    }
+    auto k_permuted = k_reshaped->permute({0, 2, 1, 3})->contiguous();
+    auto v_permuted = v_reshaped->permute({0, 2, 1, 3})->contiguous();
+
+    infinicore::Tensor k_total = k_permuted;
+    infinicore::Tensor v_total = v_permuted;
+    bool use_forward_kv = false;
+    if (has_cache_meta) {
+        auto &kv_vec = infinilm::global_state::get_forward_context().kv_cache_vec;
+        if (layer_idx_ >= kv_vec.size()) {
+            throw std::runtime_error(
+                "MiniCPMSALAMinicpm4Attention: forward_context.kv_cache_vec is unset or too small");
+        }
+        use_forward_kv = true;
+        minicpm_sala_update_layer_kv_tensor(
+            kv_vec[layer_idx_],
+            k_permuted,
+            v_permuted,
+            past_sequence_lengths.value());
+        auto k_cache_layer = kv_vec[layer_idx_]->narrow({{0, 0, 1}})->squeeze(0);
+        auto v_cache_layer = kv_vec[layer_idx_]->narrow({{0, 1, 1}})->squeeze(0);
+        k_total = k_cache_layer;
+        v_total = v_cache_layer;
+    } else {
+        total_seq_len = seq_len;
+    }
+
+    if (total_seq_len > k_total->shape()[2]) {
+        throw std::runtime_error("MiniCPMSALAMinicpm4Attention: total_seq_len exceeds available KV length");
+    }
+    k_total = k_total->narrow({{2, 0, total_seq_len}});
+    v_total = v_total->narrow({{2, 0, total_seq_len}});
+
+    try {
+        if (!total_sequence_lengths.has_value()) {
+            throw std::runtime_error("MiniCPMSALAMinicpm4Attention: total_sequence_lengths is required for InfLLM-v2 path");
+        }
+        const auto cache_lens = total_sequence_lengths.value();
+        const bool force_varlen_decode = [&]() {
+            const char *env = std::getenv("INFINI_MINICPM4_DECODE_VARLEN");
+            return env && env[0] != '\0' && env[0] != '0';
+        }();
+
+        infinicore::Tensor attn_output;
+        if (seq_len == total_seq_len || (force_varlen_decode && batch_size == 1)) {
+            if (batch_size != 1) {
+                throw std::runtime_error("MiniCPMSALAMinicpm4Attention: varlen path requires batch_size=1");
+            }
+            auto q_bshd = q_reshaped->contiguous();
+            auto k_btkd = k_total->permute({0, 2, 1, 3})->contiguous();
+            auto v_btkd = v_total->permute({0, 2, 1, 3})->contiguous();
+            auto q_var = q_bshd->view({static_cast<ptrdiff_t>(seq_len), static_cast<ptrdiff_t>(num_attention_heads_), static_cast<ptrdiff_t>(head_dim_)});
+            auto k_var = k_btkd->view({static_cast<ptrdiff_t>(total_seq_len), static_cast<ptrdiff_t>(num_key_value_heads_), static_cast<ptrdiff_t>(head_dim_)});
+            auto v_var = v_btkd->view({static_cast<ptrdiff_t>(total_seq_len), static_cast<ptrdiff_t>(num_key_value_heads_), static_cast<ptrdiff_t>(head_dim_)});
+
+            auto cuq_cpu = infinicore::Tensor::empty({2}, infinicore::DataType::I32, infinicore::Device::cpu());
+            reinterpret_cast<int32_t *>(cuq_cpu->data())[0] = 0;
+            reinterpret_cast<int32_t *>(cuq_cpu->data())[1] = static_cast<int32_t>(seq_len);
+            infinicore::Tensor cu_q = cuq_cpu->to(q_var->device());
+            auto cuk_cpu = infinicore::Tensor::empty({2}, infinicore::DataType::I32, infinicore::Device::cpu());
+            reinterpret_cast<int32_t *>(cuk_cpu->data())[0] = 0;
+            reinterpret_cast<int32_t *>(cuk_cpu->data())[1] = static_cast<int32_t>(total_seq_len);
+            infinicore::Tensor cu_k = cuk_cpu->to(q_var->device());
+
+            const bool infllmv2_causal = !use_local_window_;
+            const int window_left = use_local_window_ ? infllmv2_window_left_ : -1;
+            const int window_right = use_local_window_ ? 0 : -1;
+
+            auto out_var = infinicore::op::infllmv2_varlen(
+                q_var, k_var, v_var,
+                cu_q, cu_k,
+                static_cast<int>(seq_len),
+                static_cast<int>(total_seq_len),
+                scaling_,
+                /*causal=*/infllmv2_causal,
+                /*window_size_left=*/window_left,
+                /*window_size_right=*/window_right);
+            attn_output = out_var->view({batch_size, seq_len, num_attention_heads_ * head_dim_});
+        } else if (use_forward_kv) {
+            if (batch_size != 1) {
+                throw std::runtime_error("MiniCPMSALAMinicpm4Attention: kvcache decode requires batch_size=1");
+            }
+            auto q_bshd = q_reshaped->contiguous();
+            auto k_bthd = k_total->permute({0, 2, 1, 3})->contiguous();
+            auto v_bthd = v_total->permute({0, 2, 1, 3})->contiguous();
+
+            const bool infllmv2_causal = !use_local_window_;
+            const int window_left = use_local_window_ ? infllmv2_window_left_ : -1;
+            const int window_right = use_local_window_ ? 0 : -1;
+
+            auto out_bshd = infinicore::op::infllmv2_kvcache(
+                q_bshd,
+                k_bthd,
+                v_bthd,
+                cache_lens,
+                scaling_,
+                /*causal=*/infllmv2_causal,
+                /*window_size_left=*/window_left,
+                /*window_size_right=*/window_right);
+            attn_output = out_bshd->contiguous()->view({batch_size, seq_len, num_attention_heads_ * head_dim_});
+        } else {
+            throw std::runtime_error("MiniCPMSALAMinicpm4Attention: decode requires KV cache");
+        }
+
+        // Sparse gate + o_proj
+        auto gate = o_gate_->forward(hs_mut);
+        infinicore::op::sigmoid_(gate, gate);
+        attn_output = infinicore::op::mul(attn_output, gate);
+        auto out = o_proj_->forward(attn_output);
+        return out;
+    } catch (const std::exception &e) {
+        throw std::runtime_error(
+            std::string("MiniCPMSALAMinicpm4Attention: InfLLM-v2 attention failed. ")
+            + "Original error: " + e.what());
+    }
+}
+
 } // namespace infinilm::models::minicpm_sala
diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp
index 2013d678..43784627 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp
@@ -3,8 +3,8 @@
 #include "../../backends/attention_backends.hpp"
 #include "../../cache/kv_cache.hpp"
 #include "../../config/model_config.hpp"
-#include "../../engine/distributed/distributed.hpp"
 #include "../../layers/rotary_embedding/rotary_embedding.hpp"
+#include "../../global_state/global_state.hpp"
 
 #include "infinicore/nn/linear.hpp"
 #include "infinicore/nn/module.hpp"
@@ -17,26 +17,28 @@
 
 namespace infinilm::models::minicpm_sala {
 
-// Dense attention fallback implementation used for Milestone 1.
-// Parameter names are aligned with HF MiniCPM-SALA safetensors keys:
-//   model.layers.N.self_attn.{q_proj,k_proj,v_proj,o_proj,...}
-// TODO(refactor): KV cache is currently per-layer dense; refactor to use engine paged KV pool
-// and block_tables/slot_mapping to match SGLang minicpm-sala pattern (see minicpm_sala_attention.cpp).
-class MiniCPMSALAAttention : public infinicore::nn::Module {
+class MiniCPMSALAAttentionBase : public infinicore::nn::Module {
 public:
-    MiniCPMSALAAttention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                         const infinicore::Device &device,
-                         size_t layer_idx,
-                         const std::string &mixer_type,
-                         engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(),
-                         backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
+    virtual infinicore::Tensor forward(const infinicore::Tensor &position_ids,
+                                       const infinicore::Tensor &hidden_states) const = 0;
+    virtual void reset_state() = 0;
+    virtual ~MiniCPMSALAAttentionBase() = default;
+};
+
+// Lightning attention path (Simple GLA). Parameter names align with HF:
+//   model.layers.N.self_attn.{q_proj,k_proj,v_proj,o_proj,q_norm,k_norm,o_norm,z_proj,...}
+class MiniCPMSALALightningAttention : public MiniCPMSALAAttentionBase {
+public:
+    MiniCPMSALALightningAttention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                                  const infinicore::Device &device,
+                                  size_t layer_idx);
 
     // Match `infinilm::layers::attention::Attention` API: metadata is pulled from
     // `global_state::get_forward_context().attn_metadata`.
     infinicore::Tensor forward(const infinicore::Tensor &position_ids,
-                               const infinicore::Tensor &hidden_states) const;
+                               const infinicore::Tensor &hidden_states) const override;
 
-    void reset_state();
+    void reset_state() override;
 
 protected:
     // Projections (HF-aligned naming)
@@ -51,12 +53,8 @@ class MiniCPMSALAAttention : public infinicore::nn::Module {
     INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, o_norm);
     INFINICORE_NN_MODULE(infinicore::nn::Linear, z_proj);
 
-    // Optional (Sparse layers): o_gate
-    INFINICORE_NN_MODULE(infinicore::nn::Linear, o_gate);
-
     std::shared_ptr<infinilm::config::ModelConfig> model_config_;
     std::shared_ptr<infinicore::nn::RoPE> rotary_emb_;
-    engine::distributed::RankInfo rank_info_;
 
     size_t layer_idx_;
     size_t hidden_size_;
@@ -69,13 +67,6 @@ class MiniCPMSALAAttention : public infinicore::nn::Module {
     bool use_output_gate_ = false;
     bool use_output_norm_ = false;
     bool use_rope_ = false;
-    bool is_sparse_layer_ = false;
-
-    // InfLLM-v2 local-window masking plumbing for `mixer_type=="minicpm4"`.
-    // When enabled: causal=false + window_size_left=sparse_window_size + window_size_right=0.
-    int infllmv2_window_left_ = -1;
-    int infllmv2_window_right_ = -1;
-    bool use_local_window_ = false;
 
     backends::AttentionBackend attention_backend_;
 
@@ -89,4 +80,41 @@ class MiniCPMSALAAttention : public infinicore::nn::Module {
     mutable bool gla_state_valid_ = false;
 };
 
+// Sparse attention path (`mixer_type=="minicpm4"`) using InfLLM-v2 operators.
+// Parameter names align with HF:
+//   model.layers.N.self_attn.{q_proj,k_proj,v_proj,o_proj,o_gate,...}
+class MiniCPMSALAMinicpm4Attention : public MiniCPMSALAAttentionBase {
+public:
+    MiniCPMSALAMinicpm4Attention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                                 const infinicore::Device &device,
+                                 size_t layer_idx);
+
+    infinicore::Tensor forward(const infinicore::Tensor &position_ids,
+                               const infinicore::Tensor &hidden_states) const override;
+
+    void reset_state() override;
+
+protected:
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, q_proj);
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, k_proj);
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, v_proj);
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, o_proj);
+    INFINICORE_NN_MODULE(infinicore::nn::Linear, o_gate);
+
+    std::shared_ptr<infinilm::config::ModelConfig> model_config_;
+    size_t layer_idx_;
+    size_t hidden_size_;
+    size_t num_attention_heads_;
+    size_t num_key_value_heads_;
+    size_t head_dim_;
+    float scaling_;
+
+    // InfLLM-v2 local-window masking plumbing.
+    int infllmv2_window_left_ = -1;
+    int infllmv2_window_right_ = -1;
+    bool use_local_window_ = false;
+
+    backends::AttentionBackend attention_backend_;
+};
+
 } // namespace infinilm::models::minicpm_sala
diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp
index 7a44704e..b565cf47 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp
@@ -1,5 +1,6 @@
 #include "minicpm_sala_decoder_layer.hpp"
 
+#include "../../global_state/global_state.hpp"
 #include "infinicore/ops.hpp"
 #include "infinicore/context/context.hpp"
 #include <cmath>
@@ -15,9 +16,7 @@ namespace infinilm::models::minicpm_sala {
 MiniCPMSALADecoderLayer::MiniCPMSALADecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
                                                  const infinicore::Device &device,
                                                  size_t layer_idx,
-                                                 const std::string &mixer_type,
-                                                 engine::distributed::RankInfo rank_info,
-                                                 backends::AttentionBackend attention_backend) {
+                                                 const std::string &mixer_type) {
     layer_idx_ = layer_idx;
     // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA).
     const auto dtype = model_config->get_dtype();
@@ -29,7 +28,13 @@ MiniCPMSALADecoderLayer::MiniCPMSALADecoderLayer(std::shared_ptr<infinilm::confi
     residual_scale_ = scale_depth / std::sqrt(static_cast<double>(num_layers));
 
     INFINICORE_NN_MODULE_INIT(input_layernorm, model_config->get<size_t>("hidden_size"), eps, dtype, device);
-    INFINICORE_NN_MODULE_INIT(self_attn, model_config, device, layer_idx, mixer_type, rank_info, attention_backend);
+    if (mixer_type == "minicpm4") {
+        self_attn_ = this->register_module<MiniCPMSALAMinicpm4Attention>(
+            "self_attn", model_config, device, layer_idx);
+    } else {
+        self_attn_ = this->register_module<MiniCPMSALALightningAttention>(
+            "self_attn", model_config, device, layer_idx);
+    }
     INFINICORE_NN_MODULE_INIT(post_attention_layernorm, model_config->get<size_t>("hidden_size"), eps, dtype, device);
     INFINICORE_NN_MODULE_INIT(mlp, model_config, device);
 }
diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp
index 44d320c9..52f31000 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp
@@ -23,9 +23,7 @@ class MiniCPMSALADecoderLayer : public infinicore::nn::Module {
     MiniCPMSALADecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
                             const infinicore::Device &device,
                             size_t layer_idx,
-                            const std::string &mixer_type,
-                            engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(),
-                            backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
+                            const std::string &mixer_type);
 
     infinicore::Tensor forward(const infinicore::Tensor &hidden_states,
                                const infinicore::Tensor &position_ids) const;
@@ -38,7 +36,8 @@ class MiniCPMSALADecoderLayer : public infinicore::nn::Module {
 
 protected:
     INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, input_layernorm);
-    INFINICORE_NN_MODULE(MiniCPMSALAAttention, self_attn);
+    // Registered under the HF-compatible name "self_attn" in ctor.
+    std::shared_ptr<MiniCPMSALAAttentionBase> self_attn_;
     INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, post_attention_layernorm);
     INFINICORE_NN_MODULE(MiniCPMSALAMLP, mlp);
 };
diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp
index fb55556f..791a7832 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp
@@ -55,15 +55,15 @@ MiniCPMSALAForCausalLM::Output MiniCPMSALAForCausalLM::forward(
     auto block_tables = input.block_tables;
     auto slot_mapping = input.slot_mapping;
 
-    auto hidden_states = model_->forward(
-        input_ids,
-        position_ids,
-        past_sequence_lengths,
-        total_sequence_lengths,
-        input_offsets,
-        cu_seqlens,
-        block_tables,
-        slot_mapping);
+    infinilm::global_state::get_forward_context().attn_metadata =
+        infinilm::global_state::AttentionMetadata(past_sequence_lengths,
+                                                  total_sequence_lengths,
+                                                  input_offsets,
+                                                  cu_seqlens,
+                                                  block_tables,
+                                                  slot_mapping);
+
+    auto hidden_states = model_->forward(input_ids, position_ids);
 
     // MuP lm_head scale baked into lm_head.weight at load time; no forward scaling here.
     auto logits = lm_head_->forward(hidden_states);
diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.cpp b/csrc/models/minicpm_sala/minicpm_sala_model.cpp
index f6d9bb4d..f665ce0a 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_model.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_model.cpp
@@ -48,7 +48,7 @@ MiniCPMSALAModel::MiniCPMSALAModel(std::shared_ptr<infinilm::config::ModelConfig
     layers_.reserve(num_layers);
     for (size_t i = 0; i < num_layers; ++i) {
         layers_.push_back(this->register_module<MiniCPMSALADecoderLayer>(
-            "layers." + std::to_string(i), model_config_, device, i, mixer_types[i], rank_info, attention_backend));
+            "layers." + std::to_string(i), model_config_, device, i, mixer_types[i]));
     }
 }
 
@@ -59,21 +59,7 @@ void MiniCPMSALAModel::reset_state() {
 }
 
 infinicore::Tensor MiniCPMSALAModel::forward(const infinicore::Tensor &input_ids,
-                                             const infinicore::Tensor &position_ids,
-                                             std::optional<infinicore::Tensor> past_sequence_lengths,
-                                             std::optional<infinicore::Tensor> total_sequence_lengths,
-                                             std::optional<infinicore::Tensor> input_offsets,
-                                             std::optional<infinicore::Tensor> cu_seqlens,
-                                             std::optional<infinicore::Tensor> block_tables,
-                                             std::optional<infinicore::Tensor> slot_mapping) const {
-    infinilm::global_state::get_forward_context().attn_metadata =
-        infinilm::global_state::AttentionMetadata(past_sequence_lengths,
-                                                  total_sequence_lengths,
-                                                  input_offsets,
-                                                  cu_seqlens,
-                                                  block_tables,
-                                                  slot_mapping);
-
+                                             const infinicore::Tensor &position_ids) const {
     // MuP scaling baked into weights at load time for minicpm_sala; no forward scaling here.
     auto hs = embed_tokens_->forward(input_ids);
 
diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.hpp b/csrc/models/minicpm_sala/minicpm_sala_model.hpp
index 9b4a81c2..ed79cd76 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_model.hpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_model.hpp
@@ -27,13 +27,7 @@ class MiniCPMSALAModel : public infinicore::nn::Module {
                      const infinicore::Device &device);
 
     infinicore::Tensor forward(const infinicore::Tensor &input_ids,
-                               const infinicore::Tensor &position_ids,
-                               std::optional<infinicore::Tensor> past_sequence_lengths,
-                               std::optional<infinicore::Tensor> total_sequence_lengths,
-                               std::optional<infinicore::Tensor> input_offsets,
-                               std::optional<infinicore::Tensor> cu_seqlens,
-                               std::optional<infinicore::Tensor> block_tables,
-                               std::optional<infinicore::Tensor> slot_mapping) const;
+                               const infinicore::Tensor &position_ids) const;
 
     void reset_state();
 

From fe79f914fa2b89629840c88ce2a7d49cf8e918b4 Mon Sep 17 00:00:00 2001
From: Ceng23333 <441651826@qq.com>
Date: Fri, 10 Apr 2026 03:07:00 +0000
Subject: [PATCH 11/11] cleanup code

Signed-off-by: Ceng23333 <441651826@qq.com>
---
 .../minicpm_sala/minicpm_sala_attention.cpp   | 79 ++++++++-----------
 .../minicpm_sala/minicpm_sala_attention.hpp   | 12 ---
 .../minicpm_sala_decoder_layer.cpp            |  6 +-
 .../minicpm_sala_decoder_layer.hpp            |  9 +--
 .../minicpm_sala_for_causal_lm.cpp            | 21 +----
 .../minicpm_sala_for_causal_lm.hpp            |  6 +-
 csrc/models/minicpm_sala/minicpm_sala_mlp.cpp | 10 +--
 csrc/models/minicpm_sala/minicpm_sala_mlp.hpp |  4 -
 .../minicpm_sala/minicpm_sala_model.cpp       | 27 +++----
 .../minicpm_sala/minicpm_sala_model.hpp       | 14 ----
 10 files changed, 56 insertions(+), 132 deletions(-)

diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp
index af346445..27cb2275 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp
@@ -93,35 +93,33 @@ void ensure_gla_state_allocated(infinicore::Tensor &state,
 MiniCPMSALALightningAttention::MiniCPMSALALightningAttention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
                                                              const infinicore::Device &device,
                                                              size_t layer_idx)
-    : model_config_(std::move(model_config)),
-      layer_idx_(layer_idx) {
-    const auto dtype = model_config_->get_dtype();
-    attention_backend_ = infinilm::global_state::get_infinilm_config().attention_backend;
-    hidden_size_ = model_config_->get<size_t>("hidden_size");
-
-    num_attention_heads_ = model_config_->get_or<size_t>("lightning_nh", model_config_->get<size_t>("num_attention_heads"));
-    num_key_value_heads_ = model_config_->get_or<size_t>("lightning_nkv", model_config_->get<size_t>("num_key_value_heads"));
-    head_dim_ = model_config_->get_or<size_t>("lightning_head_dim", model_config_->get<size_t>("head_dim"));
+    : layer_idx_(layer_idx) {
+    const auto dtype = model_config->get_dtype();
+    const size_t hidden_size = model_config->get<size_t>("hidden_size");
+
+    num_attention_heads_ = model_config->get_or<size_t>("lightning_nh", model_config->get<size_t>("num_attention_heads"));
+    num_key_value_heads_ = model_config->get_or<size_t>("lightning_nkv", model_config->get<size_t>("num_key_value_heads"));
+    head_dim_ = model_config->get_or<size_t>("lightning_head_dim", model_config->get<size_t>("head_dim"));
     scaling_ = static_cast<float>(1.0 / std::sqrt(static_cast<double>(head_dim_)));
 
-    use_rope_ = model_config_->get_or<bool>("lightning_use_rope", true);
-    rotary_emb_ = infinilm::layers::rotary_embedding::get_rope(model_config_, device);
+    use_rope_ = model_config->get_or<bool>("lightning_use_rope", true);
+    rotary_emb_ = infinilm::layers::rotary_embedding::get_rope(model_config, device);
 
-    use_qk_norm_ = model_config_->get_or<bool>("qk_norm", true);
-    use_output_gate_ = model_config_->get_or<bool>("use_output_gate", true);
+    use_qk_norm_ = model_config->get_or<bool>("qk_norm", true);
+    use_output_gate_ = model_config->get_or<bool>("use_output_gate", true);
 
-    INFINICORE_NN_MODULE_INIT(q_proj, hidden_size_, num_attention_heads_ * head_dim_, false, dtype, device);
-    INFINICORE_NN_MODULE_INIT(k_proj, hidden_size_, num_key_value_heads_ * head_dim_, false, dtype, device);
-    INFINICORE_NN_MODULE_INIT(v_proj, hidden_size_, num_key_value_heads_ * head_dim_, false, dtype, device);
-    INFINICORE_NN_MODULE_INIT(o_proj, num_attention_heads_ * head_dim_, hidden_size_, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(q_proj, hidden_size, num_attention_heads_ * head_dim_, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(k_proj, hidden_size, num_key_value_heads_ * head_dim_, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(v_proj, hidden_size, num_key_value_heads_ * head_dim_, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(o_proj, num_attention_heads_ * head_dim_, hidden_size, false, dtype, device);
 
     if (use_qk_norm_) {
-        INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, model_config_->get<double>("rms_norm_eps"), dtype, device);
-        INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, model_config_->get<double>("rms_norm_eps"), dtype, device);
+        INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, model_config->get<double>("rms_norm_eps"), dtype, device);
+        INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, model_config->get<double>("rms_norm_eps"), dtype, device);
     }
     use_output_norm_ = true;
-    INFINICORE_NN_MODULE_INIT(o_norm, hidden_size_, model_config_->get<double>("rms_norm_eps"), dtype, device);
-    INFINICORE_NN_MODULE_INIT(z_proj, hidden_size_, hidden_size_, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(o_norm, hidden_size, model_config->get<double>("rms_norm_eps"), dtype, device);
+    INFINICORE_NN_MODULE_INIT(z_proj, hidden_size, hidden_size, false, dtype, device);
 
     std::vector<float> slopes = build_slope_tensor(num_attention_heads_);
     auto g_cpu = infinicore::Tensor::empty(
@@ -196,15 +194,9 @@ infinicore::Tensor MiniCPMSALALightningAttention::forward(const infinicore::Tens
     size_t cache_pos = 0;
     const bool has_cache_meta = past_sequence_lengths.has_value() && total_sequence_lengths.has_value();
     if (has_cache_meta) {
-        // Single device-to-host sync: read both scalars (engine could pass these as scalars later).
         auto past_cpu = past_sequence_lengths.value()->to(infinicore::Device::cpu());
-        auto total_cpu = total_sequence_lengths.value()->to(infinicore::Device::cpu());
         cache_pos = reinterpret_cast<int32_t *>(past_cpu->data())[0];
-        size_t total_seq_len_raw = reinterpret_cast<int32_t *>(total_cpu->data())[0];
-        total_seq_len = total_seq_len_raw;
-        // Some engine call sites pass `total_sequence_lengths` as the *input* length (e.g. 1 for decode),
-        // while `past_sequence_lengths` is the cached KV length. Attention needs total KV length.
-        // Use KV semantics: total_kv_len = cache_pos + current seq_len.
+        // `total_sequence_lengths` may be input length (e.g. 1 on decode); KV length is cache_pos + seq_len.
         total_seq_len = cache_pos + seq_len;
     } else if (total_sequence_lengths.has_value()) {
         total_seq_len = reinterpret_cast<int32_t *>(total_sequence_lengths.value()->to(infinicore::Device::cpu())->data())[0];
@@ -377,37 +369,34 @@ infinicore::Tensor MiniCPMSALALightningAttention::forward(const infinicore::Tens
 MiniCPMSALAMinicpm4Attention::MiniCPMSALAMinicpm4Attention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
                                                            const infinicore::Device &device,
                                                            size_t layer_idx)
-    : model_config_(std::move(model_config)),
-      layer_idx_(layer_idx) {
+    : layer_idx_(layer_idx) {
     (void)device;
-    const auto dtype = model_config_->get_dtype();
-    attention_backend_ = infinilm::global_state::get_infinilm_config().attention_backend;
-    hidden_size_ = model_config_->get<size_t>("hidden_size");
-    num_attention_heads_ = model_config_->get<size_t>("num_attention_heads");
-    num_key_value_heads_ = model_config_->get<size_t>("num_key_value_heads");
-    head_dim_ = model_config_->get<size_t>("head_dim");
+    const auto dtype = model_config->get_dtype();
+    const size_t hidden_size = model_config->get<size_t>("hidden_size");
+    num_attention_heads_ = model_config->get<size_t>("num_attention_heads");
+    num_key_value_heads_ = model_config->get<size_t>("num_key_value_heads");
+    head_dim_ = model_config->get<size_t>("head_dim");
     scaling_ = static_cast<float>(1.0 / std::sqrt(static_cast<double>(head_dim_)));
 
-    int sparse_window_size = model_config_->get_or<int>("sparse_window_size", -1);
+    int sparse_window_size = model_config->get_or<int>("sparse_window_size", -1);
     if (sparse_window_size <= 0) {
-        auto sparse_cfg = model_config_->get_or<nlohmann::json>("sparse_config", nlohmann::json{});
+        auto sparse_cfg = model_config->get_or<nlohmann::json>("sparse_config", nlohmann::json{});
         if (!sparse_cfg.is_null() && sparse_cfg.contains("window_size")) {
             sparse_window_size = sparse_cfg["window_size"].get<int>();
         } else {
-            sparse_window_size = model_config_->get_or<int>("window_size", -1);
+            sparse_window_size = model_config->get_or<int>("window_size", -1);
         }
     }
     if (sparse_window_size > 0) {
         infllmv2_window_left_ = sparse_window_size;
-        infllmv2_window_right_ = 0;
         use_local_window_ = true;
     }
 
-    INFINICORE_NN_MODULE_INIT(q_proj, hidden_size_, num_attention_heads_ * head_dim_, false, dtype, device);
-    INFINICORE_NN_MODULE_INIT(k_proj, hidden_size_, num_key_value_heads_ * head_dim_, false, dtype, device);
-    INFINICORE_NN_MODULE_INIT(v_proj, hidden_size_, num_key_value_heads_ * head_dim_, false, dtype, device);
-    INFINICORE_NN_MODULE_INIT(o_proj, num_attention_heads_ * head_dim_, hidden_size_, false, dtype, device);
-    INFINICORE_NN_MODULE_INIT(o_gate, hidden_size_, hidden_size_, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(q_proj, hidden_size, num_attention_heads_ * head_dim_, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(k_proj, hidden_size, num_key_value_heads_ * head_dim_, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(v_proj, hidden_size, num_key_value_heads_ * head_dim_, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(o_proj, num_attention_heads_ * head_dim_, hidden_size, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(o_gate, hidden_size, hidden_size, false, dtype, device);
 }
 
 void MiniCPMSALAMinicpm4Attention::reset_state() {
diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp
index 43784627..9af665aa 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp
@@ -1,10 +1,7 @@
 #pragma once
 
-#include "../../backends/attention_backends.hpp"
-#include "../../cache/kv_cache.hpp"
 #include "../../config/model_config.hpp"
 #include "../../layers/rotary_embedding/rotary_embedding.hpp"
-#include "../../global_state/global_state.hpp"
 
 #include "infinicore/nn/linear.hpp"
 #include "infinicore/nn/module.hpp"
@@ -53,11 +50,9 @@ class MiniCPMSALALightningAttention : public MiniCPMSALAAttentionBase {
     INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, o_norm);
     INFINICORE_NN_MODULE(infinicore::nn::Linear, z_proj);
 
-    std::shared_ptr<infinilm::config::ModelConfig> model_config_;
     std::shared_ptr<infinicore::nn::RoPE> rotary_emb_;
 
     size_t layer_idx_;
-    size_t hidden_size_;
     size_t num_attention_heads_;
     size_t num_key_value_heads_;
     size_t head_dim_;
@@ -68,8 +63,6 @@ class MiniCPMSALALightningAttention : public MiniCPMSALAAttentionBase {
     bool use_output_norm_ = false;
     bool use_rope_ = false;
 
-    backends::AttentionBackend attention_backend_;
-
     // Lightning layers only: per-head log-decay for Simple GLA (HF _build_slope_tensor * -1).
     infinicore::Tensor g_gamma_;
 
@@ -101,9 +94,7 @@ class MiniCPMSALAMinicpm4Attention : public MiniCPMSALAAttentionBase {
     INFINICORE_NN_MODULE(infinicore::nn::Linear, o_proj);
     INFINICORE_NN_MODULE(infinicore::nn::Linear, o_gate);
 
-    std::shared_ptr<infinilm::config::ModelConfig> model_config_;
     size_t layer_idx_;
-    size_t hidden_size_;
     size_t num_attention_heads_;
     size_t num_key_value_heads_;
     size_t head_dim_;
@@ -111,10 +102,7 @@ class MiniCPMSALAMinicpm4Attention : public MiniCPMSALAAttentionBase {
 
     // InfLLM-v2 local-window masking plumbing.
     int infllmv2_window_left_ = -1;
-    int infllmv2_window_right_ = -1;
     bool use_local_window_ = false;
-
-    backends::AttentionBackend attention_backend_;
 };
 
 } // namespace infinilm::models::minicpm_sala
diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp
index b565cf47..6c04b480 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp
@@ -1,6 +1,5 @@
 #include "minicpm_sala_decoder_layer.hpp"
 
-#include "../../global_state/global_state.hpp"
 #include "infinicore/ops.hpp"
 #include "infinicore/context/context.hpp"
 #include <cmath>
@@ -17,7 +16,6 @@ MiniCPMSALADecoderLayer::MiniCPMSALADecoderLayer(std::shared_ptr<infinilm::confi
                                                  const infinicore::Device &device,
                                                  size_t layer_idx,
                                                  const std::string &mixer_type) {
-    layer_idx_ = layer_idx;
     // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA).
     const auto dtype = model_config->get_dtype();
     const double eps = model_config->get<double>("rms_norm_eps");
@@ -39,6 +37,10 @@ MiniCPMSALADecoderLayer::MiniCPMSALADecoderLayer(std::shared_ptr<infinilm::confi
     INFINICORE_NN_MODULE_INIT(mlp, model_config, device);
 }
 
+void MiniCPMSALADecoderLayer::reset_attn_state() {
+    self_attn_->reset_state();
+}
+
 infinicore::Tensor MiniCPMSALADecoderLayer::forward(const infinicore::Tensor &hidden_states,
                                                     const infinicore::Tensor &position_ids) const {
     // Pre-norm attention
diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp
index 52f31000..305ab967 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp
@@ -3,9 +3,7 @@
 #include "minicpm_sala_attention.hpp"
 #include "minicpm_sala_mlp.hpp"
 
-#include "../../backends/attention_backends.hpp"
 #include "../../config/model_config.hpp"
-#include "../../engine/distributed/distributed.hpp"
 
 #include "infinicore/nn/module.hpp"
 #include "infinicore/nn/rmsnorm.hpp"
@@ -16,8 +14,6 @@
 
 namespace infinilm::models::minicpm_sala {
 
-class MiniCPMSALAModel;
-
 class MiniCPMSALADecoderLayer : public infinicore::nn::Module {
 public:
     MiniCPMSALADecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
@@ -28,11 +24,10 @@ class MiniCPMSALADecoderLayer : public infinicore::nn::Module {
     infinicore::Tensor forward(const infinicore::Tensor &hidden_states,
                                const infinicore::Tensor &position_ids) const;
 
-private:
-    friend class MiniCPMSALAModel;
+    void reset_attn_state();
 
+private:
     double residual_scale_ = 1.0;
-    size_t layer_idx_ = 0;
 
 protected:
     INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, input_layernorm);
diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp
index 791a7832..de6f34e1 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp
@@ -25,16 +25,12 @@ std::shared_ptr<infinilm::config::ModelConfig> create_minicpm_sala_model_config(
 
 MiniCPMSALAForCausalLM::MiniCPMSALAForCausalLM(
     std::shared_ptr<infinilm::config::ModelConfig> model_config,
-    const infinicore::Device &device,
-    engine::distributed::RankInfo rank_info,
-    backends::AttentionBackend attention_backend) {
+    const infinicore::Device &device) {
     device_ = device;
     model_config_ = model_config;
 
     // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA).
     const auto dtype = model_config->get_dtype();
-    (void)rank_info;
-    (void)attention_backend;
     INFINICORE_NN_MODULE_INIT(model, model_config, device);
 
     const size_t hidden_size = model_config->get<size_t>("hidden_size");
@@ -48,21 +44,6 @@ MiniCPMSALAForCausalLM::Output MiniCPMSALAForCausalLM::forward(
     auto input_ids = input.input_ids.value();
     auto position_ids = input.position_ids.value();
 
-    auto past_sequence_lengths = input.past_sequence_lengths;
-    auto total_sequence_lengths = input.total_sequence_lengths;
-    auto input_offsets = input.input_offsets;
-    auto cu_seqlens = input.cu_seqlens;
-    auto block_tables = input.block_tables;
-    auto slot_mapping = input.slot_mapping;
-
-    infinilm::global_state::get_forward_context().attn_metadata =
-        infinilm::global_state::AttentionMetadata(past_sequence_lengths,
-                                                  total_sequence_lengths,
-                                                  input_offsets,
-                                                  cu_seqlens,
-                                                  block_tables,
-                                                  slot_mapping);
-
     auto hidden_states = model_->forward(input_ids, position_ids);
 
     // MuP lm_head scale baked into lm_head.weight at load time; no forward scaling here.
diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp
index 9344dfd3..0a53e101 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp
@@ -4,8 +4,6 @@
 #include "minicpm_sala_model.hpp"
 
 #include "../../config/model_config.hpp"
-#include "../../engine/distributed/distributed.hpp"
-#include "../../backends/attention_backends.hpp"
 #include "../../layers/linear/linear.hpp"
 
 #include "infinicore/device.hpp"
@@ -18,9 +16,7 @@ namespace infinilm::models::minicpm_sala {
 class MiniCPMSALAForCausalLM : public InfinilmModel {
 public:
     MiniCPMSALAForCausalLM(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                           const infinicore::Device &device,
-                           engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(),
-                           backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
+                           const infinicore::Device &device);
 
     Output forward(const Input &input) const override;
 
diff --git a/csrc/models/minicpm_sala/minicpm_sala_mlp.cpp b/csrc/models/minicpm_sala/minicpm_sala_mlp.cpp
index 649c0095..b9ebd3c6 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_mlp.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_mlp.cpp
@@ -8,12 +8,12 @@ MiniCPMSALAMLP::MiniCPMSALAMLP(std::shared_ptr<infinilm::config::ModelConfig> mo
                                const infinicore::Device &device) {
     // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA).
     const auto dtype = model_config->get_dtype();
-    hidden_size_ = model_config->get<size_t>("hidden_size");
-    intermediate_size_ = model_config->get<size_t>("intermediate_size");
+    const size_t hidden_size = model_config->get<size_t>("hidden_size");
+    const size_t intermediate_size = model_config->get<size_t>("intermediate_size");
 
-    INFINICORE_NN_MODULE_INIT(gate_proj, hidden_size_, intermediate_size_, false, dtype, device);
-    INFINICORE_NN_MODULE_INIT(up_proj, hidden_size_, intermediate_size_, false, dtype, device);
-    INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(gate_proj, hidden_size, intermediate_size, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(up_proj, hidden_size, intermediate_size, false, dtype, device);
+    INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size, hidden_size, false, dtype, device);
 }
 
 infinicore::Tensor MiniCPMSALAMLP::forward(const infinicore::Tensor &x) const {
diff --git a/csrc/models/minicpm_sala/minicpm_sala_mlp.hpp b/csrc/models/minicpm_sala/minicpm_sala_mlp.hpp
index 9a90527a..3150670b 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_mlp.hpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_mlp.hpp
@@ -21,10 +21,6 @@ class MiniCPMSALAMLP : public infinicore::nn::Module {
     INFINICORE_NN_MODULE(infinicore::nn::Linear, gate_proj);
     INFINICORE_NN_MODULE(infinicore::nn::Linear, up_proj);
     INFINICORE_NN_MODULE(infinicore::nn::Linear, down_proj);
-
-private:
-    size_t hidden_size_;
-    size_t intermediate_size_;
 };
 
 } // namespace infinilm::models::minicpm_sala
diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.cpp b/csrc/models/minicpm_sala/minicpm_sala_model.cpp
index f665ce0a..20c6d420 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_model.cpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_model.cpp
@@ -12,32 +12,23 @@
 namespace infinilm::models::minicpm_sala {
 
 MiniCPMSALAModel::MiniCPMSALAModel(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                                   const infinicore::Device &device)
-    : model_config_(std::move(model_config)) {
+                                   const infinicore::Device &device) {
 
     // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA).
-    const auto dtype = model_config_->get_dtype();
-    compute_device_ = device;
-    const engine::distributed::RankInfo &rank_info = infinilm::global_state::get_tensor_model_parallel_rank_info();
-    const backends::AttentionBackend attention_backend = infinilm::global_state::get_infinilm_config().attention_backend;
+    const auto dtype = model_config->get_dtype();
 
-    hidden_size_ = model_config_->get<size_t>("hidden_size");
-    dim_model_base_ = model_config_->get_or<double>("dim_model_base", static_cast<double>(hidden_size_));
-    scale_emb_ = model_config_->get_or<double>("scale_emb", 1.0);
+    hidden_size_ = model_config->get<size_t>("hidden_size");
 
-    const size_t vocab_size = model_config_->get<size_t>("vocab_size");
-    const size_t num_layers = model_config_->get<size_t>("num_hidden_layers");
+    const size_t vocab_size = model_config->get<size_t>("vocab_size");
+    const size_t num_layers = model_config->get<size_t>("num_hidden_layers");
 
     INFINICORE_NN_MODULE_INIT(embed_tokens, vocab_size, hidden_size_, std::nullopt, dtype, device);
-    INFINICORE_NN_MODULE_INIT(norm, hidden_size_, model_config_->get<double>("rms_norm_eps"), dtype, device);
-
-    // Shared rotary embedding (used by lightning layers only) — match `get_rope` pattern.
-    rotary_emb_ = infinilm::layers::rotary_embedding::get_rope(model_config_, device);
+    INFINICORE_NN_MODULE_INIT(norm, hidden_size_, model_config->get<double>("rms_norm_eps"), dtype, device);
 
     // Mixer types per-layer decide attention flavor (minicpm4 vs lightning-attn).
     std::vector<std::string> mixer_types;
     try {
-        mixer_types = model_config_->get<std::vector<std::string>>("mixer_types");
+        mixer_types = model_config->get<std::vector<std::string>>("mixer_types");
     } catch (...) {
         mixer_types.assign(num_layers, "minicpm4");
     }
@@ -48,13 +39,13 @@ MiniCPMSALAModel::MiniCPMSALAModel(std::shared_ptr<infinilm::config::ModelConfig
     layers_.reserve(num_layers);
     for (size_t i = 0; i < num_layers; ++i) {
         layers_.push_back(this->register_module<MiniCPMSALADecoderLayer>(
-            "layers." + std::to_string(i), model_config_, device, i, mixer_types[i]));
+            "layers." + std::to_string(i), model_config, device, i, mixer_types[i]));
     }
 }
 
 void MiniCPMSALAModel::reset_state() {
     for (auto &layer : layers_) {
-        layer->self_attn_->reset_state();
+        layer->reset_attn_state();
     }
 }
 
diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.hpp b/csrc/models/minicpm_sala/minicpm_sala_model.hpp
index ed79cd76..811ecbf7 100644
--- a/csrc/models/minicpm_sala/minicpm_sala_model.hpp
+++ b/csrc/models/minicpm_sala/minicpm_sala_model.hpp
@@ -2,17 +2,10 @@
 
 #include "minicpm_sala_decoder_layer.hpp"
 
-#include "../../backends/attention_backends.hpp"
-#include "../../cache/cache.hpp"
 #include "../../config/model_config.hpp"
-#include "../../engine/distributed/distributed.hpp"
-
-#include "../../layers/rotary_embedding/rotary_embedding.hpp"
-#include "../../global_state/global_state.hpp"
 #include "infinicore/nn/embedding.hpp"
 #include "infinicore/nn/module.hpp"
 #include "infinicore/nn/rmsnorm.hpp"
-#include "infinicore/nn/rope.hpp"
 #include "infinicore/tensor.hpp"
 
 #include <memory>
@@ -32,7 +25,6 @@ class MiniCPMSALAModel : public infinicore::nn::Module {
     void reset_state();
 
     size_t hidden_size() const { return hidden_size_; }
-    double dim_model_base() const { return dim_model_base_; }
 
 protected:
     INFINICORE_NN_MODULE(infinicore::nn::Embedding, embed_tokens);
@@ -40,13 +32,7 @@ class MiniCPMSALAModel : public infinicore::nn::Module {
     INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, norm);
 
 private:
-    std::shared_ptr<infinilm::config::ModelConfig> model_config_;
-    std::shared_ptr<infinicore::nn::RoPE> rotary_emb_;
-    infinicore::Device compute_device_;
-
     size_t hidden_size_;
-    double scale_emb_;
-    double dim_model_base_;
 };
 
 } // namespace infinilm::models::minicpm_sala