From c8e6f32331ad37c231a8f57839c1209344fa8f2c Mon Sep 17 00:00:00 2001 From: Ceng23333 <441651826@qq.com> Date: Wed, 8 Apr 2026 11:11:02 +0000 Subject: [PATCH 01/11] squash for refactor Signed-off-by: Ceng23333 <441651826@qq.com> --- .gitignore | 5 +- MINICPM_SALA_BUILD_AND_CHANGES.md | 244 +++++ MiniCPM_SALA_alignment_progress.md | 359 ++++++++ csrc/cache/kv_cache.cpp | 67 +- csrc/cache/kv_cache.hpp | 25 +- csrc/config/config_factory.cpp | 2 +- csrc/engine/infer_engine.cpp | 10 +- csrc/engine/rank_worker.cpp | 3 +- .../minicpm_sala/minicpm_sala_attention.cpp | 575 +++++++++--- .../minicpm_sala/minicpm_sala_attention.hpp | 142 +-- .../minicpm_sala_decoderLayer.cpp | 61 -- .../minicpm_sala_decoderLayer.hpp | 34 - .../minicpm_sala_decoder_layer.cpp | 83 ++ .../minicpm_sala_decoder_layer.hpp | 53 ++ .../minicpm_sala_for_causal_lm.cpp | 77 +- .../minicpm_sala_for_causal_lm.hpp | 35 +- csrc/models/minicpm_sala/minicpm_sala_mlp.cpp | 32 + csrc/models/minicpm_sala/minicpm_sala_mlp.hpp | 31 + .../minicpm_sala/minicpm_sala_model.cpp | 171 ++++ .../minicpm_sala/minicpm_sala_model.hpp | 66 ++ csrc/models/model_factory.cpp | 30 +- csrc/pybind11/engine/engine.hpp | 2 +- examples/collect_metrics_longtext_decode.py | 355 +++++++ examples/compare_inference_speed.py | 868 ++++++++++++++++++ examples/jiuge.py | 16 +- examples/metrics_16k_prefill.md | 152 +++ examples/metrics_longtext_mem.md | 378 ++++++++ examples/run_infinicore_ops_before_logits.sh | 18 + examples/run_longtext_metrics_cases.sh | 59 ++ include/infinicore_infer/cache.h | 5 + include/infinicore_infer/weights_loader.h | 5 + python/infinilm/auto_config.py | 2 + python/infinilm/infer_engine.py | 110 ++- python/infinilm/llm/llm.py | 10 +- python/infinilm/llm/static_scheduler.py | 22 +- python/infinilm/modeling_utils.py | 65 +- .../infinilm/server/chat_message_normalize.py | 76 ++ python/infinilm/server/inference_server.py | 34 +- xmake.lua | 3 +- 39 files changed, 3871 insertions(+), 414 deletions(-) create mode 100644 MINICPM_SALA_BUILD_AND_CHANGES.md create mode 100644 MiniCPM_SALA_alignment_progress.md delete mode 100644 csrc/models/minicpm_sala/minicpm_sala_decoderLayer.cpp delete mode 100644 csrc/models/minicpm_sala/minicpm_sala_decoderLayer.hpp create mode 100644 csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp create mode 100644 csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp create mode 100644 csrc/models/minicpm_sala/minicpm_sala_mlp.cpp create mode 100644 csrc/models/minicpm_sala/minicpm_sala_mlp.hpp create mode 100644 csrc/models/minicpm_sala/minicpm_sala_model.cpp create mode 100644 csrc/models/minicpm_sala/minicpm_sala_model.hpp create mode 100644 examples/collect_metrics_longtext_decode.py create mode 100644 examples/compare_inference_speed.py create mode 100644 examples/metrics_16k_prefill.md create mode 100644 examples/metrics_longtext_mem.md create mode 100755 examples/run_infinicore_ops_before_logits.sh create mode 100755 examples/run_longtext_metrics_cases.sh create mode 100644 python/infinilm/server/chat_message_normalize.py diff --git a/.gitignore b/.gitignore index b728e6ea..1d4781b7 100644 --- a/.gitignore +++ b/.gitignore @@ -30,4 +30,7 @@ __pycache__/ *.http -*.nsys-rep +**/*.nsys-rep +**/*.jsonl +*.jsonl +**/*.mem diff --git a/MINICPM_SALA_BUILD_AND_CHANGES.md b/MINICPM_SALA_BUILD_AND_CHANGES.md new file mode 100644 index 00000000..1ec53fad --- /dev/null +++ b/MINICPM_SALA_BUILD_AND_CHANGES.md @@ -0,0 +1,244 @@ +# MiniCPM-SALA on InfiniLM: Build Guide and Change Summary + +This document describes the changes in **InfiniCore** and **InfiniLM** from their baseline commits to support MiniCPM-SALA with InfLLM-v2, the **prerequisites**, and a **step-by-step build and run guide**. With these changes, `InfiniLM/examples/jiuge.py` produces **reasonable MiniCPM-SALA generation output** when run with the correct environment. + +**Baseline commits (for reference):** + +- **InfiniLM:** `main` +- **InfiniCore:** `5fc85c8b1e6728839993f1b743a525a066da585f` + +To see the exact diff from baseline: +`git diff 5fc85c8b1e6728839993f1b743a525a066da585f -- InfiniCore` and +`git diff main -- InfiniLM`. + +--- + +## 1. Changes in InfiniCore (from `5fc85c8b1e6728839993f1b743a525a066da585f`) + +InfiniCore was extended to **wire InfLLM-v2** (Stage-2 sparse attention) so that when built with `--infllmv2=y`, the C++ API calls `mha_varlen_fwd` and `mha_fwd_kvcache` from the infllmv2_cuda_impl .so. + +### 1.1 New or modified files (summary) + +| Area | Path | Purpose | +|------|------|--------| +| API (decl) | `include/infinicore/ops/infllmv2_api.hpp` | Declares `mha_varlen_fwd`, `mha_fwd_kvcache` (must be provided by infllmv2 .so at link/runtime). | +| API (decl) | `include/infinicore/ops/infllmv2_attention.hpp` | Public op header for infllmv2 attention. | +| Ops impl | `src/infinicore/ops/infllmv2_attention/infllmv2_attention.cc` | Implements `infllmv2_varlen` and `infllmv2_kvcache` by calling the above APIs when `ENABLE_INFLLMV2` and `ENABLE_ATEN` are set. | +| Pybind | `src/infinicore/pybind11/ops/infllmv2_attention.hpp` | Exposes infllmv2 ops to Python. | +| Pybind | `src/infinicore/pybind11/ops.hpp` | Includes infllmv2 op bindings. | +| Python | `python/infinicore/ops/infllmv2_attention.py` | Python wrapper for `infllmv2_varlen` / `infllmv2_kvcache`. | +| Python | `python/infinicore/__init__.py` | Exports `infllmv2_varlen`, `infllmv2_kvcache`. | +| Build | `xmake.lua` | New option `--infllmv2=y`; when set with `--aten=y`, defines `ENABLE_INFLLMV2` and links/rpath to the auto-detected .so. | +| Test | `test/infinicore/ops/test_infllmv2_attention.py` | Unit tests for infllmv2 varlen/kvcache (skipped if not built or no CUDA). | +| Example | `examples/infllmv2_sanity.py` | Sanity script for InfLLM-v2 (skips if .so absent or no CUDA). | + +### 1.2 Build option + +- **Option:** `infllmv2` (enable InfLLM-v2; xmake auto-detects `infllm_v2/*.so` under `InfiniCore/third_party/infllmv2_cuda_impl/build/...`). +- **Requires:** `aten=y` (InfiniCore must be built with PyTorch/ATen). +- **Effect:** Defines `ENABLE_INFLLMV2`, adds link and rpath to the auto-detected infllmv2 .so. At runtime, `libinfinicore_cpp_api.so` resolves `mha_varlen_fwd` / `mha_fwd_kvcache` from that .so (via `LD_LIBRARY_PATH` or `LD_PRELOAD`). + +--- + +## 2. Changes in InfiniLM (from `main`) + +InfiniLM was extended to support the **MiniCPM-SALA** model (embedding, layers, attention, MLP, LM head) and to use InfiniCore (including InfLLM-v2 when available) for inference. + +### 2.1 New or modified files (summary) + +| Area | Path | Purpose | +|------|------|--------| +| C++ model | `csrc/models/minicpm_sala/*.cpp`, `*.hpp` | MiniCPM-SALA model: `minicpm_sala_attention`, `minicpm_sala_decoder_layer`, `minicpm_sala_model`, `minicpm_sala_for_causal_lm`, `minicpm_sala_mlp`. Per-layer dense KV cache; lightning (GLA) and optional InfLLM-v2 (minicpm4) attention paths. | +| C++ factory | `csrc/models/model_factory.cpp` | Registers MiniCPM-SALA model type. | +| Config | `python/infinilm/auto_config.py` | MiniCPM-SALA config handling. | +| Weights | `python/infinilm/modeling_utils.py` | MiniCPM-SALA weight loading (MuP scaling, etc.). | +| Examples | `examples/jiuge.py` | Generic InferEngine generation script; docstring updated with env (PYTHONPATH, LD_LIBRARY_PATH, LD_PRELOAD) for MiniCPM-SALA. | +| Examples | `examples/minicpm_sala_logits_sanity.py` | HF vs InfiniLM logits sanity (prefill/decode1/decodeN); single-token decode for correct KV cache; one-prompt output comparison. | +| Examples | `examples/modeling_minicpm_sala.py` | HF-side MiniCPM-SALA modeling (reference). | +| Docs | `MiniCPM_SALA_alignment_progress.md` | Alignment and debugging notes. | + +### 2.2 Behaviour notes + +- **Attention:** Layer 0 (minicpm4) can use compiled InfLLM-v2 when InfiniCore is built with `--infllmv2=y` and the .so is preloaded; other layers use lightning (GLA) path. +- **Attention overhead optimizations:** In `minicpm_sala_attention.cpp`: (1) sequence lengths are read in one place when both `past_sequence_lengths` and `total_sequence_lengths` are present (`has_cache_meta`), avoiding duplicate logic; (2) Q/K/V use a single `contiguous()->view` chain after projections; (3) lightning path builds `q_bthd` via one `permute->contiguous` from `q_perm`; (4) sparse path uses `q_perm` directly (already contiguous) and only calls `contiguous()` on K/V when repeating heads. Semantics and logits are unchanged. +- **KV cache:** Decode must use **single-token input** per step; passing the full sequence each step would misalign the per-layer KV cache (see sanity script). +- **Engine / KV cache config:** MiniCPM-SALA uses per-layer dense KV cache in C++; the engine’s `cache_config` is used only for scheduling (e.g. `past_sequence_lengths` / `total_sequence_lengths`). **Static cache** is recommended (default in `jiuge.py` when not passing `--enable-paged-attn`). For static, `jiuge.py` sets `max_cache_len = max(initial_capacity, max_position_embeddings)` when `model_type == "minicpm_sala"` so long contexts are supported without re-alloc. + +--- + +## 3. Prerequisites + +### 3.1 System and toolchain + +- **OS:** Linux. +- **Python:** 3.12 recommended (match the infllmv2 .so and InfiniCore pybind ABI). +- **CUDA:** 11.6+ (e.g. 12.x); `nvcc` in `PATH` (e.g. via `CUDA_HOME=/usr/local/cuda` and `PATH=$CUDA_HOME/bin:$PATH`). +- **C++:** GCC (e.g. `CC=gcc CXX=g++`) for infllmv2_cuda_impl and InfiniCore. +- **xmake:** For building InfiniCore (install from https://xmake.io or use a project-provided path). +- **PyTorch:** Installed in the same Python env used to build infllmv2 and to run InfiniLM (InfiniCore with `aten=y` links against this PyTorch’s libs). + +### 3.2 Python environment + +Use a **single venv** (or env) that has: + +- `torch` +- `transformers` +- `triton` (e.g. 3.2.0; for MiniCPM-SALA HF path; if CUDA 12.8, a small patch may be needed for Triton’s `ptx_get_version` or use a Triton version that supports 12.8) +- `flash-linear-attention` (or HF deps for MiniCPM-SALA) +- Other InfiniLM/InfiniCore runtime deps + +Build **infllmv2_cuda_impl** and **InfiniCore** with this same Python (and thus same PyTorch ABI). + +### 3.3 Repo layout + +- **minicpm-sala-support** (repo root) contains: + - **InfiniCore/** — InfiniCore with InfLLM-v2 wiring. + - **InfiniLM/** — InfiniLM with MiniCPM-SALA. + - **InfiniCore/third_party/infllmv2_cuda_impl/** — InfLLM-v2 CUDA kernel implementation (provides `mha_varlen_fwd`, `mha_fwd_kvcache`). + +--- + +## 4. Build Guide + +### 4.1 Build InfLLM-v2 (infllmv2_cuda_impl) + +This produces the `.so` that provides `mha_varlen_fwd` and `mha_fwd_kvcache`. InfiniCore must be built with a PyTorch/ABI-compatible env (same Python/torch as here). + +1. **From repo root:** + ```bash + cd InfiniCore/third_party/infllmv2_cuda_impl + ``` +2. **Submodules:** + ```bash + git submodule update --init --recursive + ``` +3. **Env (recommended):** + ```bash + export CC=gcc CXX=g++ + export CUDA_HOME=/usr/local/cuda # or your CUDA path + export PATH=$CUDA_HOME/bin:$PATH + ``` +4. **Build/install** (use the Python that has torch and that you will use for InfiniLM): + ```bash + python setup.py install + ``` + Or: `pip install -e .` +5. **Locate the .so:** + Typically under `build/lib.linux-x86_64-cpython-312/infllm_v2/` (name like `C.cpython-312-x86_64-linux-gnu.so`). Set: + ```bash + INFLLMV2_SO_DIR="/InfiniCore/third_party/infllmv2_cuda_impl/build/lib.linux-x86_64-cpython-312/infllm_v2" + ``` + +### 4.2 Build InfiniCore (with InfLLM-v2) + +InfiniCore must be built with **aten** and, for MiniCPM-SALA with InfLLM-v2, with **infllmv2=y** enabled (xmake auto-detects the .so). + +1. **Install Infini dependencies** (if not already): + Build and install Infini libs so they are under `$INFINI_ROOT` (default `~/.infini`). InfiniCore’s xmake expects `include/` and `lib/` there (e.g. `libinfinicore_cpp_api.so`, `libinfiniop.so`, etc.). + +2. **From repo root:** + ```bash + cd InfiniCore + ``` +3. **Configure** (use the same Python/torch as infllmv2): + ```bash + xmake config -y --root --nv-gpu=y --aten=y --infllmv2=y + ``` + Omit `--infllmv2=y` for a build without InfLLM-v2 (then no MiniCPM-SALA layer0 infllmv2 path). +4. **Build the Python extension:** + ```bash + xmake --root _infinicore + ``` +5. **Optional – install to ~/.infini:** + ```bash + xmake install + ``` + The Python loadable is also copied under `InfiniCore/python/infinicore/lib/` by the build. + +### 4.3 Run jiuge.py (MiniCPM-SALA) + +Use the **same venv** that has `torch`, `transformers`, etc., and set env so InfiniCore and the infllmv2 .so are found and symbols resolve. + +**Required:** + +- `PYTHONPATH`: InfiniLM and InfiniCore Python packages. +- `LD_LIBRARY_PATH`: Torch lib, Infini lib (`/root/.infini/lib` or your `INFINI_ROOT/lib`), and optionally `INFLLMV2_SO_DIR` (if not using `LD_PRELOAD`). +- If InfiniCore was built with InfLLM-v2: **`LD_PRELOAD`** the infllmv2 .so so `libinfinicore_cpp_api.so` resolves `mha_varlen_fwd` (and `mha_fwd_kvcache`). + +**Example (from repo root):** + +```bash +INFLLMV2_SO_DIR="$(pwd)/InfiniCore/third_party/infllmv2_cuda_impl/build/lib.linux-x86_64-cpython-312/infllm_v2" + +PYTHONPATH="$(pwd)/InfiniLM/python:$(pwd)/InfiniCore/python:$PYTHONPATH" \ +LD_LIBRARY_PATH="$(python -c 'import torch; print(torch.__path__[0])')/lib:/root/.infini/lib:${INFLLMV2_SO_DIR}:$LD_LIBRARY_PATH" \ +LD_PRELOAD="${INFLLMV2_SO_DIR}/C.cpython-312-x86_64-linux-gnu.so" \ +python InfiniLM/examples/jiuge.py --nvidia --model_path /root/.cache/modelscope/hub/models/OpenBMB/MiniCPM-SALA +``` + +Use the **venv** Python explicitly if needed, e.g.: + +```bash +/path/to/venv/bin/python InfiniLM/examples/jiuge.py ... +``` + +For Triton (HF path) on CUDA 12.8 you may need: + +```bash +TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas +``` + +--- + +## 5. Verification + +- **InfiniCore InfLLM-v2 ops:** + `PYTHONPATH=InfiniCore/python:InfiniCore/test/infinicore LD_LIBRARY_PATH=:${INFLLMV2_SO_DIR}:/root/.infini/lib LD_PRELOAD=${INFLLMV2_SO_DIR}/C.cpython-312-x86_64-linux-gnu.so python InfiniCore/test/infinicore/ops/test_infllmv2_attention.py --nvidia` + +- **HF vs InfiniLM logits (one-prompt decode):** + Same env + `LD_PRELOAD` and (if needed) `TRITON_PTXAS_PATH`: + `python InfiniLM/examples/minicpm_sala_logits_sanity.py --model_path --mode decodeN --decode_steps 64` + +- **Generation:** + `jiuge.py` with the same env should produce **reasonable MiniCPM-SALA output** (e.g. for prompt "How are you"). + +--- + +## 6. Related docs + +- **CURRENT_PROGRESS.md** — Local progress, InfLLM-v2 plan, and run commands. +- **InfiniLM/MiniCPM_SALA_alignment_progress.md** — Alignment and debugging details. +- **InfiniCore/third_party/infllmv2_cuda_impl/README.md** — InfLLM-v2 kernel design and install. +- **InfiniLM/examples/jiuge.py** — Docstring at top with env summary. + +--- + +## 7. TODO + +- **Remove temporal log and dump code** — Strip or gate debug logging, `INFINI_DEBUG_*`, and temporary dump paths (e.g. `/tmp/` tensor dumps, `dump_tensor_to_bin_if_enabled`, `log_tensor_stats_if_enabled`) from InfiniLM/InfiniCore once alignment and bring-up are stable. +- **Adapt inference_server.py** — Wire MiniCPM-SALA (and InfiniLM InferEngine) into the inference server (e.g. `inference_server.py` or equivalent in the workspace) so that the server can load and serve MiniCPM-SALA with the same env (PYTHONPATH, LD_LIBRARY_PATH, LD_PRELOAD) and run generation endpoints. + +### 7.1 Debug and sanity env and code (for future erasing) + +When removing temporal log and dump code, use this as the reference for **env parsing** and **locations to erase or gate**. + +**Environment variables (debug / sanity):** + +| Env var | Parsing / behavior | Purpose | +|---------|---------------------|--------| +| `INFINI_DEBUG_LOG` | Set to a file path (e.g. `/tmp/minicpm_sala_sanity_debug.log`). When set, C++ and Python append JSON/text lines to this file. | Text log for alignment debugging. | +| `INFINI_DEBUG_ATTN_DUMP` | Presence = enable (e.g. `"1"` or any). When set, tensors are written to fixed `/tmp/` paths below. | Enable binary tensor dumps and per-layer stats. | + +**Where they are read:** + +- **InfiniLM C++:** `std::getenv("INFINI_DEBUG_LOG")`, `std::getenv("INFINI_DEBUG_ATTN_DUMP")` in: + - `InfiniLM/csrc/models/minicpm_sala/minicpm_sala_attention.cpp` (dump_tensor_f32, layer q/k/v/g_gamma and attn out dumps) + - `InfiniLM/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp` (log_tensor_stats_if_enabled, tensor_to_f32_and_dump, layer input/out dumps) + - `InfiniLM/csrc/models/minicpm_sala/minicpm_sala_model.cpp` (dump_tensor_to_bin_if_enabled, log_tensor_stats_if_enabled; embed and final hidden dumps) +- **InfiniLM Python (sanity script):** `os.environ["INFINI_DEBUG_LOG"]`, `os.environ["INFINI_DEBUG_ATTN_DUMP"]` set in `InfiniLM/examples/minicpm_sala_logits_sanity.py` before runs; `os.getenv("INFINI_DEBUG_*")` in `InfiniLM/examples/modeling_minicpm_sala.py` (HF-side hooks that write `/tmp/hf_*.pt` and log to `INFINI_DEBUG_LOG`). + +**Temporary paths to remove or stop writing:** + +- **C++ dumps (binary):** `/tmp/inf_embed_out.bin`, `/tmp/inf_final_hidden.bin`, `/tmp/inf_layer0_q.bin`, `/tmp/inf_layer0_k.bin`, `/tmp/inf_layer0_v.bin`, `/tmp/inf_layer0_g_gamma.bin`, `/tmp/inf_layer1_q.bin`, `/tmp/inf_layer1_k.bin`, `/tmp/inf_layer1_v.bin`, `/tmp/inf_layer1_g_gamma.bin`, `/tmp/inf_layer0_attn_input.bin`, `/tmp/inf_attn_out_layer0.bin`, `/tmp/inf_attn_out_layer1.bin`, `/tmp/inf_layer_out_.bin`. +- **Python (sanity) writes:** `DEBUG_LOG_PATH` (e.g. `/tmp/minicpm_sala_sanity_debug.log`); `/tmp/hf_embed_out.pt`, `/tmp/hf_final_hidden.pt`, `/tmp/hf_layer0_attn_input.pt`, `/tmp/hf_layer_out_.pt`, `/tmp/hf_layer0_q.pt`, `/tmp/hf_layer0_k.pt`, `/tmp/hf_layer0_v.pt`, `/tmp/hf_attn_out_layer0.pt`, `/tmp/hf_layer1_q.pt`, `/tmp/hf_layer1_k.pt`, `/tmp/hf_layer1_v.pt`, `/tmp/hf_attn_out_layer1.pt`. +- **Helpers to remove or gate:** `dump_tensor_f32`, `dump_tensor_to_bin_if_enabled`, `log_tensor_stats_if_enabled`, `tensor_to_f32_and_dump`; sanity script’s `_append_debug_log`, and all `torch.save(..., "/tmp/...")` / `np.fromfile("/tmp/...")` / `os.path.isfile("/tmp/...")` blocks that exist only for alignment comparison. diff --git a/MiniCPM_SALA_alignment_progress.md b/MiniCPM_SALA_alignment_progress.md new file mode 100644 index 00000000..538208c9 --- /dev/null +++ b/MiniCPM_SALA_alignment_progress.md @@ -0,0 +1,359 @@ +### MiniCPM‑SALA sanity alignment – current status + +### Scope + +- **Goal**: Align InfiniLM MiniCPM‑SALA logits with HF reference on the dense/GLA (non‑sparse) path, using the `examples/minicpm_sala_logits_sanity.py` script running inside the `minicpm-sala` container. + +--- + +### Instrumentation and plumbing + +- **Sanity script (`minicpm_sala_logits_sanity.py`)** + - **Backend lock**: All InfiniLM `InferEngine` paths now use `attention_backend="default"` so they hit the dense/GLA fallback. + - **Debug log target**: The script sets `INFINI_DEBUG_LOG=/home/zenghua/repos/.cursor/debug-9146ea.log` and `INFINI_DEBUG_ATTN_DUMP=1` so both Python and C++ write to the same NDJSON file. + - **HF per-layer hooks**: + - `_register_hf_layer_hooks` walks the model (`hf.transformer.layers`, `hf.model.layers`, or `hf.layers`) and registers forward hooks on the first 3 layers. + - For each layer \(i\), it logs: + - `min`, `max`, `mean`, `l2` of the layer output, as `hypothesisId="HF_L"`, `data.layer = i`. + - Hooks are installed for `run_prefill_only` and removed after the forward pass. + +- **InfiniLM attention (`minicpm_sala_attention.cpp`)** + - Existing **layer‑0** diagnostics: + - At entry to `forward_dense_`: `forward_dense_entry` logs env/config, including `INFINI_DEBUG_ATTN_DUMP`, `use_rope`, `use_qk_norm`, `use_output_gate`, `use_output_norm`, `is_sparse_layer`, and shapes. + - For layer 0, logs stats for: + - Pre‑gate attention output (`attn_pre_gate`): full tensor min/max/mean, `l2`, shape and scaling. + - Post‑gate/norm (`attn_post_gate`), and post‑`o_proj` (`attn_post_oproj`). + - **Planned / partially implemented**: extended logging for `layer_idx_ < 2` (layers 0 and 1) with: + - `attn_pre_gate_l0` / `attn_pre_gate_l1`. + - `attn_post_gate_l0` / `attn_post_gate_l1`. + - `attn_post_oproj_l0` / `attn_post_oproj_l1`. + - Current runs still only show layer‑0 entries; the `_infinilm` binary in use has not yet picked up the `_l1` variants (see below). + +- **InfiniLM decoder layer (`minicpm_sala_decoder_layer.cpp/.hpp`)** + - **MuP residual scaling**: + - `residual_scale_ = scale_depth / sqrt(num_hidden_layers)` using `scale_depth` from `ModelConfig` (matches HF path). + - `forward` applies: + - `out1 = hidden_states + residual_scale_ * attn_out`. + - `out2 = out1 + residual_scale_ * mlp_out`. + - **Per-layer Inf output stats**: + - New member `size_t layer_idx_` stored from constructor. + - For `layer_idx_ < 3`, after computing `out2`, it: + - Copies to CPU, converts BF16/F16/F32 to float, computes `min`, `max`, `mean`, `l2` and shape. + - Logs as `hypothesisId="INF_L"`, with `data.layer = layer_idx_`. + +- **Weight scaling / MuP configuration (`modeling_utils.py`)** + - Loader reads `config.json` and applies MiniCPM‑style scaling: + - `scale_input = scale_emb`, `scale_depth`, `num_hidden_layers`, `dim_model_base`, `hidden_size`. + - For `model_type == "minicpm_sala"`: + - `scale_o` and `scale_down` are reset to 1.0 (residual scaling is done at C++ forward time). + - `scale_lm_head = dim_model_base / hidden_size` is baked into `lm_head.weight`. + - Embedding and norm weights are scaled as in the MiniCPM scripts. + +- **Rebuild and install (`rebuild.sh`, xmake)** + - `rebuild.sh`: + - `InfiniCore`: `python scripts/install.py --nv-gpu=y --ccl=y --aten=y`, then `xmake build _infinicore` and `xmake install _infinicore`. + - `InfiniLM`: optional `xmake clean`, then `xmake build _infinilm` and `xmake install _infinilm`. + - Verified inside container: + - Shared libs in `/root/.infini/lib` are updated (e.g. `libinfiniop.so`, `libinfinicore_cpp_api.so` with current timestamps). + - Python sees `infinilm` from `/home/zenghua/repos/InfiniLM/python/infinilm`. + - The extension in use is `_infinilm` at: + - `/home/zenghua/repos/InfiniLM/python/infinilm/lib/_infinilm.cpython-312-x86_64-linux-gnu.so`. + +--- + +### Sanity run behavior and current misalignment + +- **Command used (container, GPU 1)**: + ```bash + docker exec -e CUDA_VISIBLE_DEVICES=1 minicpm-sala bash -lc ' + source /app/docker/nvidia/env-set.sh + cd /home/zenghua/repos/InfiniLM + python3 examples/minicpm_sala_logits_sanity.py \ + --model_path /data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA \ + --mode prefill \ + --prompt "How are you" + ' + ``` +- **HF vs Inf logits (from `SANITY_ONELINE`)** + - `inf_norm ≈ 387.66` + - `hf_norm ≈ 1588.89` + - **ratio_inf_hf ≈ 0.244** + - `max_diff ≈ 12.77`, `mean_diff ≈ 4.64` + - Top‑1 token IDs differ (HF: 74, Inf: 59358). + +- **HF early layers (from `HF_L` logs)** + - Using the HF hooks in the sanity script: + - Layer 0: `l2 ≈ 59.49` + - Layer 1: `l2 ≈ 73.91` (first GLA layer) + - Layer 2: `l2 ≈ 87.38` + - Norms grow smoothly with depth; nothing obviously pathological on HF side. + +- **Inf attention layer‑0 vs HF** + - HF layer‑0 pre‑gate attention (`modeling_minicpm_sala.py:attn_pre_gate`): + - Shape `[1, 4, 4096]`, `min=-8.375`, `max=9.0`, `mean≈-0.1273`. + - Inf layer‑0: + - **Pre‑gate (`attn_pre_gate`)**: + - `l2 ≈ 105.50`, `min=-8.375`, `max=9.0`. + - Python’s comparison (`compare_attn`) reports `norm_ratio_inf_hf ≈ 0.4487`, i.e. Inf pre‑gate norm ≈ 0.45× HF’s. + - **Post‑gate/norm (`attn_post_gate`)**: + - `l2 ≈ 60.38`, very close to HF layer‑0 output `l2 ≈ 59.49`. + - **Post‑o_proj (`attn_post_oproj`)**: + - `l2 ≈ 98.66` (used as input to the decoder’s residual path). + - Interpretation: + - By the end of the **layer‑0 attention block**, Inf and HF are roughly matched in scale at the decoder output (norms ≈ 60). + - The severe **0.244 logits norm ratio** is therefore not due to an immediate blow‑up/vanish at layer‑0 attention output; it accumulates later (likely starting at the first GLA layer and/or via MuP/residual/MLP scaling). + +--- + +### Binary / build state + +- **Extension module mapping** + - In container, importing `infinilm` shows: + - `infinilm.__file__` → `/home/zenghua/repos/InfiniLM/python/infinilm/__init__.py` + - `_infinilm` (top‑level) → `/home/zenghua/repos/InfiniLM/python/infinilm/lib/_infinilm.cpython-312-x86_64-linux-gnu.so` + - That is the `.so` used by the sanity script. + +- **Why new attention logs for layer 1 don’t appear yet** + - `strings _infinilm.cpython-312-...so | grep 'attn_pre_gate_l1'` currently returns **no matches**: + - This confirms the loaded `_infinilm` was built **before** we added the `_l1` logging strings. + - We attempted a fresh `_infinilm` build and initially hit: + - C++ error in `MiniCPMSALADecoderLayer::forward`: `layer_idx_` not declared. + - That prevented `_infinilm` from rebuilding/overwriting the old `.so`, so your layer‑1 logging changes never reached runtime. + +- **Decoder fix applied to unblock rebuild** + - Added `size_t layer_idx_ = 0;` as a private member in `minicpm_sala_decoder_layer.hpp`. + - Set `layer_idx_ = layer_idx;` in the decoder layer constructor. + - After this fix, `_infinilm` can compile; `rebuild.sh` now proceeds past the decoder layer and updates the core libraries (and should be able to update `_infinilm` when the entire build/install completes successfully). + +--- + +### Open issues / next steps + +- **1. Get the new `_infinilm` into use** + - Ensure `rebuild.sh` completes the `_infinilm` build + install step successfully (no early termination due to missing libffi/openssl/ca‑certificates link checks). + - Confirm via: + ```bash + strings /home/zenghua/repos/InfiniLM/python/infinilm/lib/_infinilm.cpython-312-x86_64-linux-gnu.so \ + | grep -E 'attn_pre_gate_l1|attn_post_gate_l1|attn_post_oproj_l1' + ``` + If this prints the `_l1` labels, the new binary is in place. + +- **2. Re‑run sanity and capture layer‑1 attention logs** + - With the updated `_infinilm`, re‑run the prefill sanity script and inspect `debug-9146ea.log` for: + - `minicpm_sala_attention.cpp:attn_pre_gate_l1` + - `minicpm_sala_attention.cpp:attn_post_gate_l1` + - `minicpm_sala_attention.cpp:attn_post_oproj_l1` + - Compare their `l2` to HF layer‑1 (`HF_L` `l2 ≈ 73.9`). + - This will tell us whether the **first GLA layer** is where Inf starts to diverge in norm, or whether norms remain close through layer 1 and drift later. + +- **3. Use decoder `INF_L` logs to see per‑layer drift** + - Once `_infinilm` is rebuilt, `MiniCPMSALADecoderLayer`’s per‑layer `INF_L` logs for `layer_idx_ < 3` should appear in `debug-9146ea.log`. + - By comparing HF (`HF_L`) vs Inf (`INF_L`) for layers 0/1/2, we can see exactly where norm ratios deviate from ~1 and head toward ~0.244 at the logits. + - That will guide targeted fixes in: + - GLA gating / normalization (in `minicpm_sala_attention.cpp`), and/or + - MuP residual & MLP scaling (still matching HF in formula, but potentially interacting differently with the SALA configuration). + +--- + +### Summary + +- **Plumbing**: Shared log path and HF/Inf instrumentation are in place; per‑layer HF stats and layer‑0 Inf attention stats work and confirm that **layer‑0 attention output scale is roughly aligned**. +- **Mismatch**: Final logits norm is still **Inf/HF ≈ 0.244**, so the discrepancy is accumulating across layers, likely starting at or after the first GLA layer. +- **Blocking issue**: The `_infinilm` C++ extension in use predates the layer‑1 logging changes; an earlier C++ compile error prevented a fresh install. That decode‑layer bug has been fixed so we can now rebuild and get the new diagnostics into the runtime. +- **Next milestone**: Successfully rebuild `_infinilm`, confirm the `_l1` log strings are present, rerun sanity, and use the new layer‑1 and decoder `INF_L` stats to precisely locate where Inf’s norms start drifting away from HF. + +--- + +### Host follow-up (2026-03-14) + +- Ran `examples/minicpm_sala_logits_sanity.py --mode prefill --prompt "How are you"` directly on the host using the local venv and the same base env as the documented `jiuge.py` run. +- Extra host-only prep required for the HF reference path: + - installed `flash-linear-attention` to provide the `fla` module + - installed `triton==3.2.0` to avoid the Triton `STAGE` autotune import failure + - created `/home/zenghua/repos/.cursor/` because the script hardcodes `DEBUG_LOG_PATH` there +- Result on host: + - `SANITY_ONELINE ratio=0.6215 max_diff=11.5391 mean_diff=2.5607` + - HF top-1 token id `74`, Inf top-1 token id `23917` +- Interpretation: + - The host environment now reproduces the alignment issue without Docker. + - The ratio is better than the older container snapshot (`~0.244`) but still far from aligned, so the poor generation quality remains consistent with a real logits mismatch. +- Full reproducibility details for this host run were appended to `CURRENT_PROGRESS.md`. + +--- + +### HF MiniCPM4 dense-fallback experiment (2026-03-14) + +- Goal: + - Test whether the remaining mismatch is coming from the HF `minicpm4` sparse-vs-dense code path by forcing `minicpm4` layers onto the standard dense attention implementation. +- HF model-file change: + - Patched both cached copies of `modeling_minicpm_sala.py` so `MiniCPMSALADecoderLayer` uses `MINICPM_ATTENTION_CLASSES[config._attn_implementation]` for `mixer_type == "minicpm4"` instead of `MiniCPMInfLLMv2Attention`. + - Backups: + - `/root/.cache/modelscope/hub/models/OpenBMB/MiniCPM-SALA/modeling_minicpm_sala.py.bak-20260314-210428` + - `/root/.cache/huggingface/modules/transformers_modules/MiniCPM-SALA/modeling_minicpm_sala.py.bak-20260314-210619` +- Rerun result: + - `SANITY_ONELINE ratio=0.6215 max_diff=11.5391 mean_diff=2.5607` + - HF top-1 token id `74`, Inf top-1 token id `23917` + - These numbers are unchanged from the earlier host run. +- Fresh per-layer log from `debug-9146ea.log`: + - HF decoder output `l2`: + - layer 0: `59.49` + - layer 1: `73.91` + - layer 2: `87.38` + - Inf decoder output `l2`: + - layer 0: `35.08` + - layer 1: `295.86` + - layer 2: `531.38` + - Inf layer-1 attention stats: + - pre-gate `l2 ~= 749.58` + - post-gate `l2 ~= 745.29` + - post-`o_proj` `l2 ~= 1112.6` +- Interpretation: + - For this short prefill case, forcing HF `minicpm4` to the dense fallback path does not move the mismatch at all. + - The strongest current evidence is that the large norm drift starts in the InfiniLM implementation at or immediately after the first `lightning-attn` layer, not in the HF `minicpm4` branch. + +--- + +### InfiniLM MiniCPM4 HF-math experiment (2026-03-14) + +- Goal: + - Make the InfiniLM `minicpm4` layer compute the same dense attention math as the HF reference path and see whether layer 0 aligns at the start of sanity. +- C++ change: + - In `csrc/models/minicpm_sala/minicpm_sala_attention.cpp`, replaced the `minicpm4` sparse/varlen/grouped fallback branch with an explicit HF-style dense path: + - repeat KV heads to `num_attention_heads` + - compute per-head dense causal attention + - keep the same sigmoid output gate and `o_proj` +- Rebuild: + - Rebuilt and reinstalled `_infinilm` successfully using the local `xmake` toolchain. +- Rerun result: + - `SANITY_ONELINE ratio=0.6215 max_diff=11.5391 mean_diff=2.5607` + - HF top-1 token id `74`, Inf top-1 token id `23917` + - These numbers are unchanged. +- Fresh layer stats after the InfiniLM-side change: + - HF decoder output `l2`: `59.49 -> 73.91 -> 87.38` + - Inf decoder output `l2`: `35.08 -> 295.86 -> 531.38` + - Inf layer-0 attention: + - pre-gate `142.87` + - post-gate `80.43` + - post-`o_proj` `135.39` +- Interpretation: + - Even after making the InfiniLM `minicpm4` branch follow the HF dense attention structure, layer 0 does not move toward HF. + - This strongly suggests the remaining mismatch is not in the `minicpm4` attention branch itself; attention should shift to other decoder-path components and especially the first `lightning-attn` layer. + +--- + +### Temporary all-lightning experiment (2026-03-14) + +- Goal: + - Force both HF and InfiniLM to use lightning-style attention math for former `minicpm4` layers as a temporary precision-alignment probe, without changing checkpoint tensor shapes. +- Why not use `config.json` only: + - A direct `mixer_types -> all lightning-attn` config edit failed during HF weight load because former `minicpm4` layers have incompatible checkpoint shapes for the stock `LightningAttention` module (e.g. `256 x 4096` vs `4096 x 4096`). + - The original `mixer_types` config was restored. +- Temporary override implementation: + - Added env flag `MINICPM_SALA_FORCE_ALL_LIGHTNING=1`. + - HF side: + - former `minicpm4` layers instantiate `MiniCPMAttention` under the flag + - `MiniCPMAttention.forward()` switches to lightning-style GLA computation under the flag, while keeping original q/k/v/o_proj/o_gate weights + - InfiniLM side: + - `minicpm_sala_attention.cpp` routes sparse layers through `gla_attention` under the same flag + - Sanity script: + - `examples/minicpm_sala_logits_sanity.py` now sets `MINICPM_SALA_FORCE_ALL_LIGHTNING=1` for this experiment +- Result: + - `SANITY_ONELINE ratio=0.4728 max_diff=12.1406 mean_diff=1.9942` + - HF top-1 token id `59375`, Inf top-1 token id `59358` +- Fresh per-layer stats under the override: + - HF decoder output `l2`: + - layer 0: `385.10` + - layer 1: `374.87` + - layer 2: `426.87` + - Inf decoder output `l2`: + - layer 0: `26.23` + - layer 1: `208.72` + - layer 2: `403.90` + - Inf layer-0 attention: + - pre-gate `105.50` + - post-gate `60.38` + - post-`o_proj` `98.66` + - Inf layer-1 attention: + - pre-gate `672.74` + - post-gate `459.67` + - post-`o_proj` `737.03` +- Interpretation: + - The override is definitely active on both sides, because HF logits/top-1 and HF early-layer norms changed substantially. + - However, the former `minicpm4` layers still do not align numerically with InfiniLM under lightning-style attention. + - This points to a mismatch in the lightning formulation itself (decay/slopes, layout, gating, norm/casting, or related details), not just in the original mixed `mixer_types` layout. + +--- + +### Layer-0 narrowing after matched temporary semantics (2026-03-14) + +- Change: + - Updated the temporary HF override so its former `minicpm4` path uses the same grouped causal-softmax math as `InfiniCore` `gla_attention`, instead of `simple_gla` with decay. + - Added layer-0 sub-stage logging on both sides: + - HF: `inputs_embeds`, `input_layernorm`, `attn_pre_gate`, `attn_post_oproj` + - Inf: embedding output, `input_layernorm`, `attn_pre_gate`, `attn_post_oproj` +- Result: + - Layer-0 pre-gate attention still mismatches strongly: + - HF `attn_pre_gate l2 ~= 235.11` + - Inf `attn_pre_gate l2 ~= 105.50` + - `Inf/HF ~= 0.4487` + - But this is no longer the earliest divergence. +- New root-cause evidence: + - Embedding output already differs: + - HF `inputs_embeds l2 ~= 44.09` + - Inf embed output `l2 ~= 25.51` + - First decoder layer pre-norm output also differs: + - HF layer0 `input_layernorm l2 ~= 95.88` + - Inf layer0 `input_layernorm l2 ~= 70.94` +- Interpretation: + - The mismatch starts before layer-0 attention. + - Attention, gating, and `o_proj` are downstream amplifiers, but not the first source. + - The next priority should be MiniCPM-SALA embedding behavior in InfiniLM: + - verify `model.embed_tokens.weight` load/scaling, + - verify runtime embedding lookup output against HF for the same token ids, + - then re-check whether layer-0 attention comes into line automatically. + +--- + +### Multi-layer alignment after embed fix (2026-03-14) + +- Instrumentation added: + - InfiniLM dumps decoder layer outputs (out2) for layers 0–2 to `/tmp/inf_layer_out_{0,1,2}.bin` and final hidden (after norm) to `/tmp/inf_final_hidden.bin` when `INFINI_DEBUG_ATTN_DUMP=1`. + - HF hooks save layer outputs to `/tmp/hf_layer_out_{0,1,2}.pt` and final hidden to `/tmp/hf_final_hidden.pt`. + - Sanity script prints per-layer and final-hidden norm_ratio and max/mean diff. +- Result (prefill "How are you", int32 input_ids workaround): + - **Layer 0**: norm_ratio ≈ 1.0002, max_diff ≈ 0.0625 → aligned. + - **Layer 1**: norm_ratio ≈ 3.24, max_diff ≈ 28.4 → large divergence. + - **Layer 2**: norm_ratio ≈ 5.73 → further drift. +- Root cause for layer 1+: + - Config: layer 0 = `minicpm4` (sparse/dense), layer 1+ = `lightning-attn`. + - HF `LightningAttention` uses **Simple GLA** (`chunk_simple_gla` / `fused_recurrent_simple_gla`): linear/recurrent attention with decay (g_gamma), not causal softmax. + - InfiniLM now routes lightning layers through **Simple GLA** (InfiniCore `simple_gla_*` ops), matching HF’s formulation (recurrent with decay). +- Next step to align after layer 0: + - Implement Simple GLA (chunk or fused_recurrent) in InfiniCore and route lightning layers through it, matching HF’s `attn_fn` (decay, scale=1/sqrt(d), layout). + +--- + +### MMLU-Pro validation mismatches vs logit work (2026-03-24) + +Paired lm-eval `--log_samples` runs (HF vs local chat / Infini server) often disagree for **heterogeneous** reasons. Treat them differently before spending time on logits: + +| Heuristic tag (export script) | Meaning | Use logits / greedy trace? | +|------------------------------|---------|----------------------------| +| `model_disagreement` | Both sides return a valid letter choice but disagree; text is on-topic. | **Yes** — same `input_ids` + `run_prefill_and_greedy_trace` localizes numerical / decode divergence. | +| `parse_or_format` | One side `[invalid]` or regex extraction differs though the model may agree. | **No** (first fix template, stops, or metric extraction). | +| `garbage` | Off-topic or corrupted completion (e.g. wrong language / spam). | **No** — serving hygiene, batching, or cache contamination. | + +**Repo tooling** + +- `InfiniLM/examples/eval_tasks/mmlu_pro_val/export_mismatch_subset.py` — join two `samples_*.jsonl` dirs on `doc_hash`, optional filters, heuristic tag, write `mismatch_subset.json` + `.md` (includes `arguments_a` / `arguments_b` for replay). +- `InfiniLM/examples/eval_tasks/mmlu_pro_val/mmlu_pro_val_prompt.py` — rebuild `input_ids` from logged rows (rendered string vs JsonChat message list) like lm-eval. +- `InfiniLM/examples/eval_tasks/mmlu_pro_val/mmlu_pro_val_logit_probe.py` — drive `minicpm_sala_logits_sanity.run_prefill_and_greedy_trace` on subset rows (in-process HF + `InferEngine` only; HTTP cannot return logits). +- `InfiniLM/examples/minicpm_sala_logits_sanity.py` — `--mode greedy_trace` for ad-hoc prompts; shared `run_prefill_and_greedy_trace()` for subset probes. + +If greedy trace matches HF on a row but the API eval still differs, diff **chat template**, **stop sequences**, **max_tokens**, or server batching — not the GLA kernel alone. + +**HF vs `local-chat-completions` harness (practical parity)** + +- For the same `doc_hash`, the **rendered prompt string** from `--model hf` can match **byte-for-byte** re-templating the JSON messages the API path logs (verified on a biology mismatch example). +- Differences that still moved scores: **regex extraction** used the *first* `answer is (X)` in long CoT while the model’s final line said another letter; `_default_template_yaml` now uses `group_select: -1` (last match) and case-insensitive pattern. +- **Server**: strip lm-eval’s per-message `type: text` wrapper to `{role, content}` before `apply_chat_template`, and set `continue_final_message=not add_generation_prompt` like lm-eval’s HF model class (`inference_server.py`, `llm.py`). diff --git a/csrc/cache/kv_cache.cpp b/csrc/cache/kv_cache.cpp index 4c97edfa..a7220773 100644 --- a/csrc/cache/kv_cache.cpp +++ b/csrc/cache/kv_cache.cpp @@ -4,6 +4,7 @@ #include "../utils.hpp" #include "infinicore/ops.hpp" #include +#include namespace infinilm::cache { // ========================== @@ -45,7 +46,9 @@ StaticKVCache::StaticKVCache( infinicore::Size max_positional_embedding, infinicore::DataType dtype, const StaticKVCacheConfig &config, - const engine::distributed::RankInfo &rank_info) + const engine::distributed::RankInfo &rank_info, + infinicore::Size gla_recurrent_num_heads, + infinicore::Size gla_recurrent_head_dim) : Cache(), k_dim_(k_dim), v_dim_(v_dim), @@ -54,7 +57,9 @@ StaticKVCache::StaticKVCache( rank_batch_size_(config.max_batch_size()), cache_len_(config.max_cache_len() == std::numeric_limits::max() || config.max_cache_len() == 0 ? max_positional_embedding : config.max_cache_len()), rank_num_layers_(num_layers), - dtype_(dtype) { + dtype_(dtype), + gla_recurrent_num_heads_(gla_recurrent_num_heads), + gla_recurrent_head_dim_(gla_recurrent_head_dim) { // Allocate K cache k_caches_ = infinicore::Tensor::empty( @@ -75,6 +80,17 @@ StaticKVCache::StaticKVCache( v_dim_}, dtype_, rank_info.device); + + if (gla_recurrent_num_heads_ > 0 && gla_recurrent_head_dim_ > 0) { + gla_state_ = infinicore::Tensor::zeros( + {rank_num_layers_, + rank_batch_size_, + gla_recurrent_num_heads_, + gla_recurrent_head_dim_, + gla_recurrent_head_dim_}, + infinicore::DataType::F32, + rank_info.device); + } } infinicore::Tensor StaticKVCache::create_layer_kv_cache( @@ -125,12 +141,27 @@ StaticKVCache::update(size_t layer_idx, auto device = k_cache_layer->device(); #ifdef ENABLE_KV_CACHING - infinicore::op::kv_caching_( - k_cache_layer, - v_cache_layer, - k, - v, - past_sequence_lengths); + // Some debug builds have shown incremental decode (update_len=1) may diverge + // from full-sequence recompute when using the optimized kv_caching_ kernel. + // Provide an env override to fall back to the simple (and slower) copy update. + const char *disable_kv_caching = std::getenv("INFINI_DISABLE_KV_CACHING"); + const bool force_copy_update = disable_kv_caching && disable_kv_caching[0] != '\0' && disable_kv_caching[0] != '0'; + if (force_copy_update) { + size_t cache_pos = reinterpret_cast(past_sequence_lengths->to(infinicore::Device::cpu())->data())[0]; + auto result_len = cache_pos + update_len; + ASSERT(result_len <= cache_len_); + auto k_cache_update = k_cache_layer->narrow({{2, cache_pos, update_len}}); + auto v_cache_update = v_cache_layer->narrow({{2, cache_pos, update_len}}); + k_cache_update->copy_from(k); + v_cache_update->copy_from(v); + } else { + infinicore::op::kv_caching_( + k_cache_layer, + v_cache_layer, + k, + v, + past_sequence_lengths); + } #else size_t cache_pos = reinterpret_cast(past_sequence_lengths->to(infinicore::Device::cpu())->data())[0]; auto result_len = cache_pos + update_len; @@ -146,6 +177,26 @@ StaticKVCache::update(size_t layer_idx, return {k_cache_layer, v_cache_layer}; } +std::tuple +StaticKVCache::get_layer_kv(size_t layer_idx) { + ASSERT(layer_idx < rank_num_layers_); + auto k_cache_layer = k_caches_->narrow({{0, layer_idx, 1}})->squeeze(0); + auto v_cache_layer = v_caches_->narrow({{0, layer_idx, 1}})->squeeze(0); + return {k_cache_layer, v_cache_layer}; +} + +bool +StaticKVCache::has_gla_recurrent_state() const { + return gla_recurrent_num_heads_ > 0 && gla_recurrent_head_dim_ > 0 && static_cast(gla_state_); +} + +infinicore::Tensor +StaticKVCache::gla_recurrent_state_for_layer(size_t layer_idx) { + ASSERT(layer_idx < rank_num_layers_); + ASSERT(has_gla_recurrent_state()); + return gla_state_->narrow({{0, layer_idx, 1}})->squeeze(0); +} + // ========================== // PagedKVCacheConfig // ========================== diff --git a/csrc/cache/kv_cache.hpp b/csrc/cache/kv_cache.hpp index e6e640df..cbef0722 100644 --- a/csrc/cache/kv_cache.hpp +++ b/csrc/cache/kv_cache.hpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -43,7 +44,9 @@ class StaticKVCache final : public Cache { infinicore::Size max_positional_embedding, infinicore::DataType dtype, const StaticKVCacheConfig &config, - const engine::distributed::RankInfo &rank_info); + const engine::distributed::RankInfo &rank_info, + infinicore::Size gla_recurrent_num_heads = 0, + infinicore::Size gla_recurrent_head_dim = 0); static infinicore::Tensor create_layer_kv_cache( const infinicore::Size k_dim, @@ -72,6 +75,20 @@ class StaticKVCache final : public Cache { const infinicore::Tensor &v, const infinicore::Tensor &past_sequence_lengths); + /** + * @brief Get KV cache tensors for a layer (views). + * + * @return (k_cache_layer, v_cache_layer) + * k_cache_layer: [batch, num_rank_k_heads, max_cache_len, k_dim] + * v_cache_layer: [batch, num_rank_v_heads, max_cache_len, v_dim] + */ + std::tuple + get_layer_kv(size_t layer_idx); + + /** Per-layer Simple GLA recurrent state for lightning decode: [batch, H, D, D] float32 (in-place for decode_step). */ + bool has_gla_recurrent_state() const; + infinicore::Tensor gla_recurrent_state_for_layer(size_t layer_idx); + ~StaticKVCache() override = default; private: @@ -89,6 +106,12 @@ class StaticKVCache final : public Cache { // [num_layers, max_batch, num_rank_v_heads, max_cache_len, v_dim] infinicore::Tensor v_caches_; + + infinicore::Size gla_recurrent_num_heads_{0}; + infinicore::Size gla_recurrent_head_dim_{0}; + // [num_layers, max_batch, gla_recurrent_num_heads, D, D], F32; empty when heads==0 + infinicore::Tensor gla_state_; + }; class PagedKVCacheConfig final : public CacheConfig { diff --git a/csrc/config/config_factory.cpp b/csrc/config/config_factory.cpp index c822983e..aff8b986 100644 --- a/csrc/config/config_factory.cpp +++ b/csrc/config/config_factory.cpp @@ -16,7 +16,7 @@ std::shared_ptr ConfigFactory::createConfig(const if (it != config_map.end()) { it->second(model_config); } else { - std::vector classic_models = {"llama", "qwen2", "minicpm", "fm9g", "fm9g7b"}; + std::vector classic_models = {"llama", "qwen2", "minicpm", "minicpm_sala", "fm9g", "fm9g7b"}; const std::string &model_type = model_config->get("model_type"); if (std::find(classic_models.begin(), classic_models.end(), model_type) == classic_models.end()) { throw std::invalid_argument("infinilm::config::ConfigFactory::createConfig: Unsupported model config type: " + model_type); diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp index f1afd84b..2a5c5ff4 100644 --- a/csrc/engine/infer_engine.cpp +++ b/csrc/engine/infer_engine.cpp @@ -121,7 +121,15 @@ InferEngine::Input::to_model_input(infinicore::Device device) const { auto to_device = [&](const std::optional &t) -> std::optional { - return t.has_value() ? t.value()->to(device) : t; + if (!t.has_value()) { + return t; + } + auto ten = t.value(); + // Avoid redundant copies when the tensor is already on the target device. + if (ten->device() == device) { + return ten; + } + return ten->to(device); }; infinilm::InfinilmModel::Input input = { diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp index 1542c1e0..1ba89ca1 100644 --- a/csrc/engine/rank_worker.cpp +++ b/csrc/engine/rank_worker.cpp @@ -5,6 +5,7 @@ #include "../models/models_registry.hpp" #include "infinicore/ops.hpp" #include +#include #include #include @@ -261,7 +262,7 @@ void RankWorker::thread_loop() { rank_info_.device, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr); } else { - std::vector classic_models = {"llama", "qwen2", "minicpm", "fm9g", "fm9g7b"}; + std::vector classic_models = {"llama", "qwen2", "minicpm", "minicpm_sala", "fm9g", "fm9g7b"}; if ((std::find(classic_models.begin(), classic_models.end(), model_type) != classic_models.end())) { model_ = InfinilmModelFactory::createModel( model_config_, diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp index c1e20f76..c426ec1c 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp @@ -1,133 +1,496 @@ #include "minicpm_sala_attention.hpp" -#include "../../global_state/global_state.hpp" + +#include "infinicore/ops.hpp" +#include "infinicore/ops/infllmv2_attention.hpp" +#include "infinicore/ops/simple_gla_attention.hpp" +#include "infinicore/ops/simple_gla_prefill.hpp" +#include "infinicore/context/context.hpp" +#include "../debug_utils/tensor_utils.hpp" + +#include +#include +#include #include +#include namespace infinilm::models::minicpm_sala { -AttentionBase::AttentionBase(std::shared_ptr model_config, - size_t num_attention_heads, - size_t num_key_value_heads, - size_t layer_idx, - const infinicore::Device &device) - : layer_idx_(layer_idx), - hidden_size_(model_config->get("hidden_size")), - head_dim_(model_config->get("head_dim")) { - - const auto &dtype{model_config->get_dtype()}; - - use_bias_ = model_config->get_or("attention_bias", true); - use_output_bias_ = model_config->get_or("attention_output_bias", false); - - attention_backend_ = infinilm::global_state::get_infinilm_config().attention_backend; - const engine::distributed::RankInfo &rank_info = infinilm::global_state::get_tensor_model_parallel_rank_info(); - int tp_rank = infinilm::global_state::get_tensor_model_parallel_rank(); - int tp_size = infinilm::global_state::get_tensor_model_parallel_world_size(); - - const size_t total_num_heads = num_attention_heads; - const size_t total_num_kv_heads = num_key_value_heads; - if ((total_num_kv_heads < static_cast(tp_size)) || (0 != (total_num_kv_heads % static_cast(tp_size)))) { - throw std::runtime_error("infinilm::models::minicpm_sala::AttentionBase: num_key_value_heads must be divisible by tp_size"); +namespace { +// Same as HF MiniCPM-SALA _build_slope_tensor (used for Simple GLA decay). +std::vector build_slope_tensor(size_t n) { + auto get_slopes_power_of_2 = [](size_t n) -> std::vector { + double log2n = std::log2(static_cast(n)); + double start = std::pow(2.0, -(std::pow(2.0, -(log2n - 3)))); + double ratio = start; + std::vector out; + out.reserve(n); + for (size_t i = 0; i < n; ++i) { + out.push_back(static_cast(start * std::pow(ratio, static_cast(i)))); + } + return out; + }; + if (n == 0) return {}; + double log2n = std::log2(static_cast(n)); + if (std::abs(log2n - std::floor(log2n)) < 1e-9) { + return get_slopes_power_of_2(n); } - - num_attention_heads_ = total_num_heads / static_cast(tp_size); - num_key_value_heads_ = total_num_kv_heads / static_cast(tp_size); - - auto quant_scheme = model_config->get_quant_scheme(); - auto quantization_method = model_config->get_quantization_method(); - switch (quant_scheme) { - case infinicore::quantization::QuantScheme::NONE: - INFINICORE_NN_MODULE_INIT(q_proj, hidden_size_, total_num_heads * head_dim_, quantization_method, - use_bias_, dtype, device, tp_rank, tp_size); - INFINICORE_NN_MODULE_INIT(k_proj, hidden_size_, total_num_kv_heads * head_dim_, quantization_method, - use_bias_, dtype, device, tp_rank, tp_size); - INFINICORE_NN_MODULE_INIT(v_proj, hidden_size_, total_num_kv_heads * head_dim_, quantization_method, - use_bias_, dtype, device, tp_rank, tp_size); - INFINICORE_NN_MODULE_INIT(o_proj, total_num_heads * head_dim_, hidden_size_, quantization_method, - use_output_bias_, dtype, device, tp_rank, tp_size, rank_info.comm); - break; - default: - throw std::runtime_error("infinilm::models::minicpm_sala::AttentionBase: unsupported quantization scheme"); - break; + size_t closest = static_cast(std::pow(2.0, std::floor(log2n))); + auto first = get_slopes_power_of_2(closest); + auto rest = build_slope_tensor(2 * closest); + for (size_t i = 0; i < n - closest; ++i) { + first.push_back(rest[i * 2]); } + return first; +} - rotary_emb_ = infinilm::layers::rotary_embedding::get_rope(model_config, device); +} // namespace - float scaling = 1.0f / std::sqrt(static_cast(head_dim_)); - attn_ = std::make_shared(num_attention_heads_, head_dim_, scaling, - num_key_value_heads_, layer_idx_, - kv_cache_k_scale_, kv_cache_v_scale_, attention_backend_); +MiniCPMSALAAttention::MiniCPMSALAAttention(std::shared_ptr model_config, + const infinicore::Device &device, + size_t layer_idx, + const std::string &mixer_type, + engine::distributed::RankInfo rank_info, + backends::AttentionBackend attention_backend) + : model_config_(std::move(model_config)), + rank_info_(rank_info), + layer_idx_(layer_idx), + attention_backend_(attention_backend) { - auto kv_quant_scheme = infinilm::global_state::get_infinilm_config().model_config->get_kv_quant_scheme(); - switch (kv_quant_scheme) { - case (infinicore::quantization::KVQuantAlgo::NONE): { - break; - } - case (infinicore::quantization::KVQuantAlgo::INT8): { - INFINICORE_NN_PARAMETER_INIT(kv_cache_k_scale, ({1}, infinicore::DataType::F32, device, 0, 0, 1)); - INFINICORE_NN_PARAMETER_INIT(kv_cache_v_scale, ({1}, infinicore::DataType::F32, device, 0, 0, 1)); - break; + // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA). + const auto dtype = model_config_->get_dtype(); + hidden_size_ = model_config_->get("hidden_size"); + if (mixer_type == "minicpm4") { + is_sparse_layer_ = true; + num_attention_heads_ = model_config_->get("num_attention_heads"); + num_key_value_heads_ = model_config_->get("num_key_value_heads"); + head_dim_ = model_config_->get("head_dim"); + + // InfLLM-v2 local-window masking (causal-local semantics) for minicpm4. + // Prefer `sparse_window_size`, but fall back to `window_size` if needed. + int sparse_window_size = model_config_->get_or("sparse_window_size", -1); + if (sparse_window_size <= 0) { + // Some HF configs store this under `sparse_config.window_size`. + auto sparse_cfg = model_config_->get_or("sparse_config", nlohmann::json{}); + if (!sparse_cfg.is_null() && sparse_cfg.contains("window_size")) { + sparse_window_size = sparse_cfg["window_size"].get(); + } else { + sparse_window_size = model_config_->get_or("window_size", -1); + } + } + if (sparse_window_size > 0) { + infllmv2_window_left_ = sparse_window_size; + infllmv2_window_right_ = 0; + use_local_window_ = true; + } + } else { + // Lightning layers have their own head config. + num_attention_heads_ = model_config_->get_or("lightning_nh", model_config_->get("num_attention_heads")); + num_key_value_heads_ = model_config_->get_or("lightning_nkv", model_config_->get("num_key_value_heads")); + head_dim_ = model_config_->get_or("lightning_head_dim", model_config_->get("head_dim")); } - default: { - throw std::runtime_error("infinilm::layers::attention: unsupported kv_quant_scheme"); - break; + scaling_ = static_cast(1.0 / std::sqrt(static_cast(head_dim_))); + + // StaticKVCache is allocated as a compact slab per cache type: + // - minicpm4-cache stores only layers where mixer_types[i] == "minicpm4" + // - lightning-cache stores only layers where mixer_types[i] != "minicpm4" + // + // Compute this attention instance's local cache index (0-based) from its + // absolute layer_idx_. + { + bool this_is_minicpm4_cache = (mixer_type == "minicpm4"); + std::vector mixer_types; + try { + mixer_types = model_config_->get>("mixer_types"); + } catch (...) { + mixer_types.assign(model_config_->get("num_hidden_layers"), "minicpm4"); + } + // Be defensive if mixer_types size mismatches. + if (mixer_types.size() != model_config_->get("num_hidden_layers")) { + mixer_types.resize(model_config_->get("num_hidden_layers"), "minicpm4"); + } + size_t count = 0; + for (size_t i = 0; i <= layer_idx_ && i < mixer_types.size(); ++i) { + const bool is_minicpm4_layer = (mixer_types[i] == "minicpm4"); + if (is_minicpm4_layer == this_is_minicpm4_cache) { + ++count; + } + } + // layer_idx_ is always a valid layer, so count should be >= 1. + cache_layer_idx_ = count > 0 ? (count - 1) : 0; } + + // HyPE: RoPE in lightning layers, NoPE in sparse (minicpm4) layers. + // We treat all non-minicpm4 as "linear" (lightning-attn) for M1 dense fallback. + use_rope_ = (mixer_type != "minicpm4") && model_config_->get_or("lightning_use_rope", true); + + // MiniCPM-SALA uses QK-norm and output gates by default. + use_qk_norm_ = model_config_->get_or("qk_norm", true) && (mixer_type != "minicpm4"); + use_output_gate_ = model_config_->get_or("use_output_gate", true); + + // Projections + INFINICORE_NN_MODULE_INIT(q_proj, hidden_size_, num_attention_heads_ * head_dim_, false, dtype, device); + INFINICORE_NN_MODULE_INIT(k_proj, hidden_size_, num_key_value_heads_ * head_dim_, false, dtype, device); + INFINICORE_NN_MODULE_INIT(v_proj, hidden_size_, num_key_value_heads_ * head_dim_, false, dtype, device); + INFINICORE_NN_MODULE_INIT(o_proj, num_attention_heads_ * head_dim_, hidden_size_, false, dtype, device); + + if (mixer_type == "minicpm4") { + // Sparse layers use o_gate (sigmoid gate on attention output) + INFINICORE_NN_MODULE_INIT(o_gate, hidden_size_, hidden_size_, false, dtype, device); + } else { + // Lightning layers use q/k norm + output norm and z-projection gate + if (use_qk_norm_) { + INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, model_config_->get("rms_norm_eps"), dtype, device); + INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, model_config_->get("rms_norm_eps"), dtype, device); + } + use_output_norm_ = true; + // Checkpoint uses o_norm over hidden_size (shape [hidden_size]). + INFINICORE_NN_MODULE_INIT(o_norm, hidden_size_, model_config_->get("rms_norm_eps"), dtype, device); + INFINICORE_NN_MODULE_INIT(z_proj, hidden_size_, hidden_size_, false, dtype, device); } + // Simple GLA decay for lightning path: g_gamma = _build_slope_tensor * -1. + std::vector slopes = build_slope_tensor(num_attention_heads_); + auto g_cpu = infinicore::Tensor::empty( + {num_attention_heads_}, infinicore::DataType::F32, infinicore::Device::cpu()); + float *ptr = reinterpret_cast(g_cpu->data()); + for (size_t h = 0; h < num_attention_heads_; ++h) + ptr[h] = -slopes[h]; + g_gamma_ = g_cpu->to(device); } -InfLLMv2Attention::InfLLMv2Attention(std::shared_ptr model_config, - size_t layer_idx, - const infinicore::Device &device) - : AttentionBase(model_config, - model_config->get("num_attention_heads"), - model_config->get("num_key_value_heads"), - layer_idx, device) { - use_output_gate_ = model_config->get_or("use_output_gate", false); - const auto &dtype{model_config->get_dtype()}; - size_t num_attention_heads = model_config->get("num_attention_heads"); - if (use_output_gate_) { - INFINICORE_NN_MODULE_INIT(o_gate, hidden_size_, num_attention_heads * head_dim_, - model_config->get_quantization_method(), use_bias_, dtype, device); - } +void MiniCPMSALAAttention::set_rotary_emb(const std::shared_ptr &rotary_emb) { + rotary_emb_ = rotary_emb; +} + +void MiniCPMSALAAttention::reset_cache() { + // KV state is maintained by the shared engine cache (StaticKVCache). } -infinicore::Tensor InfLLMv2Attention::forward(const infinicore::Tensor &positions, - const infinicore::Tensor &hidden_states) const { - spdlog::error("InfLLMv2Attention is not implemented"); - return hidden_states; + +infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &hidden_states, + const infinicore::Tensor &position_ids, + std::shared_ptr kv_cache, + std::optional past_sequence_lengths, + std::optional total_sequence_lengths, + std::optional input_offsets, + std::optional cu_seqlens, + std::optional block_tables, + std::optional slot_mapping) const { + (void)input_offsets; + (void)block_tables; + (void)slot_mapping; + return forward_dense_(hidden_states, position_ids, kv_cache, past_sequence_lengths, total_sequence_lengths, cu_seqlens); } -LightningAttention::LightningAttention(std::shared_ptr model_config, - size_t layer_idx, - const infinicore::Device &device) - : AttentionBase(model_config, - model_config->get("num_attention_heads"), - model_config->get("lightning_nkv"), - layer_idx, device) { - - qk_norm_ = model_config->get_or("qk_norm", false); - use_output_norm_ = model_config->get_or("use_output_norm", false); - use_output_gate_ = model_config->get_or("use_output_gate", false); - const auto &dtype{model_config->get_dtype()}; - double rms_norm_eps = model_config->get("rms_norm_eps"); - size_t num_attention_heads = model_config->get("num_attention_heads"); - - if (qk_norm_) { - INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, rms_norm_eps, dtype, device); - INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, rms_norm_eps, dtype, device); +infinicore::Tensor MiniCPMSALAAttention::forward_dense_(const infinicore::Tensor &hidden_states, + const infinicore::Tensor &position_ids, + std::shared_ptr kv_cache, + std::optional past_sequence_lengths, + std::optional total_sequence_lengths, + std::optional cu_seqlens) const { + // Input: [B, S, H] + auto shape = hidden_states->shape(); + const size_t batch_size = shape[0]; + const size_t seq_len = shape[1]; + + auto hs_mut = hidden_states; + auto q = q_proj_->forward(hs_mut); + auto k = k_proj_->forward(hs_mut); + auto v = v_proj_->forward(hs_mut); + // View requires contiguous layout; only call contiguous when needed (proj output often already contiguous). + auto q_reshaped = q->contiguous()->view({batch_size, seq_len, num_attention_heads_, head_dim_}); + auto k_reshaped = k->contiguous()->view({batch_size, seq_len, num_key_value_heads_, head_dim_}); + auto v_reshaped = v->contiguous()->view({batch_size, seq_len, num_key_value_heads_, head_dim_}); + + if (use_qk_norm_) { + // RMSNorm op only supports 2D/3D; normalize over head_dim with a 3D view. + auto q3 = q_reshaped->view({batch_size * seq_len, num_attention_heads_, head_dim_}); + auto k3 = k_reshaped->view({batch_size * seq_len, num_key_value_heads_, head_dim_}); + q3 = q_norm_->forward(q3); + k3 = k_norm_->forward(k3); + q_reshaped = q3->view({batch_size, seq_len, num_attention_heads_, head_dim_}); + k_reshaped = k3->view({batch_size, seq_len, num_key_value_heads_, head_dim_}); + } + + // RoPE only for lightning layers (HyPE) + if (use_rope_) { + if (!rotary_emb_) { + throw std::runtime_error("MiniCPMSALAAttention: rotary_emb is not set but use_rope=true"); + } + // position_ids can be [B,S] or [S]; follow LlamaAttention behavior. + auto pos_shape = position_ids->shape(); + infinicore::Tensor pos_ids_for_rope = position_ids; + if (pos_shape.size() == 2) { + auto pos_narrowed = position_ids->narrow({{0, 0, 1}}); + pos_ids_for_rope = pos_narrowed->contiguous()->view({pos_shape[1]}); + } else if (pos_shape.size() == 1) { + pos_ids_for_rope = position_ids->contiguous(); + } else { + throw std::runtime_error("MiniCPMSALAAttention: Unexpected position_ids shape"); + } + + rotary_emb_->forward(q_reshaped, pos_ids_for_rope, true); + rotary_emb_->forward(k_reshaped, pos_ids_for_rope, true); + } + + // Compute dense attention (GQA): reshape as LlamaAttention does + size_t total_seq_len = seq_len; + size_t cache_pos = 0; + const bool has_cache_meta = past_sequence_lengths.has_value() && total_sequence_lengths.has_value(); + if (has_cache_meta) { + // Single device-to-host sync: read both scalars (engine could pass these as scalars later). + auto past_cpu = past_sequence_lengths.value()->to(infinicore::Device::cpu()); + auto total_cpu = total_sequence_lengths.value()->to(infinicore::Device::cpu()); + cache_pos = reinterpret_cast(past_cpu->data())[0]; + size_t total_seq_len_raw = reinterpret_cast(total_cpu->data())[0]; + total_seq_len = total_seq_len_raw; + // Some engine call sites pass `total_sequence_lengths` as the *input* length (e.g. 1 for decode), + // while `past_sequence_lengths` is the cached KV length. Attention needs total KV length. + // Use KV semantics: total_kv_len = cache_pos + current seq_len. + total_seq_len = cache_pos + seq_len; + } else if (total_sequence_lengths.has_value()) { + total_seq_len = reinterpret_cast(total_sequence_lengths.value()->to(infinicore::Device::cpu())->data())[0]; + } + + // Cache expects [B, n_kv, S, D]. Keep this as a strided view and let the caching op handle strides + // to avoid a full rearrange (permute->contiguous) copy on long-context prefill. + // Correctness: kv_caching_ / StaticKVCache::update is sensitive to input stride/layout. + // Restore contiguous to match HF logits exactly before re-applying any strided optimizations. + auto k_permuted = k_reshaped->permute({0, 2, 1, 3})->contiguous(); // [B, n_kv, S, D] + auto v_permuted = v_reshaped->permute({0, 2, 1, 3})->contiguous(); // [B, n_kv, S, D] + + // HF-like dense KV caching using the engine-provided StaticKVCache. + infinicore::Tensor k_total = k_permuted; + infinicore::Tensor v_total = v_permuted; + std::shared_ptr static_kv_cache = nullptr; + if (kv_cache != nullptr && has_cache_meta) { + static_kv_cache = std::dynamic_pointer_cast(kv_cache); + if (!static_kv_cache) { + throw std::runtime_error("MiniCPMSALAAttention: Unsupported cache type (expected StaticKVCache)"); + } + // Default behavior: update cache here. For minicpm4 decode we may override and let InfLLM-v2 update. + auto [k_cached, v_cached] = static_kv_cache->update( + cache_layer_idx_, k_permuted, v_permuted, past_sequence_lengths.value()); + k_total = k_cached; + v_total = v_cached; + } else { + // No cache metadata => treat as prefill-only. + total_seq_len = seq_len; + } + + // Slice to total_seq_len (decode-only / cont-batch) + if (total_seq_len > k_total->shape()[2]) { + throw std::runtime_error("MiniCPMSALAAttention: total_seq_len exceeds available KV length (cache not correctly updated)"); } - if (use_output_norm_) { - INFINICORE_NN_MODULE_INIT(o_norm, num_attention_heads * head_dim_, rms_norm_eps, dtype, device); + k_total = k_total->narrow({{2, 0, total_seq_len}}); + v_total = v_total->narrow({{2, 0, total_seq_len}}); + + infinicore::Tensor attn_output; + if (!is_sparse_layer_) { + // Lightning-attn: Simple GLA (HF-aligned). + // simple_gla_attention(q,k,v,g_gamma,scale) expects [B, T, H, D]; g_gamma [H]. + const size_t n_h = num_attention_heads_; + const size_t n_kv = num_key_value_heads_; + infinicore::Tensor k_use = k_total; + infinicore::Tensor v_use = v_total; + if (n_kv < n_h) { + // Repeat KV heads to match n_h (same as HF repeat_kv / repeat_interleave). + // Use as_strided view then contiguous() so one copy instead of n_h narrow/copy_from calls. + const size_t ngroup = n_h / n_kv; + const std::vector repeat_strides = { + static_cast(n_kv * total_seq_len * head_dim_), + static_cast(total_seq_len * head_dim_), + 0, + static_cast(head_dim_), + 1, + }; + k_use = k_total->as_strided( + {batch_size, n_kv, ngroup, total_seq_len, head_dim_}, repeat_strides) + ->contiguous() + ->view({batch_size, n_h, total_seq_len, head_dim_}); + v_use = v_total->as_strided( + {batch_size, n_kv, ngroup, total_seq_len, head_dim_}, repeat_strides) + ->contiguous() + ->view({batch_size, n_h, total_seq_len, head_dim_}); + } + // GLA expects [B, S, H, D]. `q_reshaped` is already [B, S, H, D], so avoid permute+contiguous. + auto q_bthd = q_reshaped; // [B, S_q, H, D] + // Correctness: restore contiguous layout for K/V before `simple_gla_attention`. + auto k_bthd = k_use->permute({0, 2, 1, 3})->contiguous(); // [B, S_kv, H, D] + auto v_bthd = v_use->permute({0, 2, 1, 3})->contiguous(); // [B, S_kv, H, D] + + // Lightning GLA decode must use recurrent state (StaticKVCache) whenever available. + const bool is_lightning_decode = has_cache_meta && static_kv_cache && (seq_len < total_seq_len); + if (is_lightning_decode && !static_kv_cache->has_gla_recurrent_state()) { + throw std::runtime_error( + "MiniCPMSALAAttention(lightning): Lightning decode requires StaticKVCache gla_recurrent_state " + "(missing recurrent buffer in StaticKVCache)."); + } + + const bool recurrent_gla = static_kv_cache && static_kv_cache->has_gla_recurrent_state() && has_cache_meta; + + infinicore::Tensor gla_out; + if (recurrent_gla && seq_len == 1 && total_seq_len > 1) { + auto S = static_kv_cache->gla_recurrent_state_for_layer(cache_layer_idx_); + auto q_new = q_bthd; + auto k_new = k_bthd->narrow({{1, total_seq_len - 1, 1}}); + auto v_new = v_bthd->narrow({{1, total_seq_len - 1, 1}}); + gla_out = infinicore::op::simple_gla_decode_step(q_new, k_new, v_new, S, g_gamma_, scaling_); + } else { + infinicore::Tensor q_full; + if (seq_len == total_seq_len) { + q_full = q_bthd; + } else { + // Decode: q has seq_len (e.g. 1), kv has total_seq_len; pad q to [B, total_seq_len, H, D]. + q_full = infinicore::Tensor::zeros( + {batch_size, total_seq_len, n_h, head_dim_}, q_bthd->dtype(), q_bthd->device()); + auto q_slot = q_full->narrow({{1, total_seq_len - seq_len, seq_len}}); + q_slot->copy_from(q_bthd); + } + // Fused prefill: naive kernel for head_dim<=64; chunked/tiled kernel for head_dim>64 (e.g. 128). + bool use_fused_prefill = (batch_size == 1) && (seq_len == total_seq_len); + if (use_fused_prefill) { + gla_out = infinicore::op::simple_gla_prefill(q_full, k_bthd, v_bthd, g_gamma_, scaling_); + } else { + gla_out = infinicore::op::simple_gla_attention(q_full, k_bthd, v_bthd, g_gamma_, scaling_); + } + + // Keep per-layer recurrent state aligned with simple_gla_attention / prefill outputs. + // Use batched GEMM (CUDA+ATen) instead of O(seq_len) decode_step launches; see + // simple_gla_recurrent_state_append_segment (closed form: S <- g^L S + Σ g^{L-1-j} outer(k,v)). + if (recurrent_gla) { + auto S = static_kv_cache->gla_recurrent_state_for_layer(cache_layer_idx_); + if (cache_pos == 0) { + infinicore::op::zeros_(S); + } + auto k_seg = k_bthd->narrow({{1, cache_pos, seq_len}}); + auto v_seg = v_bthd->narrow({{1, cache_pos, seq_len}}); + infinicore::op::simple_gla_recurrent_state_append_segment(S, k_seg, v_seg, g_gamma_); + } + } + + infinicore::Tensor out_slice = (recurrent_gla && seq_len == 1 && total_seq_len > 1) + ? gla_out + : gla_out->narrow({{1, total_seq_len - seq_len, seq_len}}); + attn_output = out_slice->view({batch_size, seq_len, n_h * head_dim_}); + } else { + // minicpm4 layers must use InfLLM-v2 attention (hard error if not available). + // NOTE: Lightning layers keep Simple GLA for correctness; only minicpm4 routes here. + try { + if (!total_sequence_lengths.has_value()) { + throw std::runtime_error( + "MiniCPMSALAAttention(minicpm4): total_sequence_lengths is required for InfLLM-v2 path"); + } + // `infllmv2_kvcache` expects the number of valid K/V entries in the + // provided cache tensors. Since we already appended the current + // token via StaticKVCache::update, the valid length is the total + // KV length (past + current token). + const auto cache_lens = total_sequence_lengths.value(); + + // Prefill: InfLLM-v2 varlen (Q and K packed lengths match `seq_len == total_seq_len` here). + // Decode: `seq_len < total_seq_len` — use `infllmv2_kvcache` after StaticKVCache::update + // (valid KV length == `total_seq_len`). Using varlen for decode (1 query vs long K) hit NaNs + // in practice for modest sequence lengths; kvcache matches operator tests and Flash path. + const bool force_varlen_decode = [&]() { + const char *env = std::getenv("INFINI_MINICPM4_DECODE_VARLEN"); + return env && env[0] != '\0' && env[0] != '0'; + }(); + + if (seq_len == total_seq_len || (force_varlen_decode && batch_size == 1)) { + if (batch_size != 1) { + throw std::runtime_error("MiniCPMSALAAttention(minicpm4): varlen prefill path currently requires batch_size=1"); + } + auto q_bshd = q_reshaped->contiguous(); // [B, S, n_h, D] + auto k_btkd = k_total->permute({0, 2, 1, 3})->contiguous(); // [B, T, n_kv, D] + auto v_btkd = v_total->permute({0, 2, 1, 3})->contiguous(); // [B, T, n_kv, D] + auto q_var = q_bshd->view({static_cast(seq_len), static_cast(num_attention_heads_), static_cast(head_dim_)}); + auto k_var = k_btkd->view({static_cast(total_seq_len), static_cast(num_key_value_heads_), static_cast(head_dim_)}); + auto v_var = v_btkd->view({static_cast(total_seq_len), static_cast(num_key_value_heads_), static_cast(head_dim_)}); + + auto cuq_cpu = infinicore::Tensor::empty({2}, infinicore::DataType::I32, infinicore::Device::cpu()); + reinterpret_cast(cuq_cpu->data())[0] = 0; + reinterpret_cast(cuq_cpu->data())[1] = static_cast(seq_len); + infinicore::Tensor cu_q = cuq_cpu->to(q_var->device()); + // cu_k corresponds to the full KV length used by k_var/v_var. + auto cuk_cpu = infinicore::Tensor::empty({2}, infinicore::DataType::I32, infinicore::Device::cpu()); + reinterpret_cast(cuk_cpu->data())[0] = 0; + reinterpret_cast(cuk_cpu->data())[1] = static_cast(total_seq_len); + infinicore::Tensor cu_k = cuk_cpu->to(q_var->device()); + + const bool infllmv2_causal = !use_local_window_; + const int window_left = use_local_window_ ? infllmv2_window_left_ : -1; + const int window_right = use_local_window_ ? 0 : -1; + + auto out_var = infinicore::op::infllmv2_varlen( + q_var, k_var, v_var, + cu_q, cu_k, + static_cast(seq_len), + static_cast(total_seq_len), + scaling_, + /*causal=*/infllmv2_causal, + /*window_size_left=*/window_left, + /*window_size_right=*/window_right); + attn_output = out_var->view({batch_size, seq_len, num_attention_heads_ * head_dim_}); + } else if (static_kv_cache) { + if (batch_size != 1) { + throw std::runtime_error("MiniCPMSALAAttention(minicpm4): kvcache decode path currently requires batch_size=1"); + } + auto q_bshd = q_reshaped->contiguous(); // [B, S_q, n_h, D] + auto k_bthd = k_total->permute({0, 2, 1, 3})->contiguous(); // [B, T, n_kv, D] + auto v_bthd = v_total->permute({0, 2, 1, 3})->contiguous(); // [B, T, n_kv, D] + + const bool infllmv2_causal = !use_local_window_; + const int window_left = use_local_window_ ? infllmv2_window_left_ : -1; + const int window_right = use_local_window_ ? 0 : -1; + + auto out_bshd = infinicore::op::infllmv2_kvcache( + q_bshd, + k_bthd, + v_bthd, + cache_lens, + scaling_, + /*causal=*/infllmv2_causal, + /*window_size_left=*/window_left, + /*window_size_right=*/window_right); + attn_output = out_bshd->contiguous()->view( + {batch_size, seq_len, num_attention_heads_ * head_dim_}); + } else { + throw std::runtime_error( + "MiniCPMSALAAttention(minicpm4): decode requires StaticKVCache (missing cache metadata or cache)"); + } + } catch (const std::exception &e) { + throw std::runtime_error( + std::string("MiniCPMSALAAttention(minicpm4): InfLLM-v2 attention failed. ") + + "This build must provide InfLLM-v2 (ENABLE_INFLLMV2+ENABLE_ATEN) and the infllmv2_cuda_impl .so " + + "must be available via LD_PRELOAD/LD_LIBRARY_PATH. Original error: " + e.what()); + } } + + // Output norm + gate variants if (use_output_gate_) { - INFINICORE_NN_MODULE_INIT(z_proj, hidden_size_, num_attention_heads * head_dim_, - model_config->get_quantization_method(), use_bias_, dtype, device); + if (o_gate_) { + // Sparse (minicpm4): y = sigmoid(o_gate(x)) * attn_output + auto gate_in = hidden_states; + auto gate = o_gate_->forward(gate_in); + infinicore::op::sigmoid_(gate, gate); + attn_output = infinicore::op::mul(attn_output, gate); + } else if (z_proj_) { + // Lightning: match HF LightningAttention: o_norm(o) then o * sigmoid(z_proj(x)). + auto z_in = hidden_states; + auto z = z_proj_->forward(z_in); + infinicore::op::sigmoid_(z, z); + if (use_output_norm_ && o_norm_) { + attn_output = o_norm_->forward(attn_output); + } + attn_output = infinicore::op::mul(attn_output, z); + } + } else if (use_output_norm_ && o_norm_) { + attn_output = o_norm_->forward(attn_output); } -} -infinicore::Tensor LightningAttention::forward(const infinicore::Tensor &positions, - const infinicore::Tensor &hidden_states) const { - spdlog::error("LightningAttention is not implemented"); - return hidden_states; + auto attn_out_mut = attn_output; + auto out = o_proj_->forward(attn_out_mut); + + return out; } } // namespace infinilm::models::minicpm_sala diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp index 81a032b6..3cd8f284 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp @@ -1,88 +1,102 @@ #pragma once -#include "../../layers/common_modules.hpp" +#include "../../backends/attention_backends.hpp" +#include "../../cache/kv_cache.hpp" +#include "../../config/model_config.hpp" +#include "../../engine/distributed/distributed.hpp" -namespace infinilm::layers::attention { -class AttentionLayer; -} +#include "infinicore/nn/linear.hpp" +#include "infinicore/nn/module.hpp" +#include "infinicore/nn/rmsnorm.hpp" +#include "infinicore/nn/rope.hpp" +#include "infinicore/tensor.hpp" -namespace infinilm::models::minicpm_sala { +#include +#include -class AttentionBase : public infinicore::nn::Module { -protected: - AttentionBase(std::shared_ptr model_config, - size_t num_attention_heads, - size_t num_key_value_heads, - size_t layer_idx, - const infinicore::Device &device); +namespace infinilm::models::minicpm_sala { +// Dense attention fallback implementation used for Milestone 1. +// Parameter names are aligned with HF MiniCPM-SALA safetensors keys: +// model.layers.N.self_attn.{q_proj,k_proj,v_proj,o_proj,...} +// TODO(refactor): KV cache is currently per-layer dense; refactor to use engine paged KV pool +// and block_tables/slot_mapping to match SGLang minicpm-sala pattern (see minicpm_sala_attention.cpp). +class MiniCPMSALAAttention : public infinicore::nn::Module { public: - size_t layer_idx() const { return layer_idx_; } - size_t num_heads() const { return num_attention_heads_; } - size_t num_kv_heads() const { return num_key_value_heads_; } - size_t head_dim() const { return head_dim_; } - size_t hidden_size() const { return hidden_size_; } + MiniCPMSALAAttention(std::shared_ptr model_config, + const infinicore::Device &device, + size_t layer_idx, + const std::string &mixer_type, + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), + backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); + + infinicore::Tensor forward(const infinicore::Tensor &hidden_states, + const infinicore::Tensor &position_ids, + std::shared_ptr kv_cache, + std::optional past_sequence_lengths, + std::optional total_sequence_lengths, + std::optional input_offsets, + std::optional cu_seqlens, + std::optional block_tables, + std::optional slot_mapping) const; + + void set_rotary_emb(const std::shared_ptr &rotary_emb); + void reset_cache(); + +private: + infinicore::Tensor forward_dense_(const infinicore::Tensor &hidden_states, + const infinicore::Tensor &position_ids, + std::shared_ptr kv_cache, + std::optional past_sequence_lengths, + std::optional total_sequence_lengths, + std::optional cu_seqlens) const; protected: - INFINICORE_NN_MODULE(infinilm::layers::linear::ColumnParallelLinear, q_proj); - INFINICORE_NN_MODULE(infinilm::layers::linear::ColumnParallelLinear, k_proj); - INFINICORE_NN_MODULE(infinilm::layers::linear::ColumnParallelLinear, v_proj); - INFINICORE_NN_MODULE(infinilm::layers::linear::RowParallelLinear, o_proj); + // Projections (HF-aligned naming) + INFINICORE_NN_MODULE(infinicore::nn::Linear, q_proj); + INFINICORE_NN_MODULE(infinicore::nn::Linear, k_proj); + INFINICORE_NN_MODULE(infinicore::nn::Linear, v_proj); + INFINICORE_NN_MODULE(infinicore::nn::Linear, o_proj); + + // Optional (Lightning layers): q_norm/k_norm/o_norm + z_proj + INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, q_norm); + INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, k_norm); + INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, o_norm); + INFINICORE_NN_MODULE(infinicore::nn::Linear, z_proj); + + // Optional (Sparse layers): o_gate + INFINICORE_NN_MODULE(infinicore::nn::Linear, o_gate); - std::shared_ptr attn_; - ::infinilm::backends::AttentionBackend attention_backend_; + std::shared_ptr model_config_; std::shared_ptr rotary_emb_; + engine::distributed::RankInfo rank_info_; size_t layer_idx_; + // Layer index remapped into the cache instance (minicpm4-cache vs lightning-cache). + // StaticKVCache allocates a compact [num_layers, ...] slab per cache type. + size_t cache_layer_idx_ = 0; size_t hidden_size_; size_t num_attention_heads_; size_t num_key_value_heads_; size_t head_dim_; - bool use_bias_; - bool use_output_bias_; - - // For off-line kv cache quantization - INFINICORE_NN_PARAMETER(kv_cache_k_scale); - INFINICORE_NN_PARAMETER(kv_cache_v_scale); -}; + float scaling_; -/** - * @brief InfLLMv2 attention with optional output gate - */ -class InfLLMv2Attention : public AttentionBase { -public: - InfLLMv2Attention(std::shared_ptr model_config, - size_t layer_idx, - const infinicore::Device &device); - - infinicore::Tensor forward(const infinicore::Tensor &positions, - const infinicore::Tensor &hidden_states) const; - -protected: - bool use_output_gate_; - INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, o_gate); -}; + bool use_qk_norm_ = false; + bool use_output_gate_ = false; + bool use_output_norm_ = false; + bool use_rope_ = false; + bool is_sparse_layer_ = false; -/** - * @brief Lightning attention with optional output norm and gate - */ -class LightningAttention : public AttentionBase { -public: - LightningAttention(std::shared_ptr model_config, - size_t layer_idx, - const infinicore::Device &device); + // InfLLM-v2 local-window masking plumbing for `mixer_type=="minicpm4"`. + // When enabled: causal=false + window_size_left=sparse_window_size + window_size_right=0. + int infllmv2_window_left_ = -1; + int infllmv2_window_right_ = -1; + bool use_local_window_ = false; - infinicore::Tensor forward(const infinicore::Tensor &positions, - const infinicore::Tensor &hidden_states) const; + backends::AttentionBackend attention_backend_; -protected: - bool qk_norm_; - bool use_output_norm_; - bool use_output_gate_; - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, q_norm); - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, k_norm); - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, o_norm); - INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, z_proj); + // Lightning layers only: per-head log-decay for Simple GLA (HF _build_slope_tensor * -1). + infinicore::Tensor g_gamma_; }; } // namespace infinilm::models::minicpm_sala diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoderLayer.cpp b/csrc/models/minicpm_sala/minicpm_sala_decoderLayer.cpp deleted file mode 100644 index ff3c113f..00000000 --- a/csrc/models/minicpm_sala/minicpm_sala_decoderLayer.cpp +++ /dev/null @@ -1,61 +0,0 @@ -#include "minicpm_sala_decoderLayer.hpp" - -#include "infinicore/ops.hpp" -#include -#include -#include - -namespace infinilm::models::minicpm_sala { - -MiniCPMSALADecoderLayer::MiniCPMSALADecoderLayer(std::shared_ptr model_config, - size_t layer_idx, - const infinicore::Device &device) - : layer_idx_(layer_idx) { - const auto &dtype{model_config->get_dtype()}; - size_t hidden_size = model_config->get("hidden_size"); - double rms_norm_eps = model_config->get("rms_norm_eps"); - - INFINICORE_NN_MODULE_INIT(input_layernorm, hidden_size, rms_norm_eps, dtype, device); - INFINICORE_NN_MODULE_INIT(post_attention_layernorm, hidden_size, rms_norm_eps, dtype, device); - INFINICORE_NN_MODULE_INIT(mlp, model_config, device); - - std::vector mixer_types = model_config->get>("mixer_types"); - std::string mixer_type = mixer_types[layer_idx]; - if ("minicpm4" == mixer_type) { - self_attn_ = std::make_shared(this->register_module("self_attn", model_config, layer_idx, device)); - } else if ("lightning" == mixer_type || "lightning_attn" == mixer_type || "lightning-attn" == mixer_type) { - self_attn_ = std::make_shared(this->register_module("self_attn", model_config, layer_idx, device)); - } else { - throw std::runtime_error("infinilm::models::minicpm_sala::MiniCPMSALADecoderLayer: unsupported mixer_type '" + mixer_type + "' for layer " + std::to_string(layer_idx)); - } -} - -std::tuple MiniCPMSALADecoderLayer::forward(const infinicore::Tensor &positions, - infinicore::Tensor &hidden_states, - infinicore::Tensor &residual) { - input_layernorm_->forward_inplace(hidden_states, residual); - hidden_states = std::visit( - [&](auto &attn_ptr) { return attn_ptr->forward(positions, hidden_states); }, *self_attn_); - - post_attention_layernorm_->forward_inplace(hidden_states, residual); - hidden_states = mlp_->forward(hidden_states); - return std::make_tuple(hidden_states, residual); -} - -infinicore::Tensor MiniCPMSALADecoderLayer::forward(const infinicore::Tensor &positions, - infinicore::Tensor &hidden_states) { - auto residual = hidden_states; - hidden_states = input_layernorm_->forward(hidden_states); - hidden_states = std::visit( - [&](auto &attn_ptr) { return attn_ptr->forward(positions, hidden_states); }, *self_attn_); - - hidden_states = infinicore::op::add(residual, hidden_states); - - residual = hidden_states; - hidden_states = post_attention_layernorm_->forward(hidden_states); - hidden_states = mlp_->forward(hidden_states); - hidden_states = infinicore::op::add(residual, hidden_states); - return hidden_states; -} - -} // namespace infinilm::models::minicpm_sala diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoderLayer.hpp b/csrc/models/minicpm_sala/minicpm_sala_decoderLayer.hpp deleted file mode 100644 index 5e8faafb..00000000 --- a/csrc/models/minicpm_sala/minicpm_sala_decoderLayer.hpp +++ /dev/null @@ -1,34 +0,0 @@ -#pragma once - -#include "../../layers/mlp/mlp.hpp" -#include "minicpm_sala_attention.hpp" -#include -#include - -namespace infinilm::models::minicpm_sala { -using MiniCPMMLP = infinilm::layers::MLP; -using MiniCPMSALAAttention = std::variant, std::shared_ptr>; - -class MiniCPMSALADecoderLayer : public infinicore::nn::Module { -public: - MiniCPMSALADecoderLayer(std::shared_ptr model_config, - size_t layer_idx, - const infinicore::Device &device); - - std::tuple forward(const infinicore::Tensor &positions, - infinicore::Tensor &hidden_states, - infinicore::Tensor &residual); - - infinicore::Tensor forward(const infinicore::Tensor &positions, - infinicore::Tensor &hidden_states); - -protected: - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, input_layernorm); - INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, post_attention_layernorm); - INFINICORE_NN_MODULE(MiniCPMSALAAttention, self_attn); - INFINICORE_NN_MODULE(MiniCPMMLP, mlp); - - size_t layer_idx_; -}; - -} // namespace infinilm::models::minicpm_sala diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp new file mode 100644 index 00000000..391b626b --- /dev/null +++ b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp @@ -0,0 +1,83 @@ +#include "minicpm_sala_decoder_layer.hpp" + +#include "infinicore/ops.hpp" +#include "infinicore/context/context.hpp" +#include +#include +#include +#include +#include +#include + +namespace infinilm::models::minicpm_sala { + + +MiniCPMSALADecoderLayer::MiniCPMSALADecoderLayer(std::shared_ptr model_config, + const infinicore::Device &device, + size_t layer_idx, + const std::string &mixer_type, + engine::distributed::RankInfo rank_info, + backends::AttentionBackend attention_backend) { + layer_idx_ = layer_idx; + // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA). + const auto dtype = model_config->get_dtype(); + const double eps = model_config->get("rms_norm_eps"); + + // MuP residual scaling at forward (o_proj/down_proj not scaled in loader for minicpm_sala). + const double scale_depth = model_config->get_or("scale_depth", 1.0); + const size_t num_layers = model_config->get("num_hidden_layers"); + residual_scale_ = scale_depth / std::sqrt(static_cast(num_layers)); + + INFINICORE_NN_MODULE_INIT(input_layernorm, model_config->get("hidden_size"), eps, dtype, device); + INFINICORE_NN_MODULE_INIT(self_attn, model_config, device, layer_idx, mixer_type, rank_info, attention_backend); + INFINICORE_NN_MODULE_INIT(post_attention_layernorm, model_config->get("hidden_size"), eps, dtype, device); + INFINICORE_NN_MODULE_INIT(mlp, model_config, device); +} + +void MiniCPMSALADecoderLayer::set_rotary_emb(const std::shared_ptr &rotary_emb) { + self_attn_->set_rotary_emb(rotary_emb); +} + +void MiniCPMSALADecoderLayer::reset_cache() { + self_attn_->reset_cache(); +} + +infinicore::Tensor MiniCPMSALADecoderLayer::forward(const infinicore::Tensor &hidden_states, + const infinicore::Tensor &position_ids, + std::shared_ptr kv_cache, + std::optional past_sequence_lengths, + std::optional total_sequence_lengths, + std::optional input_offsets, + std::optional cu_seqlens, + std::optional block_tables, + std::optional slot_mapping) const { + // Pre-norm attention + auto hs1 = input_layernorm_->forward(hidden_states); + auto attn_out = self_attn_->forward( + hs1, + position_ids, + kv_cache, + past_sequence_lengths, + total_sequence_lengths, + input_offsets, + cu_seqlens, + block_tables, + slot_mapping); + + // residual + scale_down * attn_out (MuP) + auto ones_attn = infinicore::Tensor::empty(attn_out->shape(), attn_out->dtype(), attn_out->device()); + infinicore::op::ones_(ones_attn); + auto out1 = infinicore::op::addcmul(hidden_states, attn_out, ones_attn, static_cast(residual_scale_)); + + // Pre-norm MLP + auto hs2 = post_attention_layernorm_->forward(out1); + auto mlp_out = mlp_->forward(hs2); + // residual + scale_down * mlp_out (MuP) + auto ones_mlp = infinicore::Tensor::empty(mlp_out->shape(), mlp_out->dtype(), mlp_out->device()); + infinicore::op::ones_(ones_mlp); + auto out2 = infinicore::op::addcmul(out1, mlp_out, ones_mlp, static_cast(residual_scale_)); + + return out2; +} + +} // namespace infinilm::models::minicpm_sala diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp new file mode 100644 index 00000000..948e4d97 --- /dev/null +++ b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp @@ -0,0 +1,53 @@ +#pragma once + +#include "minicpm_sala_attention.hpp" +#include "minicpm_sala_mlp.hpp" + +#include "../../backends/attention_backends.hpp" +#include "../../cache/kv_cache.hpp" +#include "../../config/model_config.hpp" +#include "../../engine/distributed/distributed.hpp" + +#include "infinicore/nn/module.hpp" +#include "infinicore/nn/rmsnorm.hpp" +#include "infinicore/tensor.hpp" + +#include +#include + +namespace infinilm::models::minicpm_sala { + +class MiniCPMSALADecoderLayer : public infinicore::nn::Module { +public: + MiniCPMSALADecoderLayer(std::shared_ptr model_config, + const infinicore::Device &device, + size_t layer_idx, + const std::string &mixer_type, + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), + backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); + + infinicore::Tensor forward(const infinicore::Tensor &hidden_states, + const infinicore::Tensor &position_ids, + std::shared_ptr kv_cache, + std::optional past_sequence_lengths, + std::optional total_sequence_lengths, + std::optional input_offsets, + std::optional cu_seqlens, + std::optional block_tables, + std::optional slot_mapping) const; + + void set_rotary_emb(const std::shared_ptr &rotary_emb); + void reset_cache(); + +private: + double residual_scale_ = 1.0; + size_t layer_idx_ = 0; + +protected: + INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, input_layernorm); + INFINICORE_NN_MODULE(MiniCPMSALAAttention, self_attn); + INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, post_attention_layernorm); + INFINICORE_NN_MODULE(MiniCPMSALAMLP, mlp); +}; + +} // namespace infinilm::models::minicpm_sala diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp index 793f86bd..ce2e9474 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp @@ -1,56 +1,63 @@ #include "minicpm_sala_for_causal_lm.hpp" -#include "../../global_state/global_state.hpp" -#include "../models_registry.hpp" + +#include "infinicore/ops.hpp" +#include #include -#include namespace infinilm::models::minicpm_sala { -MiniCPMSALAForCausalLM::MiniCPMSALAForCausalLM(std::shared_ptr model_config, - const infinicore::Device &device) { - model_config_ = model_config; - size_t hidden_size = model_config->get("hidden_size"); - size_t vocab_size = model_config->get("vocab_size"); - const auto &dtype{model_config->get_dtype()}; +MiniCPMSALAForCausalLM::MiniCPMSALAForCausalLM( + std::shared_ptr model_config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info, + backends::AttentionBackend attention_backend) { + device_ = device; + + // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA). + const auto dtype = model_config->get_dtype(); + INFINICORE_NN_MODULE_INIT(model, model_config, device, rank_info, attention_backend); + + const size_t hidden_size = model_config->get("hidden_size"); + const size_t vocab_size = model_config->get("vocab_size"); - INFINICORE_NN_MODULE_INIT(model, model_config, device); INFINICORE_NN_MODULE_INIT(lm_head, hidden_size, vocab_size, false, dtype, device); } -infinilm::InfinilmModel::Output MiniCPMSALAForCausalLM::forward(const infinilm::InfinilmModel::Input &input) const { - auto hidden_states = model_->forward(input); +MiniCPMSALAForCausalLM::Output MiniCPMSALAForCausalLM::forward( + const Input &input) const { + auto input_ids = input.input_ids.value(); + auto position_ids = input.position_ids.value(); + + auto past_sequence_lengths = input.past_sequence_lengths; + auto total_sequence_lengths = input.total_sequence_lengths; + auto input_offsets = input.input_offsets; + auto cu_seqlens = input.cu_seqlens; + auto block_tables = input.block_tables; + auto slot_mapping = input.slot_mapping; + + auto hidden_states = model_->forward( + input_ids, + position_ids, + past_sequence_lengths, + total_sequence_lengths, + input_offsets, + cu_seqlens, + block_tables, + slot_mapping); + + // MuP lm_head scale baked into lm_head.weight at load time; no forward scaling here. auto logits = lm_head_->forward(hidden_states); return {logits}; } void MiniCPMSALAForCausalLM::reset_cache(const cache::CacheConfig *cache_config) { - if (nullptr == cache_config) { - InfinilmModel::reset_cache(nullptr); - return; - } cache_config_ = cache_config->unique_copy(); - - auto &kv_cache_vec = infinilm::global_state::get_forward_context().kv_cache_vec; - kv_cache_vec.clear(); - const backends::AttentionBackend attention_backend = infinilm::global_state::get_infinilm_config().attention_backend; - - auto new_kv_cache_vec = minicpm_sala_allocate_kv_cache_tensors(cache_config, model_config_, attention_backend); - kv_cache_vec = std::move(new_kv_cache_vec); + model_->reset_cache(cache_config_.get()); } -std::shared_ptr create_minicpm_sala_model_config(std::shared_ptr model_config) { - const std::string &model_type = model_config->get("model_type"); - if ("minicpm_sala" != model_type) { - throw std::runtime_error("infinilm::models::minicpm_sala::create_minicpm_sala_model_config: model_type is not minicpm_sala"); - } - return model_config; +const cache::CacheConfig *MiniCPMSALAForCausalLM::get_cache_config() const { + return cache_config_.get(); } } // namespace infinilm::models::minicpm_sala -namespace { -INFINILM_REGISTER_CAUSAL_LM_MODEL( - minicpm_sala, - infinilm::models::minicpm_sala::MiniCPMSALAForCausalLM, - infinilm::models::minicpm_sala::create_minicpm_sala_model_config); -} // namespace diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp index f0d0aaae..9bb3ec2b 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp @@ -1,31 +1,38 @@ #pragma once -#include "minicpm_sala_decoderLayer.hpp" -#include -#include +#include "../infinilm_model.hpp" +#include "minicpm_sala_model.hpp" -namespace infinilm::models::minicpm_sala { +#include "../../config/model_config.hpp" +#include "../../engine/distributed/distributed.hpp" +#include "../../backends/attention_backends.hpp" + +#include "infinicore/device.hpp" +#include "infinicore/nn/linear.hpp" -using MiniCPMSALAModel = infinilm::layers::causal_lm_templates::TextModel; +namespace infinilm::models::minicpm_sala { +// Milestone-0 stub. Full implementation will follow the MiniCPM-SALA design: +// - Lightning Attention (Simple GLA) layers + InfLLM-V2 sparse layers in a 1:3 ratio +// - HyPE (RoPE on linear layers; NoPE on sparse layers) class MiniCPMSALAForCausalLM : public InfinilmModel { public: MiniCPMSALAForCausalLM(std::shared_ptr model_config, - const infinicore::Device &device); + const infinicore::Device &device, + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), + backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); Output forward(const Input &input) const override; void reset_cache(const cache::CacheConfig *cache_config) override; -protected: + const cache::CacheConfig *get_cache_config() const override; + +private: INFINICORE_NN_MODULE(MiniCPMSALAModel, model); - INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, lm_head); + INFINICORE_NN_MODULE(infinicore::nn::Linear, lm_head); + std::unique_ptr cache_config_; }; -std::shared_ptr create_minicpm_sala_model_config(std::shared_ptr model_config); - -/** Implemented in `minicpm_sala_allocate_kv_cache_tensors.cpp`. */ -std::vector minicpm_sala_allocate_kv_cache_tensors(const cache::CacheConfig *cache_config, - const std::shared_ptr &text_config, - const backends::AttentionBackend &attention_backend); } // namespace infinilm::models::minicpm_sala + diff --git a/csrc/models/minicpm_sala/minicpm_sala_mlp.cpp b/csrc/models/minicpm_sala/minicpm_sala_mlp.cpp new file mode 100644 index 00000000..649c0095 --- /dev/null +++ b/csrc/models/minicpm_sala/minicpm_sala_mlp.cpp @@ -0,0 +1,32 @@ +#include "minicpm_sala_mlp.hpp" + +#include "infinicore/ops.hpp" + +namespace infinilm::models::minicpm_sala { + +MiniCPMSALAMLP::MiniCPMSALAMLP(std::shared_ptr model_config, + const infinicore::Device &device) { + // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA). + const auto dtype = model_config->get_dtype(); + hidden_size_ = model_config->get("hidden_size"); + intermediate_size_ = model_config->get("intermediate_size"); + + INFINICORE_NN_MODULE_INIT(gate_proj, hidden_size_, intermediate_size_, false, dtype, device); + INFINICORE_NN_MODULE_INIT(up_proj, hidden_size_, intermediate_size_, false, dtype, device); + INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, false, dtype, device); +} + +infinicore::Tensor MiniCPMSALAMLP::forward(const infinicore::Tensor &x) const { + auto x_mut = x; + auto gate = gate_proj_->forward(x_mut); + auto up = up_proj_->forward(x_mut); + + // SwiGLU: silu(gate) * up — fused single kernel (swiglu(a,b) = a*b*sigmoid(b) => swiglu(up,gate)) + auto act = infinicore::op::swiglu(up, gate); + + auto act_mut = act; + return down_proj_->forward(act_mut); +} + +} // namespace infinilm::models::minicpm_sala + diff --git a/csrc/models/minicpm_sala/minicpm_sala_mlp.hpp b/csrc/models/minicpm_sala/minicpm_sala_mlp.hpp new file mode 100644 index 00000000..9a90527a --- /dev/null +++ b/csrc/models/minicpm_sala/minicpm_sala_mlp.hpp @@ -0,0 +1,31 @@ +#pragma once + +#include "../../config/model_config.hpp" + +#include "infinicore/nn/linear.hpp" +#include "infinicore/nn/module.hpp" +#include "infinicore/tensor.hpp" + +#include + +namespace infinilm::models::minicpm_sala { + +class MiniCPMSALAMLP : public infinicore::nn::Module { +public: + MiniCPMSALAMLP(std::shared_ptr model_config, + const infinicore::Device &device); + + infinicore::Tensor forward(const infinicore::Tensor &x) const; + +protected: + INFINICORE_NN_MODULE(infinicore::nn::Linear, gate_proj); + INFINICORE_NN_MODULE(infinicore::nn::Linear, up_proj); + INFINICORE_NN_MODULE(infinicore::nn::Linear, down_proj); + +private: + size_t hidden_size_; + size_t intermediate_size_; +}; + +} // namespace infinilm::models::minicpm_sala + diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.cpp b/csrc/models/minicpm_sala/minicpm_sala_model.cpp new file mode 100644 index 00000000..a415915f --- /dev/null +++ b/csrc/models/minicpm_sala/minicpm_sala_model.cpp @@ -0,0 +1,171 @@ +#include "minicpm_sala_model.hpp" + +#include "infinicore/context/context.hpp" +#include "infinicore/ops.hpp" +#include +#include +#include +#include +#include +#include +#include + +namespace infinilm::models::minicpm_sala { + +MiniCPMSALAModel::MiniCPMSALAModel(std::shared_ptr model_config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info, + backends::AttentionBackend attention_backend) + : model_config_(std::move(model_config)), + rank_info_(rank_info), + attention_backend_(attention_backend) { + + // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA). + const auto dtype = model_config_->get_dtype(); + compute_device_ = device; + + hidden_size_ = model_config_->get("hidden_size"); + dim_model_base_ = model_config_->get_or("dim_model_base", static_cast(hidden_size_)); + scale_emb_ = model_config_->get_or("scale_emb", 1.0); + + const size_t vocab_size = model_config_->get("vocab_size"); + const size_t num_layers = model_config_->get("num_hidden_layers"); + + INFINICORE_NN_MODULE_INIT(embed_tokens, vocab_size, hidden_size_, std::nullopt, dtype, device); + INFINICORE_NN_MODULE_INIT(norm, hidden_size_, model_config_->get("rms_norm_eps"), dtype, device); + + // Shared rotary embedding (used by lightning layers only) + INFINICORE_NN_MODULE_INIT(rotary_emb, + model_config_->get_head_dim(), + model_config_->get("max_position_embeddings"), + model_config_->get("rope_theta"), + infinicore::nn::RoPE::Algo::GPT_NEOX, + dtype, + device, + model_config_->get_rope_scaling()); + + // Mixer types per-layer decide attention flavor (minicpm4 vs lightning-attn). + std::vector mixer_types; + try { + mixer_types = model_config_->get>("mixer_types"); + } catch (...) { + mixer_types.assign(num_layers, "minicpm4"); + } + if (mixer_types.size() != num_layers) { + mixer_types.resize(num_layers, mixer_types.empty() ? "minicpm4" : mixer_types.back()); + } + mixer_types_ = mixer_types; + + layers_.reserve(num_layers); + for (size_t i = 0; i < num_layers; ++i) { + layers_.push_back(this->register_module( + "layers." + std::to_string(i), model_config_, device, i, mixer_types[i], rank_info_, attention_backend_)); + layers_.back()->set_rotary_emb(rotary_emb_); + } +} + +void MiniCPMSALAModel::reset_cache(const cache::CacheConfig *cache_config) { + if (cache_config == nullptr) { + kv_cache_minicpm4_ = nullptr; + kv_cache_lightning_ = nullptr; + for (auto &layer : layers_) { + layer->reset_cache(); + } + return; + } + + if (auto static_cfg = dynamic_cast(cache_config)) { + // Allocate separate caches by KV shape to avoid per-layer padding copies. + const size_t num_hidden_layers = model_config_->get("num_hidden_layers"); + // mixer_types_ is filled in ctor from model_config_->get("mixer_types"). + const size_t minicpm4_layer_count = + !mixer_types_.empty() ? std::count(mixer_types_.begin(), mixer_types_.end(), "minicpm4") : num_hidden_layers; + const size_t lightning_layer_count = num_hidden_layers - minicpm4_layer_count; + + const size_t base_kv_heads = model_config_->get("num_key_value_heads"); + const size_t base_head_dim = model_config_->get("head_dim"); + const size_t lightning_kv_heads = model_config_->get_or("lightning_nkv", base_kv_heads); + const size_t lightning_head_dim = model_config_->get_or("lightning_head_dim", base_head_dim); + const size_t lightning_nh = model_config_->get_or("lightning_nh", model_config_->get("num_attention_heads")); + const int tp_sz = std::max(1, rank_info_.tp_size); + const size_t lightning_nh_rank = lightning_nh / static_cast(tp_sz); + + kv_cache_minicpm4_ = (minicpm4_layer_count > 0) + ? std::make_shared( + /*k_dim=*/base_head_dim, + /*v_dim=*/base_head_dim, + /*num_k_heads=*/base_kv_heads, + /*num_v_heads=*/base_kv_heads, + /*num_layers=*/minicpm4_layer_count, + /*max_positional_embedding=*/model_config_->get("max_position_embeddings"), + /*dtype=*/model_config_->get_dtype(), + *static_cfg, + rank_info_) + : nullptr; + + kv_cache_lightning_ = (lightning_layer_count > 0) + ? std::make_shared( + /*k_dim=*/lightning_head_dim, + /*v_dim=*/lightning_head_dim, + /*num_k_heads=*/lightning_kv_heads, + /*num_v_heads=*/lightning_kv_heads, + /*num_layers=*/lightning_layer_count, + /*max_positional_embedding=*/model_config_->get("max_position_embeddings"), + /*dtype=*/model_config_->get_dtype(), + *static_cfg, + rank_info_, + /*gla_recurrent_num_heads=*/lightning_nh_rank, + /*gla_recurrent_head_dim=*/lightning_head_dim) + : nullptr; + } else { + // This refactor implements HF-like dense caching only. + throw std::runtime_error("MiniCPMSALAModel::reset_cache: Unsupported cache type (expected StaticKVCacheConfig)"); + } + + for (auto &layer : layers_) { + layer->reset_cache(); + } +} + +infinicore::Tensor MiniCPMSALAModel::forward(const infinicore::Tensor &input_ids, + const infinicore::Tensor &position_ids, + std::optional past_sequence_lengths, + std::optional total_sequence_lengths, + std::optional input_offsets, + std::optional cu_seqlens, + std::optional block_tables, + std::optional slot_mapping) const { + // MuP scaling baked into weights at load time for minicpm_sala; no forward scaling here. + auto hs = embed_tokens_->forward(input_ids); + + for (size_t i = 0; i < layers_.size(); ++i) { + std::shared_ptr layer_cache; + if (!mixer_types_.empty() && mixer_types_[i] == "minicpm4") { + layer_cache = kv_cache_minicpm4_; + } else { + layer_cache = kv_cache_lightning_; + } + hs = layers_[i]->forward(hs, + position_ids, + layer_cache, + past_sequence_lengths, + total_sequence_lengths, + input_offsets, + cu_seqlens, + block_tables, + slot_mapping); + if (const char *env = std::getenv("MINICPM_SALA_LAYER_TRACE")) { + if (env[0] != '\0' && env[0] != '0') { + fprintf(stderr, "[minicpm_sala][layer_trace] layer=%zu mixer=%s\n", + i, + mixer_types_.empty() ? "unknown" : mixer_types_[i].c_str()); + fflush(stderr); + } + } + } + + hs = norm_->forward(hs); + return hs; +} + +} // namespace infinilm::models::minicpm_sala diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.hpp b/csrc/models/minicpm_sala/minicpm_sala_model.hpp new file mode 100644 index 00000000..d360dd3e --- /dev/null +++ b/csrc/models/minicpm_sala/minicpm_sala_model.hpp @@ -0,0 +1,66 @@ +#pragma once + +#include "minicpm_sala_decoder_layer.hpp" + +#include "../../backends/attention_backends.hpp" +#include "../../cache/kv_cache.hpp" +#include "../../config/model_config.hpp" +#include "../../engine/distributed/distributed.hpp" + +#include "infinicore/nn/embedding.hpp" +#include "infinicore/nn/module.hpp" +#include "infinicore/nn/rmsnorm.hpp" +#include "infinicore/nn/rope.hpp" +#include "infinicore/tensor.hpp" + +#include +#include +#include + +namespace infinilm::models::minicpm_sala { + +class MiniCPMSALAModel : public infinicore::nn::Module { +public: + MiniCPMSALAModel(std::shared_ptr model_config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), + backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); + + infinicore::Tensor forward(const infinicore::Tensor &input_ids, + const infinicore::Tensor &position_ids, + std::optional past_sequence_lengths, + std::optional total_sequence_lengths, + std::optional input_offsets, + std::optional cu_seqlens, + std::optional block_tables, + std::optional slot_mapping) const; + + void reset_cache(const cache::CacheConfig *cache_config); + + size_t hidden_size() const { return hidden_size_; } + double dim_model_base() const { return dim_model_base_; } + +protected: + INFINICORE_NN_MODULE(infinicore::nn::Embedding, embed_tokens); + INFINICORE_NN_MODULE_VEC(MiniCPMSALADecoderLayer, layers); + INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, norm); + INFINICORE_NN_MODULE(infinicore::nn::RoPE, rotary_emb); + +private: + std::shared_ptr model_config_; + engine::distributed::RankInfo rank_info_; + backends::AttentionBackend attention_backend_; + // MiniCPM-SALA is hybrid: minicpm4 vs lightning layers can have different KV shapes. + // Use two StaticKVCache instances to avoid per-layer padding/copies during long prefill. + std::shared_ptr kv_cache_minicpm4_; + std::shared_ptr kv_cache_lightning_; + std::vector mixer_types_; + infinicore::Device compute_device_; + + size_t hidden_size_; + double scale_emb_; + double dim_model_base_; +}; + +} // namespace infinilm::models::minicpm_sala + diff --git a/csrc/models/model_factory.cpp b/csrc/models/model_factory.cpp index 03734ac9..3c885fc5 100644 --- a/csrc/models/model_factory.cpp +++ b/csrc/models/model_factory.cpp @@ -1,6 +1,6 @@ #include "model_factory.hpp" -#include "llama/llama_for_causal_lm.hpp" -#include "models_registry.hpp" +#include "llama/llama.hpp" +#include "minicpm_sala/minicpm_sala_for_causal_lm.hpp" namespace infinilm { /** @@ -41,8 +41,13 @@ std::shared_ptr InfinilmModelFactory::createModel( engine::distributed::RankInfo rank_info, const cache::CacheConfig *cache, backends::AttentionBackend attention_backend) { + std::shared_ptr model; - if (true) { + const auto model_type = model_config->get_or("model_type", "llama"); + if (model_type == "minicpm_sala") { + model = std::make_shared( + model_config, rank_info.device, rank_info, attention_backend); + } else if (true) { model = std::make_shared( model_config, rank_info.device, rank_info, attention_backend); } else { @@ -60,21 +65,8 @@ std::shared_ptr InfinilmModelFactory::createModel( std::shared_ptr model_config, const infinicore::Device &device, const cache::CacheConfig *cache) { - const std::string model_type = model_config->get("model_type"); - std::shared_ptr model; - const auto &model_map = models::get_causal_lm_model_map(); - auto it = model_map.find(model_type); - if (it != model_map.end()) { - // create model - auto &model_creator = it->second; - model = model_creator(model_config, device); - } else { - throw std::invalid_argument("InfinilmModelFactory::createModel: Unsupported model_type"); - } - - if (cache) { - model->reset_cache(cache); - } - return model; + engine::distributed::RankInfo rank_info; + rank_info.device = device; + return createModel(model_config, rank_info, cache, backends::AttentionBackend::Default); } } // namespace infinilm diff --git a/csrc/pybind11/engine/engine.hpp b/csrc/pybind11/engine/engine.hpp index a784d69c..d27b9585 100644 --- a/csrc/pybind11/engine/engine.hpp +++ b/csrc/pybind11/engine/engine.hpp @@ -199,6 +199,6 @@ inline void bind_infer_engine(py::module &m) { py::class_(infer_engine, "Output") .def_readwrite("output_ids", &InferEngine::Output::output_ids, "Output tensor"); -} + } } // namespace infinilm::engine diff --git a/examples/collect_metrics_longtext_decode.py b/examples/collect_metrics_longtext_decode.py new file mode 100644 index 00000000..172b6f40 --- /dev/null +++ b/examples/collect_metrics_longtext_decode.py @@ -0,0 +1,355 @@ +#!/usr/bin/env python3 +""" +Collect long-context + decode metrics for metrics_longtext_mem.md. + +**OOM-safe workflow:** run each case in a **fresh Python process** so CUDA allocations +are released between runs: + + ./run_longtext_metrics_cases.sh + +Or manually: + + python3 collect_metrics_longtext_decode.py --case hf:16384 --append-jsonl profiling_runs/longtext_decode_rows.jsonl + +See also docstring at top of previous revisions for GPU selection (CUDA_VISIBLE_DEVICES + NVML_GPU_INDEX). +""" + +from __future__ import annotations + +import argparse +import json +import os +import subprocess +import sys +import threading +import time +from typing import Any, Callable, Dict, List, Optional, Tuple + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python")) + + +def _poll_gpu_mem_mib(stop: threading.Event, gpu_index: int, out: List[int]) -> None: + while not stop.is_set(): + try: + r = subprocess.run( + [ + "nvidia-smi", + "-i", + str(gpu_index), + "--query-gpu=memory.used", + "--format=csv,noheader,nounits", + ], + capture_output=True, + text=True, + timeout=5, + ) + if r.returncode == 0 and r.stdout.strip().isdigit(): + out.append(int(r.stdout.strip())) + except Exception: + pass + if stop.wait(timeout=1.0): + break + + +def _with_mem_poll(gpu_index: int, fn: Callable[[], Any]) -> Tuple[Any, Optional[int]]: + samples: List[int] = [] + stop = threading.Event() + th = threading.Thread(target=_poll_gpu_mem_mib, args=(stop, gpu_index, samples), daemon=True) + th.start() + err: Optional[BaseException] = None + result: Any = None + try: + result = fn() + except BaseException as e: + err = e + finally: + stop.set() + th.join(timeout=3.0) + peak = max(samples) if samples else None + if err is not None: + raise err + return result, peak + + +def _row_dict( + date: str, + backend: str, + target: int, + actual: int, + max_new: int, + peak: Optional[int], + gpu_smi: int, + r: Dict[str, Any], +) -> Dict[str, Any]: + return { + "date": date, + "backend": backend, + "target_input_tokens": target, + "actual_input_tokens": actual, + "max_new_tokens": max_new, + "peak_mem_mib": peak, + "gpu_smi_index": gpu_smi, + "total_time_ms": r.get("total_time_ms"), + "prefill_ttft_ms": r.get("prefill_ttft_ms"), + "prefill_throughput_tok_s": r.get("prefill_throughput_tok_s"), + "decode_itl_ms": r.get("decode_itl_ms"), + "decode_throughput_tok_s": r.get("decode_throughput_tok_s"), + "engine_reported_generation_ms": r.get("engine_reported_generation_ms"), + "error": r.get("error"), + } + + +def run_single_case( + case: str, + *, + model_path: str, + gpu_smi: int, + date: str, +) -> Dict[str, Any]: + """Run one measurement; returns a row dict (may contain error key).""" + examples_dir = os.path.dirname(os.path.abspath(__file__)) + sys.path.insert(0, examples_dir) + os.chdir(examples_dir) + + from transformers import AutoTokenizer + + from compare_inference_speed import ( + _make_prompt_with_target_tokens, + run_hf_decode_loop, + run_hf_forward_prefill, + run_infinilm_inprocess, + ) + + parts = case.strip().split(":") + kind = parts[0].lower() + if kind == "hf": + # Backward compatible: + # hf: -> max_new=1 (forward-prefill only) + # hf:: -> max_new= (decode-loop timing) + if len(parts) == 2: + target = int(parts[1]) + max_new = 1 + elif len(parts) == 3: + target = int(parts[1]) + max_new = int(parts[2]) + else: + raise ValueError("--case hf:[:] (e.g. hf:16384 or hf:16384:32)") + elif kind == "infinilm_rec": + if len(parts) != 3: + raise ValueError("--case infinilm_rec:: (e.g. infinilm_rec:32768:32)") + target = int(parts[1]) + max_new = int(parts[2]) + else: + raise ValueError( + f"Unknown case kind {kind!r}; use hf: or infinilm_rec:" + ) + + tok = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + prompt, actual = _make_prompt_with_target_tokens(tok, "How are you", target) + + if kind == "hf": + + def go() -> Dict[str, Any]: + # Always use hf decode-loop so total_time_ms can be end-to-end + # (prefill + decode), matching the InfiniLM generate semantics. + return run_hf_decode_loop( + model_path, + prompt, + max_new, + device="cuda", + attn_implementation="flash_attention_2", + use_cache=True, + warmup=1, + iters=1, + ) + + try: + r, peak = _with_mem_poll(gpu_smi, go) + r = dict(r) + return _row_dict(date, "hf (decode_loop)", target, actual, max_new, peak, gpu_smi, r) + except Exception as e: + return _row_dict( + date, + "hf (decode_loop)", + target, + actual, + max_new, + None, + gpu_smi, + {"error": str(e)}, + ) + + recurrent = kind == "infinilm_rec" + if max_new == 1: + label = "infinilm (static_fit, recurrent GLA decode)" + else: + label = f"infinilm (static_fit, recurrent GLA, +{max_new} decode)" + + saved_lightning = os.environ.get("INFINI_LIGHTNING_GLA_RECURRENT_DECODE") + saved_skip = os.environ.get("INFINI_SKIP_LAST_LOGITS_CPU") + try: + if recurrent: + os.environ["INFINI_LIGHTNING_GLA_RECURRENT_DECODE"] = "1" + else: + os.environ.pop("INFINI_LIGHTNING_GLA_RECURRENT_DECODE", None) + os.environ["INFINI_SKIP_LAST_LOGITS_CPU"] = "1" + + def go_inf() -> Dict[str, Any]: + return run_infinilm_inprocess( + model_path, + prompt, + max_new, + cache_mode="static_fit", + paged_block_size=256, + attn_backend="default", + ) + + r, peak = _with_mem_poll(gpu_smi, go_inf) + return _row_dict(date, label, target, actual, max_new, peak, gpu_smi, dict(r)) + except Exception as e: + return _row_dict(date, label, target, actual, max_new, None, gpu_smi, {"error": str(e)}) + finally: + if saved_lightning is None: + os.environ.pop("INFINI_LIGHTNING_GLA_RECURRENT_DECODE", None) + else: + os.environ["INFINI_LIGHTNING_GLA_RECURRENT_DECODE"] = saved_lightning + if saved_skip is None: + os.environ.pop("INFINI_SKIP_LAST_LOGITS_CPU", None) + else: + os.environ["INFINI_SKIP_LAST_LOGITS_CPU"] = saved_skip + + +def print_markdown_table(rows: List[Dict[str, Any]]) -> None: + def fmt(x: Any) -> str: + if x is None: + return "—" + if isinstance(x, float): + s = f"{x:.2f}" + return s.rstrip("0").rstrip(".") + return str(x) + + gpu_smi = rows[0].get("gpu_smi_index", 0) if rows else 0 + print("\n### Markdown table (paste into metrics_longtext_mem.md)\n") + hdr = ( + "| date | backend | target_in | max_new | peak_mem_mib | total_ms | prefill_ttft_ms | " + "prefill_tok_s | decode_itl_ms | decode_tok_s | gpu |" + ) + sep = "|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|" + print(hdr) + print(sep) + for row in rows: + if row.get("error"): + print( + f"| {row['date']} | {row['backend']} | {row['target_input_tokens']} | " + f"{row['max_new_tokens']} | {fmt(row.get('peak_mem_mib'))} | OOM/err | — | — | — | — | {gpu_smi} |" + ) + continue + dec_itl = fmt(row.get("decode_itl_ms")) if row["max_new_tokens"] > 1 else "—" + dec_tps = fmt(row.get("decode_throughput_tok_s")) if row["max_new_tokens"] > 1 else "—" + ptt = row.get("prefill_ttft_ms") + # Only forward-prefill runs use total_time_ms as a prefill-time proxy. + if ptt is None and row.get("backend") == "hf (forward_prefill)": + ptt = row.get("total_time_ms") + print( + f"| {row['date']} | {row['backend']} | {row['target_input_tokens']} | {row['max_new_tokens']} | " + f"{fmt(row.get('peak_mem_mib'))} | {fmt(row.get('total_time_ms'))} | {fmt(ptt)} | " + f"{fmt(row.get('prefill_throughput_tok_s'))} | {dec_itl} | {dec_tps} | {gpu_smi} |" + ) + + +def main() -> None: + ap = argparse.ArgumentParser(description="Long-context + decode metrics (OOM-safe --case mode)") + ap.add_argument( + "--case", + type=str, + default=None, + help="Single case: hf:16384 | infinilm_rec:32768:32", + ) + ap.add_argument( + "--append-jsonl", + type=str, + default=None, + help="Append one JSON line (--case mode only)", + ) + ap.add_argument( + "--from-jsonl", + type=str, + default=None, + help="Load rows from jsonl and print markdown table", + ) + ap.add_argument( + "--all-in-process", + action="store_true", + help="Run full matrix in one process (may OOM between cases)", + ) + args = ap.parse_args() + + model_path = os.environ.get( + "MODEL_PATH", "/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA" + ) + gpu_smi = int(os.environ.get("NVML_GPU_INDEX", os.environ.get("CUDA_VISIBLE_DEVICES", "0"))) + date = os.environ.get("METRICS_DATE", "2026-03-23") + decode_steps = int(os.environ.get("METRICS_DECODE_STEPS", "32")) + targets = [int(x) for x in os.environ.get("METRICS_TARGETS", "16384,32768,65536").split(",")] + + examples_dir = os.path.dirname(os.path.abspath(__file__)) + + if args.from_jsonl: + rows = [] + with open(args.from_jsonl) as f: + for line in f: + line = line.strip() + if line: + rows.append(json.loads(line)) + print_markdown_table(rows) + return + + if args.case: + row = run_single_case(args.case, model_path=model_path, gpu_smi=gpu_smi, date=date) + print(json.dumps(row, ensure_ascii=False)) + if args.append_jsonl: + ap = os.path.abspath(args.append_jsonl) + ad = os.path.dirname(ap) + if ad: + os.makedirs(ad, exist_ok=True) + with open(ap, "a") as f: + f.write(json.dumps(row, ensure_ascii=False) + "\n") + return + + if not args.all_in_process: + print( + "Specify --case CASE, --from-jsonl FILE, or --all-in-process.\n" + "For OOM safety use: ./run_longtext_metrics_cases.sh", + file=sys.stderr, + ) + sys.exit(2) + + # Legacy: all targets × all backends in one process + rows: List[Dict[str, Any]] = [] + for t in targets: + row = run_single_case(f"hf:{t}", model_path=model_path, gpu_smi=gpu_smi, date=date) + rows.append(row) + for t in targets: + rows.append( + run_single_case(f"infinilm_rec:{t}:1", model_path=model_path, gpu_smi=gpu_smi, date=date) + ) + for t in targets: + rows.append( + run_single_case( + f"infinilm_rec:{t}:{decode_steps}", + model_path=model_path, + gpu_smi=gpu_smi, + date=date, + ) + ) + + out_path = os.path.join(examples_dir, "profiling_runs", "longtext_decode_metrics.json") + os.makedirs(os.path.dirname(out_path), exist_ok=True) + with open(out_path, "w") as f: + json.dump({"gpu_smi_index": gpu_smi, "decode_steps": decode_steps, "rows": rows}, f, indent=2) + print(f"Wrote {out_path}") + print_markdown_table(rows) + + +if __name__ == "__main__": + main() diff --git a/examples/compare_inference_speed.py b/examples/compare_inference_speed.py new file mode 100644 index 00000000..06fad9a7 --- /dev/null +++ b/examples/compare_inference_speed.py @@ -0,0 +1,868 @@ +#!/usr/bin/env python3 +""" +Compare MiniCPM-SALA inference speed across HF, InfiniLM, and (optionally) SGLang. + +Usage: + # HF + InfiniLM only (InfiniLM runs in subprocess with same env as jiuge): + python compare_inference_speed.py --model_path /path/to/MiniCPM-SALA [--prompt "How are you"] [--max_new_tokens 32] + + # Include SGLang (server must already be running with MiniCPM-SALA): + python compare_inference_speed.py --model_path /path/to/MiniCPM-SALA --sglang_url http://127.0.0.1:30000 + + # Optional: write JSON + python compare_inference_speed.py --model_path /path/to/MiniCPM-SALA --output results.json + +Requires: transformers, torch; for InfiniLM subprocess: PYTHONPATH and LD_LIBRARY_PATH as in jiuge. +""" + +import argparse +import json +import os +import re +import subprocess +import sys +import time +from typing import Optional, Tuple, Literal + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python")) + +def _build_chat_input_ids(tokenizer, prompt: str): + conversation = [{"role": "user", "content": prompt}] + text = tokenizer.apply_chat_template( + conversation, add_generation_prompt=True, tokenize=False + ) + ids = tokenizer(text, add_special_tokens=True)["input_ids"] + return ids + + +def _make_prompt_with_target_tokens(tokenizer, base_prompt: str, target_input_tokens: int) -> Tuple[str, int]: + """ + Build a prompt (user content) such that the *chat-templated* input_ids length is >= target_input_tokens. + Returns (prompt, actual_input_tokens). + """ + if target_input_tokens <= 0: + raise ValueError("--target_input_tokens must be > 0") + + # Ensure boundaries don't merge tokens weirdly. + chunk = (base_prompt.strip() + "\n") if base_prompt.strip() else "hello\n" + + # Exponential growth to find an upper bound. + rep = 1 + while True: + prompt = chunk * rep + ids = _build_chat_input_ids(tokenizer, prompt) + if len(ids) >= target_input_tokens: + break + rep *= 2 + if rep > 1_000_000: + raise RuntimeError("Failed to build prompt to target length (rep too large)") + + # Binary search for smallest rep that reaches target. + lo, hi = 1, rep + best_prompt = prompt + best_len = len(ids) + while lo <= hi: + mid = (lo + hi) // 2 + p = chunk * mid + l = len(_build_chat_input_ids(tokenizer, p)) + if l >= target_input_tokens: + best_prompt, best_len = p, l + hi = mid - 1 + else: + lo = mid + 1 + + return best_prompt, best_len + + +def run_hf( + model_path: str, + prompt: str, + max_new_tokens: int, + device: str = "cuda", + *, + attn_implementation: Optional[str] = None, +): + """Run HuggingFace generate and return metrics.""" + import torch + from transformers import AutoModelForCausalLM, AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + model_kwargs = { + "torch_dtype": "auto", + "trust_remote_code": True, + } + # Prefer flash-attn when available; fall back silently if not supported. + if attn_implementation is not None: + model_kwargs["attn_implementation"] = attn_implementation # type: ignore[assignment] + try: + model = AutoModelForCausalLM.from_pretrained( + model_path, + **model_kwargs, + ).to(device) + except TypeError: + # Older transformers versions may not support attn_implementation kwarg. + model_kwargs.pop("attn_implementation", None) + model = AutoModelForCausalLM.from_pretrained( + model_path, + **model_kwargs, + ).to(device) + model.eval() + + conversation = [{"role": "user", "content": prompt}] + text = tokenizer.apply_chat_template( + conversation, add_generation_prompt=True, tokenize=False + ) + inputs = tokenizer(text, return_tensors="pt").to(device) + input_len = inputs.input_ids.shape[1] + + start = time.perf_counter() + with torch.inference_mode(): + out = model.generate( + **inputs, + max_new_tokens=max_new_tokens, + do_sample=False, + pad_token_id=tokenizer.eos_token_id or 0, + ) + elapsed = time.perf_counter() - start + output_len = out.shape[1] - input_len + + return { + "backend": "hf", + "total_time_ms": round(elapsed * 1000, 2), + "input_tokens": input_len, + "output_tokens": output_len, + "prefill_ttft_ms": None, # HF generate() doesn't expose TTFT without streaming + "decode_throughput_tok_s": round(output_len / elapsed, 2) if elapsed > 0 else None, + "total_throughput_tok_s": round((input_len + output_len) / elapsed, 2) if elapsed > 0 else None, + } + + +def run_hf_forward_prefill( + model_path: str, + prompt: str, + device: str = "cuda", + *, + attn_implementation: Optional[str] = None, + use_cache: bool = True, + warmup: int = 1, + iters: int = 1, +): + """ + Run HuggingFace *forward-only* prefill (no decode loop). + Intended for kernel-level profiling to isolate prefill work. + """ + import torch + from transformers import AutoModelForCausalLM, AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + model_kwargs = { + "torch_dtype": "auto", + "trust_remote_code": True, + } + if attn_implementation is not None: + model_kwargs["attn_implementation"] = attn_implementation # type: ignore[assignment] + try: + model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs).to(device) + except TypeError: + model_kwargs.pop("attn_implementation", None) + model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs).to(device) + model.eval() + + conversation = [{"role": "user", "content": prompt}] + text = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) + inputs = tokenizer(text, return_tensors="pt").to(device) + input_len = inputs.input_ids.shape[1] + + # Warmup (reduces first-iter compilation / cache effects for profiling). + with torch.inference_mode(): + for _ in range(max(0, warmup)): + # Prefer last-token logits only (reduces memory at long context). + try: + _ = model(**inputs, use_cache=use_cache, logits_to_keep=1) + except TypeError: + _ = model(**inputs, use_cache=use_cache) + torch.cuda.synchronize() + + # Timed iters. + times = [] + with torch.inference_mode(): + for _ in range(max(1, iters)): + torch.cuda.synchronize() + try: + torch.cuda.nvtx.range_push("hf_forward_prefill") + except Exception: + pass + start = time.perf_counter() + try: + _ = model(**inputs, use_cache=use_cache, logits_to_keep=1) + except TypeError: + _ = model(**inputs, use_cache=use_cache) + torch.cuda.synchronize() + elapsed = time.perf_counter() - start + try: + torch.cuda.nvtx.range_pop() + except Exception: + pass + times.append(elapsed) + + best = min(times) if times else 0.0 + return { + "backend": "hf_forward_prefill", + "total_time_ms": round(best * 1000, 2), + "input_tokens": int(input_len), + "output_tokens": 0, + "use_cache": bool(use_cache), + "warmup": int(warmup), + "iters": int(iters), + "prefill_throughput_tok_s": round(input_len / best, 2) if best > 0 else None, + } + + +def run_hf_decode_loop( + model_path: str, + prompt: str, + max_new_tokens: int, + device: str = "cuda", + *, + attn_implementation: Optional[str] = None, + use_cache: bool = True, + warmup: int = 8, + iters: int = 1, +): + """ + Measure HF *decode-only* per-token latency using a manual loop with past_key_values. + + Protocol: + - Prefill once on the full prompt (not included in decode timing). + - Then decode `max_new_tokens` tokens with 1-token steps, timing the whole decode loop + (optionally best-of `iters`). + """ + import torch + from transformers import AutoModelForCausalLM, AutoTokenizer + + if max_new_tokens <= 0: + raise ValueError("--max_new_tokens must be > 0 for hf decode_loop") + + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + model_kwargs = { + "torch_dtype": "auto", + "trust_remote_code": True, + } + if attn_implementation is not None: + model_kwargs["attn_implementation"] = attn_implementation # type: ignore[assignment] + try: + model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs).to(device) + except TypeError: + model_kwargs.pop("attn_implementation", None) + model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs).to(device) + model.eval() + + conversation = [{"role": "user", "content": prompt}] + text = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) + inputs = tokenizer(text, return_tensors="pt").to(device) + input_ids = inputs.input_ids + input_len = int(input_ids.shape[1]) + # Some decoder-only models require attention_mask even when no padding is used. + attention_mask = inputs.get("attention_mask", None) + if attention_mask is None: + attention_mask = input_ids.new_ones(input_ids.shape) + attention_mask = attention_mask.to(device) + # Precompute full (input_len + max_new_tokens) causal attention mask for past-key decoding. + attention_mask_full = attention_mask.new_ones((attention_mask.shape[0], input_len + max_new_tokens)) + + # Prefill once to build cache. + with torch.inference_mode(): + try: + pre = model(input_ids=input_ids, attention_mask=attention_mask, use_cache=use_cache, logits_to_keep=1) + except TypeError: + pre = model(input_ids=input_ids, attention_mask=attention_mask, use_cache=use_cache) + past = getattr(pre, "past_key_values", None) + # Greedy next token from last logits. + logits = pre.logits[:, -1, :] + next_token = torch.argmax(logits, dim=-1, keepdim=True) + + # Warmup decode steps (not timed) to reduce first-step effects. + with torch.inference_mode(): + for warm_i in range(max(0, warmup)): + try: + # Attention mask must cover (past + current token). + attn_mask_step = attention_mask_full[:, : input_len + warm_i + 1] + out = model( + input_ids=next_token, + attention_mask=attn_mask_step, + use_cache=use_cache, + past_key_values=past, + logits_to_keep=1, + ) + except TypeError: + attn_mask_step = attention_mask_full[:, : input_len + warm_i + 1] + out = model( + input_ids=next_token, + attention_mask=attn_mask_step, + use_cache=use_cache, + past_key_values=past, + ) + past = getattr(out, "past_key_values", past) + logits = out.logits[:, -1, :] + next_token = torch.argmax(logits, dim=-1, keepdim=True) + torch.cuda.synchronize() + + # Timed decode loops (best-of iters). + # We report total_time_ms as end-to-end (prefill + decode), but keep + # decode_itl_ms / decode_throughput_tok_s based on decode-only time. + total_times = [] + decode_times = [] + with torch.inference_mode(): + for _ in range(max(1, iters)): + # Re-prefill to avoid measuring a "warmed" cache from prior iteration. + # Time prefill separately so decode_itl_ms stays decode-only. + torch.cuda.synchronize() + prefill_start = time.perf_counter() + try: + pre = model( + input_ids=input_ids, + attention_mask=attention_mask, + use_cache=use_cache, + logits_to_keep=1, + ) + except TypeError: + # Some model/transformers combinations may not accept attention_mask. + pre = model(input_ids=input_ids, use_cache=use_cache) + past = getattr(pre, "past_key_values", None) + logits = pre.logits[:, -1, :] + next_token = torch.argmax(logits, dim=-1, keepdim=True) + + torch.cuda.synchronize() + prefill_elapsed = time.perf_counter() - prefill_start + + torch.cuda.synchronize() + start = time.perf_counter() # decode start + try: + torch.cuda.nvtx.range_push("hf_decode_loop") + except Exception: + pass + for t in range(max_new_tokens): + attn_mask_step = attention_mask_full[:, : input_len + t + 1] + try: + out = model( + input_ids=next_token, + attention_mask=attn_mask_step, + use_cache=use_cache, + past_key_values=past, + logits_to_keep=1, + ) + except TypeError: + out = model( + input_ids=next_token, + attention_mask=attn_mask_step, + use_cache=use_cache, + past_key_values=past, + ) + past = getattr(out, "past_key_values", past) + logits = out.logits[:, -1, :] + next_token = torch.argmax(logits, dim=-1, keepdim=True) + torch.cuda.synchronize() + decode_elapsed = time.perf_counter() - start + total_elapsed = prefill_elapsed + decode_elapsed + try: + torch.cuda.nvtx.range_pop() + except Exception: + pass + total_times.append(total_elapsed) + decode_times.append(decode_elapsed) + + # Pick the iteration with the best end-to-end time; compute decode metrics + # from the corresponding decode-only time. + if total_times: + best_idx = min(range(len(total_times)), key=lambda i: total_times[i]) + best_total = total_times[best_idx] + best_decode = decode_times[best_idx] + else: + best_total = 0.0 + best_decode = 0.0 + + itl_ms = (best_decode * 1000.0 / max_new_tokens) if best_decode > 0 else None + thr = (max_new_tokens / best_decode) if best_decode > 0 else None + return { + "backend": "hf_decode_loop", + "total_time_ms": round(best_total * 1000, 2), + "input_tokens": int(input_len), + "output_tokens": int(max_new_tokens), + "decode_itl_ms": round(itl_ms, 4) if itl_ms is not None else None, + "decode_throughput_tok_s": round(thr, 2) if thr is not None else None, + "use_cache": bool(use_cache), + "warmup": int(warmup), + "iters": int(iters), + } + + +def run_infinilm_inprocess( + model_path: str, + prompt: str, + max_new_tokens: int, + *, + cache_mode: Literal["static_fit", "static_maxpos", "paged"] = "paged", + paged_block_size: int = 256, + attn_backend: str = "flash-attn", +): + """ + Run InfiniLM in-process (no 2048-token truncation). Parses InferEngine's timing prints. + This expects PYTHONPATH to include InfiniLM/InfiniCore python packages (container runner does this). + """ + import io + import torch + import contextlib + + import infinicore + from transformers import AutoTokenizer + + from infinilm.cache import PagedKVCacheConfig, StaticKVCacheConfig + from infinilm.distributed import DistConfig + from infinilm.infer_engine import GenerationConfig, InferEngine + from infinilm.modeling_utils import load_model_state_dict_by_file + + model_path = os.path.expanduser(model_path) + # Prefer flash-attn when available; fall back to default. + try: + model = InferEngine( + model_path, + device=infinicore.device("cuda", 0), + distributed_config=DistConfig(1), + enable_graph_compiling=False, + attention_backend=attn_backend, + ) + except TypeError: + # Older InferEngine builds may not accept attention_backend. + model = InferEngine( + model_path, + device=infinicore.device("cuda", 0), + distributed_config=DistConfig(1), + enable_graph_compiling=False, + ) + except Exception: + try: + model = InferEngine( + model_path, + device=infinicore.device("cuda", 0), + distributed_config=DistConfig(1), + enable_graph_compiling=False, + attention_backend="default", + ) + except TypeError: + model = InferEngine( + model_path, + device=infinicore.device("cuda", 0), + distributed_config=DistConfig(1), + enable_graph_compiling=False, + ) + load_model_state_dict_by_file(model, model_path, dtype=model.config.dtype) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + input_ids = _build_chat_input_ids(tokenizer, prompt) + input_ids_infini = infinicore.from_list([input_ids]) + + initial_capacity = len(input_ids) + max_new_tokens + if cache_mode == "paged": + num_blocks = (initial_capacity + (paged_block_size - 1)) // paged_block_size + cache_config = PagedKVCacheConfig( + num_blocks=num_blocks, + block_size=paged_block_size, + ) + else: + if cache_mode == "static_maxpos": + max_pos = getattr(model.config, "max_position_embeddings", 4096) + max_cache_len = max(initial_capacity, max_pos) + else: + # Fit cache to what we actually need for this run. + max_cache_len = initial_capacity + cache_config = StaticKVCacheConfig(max_batch_size=1, max_cache_len=max_cache_len) + # Basic GPU memory stats around cache construction (CUDA device assumed to be index 0). + mem_before_cache = torch.cuda.memory_allocated(0) + max_mem_before_cache = torch.cuda.max_memory_allocated(0) + + model.reset_cache(cache_config) + + mem_after_cache = torch.cuda.memory_allocated(0) + max_mem_after_cache = torch.cuda.max_memory_allocated(0) + + buf = io.StringIO() + start = time.perf_counter() + with contextlib.redirect_stdout(buf): + try: + torch.cuda.nvtx.range_push("infinilm_generate") + except Exception: + pass + try: + model.generate( + input_ids_infini, + GenerationConfig( + max_new_tokens=max_new_tokens, + temperature=1.0, + top_k=1, + top_p=1.0, + # Profiling: avoid per-step EOS checks + early stop variability. + stop_on_eos=False, + ), + _measure_and_log_time=True, + ) + finally: + try: + torch.cuda.nvtx.range_pop() + except Exception: + pass + elapsed = time.perf_counter() - start + stdout = buf.getvalue() + + prefill_ttft_ms = None + prefill_throughput = None + decode_itl_ms = None + decode_throughput = None + gen_completed_ms = None + for line in stdout.splitlines(): + if "Prefill TTFT:" in line: + m = re.search( + r"Prefill TTFT:\s*([\d.]+)\s*ms.*Throughput:\s*([\d.]+)\s*tok/s", line + ) + if m: + prefill_ttft_ms = float(m.group(1)) + prefill_throughput = float(m.group(2)) + if "Decode" in line and "ITL:" in line: + m = re.search( + r"Decode\s+Avg ITL:\s*([\d.]+)\s*ms.*Throughput:\s*([\d.]+)\s*tok/s", line + ) + if m: + decode_itl_ms = float(m.group(1)) + decode_throughput = float(m.group(2)) + if "Generation completed in" in line: + m = re.search(r"Generation completed in\s*([\d.]+)\s*ms", line) + if m: + gen_completed_ms = float(m.group(1)) + + return { + "backend": "infinilm", + "total_time_ms": round(elapsed * 1000, 2), + "input_tokens": len(input_ids), + "output_tokens": max_new_tokens, + "prefill_ttft_ms": prefill_ttft_ms, + "prefill_throughput_tok_s": prefill_throughput, + "decode_itl_ms": decode_itl_ms, + "decode_throughput_tok_s": decode_throughput, + "engine_reported_generation_ms": gen_completed_ms, + # Cache / attention configuration + "cache_mode": cache_mode, + "paged_block_size": paged_block_size if cache_mode == "paged" else None, + "enable_paged_attn": getattr(model, "enable_paged_attn", False), + "static_max_cache_len": max_cache_len if cache_mode != "paged" else None, + "paged_num_blocks": num_blocks if cache_mode == "paged" else None, + # Torch CUDA memory snapshots (bytes) + "torch_memory_allocated_before_cache": int(mem_before_cache), + "torch_memory_allocated_after_cache": int(mem_after_cache), + "torch_max_memory_allocated_before_cache": int(max_mem_before_cache), + "torch_max_memory_allocated_after_cache": int(max_mem_after_cache), + } + + +def run_infinilm(model_path: str, prompt: str, max_new_tokens: int, env=None): + """Run InfiniLM jiuge via subprocess and parse stdout for metrics.""" + run_env = {**os.environ, **(env or {})} + examples_dir = os.path.dirname(os.path.abspath(__file__)) + jiuge_py = os.path.join(examples_dir, "jiuge.py") + cmd = [ + sys.executable, + jiuge_py, + "--nvidia", + "--model_path", model_path, + "--prompt", prompt, + "--max_new_tokens", str(max_new_tokens), + ] + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=300, + env=run_env, + cwd=examples_dir, + ) + stdout = result.stdout or "" + if result.returncode != 0 and not stdout: + return {"backend": "infinilm", "error": (result.stderr or f"exit code {result.returncode}")[:500]} + except Exception as e: + return {"backend": "infinilm", "error": str(e)} + + # Parse jiuge / InferEngine output + prefill_ttft_ms = None + prefill_throughput = None + decode_itl_ms = None + decode_throughput = None + total_time_ms = None + for line in stdout.splitlines(): + if "Prefill TTFT:" in line: + m = re.search(r"Prefill TTFT:\s*([\d.]+)\s*ms.*Throughput:\s*([\d.]+)\s*tok/s", line) + if m: + prefill_ttft_ms = float(m.group(1)) + prefill_throughput = float(m.group(2)) + if "Decode" in line and "ITL:" in line: + m = re.search(r"Decode\s+Avg ITL:\s*([\d.]+)\s*ms.*Throughput:\s*([\d.]+)\s*tok/s", line) + if m: + decode_itl_ms = float(m.group(1)) + decode_throughput = float(m.group(2)) + if "total_time:" in line: + m = re.search(r"total_time:\s*([\d.]+)\s*ms", line) + if m: + total_time_ms = float(m.group(1)) + if "Generation completed in" in line: + m = re.search(r"Generation completed in\s*([\d.]+)\s*ms", line) + if m: + total_time_ms = float(m.group(1)) + + return { + "backend": "infinilm", + "total_time_ms": total_time_ms, + "prefill_ttft_ms": prefill_ttft_ms, + "prefill_throughput_tok_s": prefill_throughput, + "decode_itl_ms": decode_itl_ms, + "decode_throughput_tok_s": decode_throughput, + } + + +def run_sglang_client(sglang_url: str, prompt: str, max_new_tokens: int): + """Send one request to SGLang server and return metrics.""" + try: + import requests + except ImportError: + return {"backend": "sglang", "error": "requests not installed"} + + url = sglang_url.rstrip("/") + "/generate" + payload = { + "text": prompt, + "sampling_params": {"max_new_tokens": max_new_tokens, "temperature": 0}, + } + start = time.perf_counter() + try: + r = requests.post(url, json=payload, timeout=120) + r.raise_for_status() + data = r.json() + except Exception as e: + return {"backend": "sglang", "error": str(e)} + elapsed_ms = (time.perf_counter() - start) * 1000 + + # SGLang response may have "meta_info" with "completion_tokens" or we use prompt + output length + output_text = (data.get("text") or data.get("choices", [{}])[0].get("text") or "") + completion_tokens = data.get("meta_info", {}).get("completion_tokens") or data.get("usage", {}).get("completion_tokens") + if completion_tokens is None and "usage" in data: + completion_tokens = data["usage"].get("completion_tokens") + if completion_tokens is None: + completion_tokens = max_new_tokens # fallback + + return { + "backend": "sglang", + "total_time_ms": round(elapsed_ms, 2), + "output_tokens": completion_tokens, + "total_throughput_tok_s": round(completion_tokens / (elapsed_ms / 1000), 2) if elapsed_ms > 0 else None, + } + + +def main(): + parser = argparse.ArgumentParser(description="Compare MiniCPM-SALA inference speed: HF, InfiniLM, SGLang") + parser.add_argument("--model_path", required=True, help="Path to MiniCPM-SALA model dir") + parser.add_argument("--prompt", default="How are you", help="Prompt for generation") + parser.add_argument("--max_new_tokens", type=int, default=32, help="Max new tokens to generate") + parser.add_argument( + "--target_input_tokens", + type=int, + default=None, + help="If set, synthesize a long prompt so chat-templated input tokens >= this value (e.g. 65536).", + ) + parser.add_argument( + "--infinilm_cache_mode", + type=str, + default="paged", + choices=["paged", "static_fit", "static_maxpos"], + help="InfiniLM KV cache mode when running long prompts in-process.", + ) + parser.add_argument( + "--infinilm_paged_block_size", + type=int, + default=256, + help="Paged KV block size (tokens per block).", + ) + parser.add_argument( + "--infinilm_attn_backend", + type=str, + default="flash-attn", + help="InfiniLM attention backend (e.g. flash-attn or default).", + ) + parser.add_argument( + "--hf_attn_implementation", + type=str, + default="flash_attention_2", + help="HF attention implementation to request (e.g. flash_attention_2 or eager).", + ) + parser.add_argument( + "--hf_mode", + type=str, + default="generate", + choices=["generate", "forward_prefill", "decode_loop"], + help="HF run mode: generate() end-to-end, forward-only prefill, or manual decode_loop timing with KV cache.", + ) + parser.add_argument( + "--hf_forward_use_cache", + action="store_true", + help="In HF forward_prefill mode, pass use_cache=True (recommended).", + ) + parser.add_argument( + "--hf_forward_warmup", + type=int, + default=1, + help="Warmup iterations for HF forward_prefill.", + ) + parser.add_argument( + "--hf_forward_iters", + type=int, + default=1, + help="Measured iterations for HF forward_prefill (best-of).", + ) + parser.add_argument( + "--hf_decode_warmup", + type=int, + default=8, + help="Warmup steps for HF decode_loop (not timed).", + ) + parser.add_argument( + "--hf_decode_iters", + type=int, + default=1, + help="Measured iterations for HF decode_loop (best-of).", + ) + parser.add_argument("--sglang_url", default=None, help="SGLang server URL (e.g. http://127.0.0.1:30000); if set, query SGLang") + parser.add_argument("--backends", default="hf,infinilm", help="Comma-separated: hf,infinilm,sglang") + parser.add_argument("--output", default=None, help="Write JSON results to this path") + parser.add_argument("--no_hf", action="store_true", help="Skip HF (e.g. if no GPU memory for two models)") + parser.add_argument("--no_infinilm", action="store_true", help="Skip InfiniLM") + parser.add_argument( + "--prefill_16k", + action="store_true", + help="Convenience flag: set --target_input_tokens=16384 and --max_new_tokens=1 (prefill-dominated).", + ) + parser.add_argument( + "--infinilm_inprocess", + action="store_true", + help="Run InfiniLM in-process (no jiuge subprocess). Use when PYTHONPATH/LD_LIBRARY_PATH are set in this process.", + ) + args = parser.parse_args() + + backends = [b.strip() for b in args.backends.split(",")] + results = [] + + # Normalize convenience prefill-only configuration. + if args.prefill_16k: + if args.target_input_tokens is None: + args.target_input_tokens = 16384 + # For prefill-dominated comparisons, prefer HF forward-only by default. + if args.hf_mode == "generate": + args.hf_mode = "forward_prefill" + if args.max_new_tokens != 1: + args.max_new_tokens = 1 + + # If requested, build a long prompt once using HF tokenizer. + if args.target_input_tokens is not None: + try: + from transformers import AutoTokenizer + + tok = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) + long_prompt, actual = _make_prompt_with_target_tokens(tok, args.prompt, args.target_input_tokens) + args.prompt = long_prompt + print(f"[prompt] synthesized chat input tokens: {actual} (target >= {args.target_input_tokens})") + except Exception as e: + print(f"[prompt] failed to synthesize long prompt: {e}") + + if "hf" in backends and not args.no_hf: + try: + import torch + if args.hf_mode == "forward_prefill": + r = run_hf_forward_prefill( + args.model_path, + args.prompt, + attn_implementation=args.hf_attn_implementation, + use_cache=args.hf_forward_use_cache, + warmup=args.hf_forward_warmup, + iters=args.hf_forward_iters, + ) + elif args.hf_mode == "decode_loop": + r = run_hf_decode_loop( + args.model_path, + args.prompt, + args.max_new_tokens, + attn_implementation=args.hf_attn_implementation, + use_cache=True, + warmup=args.hf_decode_warmup, + iters=args.hf_decode_iters, + ) + else: + r = run_hf( + args.model_path, + args.prompt, + args.max_new_tokens, + attn_implementation=args.hf_attn_implementation, + ) + results.append(r) + except Exception as e: + results.append({"backend": "hf", "error": str(e)}) + + if "infinilm" in backends and not args.no_infinilm: + # In-process: when env is set in this process or --infinilm_inprocess, avoid jiuge subprocess. + # Also use in-process for long prompts (target_input_tokens) to avoid 2048-token truncation. + use_inprocess = args.infinilm_inprocess or args.target_input_tokens is not None + if use_inprocess: + try: + r = run_infinilm_inprocess( + args.model_path, + args.prompt, + args.max_new_tokens, + cache_mode=args.infinilm_cache_mode, # type: ignore[arg-type] + paged_block_size=args.infinilm_paged_block_size, + attn_backend=args.infinilm_attn_backend, + ) + except Exception as e: + r = {"backend": "infinilm", "error": str(e)} + else: + r = run_infinilm(args.model_path, args.prompt, args.max_new_tokens) + results.append(r) + + if "sglang" in backends and args.sglang_url: + r = run_sglang_client(args.sglang_url, args.prompt, args.max_new_tokens) + results.append(r) + elif "sglang" in backends and not args.sglang_url: + results.append({"backend": "sglang", "error": "No --sglang_url provided; start SGLang server with MiniCPM-SALA first"}) + + # Print table + print("\n" + "=" * 60) + print("MiniCPM-SALA inference speed comparison") + print("=" * 60) + print(f" prompt = {repr(args.prompt[:500])} max_new_tokens = {args.max_new_tokens}") + print() + for r in results: + if "error" in r: + print(f" {r['backend']}: ERROR {r['error']}") + continue + print(f" {r['backend']}:") + for k, v in r.items(): + if k == "backend" or v is None: + continue + if isinstance(v, float): + print(f" {k}: {v}") + else: + print(f" {k}: {v}") + print() + print("=" * 60) + + if args.output: + with open(args.output, "w") as f: + json.dump({"prompt": args.prompt, "max_new_tokens": args.max_new_tokens, "results": results}, f, indent=2) + print(f"Wrote {args.output}") + + +if __name__ == "__main__": + import os + main() diff --git a/examples/jiuge.py b/examples/jiuge.py index fa547435..1fcba6c4 100644 --- a/examples/jiuge.py +++ b/examples/jiuge.py @@ -252,9 +252,13 @@ def test( # ---------------------------------------------------------------------------- # # Create KVCache # ---------------------------------------------------------------------------- # + batch_size = 1 if isinstance(prompts, str) else len(prompts) + initial_capacity = max_new_tokens + len(input_ids_list[0]) + # MiniCPM-SALA uses per-layer dense KV cache in C++; engine cache_config drives + # scheduling only. Static cache is recommended (no paged bookkeeping) unless + # --enable-paged-attn is explicitly set. if enable_paged_attn: - batch_size = 1 if prompts is str else len(prompts) - max_total_tokens = max_new_tokens + len(input_ids_list[0]) + max_total_tokens = initial_capacity cache_config = PagedKVCacheConfig( num_blocks=( (max_total_tokens + (_PAGED_KV_BLOCK_SIZE - 1)) // _PAGED_KV_BLOCK_SIZE @@ -263,10 +267,12 @@ def test( block_size=_PAGED_KV_BLOCK_SIZE, ) else: - batch_size = 1 if prompts is str else len(prompts) - initial_capacity = max_new_tokens + len(input_ids_list[0]) + max_cache_len = initial_capacity + if getattr(model.config, "model_type", None) == "minicpm_sala": + max_pos = getattr(model.config, "max_position_embeddings", 4096) + max_cache_len = max(initial_capacity, max_pos) cache_config = StaticKVCacheConfig( - max_batch_size=batch_size, max_cache_len=initial_capacity + max_batch_size=batch_size, max_cache_len=max_cache_len ) model.reset_cache(cache_config) diff --git a/examples/metrics_16k_prefill.md b/examples/metrics_16k_prefill.md new file mode 100644 index 00000000..2337fac0 --- /dev/null +++ b/examples/metrics_16k_prefill.md @@ -0,0 +1,152 @@ +### MiniCPM-SALA 16k long-prompt metrics (A/B cache modes) + +**Setup** + +- **Prompt construction**: `--target_input_tokens 16384` (actual synthesized **16386** chat-template tokens) +- **Workload**: `--max_new_tokens 1` (prefill-dominated) +- **Environment**: run via `scripts/run_compare_speed_in_container.sh` inside container `minicpm-sala` + +| backend | cache_mode | attn_backend | enable_paged_attn | cache sizing | prefill_ttft_ms | prefill_throughput_tok_s | total_time_ms | +|---|---|---|---:|---|---:|---:|---:| +| hf | — | — | — | — | — | 9325.01 | 1757.21 | +| infinilm | static_fit | default | False | static_max_cache_len=16387 | 33632.05 | 487.21 | 33632.29 | +| infinilm | static_maxpos | default | False | static_max_cache_len=524288 | 34067.49 | 480.99 | 34067.75 | +| infinilm | paged | default | True | paged_block_size=256, paged_num_blocks=65 | 35626.25 | 459.94 | 35627.10 | + +**Raw commands** + +```bash +./scripts/run_compare_speed_in_container.sh --backends hf --target_input_tokens 16384 --max_new_tokens 1 +./scripts/run_compare_speed_in_container.sh --backends infinilm --target_input_tokens 16384 --max_new_tokens 1 --infinilm_attn_backend default --infinilm_cache_mode static_fit +./scripts/run_compare_speed_in_container.sh --backends infinilm --target_input_tokens 16384 --max_new_tokens 1 --infinilm_attn_backend default --infinilm_cache_mode static_maxpos +./scripts/run_compare_speed_in_container.sh --backends infinilm --target_input_tokens 16384 --max_new_tokens 1 --infinilm_attn_backend default --infinilm_cache_mode paged --infinilm_paged_block_size 256 +``` + +### Profiling methodology (nsys) for kernel attribution (HF vs InfiniLM prefill) + +**Goal**: attribute the 16k prefill gap to kernel families (attention vs GEMMs vs layout/copies/sync), using the same prompt and a prefill-dominated workload. + +**Environment**: all profiling commands in this section are run **inside the container `minicpm-sala`** (not on the host), so that PyTorch, InfiniCore, and the model path are available. Use `docker exec -it minicpm-sala bash` or the host script `./scripts/profile_prefill_torchprof_in_container.sh` to run in-container. + +**Workload** + +- HF: forward-only prefill (`--hf_mode forward_prefill`, `--max_new_tokens 1`) +- InfiniLM: prefill-dominated generation (`--target_input_tokens 16384 --max_new_tokens 1`) + +**Key requirements** + +- Use a free GPU to avoid allocator failures and noisy traces, e.g. `CUDA_VISIBLE_DEVICES=1`. +- Prefer `nsys stats` reports: + - `cuda_gpu_kern_sum` + - `cuda_gpu_mem_time_sum` + - `cuda_api_sum` + - `nvtx_sum` + +**Example (inside container `minicpm-sala`)** + +```bash +export CUDA_VISIBLE_DEVICES=1 +REPO=/home/zenghua/workspace/minicpm-sala-support +MODEL=/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA +OUT=${REPO}/profiles +mkdir -p ${OUT} + +source /app/docker/nvidia/env-set.sh 2>/dev/null || true +export PYTHONPATH=${REPO}/InfiniLM/python:${REPO}/InfiniCore/python:${PYTHONPATH} + +# HF forward-only prefill (single forward, best for kernel attribution) +nsys profile --force-overwrite=true --trace=cuda,nvtx,osrt \ + -o ${OUT}/hf_forward_prefill_16k \ + python3 ${REPO}/InfiniLM/examples/compare_inference_speed.py \ + --model_path "${MODEL}" --prefill_16k --backends hf \ + --hf_mode forward_prefill --hf_forward_use_cache \ + --hf_forward_warmup 1 --hf_forward_iters 1 \ + --hf_attn_implementation flash_attention_2 + +# InfiniLM prefill-dominated (max_new_tokens=1) +nsys profile --force-overwrite=true --trace=cuda,nvtx,osrt \ + -o ${OUT}/infinilm_prefill_16k \ + python3 ${REPO}/InfiniLM/examples/compare_inference_speed.py \ + --model_path "${MODEL}" --prefill_16k --backends infinilm \ + --infinilm_cache_mode static_fit --infinilm_attn_backend default + +# Summaries +nsys stats --report cuda_gpu_kern_sum --format table ${OUT}/hf_forward_prefill_16k.nsys-rep > ${OUT}/hf_forward_prefill_16k_cuda_gpu_kern_sum.txt +nsys stats --report cuda_gpu_kern_sum --format table ${OUT}/infinilm_prefill_16k.nsys-rep > ${OUT}/infinilm_prefill_16k_cuda_gpu_kern_sum.txt +nsys stats --report cuda_gpu_mem_time_sum --format table ${OUT}/hf_forward_prefill_16k.nsys-rep > ${OUT}/hf_forward_prefill_16k_cuda_gpu_mem_time_sum.txt +nsys stats --report cuda_gpu_mem_time_sum --format table ${OUT}/infinilm_prefill_16k.nsys-rep > ${OUT}/infinilm_prefill_16k_cuda_gpu_mem_time_sum.txt +nsys stats --report cuda_api_sum --format table ${OUT}/hf_forward_prefill_16k.nsys-rep > ${OUT}/hf_forward_prefill_16k_cuda_api_sum.txt +nsys stats --report cuda_api_sum --format table ${OUT}/infinilm_prefill_16k.nsys-rep > ${OUT}/infinilm_prefill_16k_cuda_api_sum.txt +nsys stats --report nvtx_sum --format table ${OUT}/hf_forward_prefill_16k.nsys-rep > ${OUT}/hf_forward_prefill_16k_nvtx_sum.txt +nsys stats --report nvtx_sum --format table ${OUT}/infinilm_prefill_16k.nsys-rep > ${OUT}/infinilm_prefill_16k_nvtx_sum.txt +``` + +### Prefill kernel launch reduction: SiLU/SwiGLU evidence and change + +**Evidence that SiLU/SwiGLU contributed to launch count** + +- Prefill profiling (e.g. `profile_prefill_infinilm_torchprof.py` at seq_len=512) showed ~298k `cudaLaunchKernel` and many small **elementwise** kernels (~36k calls). The MLP path used two separate InfiniCore ops per layer for SwiGLU: + - `infinicore::op::silu_(gate, gate)` — one kernel per layer + - `infinicore::op::mul(gate, up)` — one kernel per layer +- With 32 layers that is **64 extra launches** from this pattern alone. InfiniCore provides a **fused** `swiglu(a, b)` (single kernel: `a * b * sigmoid(b)`), which matches SwiGLU as `silu(gate)*up` when called as `swiglu(up, gate)`. + +**Change applied** + +- **File**: `InfiniLM/csrc/models/minicpm_sala/minicpm_sala_mlp.cpp` +- **Before**: `silu_(gate, gate)` then `mul(gate, up)` (two kernel launches per layer). +- **After**: single `infinicore::op::swiglu(up, gate)` (one kernel per layer). +- **Effect**: 32 fewer kernel launches per prefill (one per layer). Re-run the same prefill profiler or nsys commands above and compare `cuda_api_sum` (e.g. `cudaLaunchKernel` count) and `cuda_gpu_kern_sum` to confirm. + +### Environment fix: run InfiniLM/InfiniCore with InfLLM-v2 without LD_PRELOAD (nsys-safe) + +When profiling with `nsys`, setting `LD_PRELOAD` to the `infllm_v2` extension can break `nsys` itself (loader errors from PyTorch's `libtorch_python.so`). To make `nsys profile ... python ...` work reliably, we preload the InfLLM-v2 `.so` **inside Python** (RTLD_GLOBAL) before importing `infinicore`, so that `libinfinicore_cpp_api.so` can resolve `mha_varlen_fwd` / `mha_fwd_kvcache` without using `LD_PRELOAD`. + +- **Note**: InfLLM-v2 is now linked normally via InfiniCore build; no Python-side preload helper is required. +- **Wired into scripts** (preload before `import infinicore`): + - `InfiniLM/examples/compare_inference_speed.py` + - `InfiniLM/examples/profile_prefill_infinilm_torchprof.py` + - `InfiniLM/examples/minicpm_sala_logits_sanity.py` + +This unblocks running both torchprof and `nsys profile` inside the `minicpm-sala` container with a consistent environment. + +### 16k prefill nsys numbers (post env-fix) + +**Workload:** `--prefill_16k` (prompt tokens 16386), `--max_new_tokens 1`, `--infinilm_cache_mode static_fit`, `--infinilm_attn_backend default` + +- **HF forward-only prefill** (from `compare_inference_speed.py`): `total_time_ms ≈ 1782.58` for 16386 tokens. +- **HF forward-only prefill (rerun)** (from `compare_inference_speed.py`): `total_time_ms = 1757.21`, `prefill_throughput_tok_s = 9325.01` for 16386 tokens. +- **InfiniLM prefill-dominated** (from `compare_inference_speed.py`): `prefill_ttft_ms ≈ 55646.11` (baseline run) and `prefill_ttft_ms ≈ 57623.64` (rerun after minor code changes). + +**InfiniLM 16k CUDA API summary** (nsys `cuda_api_sum`, baseline run `profiles/infinilm_prefill_16k_cuda_api_sum.txt`): + +- `cudaLaunchKernel`: **3,147,266 calls** +- `cudaMemcpyAsync`: **394,155 calls** + +Top GPU kernels by time (nsys `cuda_gpu_kern_sum`, baseline run `profiles/infinilm_prefill_16k_cuda_gpu_kern_sum.txt`) show very high call counts tied to the Lightning Simple GLA path: + +- Several `at::native::*elementwise_kernel*` entries at **393,264 instances each** (exactly `16386 * 24`), indicating a large per-token kernel launch budget in the current GLA implementation. + +**Prefill profiling: run inside container `minicpm-sala`** + +All profiling commands below are intended to run **inside the container** (so PyTorch, InfiniCore, and the model are available). From the host you can either `docker exec -it minicpm-sala bash` and run the commands, or use the helper script that runs the torchprof prefill script in-container. + +- **Launch-count confirmation (torchprof, in-container)** + + From repo root on host: + + ```bash + ./scripts/profile_prefill_torchprof_in_container.sh + ``` + + Optional env: `SEQ_LEN=512` (default), `ACTIVE=1`, `MODEL_PATH`, `CUDA_VISIBLE_DEVICES`, `INFINILM_CUDA_INDEX`. The script prints `[launch_summary] cudaLaunchKernel_count=... cudaMemcpy_count=...` and the kernel table; compare after the SwiGLU fusion to confirm ~32 fewer launches per prefill. + + Or inside the container: + + ```bash + source /app/docker/nvidia/env-set.sh 2>/dev/null || true + export PYTHONPATH=${REPO}/InfiniLM/python:${REPO}/InfiniCore/python:${PYTHONPATH} + cd ${REPO}/InfiniLM + INFINILM_CUDA_INDEX=0 python3 examples/profile_prefill_infinilm_torchprof.py --model_path "${MODEL}" --seq_len 512 --active 1 --out /tmp/torchprof_prefill_512.txt + ``` + +- **nsys prefill profiling** (see “Example (inside container minicpm-sala)” above) also runs in-container; use the same `REPO`, `MODEL`, `source env-set.sh`, and `PYTHONPATH` before `nsys profile` and `nsys stats`. diff --git a/examples/metrics_longtext_mem.md b/examples/metrics_longtext_mem.md new file mode 100644 index 00000000..28fe8f33 --- /dev/null +++ b/examples/metrics_longtext_mem.md @@ -0,0 +1,378 @@ +### MiniCPM-SALA long-context metrics + memory history + +**Goal**: record reproducible long-context runs with: + +- **time** (prefill TTFT / throughput) +- **peak GPU memory** (from 1s `nvidia-smi` polling) +- exact **command lines** and key env + +**Notes** + +- All commands are intended to run **inside** docker container `minicpm-sala`. +- Prefer an **idle** GPU (avoid indices that are already near full VRAM). Scan on the host (or `docker exec minicpm-sala nvidia-smi ...` if all GPUs are visible there): + `nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv,noheader,nounits` + Then set `export CUDA_VISIBLE_DEVICES=` for the run and, for scripts that poll VRAM (e.g. `collect_metrics_longtext_decode.py`), set `NVML_GPU_INDEX=` to the **same** index. Example when GPUs 2–4 are mostly free: `CUDA_VISIBLE_DEVICES=2` and `NVML_GPU_INDEX=2`. +- For InfiniLM + InfLLM-v2 builds, `libinfinicore_cpp_api.so` may require preloading `infllm_v2` with `RTLD_GLOBAL` before importing `infinicore`. + +### OOM-safe sweep: one case per process + +Running every long-context case in a **single** Python session can leave CUDA memory fragmented or peak across cases. Prefer `**run_longtext_metrics_cases.sh`**, which runs each `(backend × target × max_new)` as its **own** `python3 collect_metrics_longtext_decode.py --case ...` subprocess, appends one JSON line per row to `profiling_runs/longtext_decode_rows.jsonl`, then prints a markdown table via `--from-jsonl`. + +```bash +REPO=/home/zenghua/workspace/minicpm-sala-support +export CUDA_VISIBLE_DEVICES=2 +export NVML_GPU_INDEX=2 +export PYTHONPATH=$REPO/InfiniLM/examples:$REPO/InfiniCore/python:$REPO/InfiniLM/python +export LD_LIBRARY_PATH=/root/.infini/lib:${LD_LIBRARY_PATH:-} +export METRICS_DATE=2026-03-23 +cd $REPO/InfiniLM/examples +./run_longtext_metrics_cases.sh +``` + +Single case manually: `python3 collect_metrics_longtext_decode.py --case hf:16384` or `infinilm_rec:65536:1`. Monolithic (unsafe) full matrix: `python3 collect_metrics_longtext_decode.py --all-in-process`. + +### Clean & Validate Status (post-cleanup: 2026-03-23) + +- Clean: removed unused debug helper `log_tensor_stats_to_file_if_enabled` and deprecated metrics padded-decode cases; `collect_metrics_longtext_decode.py` + `run_longtext_metrics_cases.sh` no longer sweep `infinilm_pad:*`. +- Validate: rebuilt `_infinilm`, ran `InfiniCore/test/infinicore/ops/test_simple_gla_decode_recurrent.py --nvidia`, `test_simple_gla_prefill.py --nvidia`, and `InfiniLM/examples/minicpm_sala_logits_sanity.py` in `prefill` + `decode1`; confirmed `collect_metrics_longtext_decode.py --case infinilm_pad:*` is rejected with an "Unknown case kind" error. + +--- + +## 2026-03-23 long-context + decode (`longtext_decode_rows.jsonl`) + +Subprocess sweep via `./run_longtext_metrics_cases.sh`. **GPU:** `CUDA_VISIBLE_DEVICES=0`, `NVML_GPU_INDEX=0`. **Targets:** `METRICS_TARGETS=16384,32768`. **Decode steps:** `METRICS_DECODE_STEPS=32`. Recurrent InfiniLM uses `INFINI_LIGHTNING_GLA_RECURRENT_DECODE=1` with batched GLA state sync (GEMM). HF `total_ms` is end-to-end (prefill + decode), matching InfiniLM. `hf (decode_loop)` rows for `max_new=32` are appended via `hf::32`. Regenerate this table: `python3 collect_metrics_longtext_decode.py --from-jsonl profiling_runs/longtext_decode_rows.jsonl` + + +| date | backend | target_in | max_new | peak_mem_mib | total_ms | prefill_ttft_ms | prefill_tok_s | decode_itl_ms | decode_tok_s | gpu | +| ---------- | ------------------------------------------------ | --------- | ------- | ------------ | -------- | --------------- | ------------- | ------------- | ------------ | --- | +| 2026-03-23 | hf (decode_loop) | 16384 | 1 | 38101 | 1821.53 | — | — | — | — | 0 | +| 2026-03-23 | hf (decode_loop) | 32768 | 1 | 51545 | 3711.99 | — | — | — | — | 0 | +| 2026-03-23 | hf (decode_loop) | 16384 | 32 | 38365 | 3435.09 | — | — | 52.24 | 19.14 | 0 | +| 2026-03-23 | hf (decode_loop) | 32768 | 32 | 41717 | 5247.77 | — | — | 52.90 | 18.90 | 0 | +| 2026-03-23 | infinilm (static_fit, recurrent GLA decode) | 16384 | 1 | 33525 | 3162.11 | 3161.5 | 5182.98 | — | — | 0 | +| 2026-03-23 | infinilm (static_fit, recurrent GLA decode) | 32768 | 1 | 44897 | 7139.12 | 7138.74 | 4590.45 | — | — | 0 | +| 2026-03-23 | infinilm (static_fit, recurrent GLA, +32 decode) | 16384 | 32 | 33537 | 4111.32 | 3182.07 | 5149.48 | 29.94 | 33.4 | 0 | +| 2026-03-23 | infinilm (static_fit, recurrent GLA, +32 decode) | 32768 | 32 | 44911 | 8357.39 | 7146.78 | 4585.28 | 39 | 25.64 | 0 | + + +--- + +## History table + + +| date | backend | target_input_tokens | max_new_tokens | cache_mode | peak_mem_mib | total_time_ms | prefill_ttft_ms | prefill_throughput_tok_s | gpu | +| ---------- | --------------------------------------------- | ------------------- | -------------- | ---------- | ------------ | ------------- | --------------- | ------------------------ | --- | +| 2026-03-18 | hf | 16384 | 1 | — | 38091 | 1757.21 | — | 9325.01 | 2 | +| 2026-03-19 | hf | 16384 | 1 | — | 38091 | 1760.08 | — | 9311.48 | 2 | +| 2026-03-18 | hf | 32768 | 1 | — | 41173 | 3537.65 | — | 9263.22 | 2 | +| 2026-03-19 | hf | 32768 | 1 | — | 41151 | 3516.06 | — | 9319.51 | 2 | +| 2026-03-19 | infinilm(baseline) | 16384 | 1 | static_fit | 33570 | 2849.22 | 2849.03 | 5751.44 | 0 | +| 2026-03-19 | infinilm(baseline) | 32768 | 1 | static_fit | 44174 | 5960.41 | 5960.14 | 5498.19 | 0 | +| 2026-03-19 | infinilm(baseline) | 65536 | 1 | static_fit | 67195 | 13929.51 | 13929.12 | 4705.11 | 4 | +| 2026-03-19 | hf (consistent-batch) | 16384 | 1 | — | 38091 | 1782.63 | — | 9192.04 | 4 | +| 2026-03-19 | hf (consistent-batch) | 32768 | 1 | — | 41173 | 3585.96 | — | 9138.42 | 4 | +| 2026-03-19 | hf (consistent-batch) | 65536 | 1 | — | 47319 | 7426.98 | — | 8824.32 | 4 | +| 2026-03-19 | infinilm (consistent-batch) | 16384 | 1 | static_fit | 32605 | 2887.28 | 2887.06 | 5675.67 | 4 | +| 2026-03-19 | infinilm (consistent-batch) | 32768 | 1 | static_fit | 43209 | 6005.78 | 6005.57 | 5456.60 | 4 | +| 2026-03-19 | infinilm (consistent-batch) | 65536 | 1 | static_fit | 67195 | 13940.17 | 13939.90 | 4701.47 | 4 | +| 2026-03-19 | infinilm (exp2/3 opt: strided KV + GLA views) | 32768 | 1 | static_fit | 38613 | 5993.70 | 5993.45 | 5467.64 | 4 | +| 2026-03-19 | infinilm (exp2/3 opt: strided KV + GLA views) | 65536 | 1 | static_fit | 67195 | 13959.08 | 13958.78 | 4695.11 | 4 | +| 2026-03-19 | infinilm(baseline) | 131072 | 1 | static_fit | 79883 | OOM | — | — | 6 | +| 2026-03-18 | hf | 524288 | 1 | — | 59591 | OOM | — | — | 3 | +| 2026-03-18 | hf | 65536 | 1 | — | 47319 | 7340.99 | — | 8927.67 | 1 | +| 2026-03-18 | hf | 131072 | 1 | — | 61641 | 15290.39 | — | 8572.31 | 1 | +| 2026-03-18 | hf | 262144 | 1 | — | 80059 | OOM | — | — | 1 | + + +--- + +## 2026-03-19 consistent batch summary (GPU 4, 1s polling) + +Protocol used for both backends: + +- same physical GPU (`CUDA_VISIBLE_DEVICES=4`), same model, `max_new_tokens=1` +- same target lengths: 16k / 32k / 64k +- memory measured from 1s `nvidia-smi -i 4 --query-gpu=memory.used` polling +- HF path: `--hf_mode forward_prefill --hf_forward_use_cache --hf_forward_warmup 1 --hf_forward_iters 1` +- InfiniLM path: `--infinilm_inprocess --infinilm_cache_mode static_fit` + +### Growth deltas (16k->32k and 32k->64k) + +TTFT note: HF forward-prefill does not emit TTFT; `total_time_ms` is used as prefill-time proxy for HF deltas. + + +| backend | 16k->32k mem delta (MiB) | 32k->64k mem delta (MiB) | 16k->32k time delta (ms) | 32k->64k time delta (ms) | +| --------------------- | ------------------------ | ------------------------ | ------------------------ | ------------------------ | +| hf (forward-prefill) | +3082 | +6146 | +1803.33 | +3841.02 | +| infinilm (static_fit) | +10604 | +23986 | +3118.51 (TTFT) | +7934.33 (TTFT) | + + +### Attribution profiling (InfiniLM 32k / 64k) + +Artifacts are saved in `InfiniLM/examples/profiling_runs`: + +- allocator logs: `alloc_infinilm_32768_gpu4.log`, `alloc_infinilm_65536_gpu4.log` +- nsys logs: `nsys_infinilm_32768_gpu4.log`, `nsys_infinilm_65536_gpu4.log` + +Allocator observations (`INFINICORE_DEBUG_ALLOC=1`): + +- both runs show identical small/medium allocation patterns (e.g., many `32 MiB` and `128 MiB` class allocations), suggesting these are mostly fixed/runtime-structural. +- 64k introduces substantially larger "large" allocations than 32k (examples in logs include `12.0 GiB`, `9.0 GiB`, and `2.0 GiB`-class requests), consistent with context-length-driven persistent KV slab growth. +- 32k large allocations are present but markedly smaller (e.g., `~6.0 GiB`, `~4.5 GiB`, `~1.0 GiB`), aligning with lower persistent cache footprint. + +Nsight Systems observations (`nsys profile --trace=cuda,nvtx,osrt --stats=true`): + +- NVTX `infinilm_generate` range scales from `~6.18s` (32k) to `~14.17s` (64k), matching TTFT growth. +- CUDA API summary becomes more memcpy-dominated at 64k: + - 32k: `cudaMemcpy ~64.6%`, `cudaMemcpyAsync ~33.0%` + - 64k: `cudaMemcpy ~83.0%`, `cudaMemcpyAsync ~15.7%` +- GPU kernel summary shows both attention and GLA prefill kernels scaling up: + - `flash_fwd_kernel` total: `~1.03s` -> `~4.09s` + - `simple_gla_prefill_chunked_kernel` total: `~1.24s` -> `~2.45s` + +Attribution confidence: + +- **High**: persistent KV/cache-related allocations are the primary memory-growth driver from 32k to 64k. +- **Medium**: transient prefill compute/workspace growth contributes, but is secondary vs persistent slabs for memory. +- **Medium**: synchronization/memcpy behavior is a major TTFT growth contributor at 64k. + +### Short-context decode profiling (Nsight Systems, vs HF) + +**Artifacts** (under `InfiniLM/examples/profiling_runs/`): + +- HF manual decode: `nsys_decode_hf_tok256_gpu4.log` (`--hf_mode decode_loop`, short prompt, `max_new_tokens=256`). +- InfiniLM generate: `nsys_decode_infinilm_tok256_gpu4.log`, `nsys_decode_infinilm_nvtx_tok256_gpu4.log`, `nsys_decode_infinilm_nvtx_opt_tok256_gpu4.log` (same prompt / 256 new tokens; NVTX ranges from `infer_engine.generate`). +- Post–`write_i32`/`write_i64` rebuild (2026-03-20, GPU 4): `nsys_decode_infinilm_tok256_gpu4_pybind_run.log` (failed: stale `_infinicore` without `write_i32`), `nsys_decode_infinilm_tok256_gpu4_pybind_run2.log` + `decode_infinilm_tok256_gpu4_pybind_run2.nsys-rep` (**good** after `install.py` + `xmake build/install _infinicore` in container). Script `compare_inference_speed.py` preloads InfLLM-v2 (`RTLD_GLOBAL`) so `libinfinicore_cpp_api.so` resolves `mha_varlen_fwd`; bare `python -c import infinicore` without that preload can show an undefined-symbol error. + +**NVTX (InfiniLM)** — use these ranges in the Nsight UI / `nsys stats` to isolate prefill vs steady decode: + +- `infinilm_prefill_step` — first `generate` iteration. +- `infinilm_decode_total` — spans decode iterations 1..N-1 (opened on iter 1). +- `infinilm_decode_step` — one range per token step (high instance count). +- `infinilm_generate` — full `engine.generate()` call. + +**HF**: `hf_decode_loop` wraps the timed decode loop (prefill is outside this range). + +**Headline comparison** (same GPU, 256 decode steps, short prompt; numbers from the logs above): + + +| Metric (CUDA API sum) | HF `decode_loop` | InfiniLM `generate` | +| ------------------------ | ------------------- | ------------------- | +| `cudaLaunchKernel` calls | ~593k | ~7.44M | +| ~calls / decode step | ~2.3k | ~29k | +| `cudaMemcpyAsync` calls | lower than InfiniLM | ~988k | + + +**Memcpy time** (`cuda_gpu_mem_time_sum`): InfiniLM decode shows large **H2D** wall share (~63% of memcpy time in one run) with **many** small transfers; HF decode shows **fewer** H2D operations but they can dominate memcpy time when they occur. + +**Interpretation**: InfiniLM short decode is limited less by a single kernel and more by **per-step framework overhead** (launch count + small copies). Next wins are structural (fewer launches per token, true decode KV path, graph/capture where safe), not scalar metadata alone. + +**Continuing profiling — repro commands** (inside `minicpm-sala`, pick idle `GPU`; outputs go to `profiling_runs/`): + +```bash +REPO=/home/zenghua/workspace/minicpm-sala-support +MODEL=/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA +GPU=4 +export CUDA_VISIBLE_DEVICES=$GPU +export PYTHONPATH=$REPO/InfiniLM/python:$REPO/InfiniCore/python:${PYTHONPATH:-} +export LD_LIBRARY_PATH=/root/.infini/lib:${LD_LIBRARY_PATH:-} +cd $REPO/InfiniLM/examples + +TAG=decode_infinilm_tok256_gpu${GPU} +nsys profile --trace=cuda,nvtx,osrt --stats=true -o profiling_runs/${TAG} --force-overwrite true \ + python3 compare_inference_speed.py \ + --model_path "$MODEL" \ + --prompt "Write a short haiku about GPUs." \ + --max_new_tokens 256 \ + --backends infinilm \ + --no_hf \ + --infinilm_inprocess \ + --infinilm_cache_mode static_fit \ + 2>&1 | tee profiling_runs/nsys_${TAG}.log + +# Optional (InfiniLM decode): reduce D2H / Python overhead and A/B CPU metadata tensors +# export INFINI_PROFILE_KEEP_OUTPUT_IDS_ON_DEVICE=1 +# export INFINI_PROFILE_COLLECT_OUTPUT_IDS=0 +# export INFINI_PROFILE_DISABLE_FAST_DECODE_META=1 # force per-step from_list() metadata vs reusable CPU+write_i* fast path + +TAG=decode_hf_tok256_gpu${GPU} +nsys profile --trace=cuda,nvtx,osrt --stats=true -o profiling_runs/${TAG} --force-overwrite true \ + python3 compare_inference_speed.py \ + --model_path "$MODEL" \ + --prompt "Write a short haiku about GPUs." \ + --max_new_tokens 256 \ + --backends hf \ + --no_infinilm \ + --hf_mode decode_loop \ + --hf_decode_warmup 8 \ + --hf_decode_iters 1 \ + --hf_attn_implementation flash_attention_2 \ + 2>&1 | tee profiling_runs/nsys_${TAG}.log +``` + +**Long-context decode** (optional): add e.g. `--target_input_tokens 32768` to either command so NVTX still tags prefill vs decode; expect traces to be large. + +**Prefill-only nsys** (matches earlier 32k/64k attribution): + +```bash +TAG=infinilm_prefill_32768_gpu${GPU} +nsys profile --trace=cuda,nvtx,osrt --stats=true -o profiling_runs/${TAG} --force-overwrite true \ + python3 compare_inference_speed.py \ + --model_path "$MODEL" \ + --target_input_tokens 32768 \ + --max_new_tokens 1 \ + --backends infinilm \ + --no_hf \ + --infinilm_inprocess \ + --infinilm_cache_mode static_fit \ + 2>&1 | tee profiling_runs/nsys_${TAG}.log +``` + +After code changes (e.g. pybind metadata path), re-run the **same** `TAG` with a suffix (`_run2`) and diff `cuda_api_sum` / `cuda_gpu_kern_sum` / NVTX tables. + +### Ranked next optimization experiments (minimal changes) + +1. **Constrain/reshape persistent KV growth first** +Expected impact: High memory reduction, likely best leverage on 32k->64k slope. +Minimal experiment: compare `static_fit` vs `paged` (small block sizes, e.g., 128/256) at 32k/64k and re-measure peaks + TTFT. +2. **Reduce transient prefill movement/workspace** +Expected impact: Medium TTFT gain, small-to-medium memory relief. +Minimal experiment: isolate `simple_gla_prefill` transform/workspace path and reduce extra copies/format conversions; confirm via reduced `cudaMemcpy` share in nsys. +3. **Trim synchronization/copy overhead around prefill** +Expected impact: Medium TTFT gain at long context. +Minimal experiment: profile before/after removing avoidable sync points or host-device transfers in attention/prefill orchestration; success criterion is lower `cudaMemcpy` wall share with unchanged logits. + +Applied (2026-03-19): removed `permute(...)->contiguous()` materialization for KV cache update and GLA prefill inputs in `minicpm_sala_attention.cpp` (pass strided views). +Result: 32k peak memory improved on GPU 4 (**43209 MiB → 38613 MiB**) with similar TTFT; 64k peak unchanged (dominated by persistent KV slabs). + +Validation gate for each experiment: + +- **Operator unit tests (CUDA) first** — InfLLM-v2 + Simple GLA prefill (see below). Failing ops almost always mean wasted time on full-model logits debugging. +- run `minicpm_sala_logits_sanity.py` (prefill mode) and compare ratio/max_diff/mean_diff against current baseline. +- run one prompt generation sanity and verify no functional regression. + +--- + +## Commands (repro) + +### InfiniCore operator tests (run before logits sanity) + +MiniCPM-SALA stack depends on `infllmv2_varlen` / `infllmv2_kvcache` and `simple_gla_prefill`. Run these inside `minicpm-sala` with `InfiniLM/python` on `PYTHONPATH` so InfLLM-v2 preloads before `import infinicore`: + +```bash +REPO=/home/zenghua/workspace/minicpm-sala-support +export CUDA_VISIBLE_DEVICES=1 +export PYTHONPATH=$REPO/InfiniCore/test/infinicore:$REPO/InfiniCore/python:$REPO/InfiniLM/python:${PYTHONPATH:-} +export LD_LIBRARY_PATH=/root/.infini/lib:${LD_LIBRARY_PATH:-} +cd $REPO/InfiniCore/test/infinicore/ops + +python3 test_infllmv2_attention.py --nvidia +python3 test_simple_gla_prefill.py --nvidia +``` + +One-liner wrapper (same env assumptions as the repo): + +```bash +bash $REPO/InfiniLM/examples/run_infinicore_ops_before_logits.sh +``` + +### Logits correctness gate (HF vs InfiniLM) + +Run (inside `minicpm-sala`) to sanity-check HF vs InfiniLM prefill logits on a short prompt: + +```bash +REPO=/home/zenghua/workspace/minicpm-sala-support +MODEL=/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA +export CUDA_VISIBLE_DEVICES=1 +export HF_CUDA_INDEX=0 +export INFINILM_CUDA_INDEX=0 +export PYTHONPATH=$REPO/InfiniLM/python:$REPO/InfiniCore/python:${PYTHONPATH:-} +export LD_LIBRARY_PATH=/root/.infini/lib:${LD_LIBRARY_PATH:-} +cd $REPO/InfiniLM/examples + +python3 minicpm_sala_logits_sanity.py \ + --model_path "$MODEL" \ + --mode prefill \ + --prompt "How are you? Tell me a short joke." \ + --k 10 +``` + +Recorded output (2026-03-18, GPU=1): + +```text +SANITY_ONELINE ratio=0.9889 max_diff=0.1875 mean_diff=0.0682 +``` + +`--mode decode1` (prefill + one decode step): **prefill section** should match the prefill-only run. The **decode** section should now be finite (the previous `NaN` issue was traced to the CUDA embedding kernel leaving outputs uninitialized for out-of-range indices). Correctness can still diverge from HF for longer prompts due to decode/KV/attention parity work; treat **prefill** as the strongest HF parity gate for now. + +### GPU scan + +```bash +nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv,noheader,nounits +``` + +### HF-only prefill (32k) with 1s memory polling + +```bash +REPO=/home/zenghua/workspace/minicpm-sala-support +MODEL=/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA +export CUDA_VISIBLE_DEVICES=2 +export PYTHONPATH=$REPO/InfiniLM/python:$REPO/InfiniCore/python:${PYTHONPATH:-} +cd $REPO/InfiniLM/examples + +python3 compare_inference_speed.py \ + --model_path "$MODEL" \ + --target_input_tokens 32768 \ + --max_new_tokens 1 \ + --backends hf \ + --hf_mode forward_prefill \ + --hf_forward_use_cache \ + --hf_forward_warmup 1 \ + --hf_forward_iters 1 \ + --hf_attn_implementation flash_attention_2 \ + & pid=$! + +echo "[mem] polling physical GPU 2 while pid=$pid" +while kill -0 $pid 2>/dev/null; do + date +"%F %T" + nvidia-smi -i 2 --query-gpu=memory.used,memory.total --format=csv,noheader,nounits + sleep 1 +done +wait $pid +``` + +### InfiniLM-only (32k) with InfLLM-v2 preload + 1s memory polling + +```bash +REPO=/home/zenghua/workspace/minicpm-sala-support +MODEL=/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA +export CUDA_VISIBLE_DEVICES=2 +export PYTHONPATH=$REPO/InfiniLM/python:$REPO/InfiniCore/python:${PYTHONPATH:-} +export LD_LIBRARY_PATH=/root/.infini/lib:$REPO/InfiniLM/build/linux/x86_64/release:${LD_LIBRARY_PATH:-} +cd $REPO/InfiniLM/examples + +python3 - <<'PY' & pid=$! +import ctypes, os, runpy, sys +ctypes.CDLL("/usr/local/lib/python3.12/dist-packages/infllm_v2/C.cpython-312-x86_64-linux-gnu.so", mode=ctypes.RTLD_GLOBAL) +sys.argv = [ + "compare_inference_speed.py", + "--model_path", os.environ["MODEL"], + "--target_input_tokens", "32768", + "--max_new_tokens", "1", + "--backends", "infinilm", + "--no_hf", + "--infinilm_inprocess", + "--infinilm_cache_mode", "static_fit", +] +runpy.run_path("compare_inference_speed.py", run_name="__main__") +PY + +echo "[mem] polling physical GPU 2 while pid=$pid" +while kill -0 $pid 2>/dev/null; do + date +"%F %T" + nvidia-smi -i 2 --query-gpu=memory.used,memory.total --format=csv,noheader,nounits + sleep 1 +done +wait $pid +``` + diff --git a/examples/run_infinicore_ops_before_logits.sh b/examples/run_infinicore_ops_before_logits.sh new file mode 100755 index 00000000..5a93fe11 --- /dev/null +++ b/examples/run_infinicore_ops_before_logits.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# InfiniCore CUDA operator smoke tests for MiniCPM-SALA-related ops. +# Run inside minicpm-sala docker before deeper HF-vs-InfiniLM alignment probes. +set -euo pipefail + +REPO="${REPO:-/home/zenghua/workspace/minicpm-sala-support}" +export PYTHONPATH="$REPO/InfiniCore/test/infinicore:$REPO/InfiniCore/python:$REPO/InfiniLM/python:${PYTHONPATH:-}" +export LD_LIBRARY_PATH="/root/.infini/lib:${LD_LIBRARY_PATH:-}" + +OPS_DIR="$REPO/InfiniCore/test/infinicore/ops" +cd "$OPS_DIR" + +echo "[run_infinicore_ops] REPO=$REPO" +echo "[run_infinicore_ops] test_infllmv2_attention.py --nvidia" +python3 test_infllmv2_attention.py --nvidia +echo "[run_infinicore_ops] test_simple_gla_prefill.py --nvidia" +python3 test_simple_gla_prefill.py --nvidia +echo "[run_infinicore_ops] OK" diff --git a/examples/run_longtext_metrics_cases.sh b/examples/run_longtext_metrics_cases.sh new file mode 100755 index 00000000..dd595c7b --- /dev/null +++ b/examples/run_longtext_metrics_cases.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +# Run each longtext/decode metric case in a **separate** Python process to release CUDA +# memory between runs (reduces OOM when sweeping 16k/32k/64k × HF + InfiniLM). +# +# Usage (inside minicpm-sala, after picking an idle GPU): +# export CUDA_VISIBLE_DEVICES=2 +# export NVML_GPU_INDEX=2 +# export REPO=/home/zenghua/workspace/minicpm-sala-support +# export PYTHONPATH=$REPO/InfiniLM/examples:$REPO/InfiniCore/python:$REPO/InfiniLM/python +# export LD_LIBRARY_PATH=/root/.infini/lib:${LD_LIBRARY_PATH:-} +# export METRICS_DATE=2026-03-23 +# cd $REPO/InfiniLM/examples && ./run_longtext_metrics_cases.sh +# +# Optional: +# METRICS_TARGETS=16384,32768 METRICS_DECODE_STEPS=32 ./run_longtext_metrics_cases.sh +# SLEEP_BETWEEN_SEC=3 # extra pause between subprocesses + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +REPO="${REPO:-/home/zenghua/workspace/minicpm-sala-support}" +export PYTHONPATH="${SCRIPT_DIR}:${REPO}/InfiniCore/python:${REPO}/InfiniLM/python:${PYTHONPATH:-}" +export LD_LIBRARY_PATH="/root/.infini/lib:${LD_LIBRARY_PATH:-}" + +: "${CUDA_VISIBLE_DEVICES:=0}" +: "${NVML_GPU_INDEX:=${CUDA_VISIBLE_DEVICES}}" +: "${METRICS_DATE:=2026-03-23}" +: "${METRICS_DECODE_STEPS:=32}" +: "${METRICS_TARGETS:=16384,32768,65536}" +: "${SLEEP_BETWEEN_SEC:=2}" + +OUT_JSONL="${OUT_JSONL:-${SCRIPT_DIR}/profiling_runs/longtext_decode_rows.jsonl}" +mkdir -p "$(dirname "$OUT_JSONL")" +rm -f "$OUT_JSONL" +echo "[run_longtext_metrics] jsonl -> $OUT_JSONL GPU smi index=$NVML_GPU_INDEX" + +IFS=',' read -r -a TARGETS <<< "$METRICS_TARGETS" + +run_one() { + local c="$1" + echo "[run_longtext_metrics] case=$c" + python3 collect_metrics_longtext_decode.py --case "$c" --append-jsonl "$OUT_JSONL" || true + sleep "${SLEEP_BETWEEN_SEC}" +} + +for t in "${TARGETS[@]}"; do + run_one "hf:${t}" +done +for t in "${TARGETS[@]}"; do + run_one "infinilm_rec:${t}:1" +done +for t in "${TARGETS[@]}"; do + run_one "infinilm_rec:${t}:${METRICS_DECODE_STEPS}" +done + +echo "[run_longtext_metrics] merged table:" +python3 collect_metrics_longtext_decode.py --from-jsonl "$OUT_JSONL" diff --git a/include/infinicore_infer/cache.h b/include/infinicore_infer/cache.h index 522f2235..5f691c64 100644 --- a/include/infinicore_infer/cache.h +++ b/include/infinicore_infer/cache.h @@ -3,6 +3,11 @@ #include +#ifndef __INFINI_C +// Compat: older InfiniCore headers use `__C` instead of `__INFINI_C`. +#define __INFINI_C __C +#endif + __INFINI_C __export struct KVCache *createKVCache( size_t nlayers, size_t max_len, diff --git a/include/infinicore_infer/weights_loader.h b/include/infinicore_infer/weights_loader.h index 82eafe59..057c3a1b 100644 --- a/include/infinicore_infer/weights_loader.h +++ b/include/infinicore_infer/weights_loader.h @@ -3,6 +3,11 @@ #include +#ifndef __INFINI_C +// Compat: older InfiniCore headers use `__C` instead of `__INFINI_C`. +#define __INFINI_C __C +#endif + struct ModelWeights; __INFINI_C __export void diff --git a/python/infinilm/auto_config.py b/python/infinilm/auto_config.py index 7e2d4afd..b6f96ff5 100644 --- a/python/infinilm/auto_config.py +++ b/python/infinilm/auto_config.py @@ -27,6 +27,8 @@ def from_pretrained(model_path): return LlamaConfig(**config_dict) elif config_dict["model_type"] == "minicpm": return LlamaConfig(**config_dict) + elif config_dict["model_type"] == "minicpm_sala": + return LlamaConfig(**config_dict) elif config_dict["model_type"] == "fm9g": return LlamaConfig(**config_dict) elif config_dict["model_type"] == "fm9g7b": diff --git a/python/infinilm/infer_engine.py b/python/infinilm/infer_engine.py index a67add6f..6552227c 100644 --- a/python/infinilm/infer_engine.py +++ b/python/infinilm/infer_engine.py @@ -1,4 +1,5 @@ import time +import os from dataclasses import dataclass import infinicore @@ -78,9 +79,7 @@ def forward( try: # TODO: Remove `_underlying` and simplify the corresponding code. input_ids = input_ids._underlying if input_ids is not None else None - position_ids = ( - position_ids._underlying if position_ids is not None else None - ) + position_ids = position_ids._underlying if position_ids is not None else None past_kv_lengths = ( past_kv_lengths._underlying if past_kv_lengths is not None else None ) @@ -134,6 +133,7 @@ def generate( eos_token_id = generation_config.eos_token_id past_seq_len = 0 + output_ids = [] initial_batch_size, initial_seqlen = input_ids.shape[:2] seq_len = initial_seqlen @@ -164,6 +164,42 @@ def generate( dtype=infinicore.int32, ) + # Decode metadata fast path (batch=1, static cache): + # avoid per-step from_list()/numpy allocations for tiny scalar tensors. + # Those tensors live on CPU and are H2D-copied each forward; for profiling + # comparisons vs `from_list` device metadata, set: + # INFINI_PROFILE_DISABLE_FAST_DECODE_META=1 + disable_fast_decode_meta = os.environ.get( + "INFINI_PROFILE_DISABLE_FAST_DECODE_META", "0" + ) not in ("", "0", "false", "False") + fast_decode_meta = ( + (not self.enable_paged_attn) + and (initial_batch_size == 1) + and not disable_fast_decode_meta + ) + if fast_decode_meta: + cpu = infinicore.device("cpu", 0) + + # Reusable metadata tensors; values updated via pybind write_i32/write_i64. + position_ids_decode = infinicore.empty( + [1, 1], dtype=infinicore.int64, device=cpu + ) + past_kv_lengths_decode = infinicore.empty( + [1], dtype=infinicore.int32, device=cpu + ) + total_kv_lengths_decode = infinicore.empty( + [1], dtype=infinicore.int32, device=cpu + ) + cu_seqlens_decode = infinicore.empty( + [2], dtype=infinicore.int32, device=cpu + ) + input_offsets_decode = infinicore.empty( + [2], dtype=infinicore.int32, device=cpu + ) + input_offsets_decode.write_i32(0, 0) + input_offsets_decode.write_i32(1, 1) + + decode_total_open = False for iter in range(0, generation_config.max_new_tokens): if _measure_and_log_time: start_time = time.perf_counter() @@ -203,29 +239,54 @@ def generate( dtype=infinicore.int64, ) else: - position_ids = infinicore.from_list( - [ - list(range(past_seq_len, past_seq_len + seq_len)) - for _ in range(batch_size) - ], - dtype=infinicore.int64, - ) + if fast_decode_meta and iter > 0 and batch_size == 1 and seq_len == 1: + position_ids_decode.write_i64(0, int(past_seq_len)) + past_kv_lengths_decode.write_i32(0, int(past_seq_len)) + total_kv_lengths_decode.write_i32(0, int(past_seq_len + seq_len)) + cu_seqlens_decode.write_i32(0, 0) + cu_seqlens_decode.write_i32(1, int(past_seq_len + seq_len)) + position_ids = position_ids_decode + past_kv_lengths = past_kv_lengths_decode + total_kv_lengths = total_kv_lengths_decode + cu_seqlens = cu_seqlens_decode + input_offsets = input_offsets_decode + else: + position_ids = infinicore.from_list( + [ + list(range(past_seq_len, past_seq_len + seq_len)) + for _ in range(batch_size) + ], + dtype=infinicore.int64, + ) + past_kv_lengths = infinicore.from_list( + [past_seq_len] * batch_size, dtype=infinicore.int32 + ) + total_kv_lengths = infinicore.from_list( + [past_seq_len + seq_len] * batch_size, dtype=infinicore.int32 + ) + cu_seqlens = infinicore.from_list( + [(past_seq_len + seq_len) * i for i in range(batch_size + 1)], + dtype=infinicore.int32, + ) + input_offsets = infinicore.from_list( + [seq_len * i for i in range(batch_size + 1)], dtype=infinicore.int32 + ) slot_mapping = None - - past_kv_lengths = infinicore.from_list( - [past_seq_len] * batch_size, dtype=infinicore.int32 - ) - total_kv_lengths = infinicore.from_list( - [past_seq_len + seq_len] * batch_size, dtype=infinicore.int32 - ) - cu_seqlens = infinicore.from_list( - [(past_seq_len + seq_len) * i for i in range(batch_size + 1)], - dtype=infinicore.int32, - ) - input_offsets = infinicore.from_list( - [seq_len * i for i in range(batch_size + 1)], dtype=infinicore.int32 - ) + if self.enable_paged_attn: + past_kv_lengths = infinicore.from_list( + [past_seq_len] * batch_size, dtype=infinicore.int32 + ) + total_kv_lengths = infinicore.from_list( + [past_seq_len + seq_len] * batch_size, dtype=infinicore.int32 + ) + cu_seqlens = infinicore.from_list( + [(past_seq_len + seq_len) * i for i in range(batch_size + 1)], + dtype=infinicore.int32, + ) + input_offsets = infinicore.from_list( + [seq_len * i for i in range(batch_size + 1)], dtype=infinicore.int32 + ) output_id = self( input_ids=input_ids, @@ -240,7 +301,6 @@ def generate( top_k=generation_config.top_k, top_p=generation_config.top_p, ) - output_ids.append(output_id) if ( diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py index 7b6ceea4..e07e2155 100644 --- a/python/infinilm/llm/llm.py +++ b/python/infinilm/llm/llm.py @@ -95,8 +95,15 @@ def __init__(self, config: EngineConfig): ) # Load model weights + dtype_map = { + "float16": infinicore.float16, + "bfloat16": infinicore.bfloat16, + "float32": infinicore.float32, + } load_model_state_dict_by_file( - self.model_engine, config.model_path, dtype=self.model_engine.config.dtype + self.model_engine, + config.model_path, + dtype=dtype_map.get(config.dtype, self.model_engine.config.dtype), ) # Initialize tokenizer @@ -371,6 +378,7 @@ def apply_chat_template( conversation=messages, add_generation_prompt=add_generation_prompt, tokenize=False, + continue_final_message=not add_generation_prompt, **chat_template_kwargs, ) diff --git a/python/infinilm/llm/static_scheduler.py b/python/infinilm/llm/static_scheduler.py index de4d9d35..25d64ae5 100644 --- a/python/infinilm/llm/static_scheduler.py +++ b/python/infinilm/llm/static_scheduler.py @@ -4,6 +4,7 @@ import logging import queue +import os import janus from typing import List, Optional @@ -60,9 +61,17 @@ def build_model_inputs( tokens = req.get_input_tokens() prefix_hit_len = self.prefix_hit_len input_tokens = tokens[prefix_hit_len:] - input_ids = [input_tokens] - position_ids = [list(range(prefix_hit_len, len(tokens)))] - past_kv_len = prefix_hit_len + if len(input_tokens) == 0: + # Full prefix hit: avoid empty tensor conversion in model input path. + # Recompute the last prompt token as a one-token prefill step. + input_tokens = [tokens[-1]] + input_ids = [input_tokens] + position_ids = [[len(tokens) - 1]] + past_kv_len = len(tokens) - 1 + else: + input_ids = [input_tokens] + position_ids = [list(range(prefix_hit_len, len(tokens)))] + past_kv_len = prefix_hit_len total_kv_len = len(tokens) input_offsets = [0, len(input_tokens)] else: @@ -106,6 +115,11 @@ def __init__(self, max_cache_len: int = 4096): self.max_cache_len = max_cache_len self.cached_block_hashes: List[int] = [] self.pending_block_hashes: List[int] = [] + # Safety switch: disable cross-request prefix reuse when investigating + # corrupted/contaminated generations. + self.disable_prefix_reuse = os.getenv( + "INFINILM_STATIC_DISABLE_PREFIX_REUSE", "0" + ) in ("1", "true", "True", "yes", "on") def add_request(self, request: InferenceRequest): if request is not None: @@ -205,6 +219,8 @@ def schedule(self) -> Optional[StaticSchedulerOutput]: num_full_blocks = prompt_len // _BLOCK_SIZE matched = 0 + if self.disable_prefix_reuse and self.cached_block_hashes: + self.cached_block_hashes.clear() self.pending_block_hashes.clear() for i in range(num_full_blocks): diff --git a/python/infinilm/modeling_utils.py b/python/infinilm/modeling_utils.py index 1d21f2d9..17a5fe58 100644 --- a/python/infinilm/modeling_utils.py +++ b/python/infinilm/modeling_utils.py @@ -1,4 +1,6 @@ import os +import json +import math from typing import Dict, Union import time import torch @@ -93,7 +95,8 @@ def load_state_dict( ) for k in f.keys(): - state_dict[k] = f.get_tensor(k).to(device=device) + # Explicitly cast dtype: some ops (e.g. embedding) may not support BF16 on all backends. + state_dict[k] = f.get_tensor(k).to(device=device, dtype=dtype) return state_dict @@ -152,6 +155,35 @@ def load_model_state_dict_by_file( torch_dtype = infinicore.utils.to_torch_dtype(dtype) model_keys = model.state_dict_keyname() + # MiniCPM-style scaling (used by MiniCPM / FM9G; also applies to MiniCPM-SALA checkpoints). + # This matches `InfiniLM/scripts/jiuge.py` weight scaling behavior. + scale_input = 1.0 + scale_output = 1.0 + scale_o = 1.0 + scale_down = 1.0 + scale_lm_head = 1.0 + try: + with open(os.path.join(model_path, "config.json")) as f: + cfg = json.load(f) + if ( + cfg.get("model_type") in ["fm9g", "minicpm", "minicpm_sala"] + and "scale_emb" in cfg + and "scale_depth" in cfg + ): + scale_input = float(cfg["scale_emb"]) + scale_o = float(cfg["scale_depth"]) / math.sqrt(float(cfg["num_hidden_layers"])) + scale_down = float(cfg["scale_depth"]) / math.sqrt(float(cfg["num_hidden_layers"])) + if cfg.get("model_type") in ["fm9g", "minicpm"] and "dim_model_base" in cfg: + scale_output = float(int(cfg["hidden_size"]) // int(cfg["dim_model_base"])) + if cfg.get("model_type") == "minicpm_sala" and "dim_model_base" in cfg and "hidden_size" in cfg: + scale_lm_head = float(cfg["dim_model_base"]) / float(cfg["hidden_size"]) + # minicpm_sala: only bake embed and lm_head; residual scaling done at forward in C++ + if cfg.get("model_type") == "minicpm_sala": + scale_o = 1.0 + scale_down = 1.0 + except Exception: + pass + already_loaded_keys = [] file_list = glob.glob(os.path.join(model_path, "*.safetensors")) @@ -167,6 +199,24 @@ def load_model_state_dict_by_file( ) already_loaded_keys.extend(model_param.keys()) + # Apply MiniCPM scaling to loaded tensors (in torch space). + if scale_input != 1.0 and "model.embed_tokens.weight" in model_param: + model_param["model.embed_tokens.weight"] = ( + model_param["model.embed_tokens.weight"] * scale_input + ) + if scale_output != 1.0 and "model.norm.weight" in model_param: + model_param["model.norm.weight"] = ( + model_param["model.norm.weight"] * scale_output + ) + if scale_o != 1.0 or scale_down != 1.0: + for k, v in list(model_param.items()): + if scale_o != 1.0 and k.endswith(".self_attn.o_proj.weight"): + model_param[k] = v * scale_o + elif scale_down != 1.0 and k.endswith(".mlp.down_proj.weight"): + model_param[k] = v * scale_down + if scale_lm_head != 1.0 and "lm_head.weight" in model_param: + model_param["lm_head.weight"] = model_param["lm_head.weight"] * scale_lm_head + # --------------------------------------------------------- # # model_param_infini references torch.Tensor # --------------------------------------------------------- # @@ -180,6 +230,19 @@ def load_model_state_dict_by_file( file_path = os.path.join(model_path, "pytorch_model.bin") model_params = torch.load(file_path, weights_only=True, map_location="cpu") + if scale_input != 1.0 and "model.embed_tokens.weight" in model_params: + model_params["model.embed_tokens.weight"] = model_params["model.embed_tokens.weight"] * scale_input + if scale_output != 1.0 and "model.norm.weight" in model_params: + model_params["model.norm.weight"] = model_params["model.norm.weight"] * scale_output + if scale_o != 1.0 or scale_down != 1.0: + for k, v in list(model_params.items()): + if scale_o != 1.0 and k.endswith(".self_attn.o_proj.weight"): + model_params[k] = v * scale_o + elif scale_down != 1.0 and k.endswith(".mlp.down_proj.weight"): + model_params[k] = v * scale_down + if scale_lm_head != 1.0 and "lm_head.weight" in model_params: + model_params["lm_head.weight"] = model_params["lm_head.weight"] * scale_lm_head + model_param_infini = {} for key in model_params.keys(): model_param_infini[key] = infinicore.from_torch( diff --git a/python/infinilm/server/chat_message_normalize.py b/python/infinilm/server/chat_message_normalize.py new file mode 100644 index 00000000..04afe176 --- /dev/null +++ b/python/infinilm/server/chat_message_normalize.py @@ -0,0 +1,76 @@ +"""Normalize OpenAI-style chat messages before HuggingFace chat_template. + +Kept separate from ``inference_server`` so this logic can be smoke-tested without +loading InfiniCore / CUDA (see ``__main__`` block). +""" + + +def normalize_openai_messages_for_hf_template(messages: list) -> list: + """Strip lm-eval ``type: text`` wrappers; flatten multimodal text parts. + + lm-eval ``local-chat-completions`` with ``tokenized_requests=False`` JSON-encodes + each turn with an extra top-level ``"type": "text"`` (see ``TemplateAPI.apply_chat_template`` + in lm-eval). HuggingFace ``--model hf`` passes plain ``{role, content}`` dicts into + ``apply_chat_template``. Stripping unknown keys keeps server templating aligned with + the HF harness for text-only tasks. + """ + normalized: list = [] + for msg in messages: + if not isinstance(msg, dict): + normalized.append(msg) + continue + + role = msg.get("role") + if role is None: + normalized.append(msg) + continue + + content = msg.get("content") + if isinstance(content, list): + text_parts: list[str] = [] + for part in content: + if isinstance(part, dict): + if part.get("type") == "text" and "text" in part: + text_parts.append(part["text"]) + elif isinstance(part, str): + text_parts.append(part) + elif isinstance(part, str): + text_parts.append(part) + merged = "".join(text_parts) if text_parts else "" + core = {"role": role, "content": merged} + if msg.get("name") is not None: + core["name"] = msg["name"] + normalized.append(core) + elif isinstance(content, str): + core = {"role": role, "content": content} + if msg.get("name") is not None: + core["name"] = msg["name"] + normalized.append(core) + else: + normalized.append(msg) + + return normalized + + +if __name__ == "__main__": + # Smoke test (no InfiniCore): run as + # python3 -m infinilm.server.chat_message_normalize + lm_eval_style = [ + {"role": "system", "content": "sys", "type": "text"}, + {"role": "user", "content": "hi", "type": "text"}, + ] + out = normalize_openai_messages_for_hf_template(lm_eval_style) + assert out == [{"role": "system", "content": "sys"}, {"role": "user", "content": "hi"}], out + mm = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "a"}, + {"type": "text", "text": "b"}, + ], + } + ] + assert normalize_openai_messages_for_hf_template(mm) == [ + {"role": "user", "content": "ab"} + ] + print("chat_message_normalize: ok") diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py index b5c49247..8c361c4e 100644 --- a/python/infinilm/server/inference_server.py +++ b/python/infinilm/server/inference_server.py @@ -17,6 +17,7 @@ from fastapi.responses import JSONResponse, StreamingResponse from infinilm.llm import AsyncLLMEngine, SamplingParams, FinishReason +from infinilm.server.chat_message_normalize import normalize_openai_messages_for_hf_template logger = logging.getLogger(__name__) @@ -266,37 +267,8 @@ async def list_models_legacy(): return _models_payload() def _normalize_messages(self, messages: list) -> list: - """Normalize messages to handle multimodal content (list format). - - Converts content from list format [{"type": "text", "text": "..."}] - to string format for chat template compatibility. - """ - normalized = [] - for msg in messages: - if not isinstance(msg, dict): - normalized.append(msg) - continue - - content = msg.get("content") - if isinstance(content, list): - # Extract text from multimodal content list - text_parts = [] - for part in content: - if isinstance(part, dict): - if part.get("type") == "text" and "text" in part: - text_parts.append(part["text"]) - elif isinstance(part, str): - text_parts.append(part) - elif isinstance(part, str): - text_parts.append(part) - # Join all text parts - normalized_msg = msg.copy() - normalized_msg["content"] = "".join(text_parts) if text_parts else "" - normalized.append(normalized_msg) - else: - normalized.append(msg) - - return normalized + """Delegate to :func:`normalize_openai_messages_for_hf_template`.""" + return normalize_openai_messages_for_hf_template(messages) def _build_sampling_params(self, data: dict) -> SamplingParams: """Build SamplingParams from request data.""" diff --git a/xmake.lua b/xmake.lua index 2b1b51d3..5282f6a7 100644 --- a/xmake.lua +++ b/xmake.lua @@ -56,7 +56,7 @@ target_end() target("_infinilm") add_packages("pybind11") set_default(false) - add_rules("python.module", {soabi = true}) + add_rules("python.library", {soabi = true}) set_languages("cxx17") set_kind("shared") @@ -70,6 +70,7 @@ target("_infinilm") add_linkdirs(INFINI_ROOT.."/lib") add_links("infinicore_cpp_api", "infiniop", "infinirt", "infiniccl") + add_rpathdirs(INFINI_ROOT.."/lib") -- Add src files add_files("csrc/**.cpp") From d037eecc8b3f163e52272cf98ac776b08515ffd8 Mon Sep 17 00:00:00 2001 From: Ceng23333 <441651826@qq.com> Date: Thu, 9 Apr 2026 01:38:05 +0000 Subject: [PATCH 02/11] refactor minicpm-sala Signed-off-by: Ceng23333 <441651826@qq.com> --- csrc/cache/kv_cache.cpp | 67 ++----------- csrc/cache/kv_cache.hpp | 25 +---- csrc/config/config_factory.cpp | 2 +- csrc/engine/infer_engine.cpp | 10 +- csrc/engine/rank_worker.cpp | 3 +- .../minicpm_sala/minicpm_sala_attention.cpp | 95 +++++++++++++------ .../minicpm_sala/minicpm_sala_attention.hpp | 6 ++ .../minicpm_sala_for_causal_lm.cpp | 18 ++++ .../minicpm_sala_for_causal_lm.hpp | 7 ++ .../minicpm_sala/minicpm_sala_model.cpp | 7 +- csrc/models/model_factory.cpp | 30 +++--- csrc/pybind11/engine/engine.hpp | 2 +- 12 files changed, 129 insertions(+), 143 deletions(-) diff --git a/csrc/cache/kv_cache.cpp b/csrc/cache/kv_cache.cpp index a7220773..4c97edfa 100644 --- a/csrc/cache/kv_cache.cpp +++ b/csrc/cache/kv_cache.cpp @@ -4,7 +4,6 @@ #include "../utils.hpp" #include "infinicore/ops.hpp" #include -#include namespace infinilm::cache { // ========================== @@ -46,9 +45,7 @@ StaticKVCache::StaticKVCache( infinicore::Size max_positional_embedding, infinicore::DataType dtype, const StaticKVCacheConfig &config, - const engine::distributed::RankInfo &rank_info, - infinicore::Size gla_recurrent_num_heads, - infinicore::Size gla_recurrent_head_dim) + const engine::distributed::RankInfo &rank_info) : Cache(), k_dim_(k_dim), v_dim_(v_dim), @@ -57,9 +54,7 @@ StaticKVCache::StaticKVCache( rank_batch_size_(config.max_batch_size()), cache_len_(config.max_cache_len() == std::numeric_limits::max() || config.max_cache_len() == 0 ? max_positional_embedding : config.max_cache_len()), rank_num_layers_(num_layers), - dtype_(dtype), - gla_recurrent_num_heads_(gla_recurrent_num_heads), - gla_recurrent_head_dim_(gla_recurrent_head_dim) { + dtype_(dtype) { // Allocate K cache k_caches_ = infinicore::Tensor::empty( @@ -80,17 +75,6 @@ StaticKVCache::StaticKVCache( v_dim_}, dtype_, rank_info.device); - - if (gla_recurrent_num_heads_ > 0 && gla_recurrent_head_dim_ > 0) { - gla_state_ = infinicore::Tensor::zeros( - {rank_num_layers_, - rank_batch_size_, - gla_recurrent_num_heads_, - gla_recurrent_head_dim_, - gla_recurrent_head_dim_}, - infinicore::DataType::F32, - rank_info.device); - } } infinicore::Tensor StaticKVCache::create_layer_kv_cache( @@ -141,27 +125,12 @@ StaticKVCache::update(size_t layer_idx, auto device = k_cache_layer->device(); #ifdef ENABLE_KV_CACHING - // Some debug builds have shown incremental decode (update_len=1) may diverge - // from full-sequence recompute when using the optimized kv_caching_ kernel. - // Provide an env override to fall back to the simple (and slower) copy update. - const char *disable_kv_caching = std::getenv("INFINI_DISABLE_KV_CACHING"); - const bool force_copy_update = disable_kv_caching && disable_kv_caching[0] != '\0' && disable_kv_caching[0] != '0'; - if (force_copy_update) { - size_t cache_pos = reinterpret_cast(past_sequence_lengths->to(infinicore::Device::cpu())->data())[0]; - auto result_len = cache_pos + update_len; - ASSERT(result_len <= cache_len_); - auto k_cache_update = k_cache_layer->narrow({{2, cache_pos, update_len}}); - auto v_cache_update = v_cache_layer->narrow({{2, cache_pos, update_len}}); - k_cache_update->copy_from(k); - v_cache_update->copy_from(v); - } else { - infinicore::op::kv_caching_( - k_cache_layer, - v_cache_layer, - k, - v, - past_sequence_lengths); - } + infinicore::op::kv_caching_( + k_cache_layer, + v_cache_layer, + k, + v, + past_sequence_lengths); #else size_t cache_pos = reinterpret_cast(past_sequence_lengths->to(infinicore::Device::cpu())->data())[0]; auto result_len = cache_pos + update_len; @@ -177,26 +146,6 @@ StaticKVCache::update(size_t layer_idx, return {k_cache_layer, v_cache_layer}; } -std::tuple -StaticKVCache::get_layer_kv(size_t layer_idx) { - ASSERT(layer_idx < rank_num_layers_); - auto k_cache_layer = k_caches_->narrow({{0, layer_idx, 1}})->squeeze(0); - auto v_cache_layer = v_caches_->narrow({{0, layer_idx, 1}})->squeeze(0); - return {k_cache_layer, v_cache_layer}; -} - -bool -StaticKVCache::has_gla_recurrent_state() const { - return gla_recurrent_num_heads_ > 0 && gla_recurrent_head_dim_ > 0 && static_cast(gla_state_); -} - -infinicore::Tensor -StaticKVCache::gla_recurrent_state_for_layer(size_t layer_idx) { - ASSERT(layer_idx < rank_num_layers_); - ASSERT(has_gla_recurrent_state()); - return gla_state_->narrow({{0, layer_idx, 1}})->squeeze(0); -} - // ========================== // PagedKVCacheConfig // ========================== diff --git a/csrc/cache/kv_cache.hpp b/csrc/cache/kv_cache.hpp index cbef0722..e6e640df 100644 --- a/csrc/cache/kv_cache.hpp +++ b/csrc/cache/kv_cache.hpp @@ -12,7 +12,6 @@ #include #include #include -#include #include #include @@ -44,9 +43,7 @@ class StaticKVCache final : public Cache { infinicore::Size max_positional_embedding, infinicore::DataType dtype, const StaticKVCacheConfig &config, - const engine::distributed::RankInfo &rank_info, - infinicore::Size gla_recurrent_num_heads = 0, - infinicore::Size gla_recurrent_head_dim = 0); + const engine::distributed::RankInfo &rank_info); static infinicore::Tensor create_layer_kv_cache( const infinicore::Size k_dim, @@ -75,20 +72,6 @@ class StaticKVCache final : public Cache { const infinicore::Tensor &v, const infinicore::Tensor &past_sequence_lengths); - /** - * @brief Get KV cache tensors for a layer (views). - * - * @return (k_cache_layer, v_cache_layer) - * k_cache_layer: [batch, num_rank_k_heads, max_cache_len, k_dim] - * v_cache_layer: [batch, num_rank_v_heads, max_cache_len, v_dim] - */ - std::tuple - get_layer_kv(size_t layer_idx); - - /** Per-layer Simple GLA recurrent state for lightning decode: [batch, H, D, D] float32 (in-place for decode_step). */ - bool has_gla_recurrent_state() const; - infinicore::Tensor gla_recurrent_state_for_layer(size_t layer_idx); - ~StaticKVCache() override = default; private: @@ -106,12 +89,6 @@ class StaticKVCache final : public Cache { // [num_layers, max_batch, num_rank_v_heads, max_cache_len, v_dim] infinicore::Tensor v_caches_; - - infinicore::Size gla_recurrent_num_heads_{0}; - infinicore::Size gla_recurrent_head_dim_{0}; - // [num_layers, max_batch, gla_recurrent_num_heads, D, D], F32; empty when heads==0 - infinicore::Tensor gla_state_; - }; class PagedKVCacheConfig final : public CacheConfig { diff --git a/csrc/config/config_factory.cpp b/csrc/config/config_factory.cpp index aff8b986..c822983e 100644 --- a/csrc/config/config_factory.cpp +++ b/csrc/config/config_factory.cpp @@ -16,7 +16,7 @@ std::shared_ptr ConfigFactory::createConfig(const if (it != config_map.end()) { it->second(model_config); } else { - std::vector classic_models = {"llama", "qwen2", "minicpm", "minicpm_sala", "fm9g", "fm9g7b"}; + std::vector classic_models = {"llama", "qwen2", "minicpm", "fm9g", "fm9g7b"}; const std::string &model_type = model_config->get("model_type"); if (std::find(classic_models.begin(), classic_models.end(), model_type) == classic_models.end()) { throw std::invalid_argument("infinilm::config::ConfigFactory::createConfig: Unsupported model config type: " + model_type); diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp index 2a5c5ff4..f1afd84b 100644 --- a/csrc/engine/infer_engine.cpp +++ b/csrc/engine/infer_engine.cpp @@ -121,15 +121,7 @@ InferEngine::Input::to_model_input(infinicore::Device device) const { auto to_device = [&](const std::optional &t) -> std::optional { - if (!t.has_value()) { - return t; - } - auto ten = t.value(); - // Avoid redundant copies when the tensor is already on the target device. - if (ten->device() == device) { - return ten; - } - return ten->to(device); + return t.has_value() ? t.value()->to(device) : t; }; infinilm::InfinilmModel::Input input = { diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp index 1ba89ca1..1542c1e0 100644 --- a/csrc/engine/rank_worker.cpp +++ b/csrc/engine/rank_worker.cpp @@ -5,7 +5,6 @@ #include "../models/models_registry.hpp" #include "infinicore/ops.hpp" #include -#include #include #include @@ -262,7 +261,7 @@ void RankWorker::thread_loop() { rank_info_.device, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr); } else { - std::vector classic_models = {"llama", "qwen2", "minicpm", "minicpm_sala", "fm9g", "fm9g7b"}; + std::vector classic_models = {"llama", "qwen2", "minicpm", "fm9g", "fm9g7b"}; if ((std::find(classic_models.begin(), classic_models.end(), model_type) != classic_models.end())) { model_ = InfinilmModelFactory::createModel( model_config_, diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp index c426ec1c..001122e4 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp @@ -3,7 +3,9 @@ #include "infinicore/ops.hpp" #include "infinicore/ops/infllmv2_attention.hpp" #include "infinicore/ops/simple_gla_attention.hpp" +#include "infinicore/ops/simple_gla_decode_step.hpp" #include "infinicore/ops/simple_gla_prefill.hpp" +#include "infinicore/ops/simple_gla_recurrent_state_append.hpp" #include "infinicore/context/context.hpp" #include "../debug_utils/tensor_utils.hpp" @@ -45,6 +47,19 @@ std::vector build_slope_tensor(size_t n) { } // namespace +namespace { +void ensure_gla_state_allocated(infinicore::Tensor &state, + const infinicore::Device &device, + size_t batch_size, + size_t n_h, + size_t head_dim) { + const std::vector want = {batch_size, n_h, head_dim, head_dim}; + if (!state || state->shape() != want || state->dtype() != infinicore::DataType::F32 || state->device() != device) { + state = infinicore::Tensor::zeros(want, infinicore::DataType::F32, device); + } +} +} // namespace + MiniCPMSALAAttention::MiniCPMSALAAttention(std::shared_ptr model_config, const infinicore::Device &device, size_t layer_idx, @@ -162,7 +177,11 @@ void MiniCPMSALAAttention::set_rotary_emb(const std::shared_ptrpermute({0, 2, 1, 3})->contiguous(); // [B, S_kv, H, D] auto v_bthd = v_use->permute({0, 2, 1, 3})->contiguous(); // [B, S_kv, H, D] - // Lightning GLA decode must use recurrent state (StaticKVCache) whenever available. - const bool is_lightning_decode = has_cache_meta && static_kv_cache && (seq_len < total_seq_len); - if (is_lightning_decode && !static_kv_cache->has_gla_recurrent_state()) { - throw std::runtime_error( - "MiniCPMSALAAttention(lightning): Lightning decode requires StaticKVCache gla_recurrent_state " - "(missing recurrent buffer in StaticKVCache)."); - } - - const bool recurrent_gla = static_kv_cache && static_kv_cache->has_gla_recurrent_state() && has_cache_meta; + // Lightning fast decode: maintain recurrent state locally (do NOT depend on StaticKVCache extensions). + // We rebuild state on-demand if it is out-of-sync with cache_pos. + const bool is_decode = has_cache_meta && static_kv_cache && (seq_len == 1) && (total_seq_len > 1); + if (is_decode) { + ensure_gla_state_allocated(gla_state_, q_bthd->device(), batch_size, n_h, head_dim_); + + // Ensure `state` corresponds to exactly `cache_pos` cached tokens (excluding current token). + if (!gla_state_valid_ || gla_state_cached_len_ != cache_pos) { + // Rebuild from available KV. This is O(T) once after reset / mismatch. + infinicore::op::zeros_(gla_state_); + if (cache_pos > 0) { + auto k_prev = k_bthd->narrow({{1, 0, cache_pos}}); + auto v_prev = v_bthd->narrow({{1, 0, cache_pos}}); + infinicore::op::simple_gla_recurrent_state_append_segment(gla_state_, k_prev, v_prev, g_gamma_); + } + gla_state_cached_len_ = cache_pos; + gla_state_valid_ = true; + } - infinicore::Tensor gla_out; - if (recurrent_gla && seq_len == 1 && total_seq_len > 1) { - auto S = static_kv_cache->gla_recurrent_state_for_layer(cache_layer_idx_); - auto q_new = q_bthd; + // Decode-step uses only the newest KV at position (total_seq_len - 1). + auto q_new = q_bthd; // [B,1,H,D] auto k_new = k_bthd->narrow({{1, total_seq_len - 1, 1}}); auto v_new = v_bthd->narrow({{1, total_seq_len - 1, 1}}); - gla_out = infinicore::op::simple_gla_decode_step(q_new, k_new, v_new, S, g_gamma_, scaling_); + auto out_b1hd = infinicore::op::simple_gla_decode_step(q_new, k_new, v_new, gla_state_, g_gamma_, scaling_); + gla_state_cached_len_ = cache_pos + 1; + attn_output = out_b1hd->view({batch_size, seq_len, n_h * head_dim_}); + // Fall through to output norm/gate + o_proj below (do not run full-sequence GLA again). } else { + // Prefill / non-decode batching: non-recurrent kernels, then update local recurrent state. infinicore::Tensor q_full; if (seq_len == total_seq_len) { q_full = q_bthd; } else { - // Decode: q has seq_len (e.g. 1), kv has total_seq_len; pad q to [B, total_seq_len, H, D]. + // q shorter than KV: pad q to [B, total_seq_len, H, D]. q_full = infinicore::Tensor::zeros( {batch_size, total_seq_len, n_h, head_dim_}, q_bthd->dtype(), q_bthd->device()); auto q_slot = q_full->narrow({{1, total_seq_len - seq_len, seq_len}}); q_slot->copy_from(q_bthd); } + + infinicore::Tensor gla_out; // Fused prefill: naive kernel for head_dim<=64; chunked/tiled kernel for head_dim>64 (e.g. 128). bool use_fused_prefill = (batch_size == 1) && (seq_len == total_seq_len); if (use_fused_prefill) { @@ -354,24 +386,27 @@ infinicore::Tensor MiniCPMSALAAttention::forward_dense_(const infinicore::Tensor gla_out = infinicore::op::simple_gla_attention(q_full, k_bthd, v_bthd, g_gamma_, scaling_); } - // Keep per-layer recurrent state aligned with simple_gla_attention / prefill outputs. - // Use batched GEMM (CUDA+ATen) instead of O(seq_len) decode_step launches; see - // simple_gla_recurrent_state_append_segment (closed form: S <- g^L S + Σ g^{L-1-j} outer(k,v)). - if (recurrent_gla) { - auto S = static_kv_cache->gla_recurrent_state_for_layer(cache_layer_idx_); - if (cache_pos == 0) { - infinicore::op::zeros_(S); - } + // Keep local recurrent state in sync for subsequent decode steps. + ensure_gla_state_allocated(gla_state_, q_bthd->device(), batch_size, n_h, head_dim_); + if (cache_pos == 0) { + infinicore::op::zeros_(gla_state_); + gla_state_cached_len_ = 0; + gla_state_valid_ = true; + } + // Append the segment we just wrote: [cache_pos, cache_pos + seq_len) + if (gla_state_valid_ && gla_state_cached_len_ == cache_pos) { auto k_seg = k_bthd->narrow({{1, cache_pos, seq_len}}); auto v_seg = v_bthd->narrow({{1, cache_pos, seq_len}}); - infinicore::op::simple_gla_recurrent_state_append_segment(S, k_seg, v_seg, g_gamma_); + infinicore::op::simple_gla_recurrent_state_append_segment(gla_state_, k_seg, v_seg, g_gamma_); + gla_state_cached_len_ = cache_pos + seq_len; + } else { + // Out-of-sync; force rebuild next time we need recurrent decode. + gla_state_valid_ = false; } - } - infinicore::Tensor out_slice = (recurrent_gla && seq_len == 1 && total_seq_len > 1) - ? gla_out - : gla_out->narrow({{1, total_seq_len - seq_len, seq_len}}); - attn_output = out_slice->view({batch_size, seq_len, n_h * head_dim_}); + infinicore::Tensor out_slice = gla_out->narrow({{1, total_seq_len - seq_len, seq_len}}); + attn_output = out_slice->view({batch_size, seq_len, n_h * head_dim_}); + } } else { // minicpm4 layers must use InfLLM-v2 attention (hard error if not available). // NOTE: Lightning layers keep Simple GLA for correctness; only minicpm4 routes here. diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp index 3cd8f284..37dab7ec 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp @@ -97,6 +97,12 @@ class MiniCPMSALAAttention : public infinicore::nn::Module { // Lightning layers only: per-head log-decay for Simple GLA (HF _build_slope_tensor * -1). infinicore::Tensor g_gamma_; + + // Lightning layers only: recurrent state for fast decode. + // Shape: [B, H, D, D] float32. Tracks how many KV tokens are folded into the state. + mutable infinicore::Tensor gla_state_; + mutable size_t gla_state_cached_len_ = 0; + mutable bool gla_state_valid_ = false; }; } // namespace infinilm::models::minicpm_sala diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp index ce2e9474..74ea4f9a 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp @@ -1,11 +1,22 @@ #include "minicpm_sala_for_causal_lm.hpp" +#include "../models_registry.hpp" #include "infinicore/ops.hpp" #include #include +#include namespace infinilm::models::minicpm_sala { +std::shared_ptr create_minicpm_sala_model_config( + std::shared_ptr model_config) { + const std::string &model_type = model_config->get("model_type"); + if ("minicpm_sala" != model_type) { + throw std::runtime_error("infinilm::models::minicpm_sala::create_minicpm_sala_model_config: model_type is not minicpm_sala"); + } + return model_config; +} + MiniCPMSALAForCausalLM::MiniCPMSALAForCausalLM( std::shared_ptr model_config, const infinicore::Device &device, @@ -61,3 +72,10 @@ const cache::CacheConfig *MiniCPMSALAForCausalLM::get_cache_config() const { } // namespace infinilm::models::minicpm_sala +namespace { +INFINILM_REGISTER_CAUSAL_LM_MODEL( + minicpm_sala, + infinilm::models::minicpm_sala::MiniCPMSALAForCausalLM, + infinilm::models::minicpm_sala::create_minicpm_sala_model_config); +} // namespace + diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp index 9bb3ec2b..33305b23 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp @@ -36,3 +36,10 @@ class MiniCPMSALAForCausalLM : public InfinilmModel { } // namespace infinilm::models::minicpm_sala +namespace infinilm::models::minicpm_sala { + +std::shared_ptr create_minicpm_sala_model_config( + std::shared_ptr model_config); + +} // namespace infinilm::models::minicpm_sala + diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.cpp b/csrc/models/minicpm_sala/minicpm_sala_model.cpp index a415915f..6fd00bfe 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_model.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_model.cpp @@ -86,9 +86,6 @@ void MiniCPMSALAModel::reset_cache(const cache::CacheConfig *cache_config) { const size_t base_head_dim = model_config_->get("head_dim"); const size_t lightning_kv_heads = model_config_->get_or("lightning_nkv", base_kv_heads); const size_t lightning_head_dim = model_config_->get_or("lightning_head_dim", base_head_dim); - const size_t lightning_nh = model_config_->get_or("lightning_nh", model_config_->get("num_attention_heads")); - const int tp_sz = std::max(1, rank_info_.tp_size); - const size_t lightning_nh_rank = lightning_nh / static_cast(tp_sz); kv_cache_minicpm4_ = (minicpm4_layer_count > 0) ? std::make_shared( @@ -113,9 +110,7 @@ void MiniCPMSALAModel::reset_cache(const cache::CacheConfig *cache_config) { /*max_positional_embedding=*/model_config_->get("max_position_embeddings"), /*dtype=*/model_config_->get_dtype(), *static_cfg, - rank_info_, - /*gla_recurrent_num_heads=*/lightning_nh_rank, - /*gla_recurrent_head_dim=*/lightning_head_dim) + rank_info_) : nullptr; } else { // This refactor implements HF-like dense caching only. diff --git a/csrc/models/model_factory.cpp b/csrc/models/model_factory.cpp index 3c885fc5..03734ac9 100644 --- a/csrc/models/model_factory.cpp +++ b/csrc/models/model_factory.cpp @@ -1,6 +1,6 @@ #include "model_factory.hpp" -#include "llama/llama.hpp" -#include "minicpm_sala/minicpm_sala_for_causal_lm.hpp" +#include "llama/llama_for_causal_lm.hpp" +#include "models_registry.hpp" namespace infinilm { /** @@ -41,13 +41,8 @@ std::shared_ptr InfinilmModelFactory::createModel( engine::distributed::RankInfo rank_info, const cache::CacheConfig *cache, backends::AttentionBackend attention_backend) { - std::shared_ptr model; - const auto model_type = model_config->get_or("model_type", "llama"); - if (model_type == "minicpm_sala") { - model = std::make_shared( - model_config, rank_info.device, rank_info, attention_backend); - } else if (true) { + if (true) { model = std::make_shared( model_config, rank_info.device, rank_info, attention_backend); } else { @@ -65,8 +60,21 @@ std::shared_ptr InfinilmModelFactory::createModel( std::shared_ptr model_config, const infinicore::Device &device, const cache::CacheConfig *cache) { - engine::distributed::RankInfo rank_info; - rank_info.device = device; - return createModel(model_config, rank_info, cache, backends::AttentionBackend::Default); + const std::string model_type = model_config->get("model_type"); + std::shared_ptr model; + const auto &model_map = models::get_causal_lm_model_map(); + auto it = model_map.find(model_type); + if (it != model_map.end()) { + // create model + auto &model_creator = it->second; + model = model_creator(model_config, device); + } else { + throw std::invalid_argument("InfinilmModelFactory::createModel: Unsupported model_type"); + } + + if (cache) { + model->reset_cache(cache); + } + return model; } } // namespace infinilm diff --git a/csrc/pybind11/engine/engine.hpp b/csrc/pybind11/engine/engine.hpp index d27b9585..a784d69c 100644 --- a/csrc/pybind11/engine/engine.hpp +++ b/csrc/pybind11/engine/engine.hpp @@ -199,6 +199,6 @@ inline void bind_infer_engine(py::module &m) { py::class_(infer_engine, "Output") .def_readwrite("output_ids", &InferEngine::Output::output_ids, "Output tensor"); - } +} } // namespace infinilm::engine From b93614901039c5f1dbecd81feaccdecd838b2aaf Mon Sep 17 00:00:00 2001 From: Ceng23333 <441651826@qq.com> Date: Thu, 9 Apr 2026 01:40:55 +0000 Subject: [PATCH 03/11] cleanup code Signed-off-by: Ceng23333 <441651826@qq.com> --- examples/collect_metrics_longtext_decode.py | 355 -------- examples/compare_inference_speed.py | 868 ------------------- examples/metrics_16k_prefill.md | 152 ---- examples/metrics_longtext_mem.md | 378 -------- examples/run_infinicore_ops_before_logits.sh | 18 - examples/run_longtext_metrics_cases.sh | 59 -- 6 files changed, 1830 deletions(-) delete mode 100644 examples/collect_metrics_longtext_decode.py delete mode 100644 examples/compare_inference_speed.py delete mode 100644 examples/metrics_16k_prefill.md delete mode 100644 examples/metrics_longtext_mem.md delete mode 100755 examples/run_infinicore_ops_before_logits.sh delete mode 100755 examples/run_longtext_metrics_cases.sh diff --git a/examples/collect_metrics_longtext_decode.py b/examples/collect_metrics_longtext_decode.py deleted file mode 100644 index 172b6f40..00000000 --- a/examples/collect_metrics_longtext_decode.py +++ /dev/null @@ -1,355 +0,0 @@ -#!/usr/bin/env python3 -""" -Collect long-context + decode metrics for metrics_longtext_mem.md. - -**OOM-safe workflow:** run each case in a **fresh Python process** so CUDA allocations -are released between runs: - - ./run_longtext_metrics_cases.sh - -Or manually: - - python3 collect_metrics_longtext_decode.py --case hf:16384 --append-jsonl profiling_runs/longtext_decode_rows.jsonl - -See also docstring at top of previous revisions for GPU selection (CUDA_VISIBLE_DEVICES + NVML_GPU_INDEX). -""" - -from __future__ import annotations - -import argparse -import json -import os -import subprocess -import sys -import threading -import time -from typing import Any, Callable, Dict, List, Optional, Tuple - -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python")) - - -def _poll_gpu_mem_mib(stop: threading.Event, gpu_index: int, out: List[int]) -> None: - while not stop.is_set(): - try: - r = subprocess.run( - [ - "nvidia-smi", - "-i", - str(gpu_index), - "--query-gpu=memory.used", - "--format=csv,noheader,nounits", - ], - capture_output=True, - text=True, - timeout=5, - ) - if r.returncode == 0 and r.stdout.strip().isdigit(): - out.append(int(r.stdout.strip())) - except Exception: - pass - if stop.wait(timeout=1.0): - break - - -def _with_mem_poll(gpu_index: int, fn: Callable[[], Any]) -> Tuple[Any, Optional[int]]: - samples: List[int] = [] - stop = threading.Event() - th = threading.Thread(target=_poll_gpu_mem_mib, args=(stop, gpu_index, samples), daemon=True) - th.start() - err: Optional[BaseException] = None - result: Any = None - try: - result = fn() - except BaseException as e: - err = e - finally: - stop.set() - th.join(timeout=3.0) - peak = max(samples) if samples else None - if err is not None: - raise err - return result, peak - - -def _row_dict( - date: str, - backend: str, - target: int, - actual: int, - max_new: int, - peak: Optional[int], - gpu_smi: int, - r: Dict[str, Any], -) -> Dict[str, Any]: - return { - "date": date, - "backend": backend, - "target_input_tokens": target, - "actual_input_tokens": actual, - "max_new_tokens": max_new, - "peak_mem_mib": peak, - "gpu_smi_index": gpu_smi, - "total_time_ms": r.get("total_time_ms"), - "prefill_ttft_ms": r.get("prefill_ttft_ms"), - "prefill_throughput_tok_s": r.get("prefill_throughput_tok_s"), - "decode_itl_ms": r.get("decode_itl_ms"), - "decode_throughput_tok_s": r.get("decode_throughput_tok_s"), - "engine_reported_generation_ms": r.get("engine_reported_generation_ms"), - "error": r.get("error"), - } - - -def run_single_case( - case: str, - *, - model_path: str, - gpu_smi: int, - date: str, -) -> Dict[str, Any]: - """Run one measurement; returns a row dict (may contain error key).""" - examples_dir = os.path.dirname(os.path.abspath(__file__)) - sys.path.insert(0, examples_dir) - os.chdir(examples_dir) - - from transformers import AutoTokenizer - - from compare_inference_speed import ( - _make_prompt_with_target_tokens, - run_hf_decode_loop, - run_hf_forward_prefill, - run_infinilm_inprocess, - ) - - parts = case.strip().split(":") - kind = parts[0].lower() - if kind == "hf": - # Backward compatible: - # hf: -> max_new=1 (forward-prefill only) - # hf:: -> max_new= (decode-loop timing) - if len(parts) == 2: - target = int(parts[1]) - max_new = 1 - elif len(parts) == 3: - target = int(parts[1]) - max_new = int(parts[2]) - else: - raise ValueError("--case hf:[:] (e.g. hf:16384 or hf:16384:32)") - elif kind == "infinilm_rec": - if len(parts) != 3: - raise ValueError("--case infinilm_rec:: (e.g. infinilm_rec:32768:32)") - target = int(parts[1]) - max_new = int(parts[2]) - else: - raise ValueError( - f"Unknown case kind {kind!r}; use hf: or infinilm_rec:" - ) - - tok = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - prompt, actual = _make_prompt_with_target_tokens(tok, "How are you", target) - - if kind == "hf": - - def go() -> Dict[str, Any]: - # Always use hf decode-loop so total_time_ms can be end-to-end - # (prefill + decode), matching the InfiniLM generate semantics. - return run_hf_decode_loop( - model_path, - prompt, - max_new, - device="cuda", - attn_implementation="flash_attention_2", - use_cache=True, - warmup=1, - iters=1, - ) - - try: - r, peak = _with_mem_poll(gpu_smi, go) - r = dict(r) - return _row_dict(date, "hf (decode_loop)", target, actual, max_new, peak, gpu_smi, r) - except Exception as e: - return _row_dict( - date, - "hf (decode_loop)", - target, - actual, - max_new, - None, - gpu_smi, - {"error": str(e)}, - ) - - recurrent = kind == "infinilm_rec" - if max_new == 1: - label = "infinilm (static_fit, recurrent GLA decode)" - else: - label = f"infinilm (static_fit, recurrent GLA, +{max_new} decode)" - - saved_lightning = os.environ.get("INFINI_LIGHTNING_GLA_RECURRENT_DECODE") - saved_skip = os.environ.get("INFINI_SKIP_LAST_LOGITS_CPU") - try: - if recurrent: - os.environ["INFINI_LIGHTNING_GLA_RECURRENT_DECODE"] = "1" - else: - os.environ.pop("INFINI_LIGHTNING_GLA_RECURRENT_DECODE", None) - os.environ["INFINI_SKIP_LAST_LOGITS_CPU"] = "1" - - def go_inf() -> Dict[str, Any]: - return run_infinilm_inprocess( - model_path, - prompt, - max_new, - cache_mode="static_fit", - paged_block_size=256, - attn_backend="default", - ) - - r, peak = _with_mem_poll(gpu_smi, go_inf) - return _row_dict(date, label, target, actual, max_new, peak, gpu_smi, dict(r)) - except Exception as e: - return _row_dict(date, label, target, actual, max_new, None, gpu_smi, {"error": str(e)}) - finally: - if saved_lightning is None: - os.environ.pop("INFINI_LIGHTNING_GLA_RECURRENT_DECODE", None) - else: - os.environ["INFINI_LIGHTNING_GLA_RECURRENT_DECODE"] = saved_lightning - if saved_skip is None: - os.environ.pop("INFINI_SKIP_LAST_LOGITS_CPU", None) - else: - os.environ["INFINI_SKIP_LAST_LOGITS_CPU"] = saved_skip - - -def print_markdown_table(rows: List[Dict[str, Any]]) -> None: - def fmt(x: Any) -> str: - if x is None: - return "—" - if isinstance(x, float): - s = f"{x:.2f}" - return s.rstrip("0").rstrip(".") - return str(x) - - gpu_smi = rows[0].get("gpu_smi_index", 0) if rows else 0 - print("\n### Markdown table (paste into metrics_longtext_mem.md)\n") - hdr = ( - "| date | backend | target_in | max_new | peak_mem_mib | total_ms | prefill_ttft_ms | " - "prefill_tok_s | decode_itl_ms | decode_tok_s | gpu |" - ) - sep = "|---|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|" - print(hdr) - print(sep) - for row in rows: - if row.get("error"): - print( - f"| {row['date']} | {row['backend']} | {row['target_input_tokens']} | " - f"{row['max_new_tokens']} | {fmt(row.get('peak_mem_mib'))} | OOM/err | — | — | — | — | {gpu_smi} |" - ) - continue - dec_itl = fmt(row.get("decode_itl_ms")) if row["max_new_tokens"] > 1 else "—" - dec_tps = fmt(row.get("decode_throughput_tok_s")) if row["max_new_tokens"] > 1 else "—" - ptt = row.get("prefill_ttft_ms") - # Only forward-prefill runs use total_time_ms as a prefill-time proxy. - if ptt is None and row.get("backend") == "hf (forward_prefill)": - ptt = row.get("total_time_ms") - print( - f"| {row['date']} | {row['backend']} | {row['target_input_tokens']} | {row['max_new_tokens']} | " - f"{fmt(row.get('peak_mem_mib'))} | {fmt(row.get('total_time_ms'))} | {fmt(ptt)} | " - f"{fmt(row.get('prefill_throughput_tok_s'))} | {dec_itl} | {dec_tps} | {gpu_smi} |" - ) - - -def main() -> None: - ap = argparse.ArgumentParser(description="Long-context + decode metrics (OOM-safe --case mode)") - ap.add_argument( - "--case", - type=str, - default=None, - help="Single case: hf:16384 | infinilm_rec:32768:32", - ) - ap.add_argument( - "--append-jsonl", - type=str, - default=None, - help="Append one JSON line (--case mode only)", - ) - ap.add_argument( - "--from-jsonl", - type=str, - default=None, - help="Load rows from jsonl and print markdown table", - ) - ap.add_argument( - "--all-in-process", - action="store_true", - help="Run full matrix in one process (may OOM between cases)", - ) - args = ap.parse_args() - - model_path = os.environ.get( - "MODEL_PATH", "/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA" - ) - gpu_smi = int(os.environ.get("NVML_GPU_INDEX", os.environ.get("CUDA_VISIBLE_DEVICES", "0"))) - date = os.environ.get("METRICS_DATE", "2026-03-23") - decode_steps = int(os.environ.get("METRICS_DECODE_STEPS", "32")) - targets = [int(x) for x in os.environ.get("METRICS_TARGETS", "16384,32768,65536").split(",")] - - examples_dir = os.path.dirname(os.path.abspath(__file__)) - - if args.from_jsonl: - rows = [] - with open(args.from_jsonl) as f: - for line in f: - line = line.strip() - if line: - rows.append(json.loads(line)) - print_markdown_table(rows) - return - - if args.case: - row = run_single_case(args.case, model_path=model_path, gpu_smi=gpu_smi, date=date) - print(json.dumps(row, ensure_ascii=False)) - if args.append_jsonl: - ap = os.path.abspath(args.append_jsonl) - ad = os.path.dirname(ap) - if ad: - os.makedirs(ad, exist_ok=True) - with open(ap, "a") as f: - f.write(json.dumps(row, ensure_ascii=False) + "\n") - return - - if not args.all_in_process: - print( - "Specify --case CASE, --from-jsonl FILE, or --all-in-process.\n" - "For OOM safety use: ./run_longtext_metrics_cases.sh", - file=sys.stderr, - ) - sys.exit(2) - - # Legacy: all targets × all backends in one process - rows: List[Dict[str, Any]] = [] - for t in targets: - row = run_single_case(f"hf:{t}", model_path=model_path, gpu_smi=gpu_smi, date=date) - rows.append(row) - for t in targets: - rows.append( - run_single_case(f"infinilm_rec:{t}:1", model_path=model_path, gpu_smi=gpu_smi, date=date) - ) - for t in targets: - rows.append( - run_single_case( - f"infinilm_rec:{t}:{decode_steps}", - model_path=model_path, - gpu_smi=gpu_smi, - date=date, - ) - ) - - out_path = os.path.join(examples_dir, "profiling_runs", "longtext_decode_metrics.json") - os.makedirs(os.path.dirname(out_path), exist_ok=True) - with open(out_path, "w") as f: - json.dump({"gpu_smi_index": gpu_smi, "decode_steps": decode_steps, "rows": rows}, f, indent=2) - print(f"Wrote {out_path}") - print_markdown_table(rows) - - -if __name__ == "__main__": - main() diff --git a/examples/compare_inference_speed.py b/examples/compare_inference_speed.py deleted file mode 100644 index 06fad9a7..00000000 --- a/examples/compare_inference_speed.py +++ /dev/null @@ -1,868 +0,0 @@ -#!/usr/bin/env python3 -""" -Compare MiniCPM-SALA inference speed across HF, InfiniLM, and (optionally) SGLang. - -Usage: - # HF + InfiniLM only (InfiniLM runs in subprocess with same env as jiuge): - python compare_inference_speed.py --model_path /path/to/MiniCPM-SALA [--prompt "How are you"] [--max_new_tokens 32] - - # Include SGLang (server must already be running with MiniCPM-SALA): - python compare_inference_speed.py --model_path /path/to/MiniCPM-SALA --sglang_url http://127.0.0.1:30000 - - # Optional: write JSON - python compare_inference_speed.py --model_path /path/to/MiniCPM-SALA --output results.json - -Requires: transformers, torch; for InfiniLM subprocess: PYTHONPATH and LD_LIBRARY_PATH as in jiuge. -""" - -import argparse -import json -import os -import re -import subprocess -import sys -import time -from typing import Optional, Tuple, Literal - -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python")) - -def _build_chat_input_ids(tokenizer, prompt: str): - conversation = [{"role": "user", "content": prompt}] - text = tokenizer.apply_chat_template( - conversation, add_generation_prompt=True, tokenize=False - ) - ids = tokenizer(text, add_special_tokens=True)["input_ids"] - return ids - - -def _make_prompt_with_target_tokens(tokenizer, base_prompt: str, target_input_tokens: int) -> Tuple[str, int]: - """ - Build a prompt (user content) such that the *chat-templated* input_ids length is >= target_input_tokens. - Returns (prompt, actual_input_tokens). - """ - if target_input_tokens <= 0: - raise ValueError("--target_input_tokens must be > 0") - - # Ensure boundaries don't merge tokens weirdly. - chunk = (base_prompt.strip() + "\n") if base_prompt.strip() else "hello\n" - - # Exponential growth to find an upper bound. - rep = 1 - while True: - prompt = chunk * rep - ids = _build_chat_input_ids(tokenizer, prompt) - if len(ids) >= target_input_tokens: - break - rep *= 2 - if rep > 1_000_000: - raise RuntimeError("Failed to build prompt to target length (rep too large)") - - # Binary search for smallest rep that reaches target. - lo, hi = 1, rep - best_prompt = prompt - best_len = len(ids) - while lo <= hi: - mid = (lo + hi) // 2 - p = chunk * mid - l = len(_build_chat_input_ids(tokenizer, p)) - if l >= target_input_tokens: - best_prompt, best_len = p, l - hi = mid - 1 - else: - lo = mid + 1 - - return best_prompt, best_len - - -def run_hf( - model_path: str, - prompt: str, - max_new_tokens: int, - device: str = "cuda", - *, - attn_implementation: Optional[str] = None, -): - """Run HuggingFace generate and return metrics.""" - import torch - from transformers import AutoModelForCausalLM, AutoTokenizer - - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - model_kwargs = { - "torch_dtype": "auto", - "trust_remote_code": True, - } - # Prefer flash-attn when available; fall back silently if not supported. - if attn_implementation is not None: - model_kwargs["attn_implementation"] = attn_implementation # type: ignore[assignment] - try: - model = AutoModelForCausalLM.from_pretrained( - model_path, - **model_kwargs, - ).to(device) - except TypeError: - # Older transformers versions may not support attn_implementation kwarg. - model_kwargs.pop("attn_implementation", None) - model = AutoModelForCausalLM.from_pretrained( - model_path, - **model_kwargs, - ).to(device) - model.eval() - - conversation = [{"role": "user", "content": prompt}] - text = tokenizer.apply_chat_template( - conversation, add_generation_prompt=True, tokenize=False - ) - inputs = tokenizer(text, return_tensors="pt").to(device) - input_len = inputs.input_ids.shape[1] - - start = time.perf_counter() - with torch.inference_mode(): - out = model.generate( - **inputs, - max_new_tokens=max_new_tokens, - do_sample=False, - pad_token_id=tokenizer.eos_token_id or 0, - ) - elapsed = time.perf_counter() - start - output_len = out.shape[1] - input_len - - return { - "backend": "hf", - "total_time_ms": round(elapsed * 1000, 2), - "input_tokens": input_len, - "output_tokens": output_len, - "prefill_ttft_ms": None, # HF generate() doesn't expose TTFT without streaming - "decode_throughput_tok_s": round(output_len / elapsed, 2) if elapsed > 0 else None, - "total_throughput_tok_s": round((input_len + output_len) / elapsed, 2) if elapsed > 0 else None, - } - - -def run_hf_forward_prefill( - model_path: str, - prompt: str, - device: str = "cuda", - *, - attn_implementation: Optional[str] = None, - use_cache: bool = True, - warmup: int = 1, - iters: int = 1, -): - """ - Run HuggingFace *forward-only* prefill (no decode loop). - Intended for kernel-level profiling to isolate prefill work. - """ - import torch - from transformers import AutoModelForCausalLM, AutoTokenizer - - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - model_kwargs = { - "torch_dtype": "auto", - "trust_remote_code": True, - } - if attn_implementation is not None: - model_kwargs["attn_implementation"] = attn_implementation # type: ignore[assignment] - try: - model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs).to(device) - except TypeError: - model_kwargs.pop("attn_implementation", None) - model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs).to(device) - model.eval() - - conversation = [{"role": "user", "content": prompt}] - text = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) - inputs = tokenizer(text, return_tensors="pt").to(device) - input_len = inputs.input_ids.shape[1] - - # Warmup (reduces first-iter compilation / cache effects for profiling). - with torch.inference_mode(): - for _ in range(max(0, warmup)): - # Prefer last-token logits only (reduces memory at long context). - try: - _ = model(**inputs, use_cache=use_cache, logits_to_keep=1) - except TypeError: - _ = model(**inputs, use_cache=use_cache) - torch.cuda.synchronize() - - # Timed iters. - times = [] - with torch.inference_mode(): - for _ in range(max(1, iters)): - torch.cuda.synchronize() - try: - torch.cuda.nvtx.range_push("hf_forward_prefill") - except Exception: - pass - start = time.perf_counter() - try: - _ = model(**inputs, use_cache=use_cache, logits_to_keep=1) - except TypeError: - _ = model(**inputs, use_cache=use_cache) - torch.cuda.synchronize() - elapsed = time.perf_counter() - start - try: - torch.cuda.nvtx.range_pop() - except Exception: - pass - times.append(elapsed) - - best = min(times) if times else 0.0 - return { - "backend": "hf_forward_prefill", - "total_time_ms": round(best * 1000, 2), - "input_tokens": int(input_len), - "output_tokens": 0, - "use_cache": bool(use_cache), - "warmup": int(warmup), - "iters": int(iters), - "prefill_throughput_tok_s": round(input_len / best, 2) if best > 0 else None, - } - - -def run_hf_decode_loop( - model_path: str, - prompt: str, - max_new_tokens: int, - device: str = "cuda", - *, - attn_implementation: Optional[str] = None, - use_cache: bool = True, - warmup: int = 8, - iters: int = 1, -): - """ - Measure HF *decode-only* per-token latency using a manual loop with past_key_values. - - Protocol: - - Prefill once on the full prompt (not included in decode timing). - - Then decode `max_new_tokens` tokens with 1-token steps, timing the whole decode loop - (optionally best-of `iters`). - """ - import torch - from transformers import AutoModelForCausalLM, AutoTokenizer - - if max_new_tokens <= 0: - raise ValueError("--max_new_tokens must be > 0 for hf decode_loop") - - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - model_kwargs = { - "torch_dtype": "auto", - "trust_remote_code": True, - } - if attn_implementation is not None: - model_kwargs["attn_implementation"] = attn_implementation # type: ignore[assignment] - try: - model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs).to(device) - except TypeError: - model_kwargs.pop("attn_implementation", None) - model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs).to(device) - model.eval() - - conversation = [{"role": "user", "content": prompt}] - text = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) - inputs = tokenizer(text, return_tensors="pt").to(device) - input_ids = inputs.input_ids - input_len = int(input_ids.shape[1]) - # Some decoder-only models require attention_mask even when no padding is used. - attention_mask = inputs.get("attention_mask", None) - if attention_mask is None: - attention_mask = input_ids.new_ones(input_ids.shape) - attention_mask = attention_mask.to(device) - # Precompute full (input_len + max_new_tokens) causal attention mask for past-key decoding. - attention_mask_full = attention_mask.new_ones((attention_mask.shape[0], input_len + max_new_tokens)) - - # Prefill once to build cache. - with torch.inference_mode(): - try: - pre = model(input_ids=input_ids, attention_mask=attention_mask, use_cache=use_cache, logits_to_keep=1) - except TypeError: - pre = model(input_ids=input_ids, attention_mask=attention_mask, use_cache=use_cache) - past = getattr(pre, "past_key_values", None) - # Greedy next token from last logits. - logits = pre.logits[:, -1, :] - next_token = torch.argmax(logits, dim=-1, keepdim=True) - - # Warmup decode steps (not timed) to reduce first-step effects. - with torch.inference_mode(): - for warm_i in range(max(0, warmup)): - try: - # Attention mask must cover (past + current token). - attn_mask_step = attention_mask_full[:, : input_len + warm_i + 1] - out = model( - input_ids=next_token, - attention_mask=attn_mask_step, - use_cache=use_cache, - past_key_values=past, - logits_to_keep=1, - ) - except TypeError: - attn_mask_step = attention_mask_full[:, : input_len + warm_i + 1] - out = model( - input_ids=next_token, - attention_mask=attn_mask_step, - use_cache=use_cache, - past_key_values=past, - ) - past = getattr(out, "past_key_values", past) - logits = out.logits[:, -1, :] - next_token = torch.argmax(logits, dim=-1, keepdim=True) - torch.cuda.synchronize() - - # Timed decode loops (best-of iters). - # We report total_time_ms as end-to-end (prefill + decode), but keep - # decode_itl_ms / decode_throughput_tok_s based on decode-only time. - total_times = [] - decode_times = [] - with torch.inference_mode(): - for _ in range(max(1, iters)): - # Re-prefill to avoid measuring a "warmed" cache from prior iteration. - # Time prefill separately so decode_itl_ms stays decode-only. - torch.cuda.synchronize() - prefill_start = time.perf_counter() - try: - pre = model( - input_ids=input_ids, - attention_mask=attention_mask, - use_cache=use_cache, - logits_to_keep=1, - ) - except TypeError: - # Some model/transformers combinations may not accept attention_mask. - pre = model(input_ids=input_ids, use_cache=use_cache) - past = getattr(pre, "past_key_values", None) - logits = pre.logits[:, -1, :] - next_token = torch.argmax(logits, dim=-1, keepdim=True) - - torch.cuda.synchronize() - prefill_elapsed = time.perf_counter() - prefill_start - - torch.cuda.synchronize() - start = time.perf_counter() # decode start - try: - torch.cuda.nvtx.range_push("hf_decode_loop") - except Exception: - pass - for t in range(max_new_tokens): - attn_mask_step = attention_mask_full[:, : input_len + t + 1] - try: - out = model( - input_ids=next_token, - attention_mask=attn_mask_step, - use_cache=use_cache, - past_key_values=past, - logits_to_keep=1, - ) - except TypeError: - out = model( - input_ids=next_token, - attention_mask=attn_mask_step, - use_cache=use_cache, - past_key_values=past, - ) - past = getattr(out, "past_key_values", past) - logits = out.logits[:, -1, :] - next_token = torch.argmax(logits, dim=-1, keepdim=True) - torch.cuda.synchronize() - decode_elapsed = time.perf_counter() - start - total_elapsed = prefill_elapsed + decode_elapsed - try: - torch.cuda.nvtx.range_pop() - except Exception: - pass - total_times.append(total_elapsed) - decode_times.append(decode_elapsed) - - # Pick the iteration with the best end-to-end time; compute decode metrics - # from the corresponding decode-only time. - if total_times: - best_idx = min(range(len(total_times)), key=lambda i: total_times[i]) - best_total = total_times[best_idx] - best_decode = decode_times[best_idx] - else: - best_total = 0.0 - best_decode = 0.0 - - itl_ms = (best_decode * 1000.0 / max_new_tokens) if best_decode > 0 else None - thr = (max_new_tokens / best_decode) if best_decode > 0 else None - return { - "backend": "hf_decode_loop", - "total_time_ms": round(best_total * 1000, 2), - "input_tokens": int(input_len), - "output_tokens": int(max_new_tokens), - "decode_itl_ms": round(itl_ms, 4) if itl_ms is not None else None, - "decode_throughput_tok_s": round(thr, 2) if thr is not None else None, - "use_cache": bool(use_cache), - "warmup": int(warmup), - "iters": int(iters), - } - - -def run_infinilm_inprocess( - model_path: str, - prompt: str, - max_new_tokens: int, - *, - cache_mode: Literal["static_fit", "static_maxpos", "paged"] = "paged", - paged_block_size: int = 256, - attn_backend: str = "flash-attn", -): - """ - Run InfiniLM in-process (no 2048-token truncation). Parses InferEngine's timing prints. - This expects PYTHONPATH to include InfiniLM/InfiniCore python packages (container runner does this). - """ - import io - import torch - import contextlib - - import infinicore - from transformers import AutoTokenizer - - from infinilm.cache import PagedKVCacheConfig, StaticKVCacheConfig - from infinilm.distributed import DistConfig - from infinilm.infer_engine import GenerationConfig, InferEngine - from infinilm.modeling_utils import load_model_state_dict_by_file - - model_path = os.path.expanduser(model_path) - # Prefer flash-attn when available; fall back to default. - try: - model = InferEngine( - model_path, - device=infinicore.device("cuda", 0), - distributed_config=DistConfig(1), - enable_graph_compiling=False, - attention_backend=attn_backend, - ) - except TypeError: - # Older InferEngine builds may not accept attention_backend. - model = InferEngine( - model_path, - device=infinicore.device("cuda", 0), - distributed_config=DistConfig(1), - enable_graph_compiling=False, - ) - except Exception: - try: - model = InferEngine( - model_path, - device=infinicore.device("cuda", 0), - distributed_config=DistConfig(1), - enable_graph_compiling=False, - attention_backend="default", - ) - except TypeError: - model = InferEngine( - model_path, - device=infinicore.device("cuda", 0), - distributed_config=DistConfig(1), - enable_graph_compiling=False, - ) - load_model_state_dict_by_file(model, model_path, dtype=model.config.dtype) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - - input_ids = _build_chat_input_ids(tokenizer, prompt) - input_ids_infini = infinicore.from_list([input_ids]) - - initial_capacity = len(input_ids) + max_new_tokens - if cache_mode == "paged": - num_blocks = (initial_capacity + (paged_block_size - 1)) // paged_block_size - cache_config = PagedKVCacheConfig( - num_blocks=num_blocks, - block_size=paged_block_size, - ) - else: - if cache_mode == "static_maxpos": - max_pos = getattr(model.config, "max_position_embeddings", 4096) - max_cache_len = max(initial_capacity, max_pos) - else: - # Fit cache to what we actually need for this run. - max_cache_len = initial_capacity - cache_config = StaticKVCacheConfig(max_batch_size=1, max_cache_len=max_cache_len) - # Basic GPU memory stats around cache construction (CUDA device assumed to be index 0). - mem_before_cache = torch.cuda.memory_allocated(0) - max_mem_before_cache = torch.cuda.max_memory_allocated(0) - - model.reset_cache(cache_config) - - mem_after_cache = torch.cuda.memory_allocated(0) - max_mem_after_cache = torch.cuda.max_memory_allocated(0) - - buf = io.StringIO() - start = time.perf_counter() - with contextlib.redirect_stdout(buf): - try: - torch.cuda.nvtx.range_push("infinilm_generate") - except Exception: - pass - try: - model.generate( - input_ids_infini, - GenerationConfig( - max_new_tokens=max_new_tokens, - temperature=1.0, - top_k=1, - top_p=1.0, - # Profiling: avoid per-step EOS checks + early stop variability. - stop_on_eos=False, - ), - _measure_and_log_time=True, - ) - finally: - try: - torch.cuda.nvtx.range_pop() - except Exception: - pass - elapsed = time.perf_counter() - start - stdout = buf.getvalue() - - prefill_ttft_ms = None - prefill_throughput = None - decode_itl_ms = None - decode_throughput = None - gen_completed_ms = None - for line in stdout.splitlines(): - if "Prefill TTFT:" in line: - m = re.search( - r"Prefill TTFT:\s*([\d.]+)\s*ms.*Throughput:\s*([\d.]+)\s*tok/s", line - ) - if m: - prefill_ttft_ms = float(m.group(1)) - prefill_throughput = float(m.group(2)) - if "Decode" in line and "ITL:" in line: - m = re.search( - r"Decode\s+Avg ITL:\s*([\d.]+)\s*ms.*Throughput:\s*([\d.]+)\s*tok/s", line - ) - if m: - decode_itl_ms = float(m.group(1)) - decode_throughput = float(m.group(2)) - if "Generation completed in" in line: - m = re.search(r"Generation completed in\s*([\d.]+)\s*ms", line) - if m: - gen_completed_ms = float(m.group(1)) - - return { - "backend": "infinilm", - "total_time_ms": round(elapsed * 1000, 2), - "input_tokens": len(input_ids), - "output_tokens": max_new_tokens, - "prefill_ttft_ms": prefill_ttft_ms, - "prefill_throughput_tok_s": prefill_throughput, - "decode_itl_ms": decode_itl_ms, - "decode_throughput_tok_s": decode_throughput, - "engine_reported_generation_ms": gen_completed_ms, - # Cache / attention configuration - "cache_mode": cache_mode, - "paged_block_size": paged_block_size if cache_mode == "paged" else None, - "enable_paged_attn": getattr(model, "enable_paged_attn", False), - "static_max_cache_len": max_cache_len if cache_mode != "paged" else None, - "paged_num_blocks": num_blocks if cache_mode == "paged" else None, - # Torch CUDA memory snapshots (bytes) - "torch_memory_allocated_before_cache": int(mem_before_cache), - "torch_memory_allocated_after_cache": int(mem_after_cache), - "torch_max_memory_allocated_before_cache": int(max_mem_before_cache), - "torch_max_memory_allocated_after_cache": int(max_mem_after_cache), - } - - -def run_infinilm(model_path: str, prompt: str, max_new_tokens: int, env=None): - """Run InfiniLM jiuge via subprocess and parse stdout for metrics.""" - run_env = {**os.environ, **(env or {})} - examples_dir = os.path.dirname(os.path.abspath(__file__)) - jiuge_py = os.path.join(examples_dir, "jiuge.py") - cmd = [ - sys.executable, - jiuge_py, - "--nvidia", - "--model_path", model_path, - "--prompt", prompt, - "--max_new_tokens", str(max_new_tokens), - ] - try: - result = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=300, - env=run_env, - cwd=examples_dir, - ) - stdout = result.stdout or "" - if result.returncode != 0 and not stdout: - return {"backend": "infinilm", "error": (result.stderr or f"exit code {result.returncode}")[:500]} - except Exception as e: - return {"backend": "infinilm", "error": str(e)} - - # Parse jiuge / InferEngine output - prefill_ttft_ms = None - prefill_throughput = None - decode_itl_ms = None - decode_throughput = None - total_time_ms = None - for line in stdout.splitlines(): - if "Prefill TTFT:" in line: - m = re.search(r"Prefill TTFT:\s*([\d.]+)\s*ms.*Throughput:\s*([\d.]+)\s*tok/s", line) - if m: - prefill_ttft_ms = float(m.group(1)) - prefill_throughput = float(m.group(2)) - if "Decode" in line and "ITL:" in line: - m = re.search(r"Decode\s+Avg ITL:\s*([\d.]+)\s*ms.*Throughput:\s*([\d.]+)\s*tok/s", line) - if m: - decode_itl_ms = float(m.group(1)) - decode_throughput = float(m.group(2)) - if "total_time:" in line: - m = re.search(r"total_time:\s*([\d.]+)\s*ms", line) - if m: - total_time_ms = float(m.group(1)) - if "Generation completed in" in line: - m = re.search(r"Generation completed in\s*([\d.]+)\s*ms", line) - if m: - total_time_ms = float(m.group(1)) - - return { - "backend": "infinilm", - "total_time_ms": total_time_ms, - "prefill_ttft_ms": prefill_ttft_ms, - "prefill_throughput_tok_s": prefill_throughput, - "decode_itl_ms": decode_itl_ms, - "decode_throughput_tok_s": decode_throughput, - } - - -def run_sglang_client(sglang_url: str, prompt: str, max_new_tokens: int): - """Send one request to SGLang server and return metrics.""" - try: - import requests - except ImportError: - return {"backend": "sglang", "error": "requests not installed"} - - url = sglang_url.rstrip("/") + "/generate" - payload = { - "text": prompt, - "sampling_params": {"max_new_tokens": max_new_tokens, "temperature": 0}, - } - start = time.perf_counter() - try: - r = requests.post(url, json=payload, timeout=120) - r.raise_for_status() - data = r.json() - except Exception as e: - return {"backend": "sglang", "error": str(e)} - elapsed_ms = (time.perf_counter() - start) * 1000 - - # SGLang response may have "meta_info" with "completion_tokens" or we use prompt + output length - output_text = (data.get("text") or data.get("choices", [{}])[0].get("text") or "") - completion_tokens = data.get("meta_info", {}).get("completion_tokens") or data.get("usage", {}).get("completion_tokens") - if completion_tokens is None and "usage" in data: - completion_tokens = data["usage"].get("completion_tokens") - if completion_tokens is None: - completion_tokens = max_new_tokens # fallback - - return { - "backend": "sglang", - "total_time_ms": round(elapsed_ms, 2), - "output_tokens": completion_tokens, - "total_throughput_tok_s": round(completion_tokens / (elapsed_ms / 1000), 2) if elapsed_ms > 0 else None, - } - - -def main(): - parser = argparse.ArgumentParser(description="Compare MiniCPM-SALA inference speed: HF, InfiniLM, SGLang") - parser.add_argument("--model_path", required=True, help="Path to MiniCPM-SALA model dir") - parser.add_argument("--prompt", default="How are you", help="Prompt for generation") - parser.add_argument("--max_new_tokens", type=int, default=32, help="Max new tokens to generate") - parser.add_argument( - "--target_input_tokens", - type=int, - default=None, - help="If set, synthesize a long prompt so chat-templated input tokens >= this value (e.g. 65536).", - ) - parser.add_argument( - "--infinilm_cache_mode", - type=str, - default="paged", - choices=["paged", "static_fit", "static_maxpos"], - help="InfiniLM KV cache mode when running long prompts in-process.", - ) - parser.add_argument( - "--infinilm_paged_block_size", - type=int, - default=256, - help="Paged KV block size (tokens per block).", - ) - parser.add_argument( - "--infinilm_attn_backend", - type=str, - default="flash-attn", - help="InfiniLM attention backend (e.g. flash-attn or default).", - ) - parser.add_argument( - "--hf_attn_implementation", - type=str, - default="flash_attention_2", - help="HF attention implementation to request (e.g. flash_attention_2 or eager).", - ) - parser.add_argument( - "--hf_mode", - type=str, - default="generate", - choices=["generate", "forward_prefill", "decode_loop"], - help="HF run mode: generate() end-to-end, forward-only prefill, or manual decode_loop timing with KV cache.", - ) - parser.add_argument( - "--hf_forward_use_cache", - action="store_true", - help="In HF forward_prefill mode, pass use_cache=True (recommended).", - ) - parser.add_argument( - "--hf_forward_warmup", - type=int, - default=1, - help="Warmup iterations for HF forward_prefill.", - ) - parser.add_argument( - "--hf_forward_iters", - type=int, - default=1, - help="Measured iterations for HF forward_prefill (best-of).", - ) - parser.add_argument( - "--hf_decode_warmup", - type=int, - default=8, - help="Warmup steps for HF decode_loop (not timed).", - ) - parser.add_argument( - "--hf_decode_iters", - type=int, - default=1, - help="Measured iterations for HF decode_loop (best-of).", - ) - parser.add_argument("--sglang_url", default=None, help="SGLang server URL (e.g. http://127.0.0.1:30000); if set, query SGLang") - parser.add_argument("--backends", default="hf,infinilm", help="Comma-separated: hf,infinilm,sglang") - parser.add_argument("--output", default=None, help="Write JSON results to this path") - parser.add_argument("--no_hf", action="store_true", help="Skip HF (e.g. if no GPU memory for two models)") - parser.add_argument("--no_infinilm", action="store_true", help="Skip InfiniLM") - parser.add_argument( - "--prefill_16k", - action="store_true", - help="Convenience flag: set --target_input_tokens=16384 and --max_new_tokens=1 (prefill-dominated).", - ) - parser.add_argument( - "--infinilm_inprocess", - action="store_true", - help="Run InfiniLM in-process (no jiuge subprocess). Use when PYTHONPATH/LD_LIBRARY_PATH are set in this process.", - ) - args = parser.parse_args() - - backends = [b.strip() for b in args.backends.split(",")] - results = [] - - # Normalize convenience prefill-only configuration. - if args.prefill_16k: - if args.target_input_tokens is None: - args.target_input_tokens = 16384 - # For prefill-dominated comparisons, prefer HF forward-only by default. - if args.hf_mode == "generate": - args.hf_mode = "forward_prefill" - if args.max_new_tokens != 1: - args.max_new_tokens = 1 - - # If requested, build a long prompt once using HF tokenizer. - if args.target_input_tokens is not None: - try: - from transformers import AutoTokenizer - - tok = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) - long_prompt, actual = _make_prompt_with_target_tokens(tok, args.prompt, args.target_input_tokens) - args.prompt = long_prompt - print(f"[prompt] synthesized chat input tokens: {actual} (target >= {args.target_input_tokens})") - except Exception as e: - print(f"[prompt] failed to synthesize long prompt: {e}") - - if "hf" in backends and not args.no_hf: - try: - import torch - if args.hf_mode == "forward_prefill": - r = run_hf_forward_prefill( - args.model_path, - args.prompt, - attn_implementation=args.hf_attn_implementation, - use_cache=args.hf_forward_use_cache, - warmup=args.hf_forward_warmup, - iters=args.hf_forward_iters, - ) - elif args.hf_mode == "decode_loop": - r = run_hf_decode_loop( - args.model_path, - args.prompt, - args.max_new_tokens, - attn_implementation=args.hf_attn_implementation, - use_cache=True, - warmup=args.hf_decode_warmup, - iters=args.hf_decode_iters, - ) - else: - r = run_hf( - args.model_path, - args.prompt, - args.max_new_tokens, - attn_implementation=args.hf_attn_implementation, - ) - results.append(r) - except Exception as e: - results.append({"backend": "hf", "error": str(e)}) - - if "infinilm" in backends and not args.no_infinilm: - # In-process: when env is set in this process or --infinilm_inprocess, avoid jiuge subprocess. - # Also use in-process for long prompts (target_input_tokens) to avoid 2048-token truncation. - use_inprocess = args.infinilm_inprocess or args.target_input_tokens is not None - if use_inprocess: - try: - r = run_infinilm_inprocess( - args.model_path, - args.prompt, - args.max_new_tokens, - cache_mode=args.infinilm_cache_mode, # type: ignore[arg-type] - paged_block_size=args.infinilm_paged_block_size, - attn_backend=args.infinilm_attn_backend, - ) - except Exception as e: - r = {"backend": "infinilm", "error": str(e)} - else: - r = run_infinilm(args.model_path, args.prompt, args.max_new_tokens) - results.append(r) - - if "sglang" in backends and args.sglang_url: - r = run_sglang_client(args.sglang_url, args.prompt, args.max_new_tokens) - results.append(r) - elif "sglang" in backends and not args.sglang_url: - results.append({"backend": "sglang", "error": "No --sglang_url provided; start SGLang server with MiniCPM-SALA first"}) - - # Print table - print("\n" + "=" * 60) - print("MiniCPM-SALA inference speed comparison") - print("=" * 60) - print(f" prompt = {repr(args.prompt[:500])} max_new_tokens = {args.max_new_tokens}") - print() - for r in results: - if "error" in r: - print(f" {r['backend']}: ERROR {r['error']}") - continue - print(f" {r['backend']}:") - for k, v in r.items(): - if k == "backend" or v is None: - continue - if isinstance(v, float): - print(f" {k}: {v}") - else: - print(f" {k}: {v}") - print() - print("=" * 60) - - if args.output: - with open(args.output, "w") as f: - json.dump({"prompt": args.prompt, "max_new_tokens": args.max_new_tokens, "results": results}, f, indent=2) - print(f"Wrote {args.output}") - - -if __name__ == "__main__": - import os - main() diff --git a/examples/metrics_16k_prefill.md b/examples/metrics_16k_prefill.md deleted file mode 100644 index 2337fac0..00000000 --- a/examples/metrics_16k_prefill.md +++ /dev/null @@ -1,152 +0,0 @@ -### MiniCPM-SALA 16k long-prompt metrics (A/B cache modes) - -**Setup** - -- **Prompt construction**: `--target_input_tokens 16384` (actual synthesized **16386** chat-template tokens) -- **Workload**: `--max_new_tokens 1` (prefill-dominated) -- **Environment**: run via `scripts/run_compare_speed_in_container.sh` inside container `minicpm-sala` - -| backend | cache_mode | attn_backend | enable_paged_attn | cache sizing | prefill_ttft_ms | prefill_throughput_tok_s | total_time_ms | -|---|---|---|---:|---|---:|---:|---:| -| hf | — | — | — | — | — | 9325.01 | 1757.21 | -| infinilm | static_fit | default | False | static_max_cache_len=16387 | 33632.05 | 487.21 | 33632.29 | -| infinilm | static_maxpos | default | False | static_max_cache_len=524288 | 34067.49 | 480.99 | 34067.75 | -| infinilm | paged | default | True | paged_block_size=256, paged_num_blocks=65 | 35626.25 | 459.94 | 35627.10 | - -**Raw commands** - -```bash -./scripts/run_compare_speed_in_container.sh --backends hf --target_input_tokens 16384 --max_new_tokens 1 -./scripts/run_compare_speed_in_container.sh --backends infinilm --target_input_tokens 16384 --max_new_tokens 1 --infinilm_attn_backend default --infinilm_cache_mode static_fit -./scripts/run_compare_speed_in_container.sh --backends infinilm --target_input_tokens 16384 --max_new_tokens 1 --infinilm_attn_backend default --infinilm_cache_mode static_maxpos -./scripts/run_compare_speed_in_container.sh --backends infinilm --target_input_tokens 16384 --max_new_tokens 1 --infinilm_attn_backend default --infinilm_cache_mode paged --infinilm_paged_block_size 256 -``` - -### Profiling methodology (nsys) for kernel attribution (HF vs InfiniLM prefill) - -**Goal**: attribute the 16k prefill gap to kernel families (attention vs GEMMs vs layout/copies/sync), using the same prompt and a prefill-dominated workload. - -**Environment**: all profiling commands in this section are run **inside the container `minicpm-sala`** (not on the host), so that PyTorch, InfiniCore, and the model path are available. Use `docker exec -it minicpm-sala bash` or the host script `./scripts/profile_prefill_torchprof_in_container.sh` to run in-container. - -**Workload** - -- HF: forward-only prefill (`--hf_mode forward_prefill`, `--max_new_tokens 1`) -- InfiniLM: prefill-dominated generation (`--target_input_tokens 16384 --max_new_tokens 1`) - -**Key requirements** - -- Use a free GPU to avoid allocator failures and noisy traces, e.g. `CUDA_VISIBLE_DEVICES=1`. -- Prefer `nsys stats` reports: - - `cuda_gpu_kern_sum` - - `cuda_gpu_mem_time_sum` - - `cuda_api_sum` - - `nvtx_sum` - -**Example (inside container `minicpm-sala`)** - -```bash -export CUDA_VISIBLE_DEVICES=1 -REPO=/home/zenghua/workspace/minicpm-sala-support -MODEL=/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA -OUT=${REPO}/profiles -mkdir -p ${OUT} - -source /app/docker/nvidia/env-set.sh 2>/dev/null || true -export PYTHONPATH=${REPO}/InfiniLM/python:${REPO}/InfiniCore/python:${PYTHONPATH} - -# HF forward-only prefill (single forward, best for kernel attribution) -nsys profile --force-overwrite=true --trace=cuda,nvtx,osrt \ - -o ${OUT}/hf_forward_prefill_16k \ - python3 ${REPO}/InfiniLM/examples/compare_inference_speed.py \ - --model_path "${MODEL}" --prefill_16k --backends hf \ - --hf_mode forward_prefill --hf_forward_use_cache \ - --hf_forward_warmup 1 --hf_forward_iters 1 \ - --hf_attn_implementation flash_attention_2 - -# InfiniLM prefill-dominated (max_new_tokens=1) -nsys profile --force-overwrite=true --trace=cuda,nvtx,osrt \ - -o ${OUT}/infinilm_prefill_16k \ - python3 ${REPO}/InfiniLM/examples/compare_inference_speed.py \ - --model_path "${MODEL}" --prefill_16k --backends infinilm \ - --infinilm_cache_mode static_fit --infinilm_attn_backend default - -# Summaries -nsys stats --report cuda_gpu_kern_sum --format table ${OUT}/hf_forward_prefill_16k.nsys-rep > ${OUT}/hf_forward_prefill_16k_cuda_gpu_kern_sum.txt -nsys stats --report cuda_gpu_kern_sum --format table ${OUT}/infinilm_prefill_16k.nsys-rep > ${OUT}/infinilm_prefill_16k_cuda_gpu_kern_sum.txt -nsys stats --report cuda_gpu_mem_time_sum --format table ${OUT}/hf_forward_prefill_16k.nsys-rep > ${OUT}/hf_forward_prefill_16k_cuda_gpu_mem_time_sum.txt -nsys stats --report cuda_gpu_mem_time_sum --format table ${OUT}/infinilm_prefill_16k.nsys-rep > ${OUT}/infinilm_prefill_16k_cuda_gpu_mem_time_sum.txt -nsys stats --report cuda_api_sum --format table ${OUT}/hf_forward_prefill_16k.nsys-rep > ${OUT}/hf_forward_prefill_16k_cuda_api_sum.txt -nsys stats --report cuda_api_sum --format table ${OUT}/infinilm_prefill_16k.nsys-rep > ${OUT}/infinilm_prefill_16k_cuda_api_sum.txt -nsys stats --report nvtx_sum --format table ${OUT}/hf_forward_prefill_16k.nsys-rep > ${OUT}/hf_forward_prefill_16k_nvtx_sum.txt -nsys stats --report nvtx_sum --format table ${OUT}/infinilm_prefill_16k.nsys-rep > ${OUT}/infinilm_prefill_16k_nvtx_sum.txt -``` - -### Prefill kernel launch reduction: SiLU/SwiGLU evidence and change - -**Evidence that SiLU/SwiGLU contributed to launch count** - -- Prefill profiling (e.g. `profile_prefill_infinilm_torchprof.py` at seq_len=512) showed ~298k `cudaLaunchKernel` and many small **elementwise** kernels (~36k calls). The MLP path used two separate InfiniCore ops per layer for SwiGLU: - - `infinicore::op::silu_(gate, gate)` — one kernel per layer - - `infinicore::op::mul(gate, up)` — one kernel per layer -- With 32 layers that is **64 extra launches** from this pattern alone. InfiniCore provides a **fused** `swiglu(a, b)` (single kernel: `a * b * sigmoid(b)`), which matches SwiGLU as `silu(gate)*up` when called as `swiglu(up, gate)`. - -**Change applied** - -- **File**: `InfiniLM/csrc/models/minicpm_sala/minicpm_sala_mlp.cpp` -- **Before**: `silu_(gate, gate)` then `mul(gate, up)` (two kernel launches per layer). -- **After**: single `infinicore::op::swiglu(up, gate)` (one kernel per layer). -- **Effect**: 32 fewer kernel launches per prefill (one per layer). Re-run the same prefill profiler or nsys commands above and compare `cuda_api_sum` (e.g. `cudaLaunchKernel` count) and `cuda_gpu_kern_sum` to confirm. - -### Environment fix: run InfiniLM/InfiniCore with InfLLM-v2 without LD_PRELOAD (nsys-safe) - -When profiling with `nsys`, setting `LD_PRELOAD` to the `infllm_v2` extension can break `nsys` itself (loader errors from PyTorch's `libtorch_python.so`). To make `nsys profile ... python ...` work reliably, we preload the InfLLM-v2 `.so` **inside Python** (RTLD_GLOBAL) before importing `infinicore`, so that `libinfinicore_cpp_api.so` can resolve `mha_varlen_fwd` / `mha_fwd_kvcache` without using `LD_PRELOAD`. - -- **Note**: InfLLM-v2 is now linked normally via InfiniCore build; no Python-side preload helper is required. -- **Wired into scripts** (preload before `import infinicore`): - - `InfiniLM/examples/compare_inference_speed.py` - - `InfiniLM/examples/profile_prefill_infinilm_torchprof.py` - - `InfiniLM/examples/minicpm_sala_logits_sanity.py` - -This unblocks running both torchprof and `nsys profile` inside the `minicpm-sala` container with a consistent environment. - -### 16k prefill nsys numbers (post env-fix) - -**Workload:** `--prefill_16k` (prompt tokens 16386), `--max_new_tokens 1`, `--infinilm_cache_mode static_fit`, `--infinilm_attn_backend default` - -- **HF forward-only prefill** (from `compare_inference_speed.py`): `total_time_ms ≈ 1782.58` for 16386 tokens. -- **HF forward-only prefill (rerun)** (from `compare_inference_speed.py`): `total_time_ms = 1757.21`, `prefill_throughput_tok_s = 9325.01` for 16386 tokens. -- **InfiniLM prefill-dominated** (from `compare_inference_speed.py`): `prefill_ttft_ms ≈ 55646.11` (baseline run) and `prefill_ttft_ms ≈ 57623.64` (rerun after minor code changes). - -**InfiniLM 16k CUDA API summary** (nsys `cuda_api_sum`, baseline run `profiles/infinilm_prefill_16k_cuda_api_sum.txt`): - -- `cudaLaunchKernel`: **3,147,266 calls** -- `cudaMemcpyAsync`: **394,155 calls** - -Top GPU kernels by time (nsys `cuda_gpu_kern_sum`, baseline run `profiles/infinilm_prefill_16k_cuda_gpu_kern_sum.txt`) show very high call counts tied to the Lightning Simple GLA path: - -- Several `at::native::*elementwise_kernel*` entries at **393,264 instances each** (exactly `16386 * 24`), indicating a large per-token kernel launch budget in the current GLA implementation. - -**Prefill profiling: run inside container `minicpm-sala`** - -All profiling commands below are intended to run **inside the container** (so PyTorch, InfiniCore, and the model are available). From the host you can either `docker exec -it minicpm-sala bash` and run the commands, or use the helper script that runs the torchprof prefill script in-container. - -- **Launch-count confirmation (torchprof, in-container)** - - From repo root on host: - - ```bash - ./scripts/profile_prefill_torchprof_in_container.sh - ``` - - Optional env: `SEQ_LEN=512` (default), `ACTIVE=1`, `MODEL_PATH`, `CUDA_VISIBLE_DEVICES`, `INFINILM_CUDA_INDEX`. The script prints `[launch_summary] cudaLaunchKernel_count=... cudaMemcpy_count=...` and the kernel table; compare after the SwiGLU fusion to confirm ~32 fewer launches per prefill. - - Or inside the container: - - ```bash - source /app/docker/nvidia/env-set.sh 2>/dev/null || true - export PYTHONPATH=${REPO}/InfiniLM/python:${REPO}/InfiniCore/python:${PYTHONPATH} - cd ${REPO}/InfiniLM - INFINILM_CUDA_INDEX=0 python3 examples/profile_prefill_infinilm_torchprof.py --model_path "${MODEL}" --seq_len 512 --active 1 --out /tmp/torchprof_prefill_512.txt - ``` - -- **nsys prefill profiling** (see “Example (inside container minicpm-sala)” above) also runs in-container; use the same `REPO`, `MODEL`, `source env-set.sh`, and `PYTHONPATH` before `nsys profile` and `nsys stats`. diff --git a/examples/metrics_longtext_mem.md b/examples/metrics_longtext_mem.md deleted file mode 100644 index 28fe8f33..00000000 --- a/examples/metrics_longtext_mem.md +++ /dev/null @@ -1,378 +0,0 @@ -### MiniCPM-SALA long-context metrics + memory history - -**Goal**: record reproducible long-context runs with: - -- **time** (prefill TTFT / throughput) -- **peak GPU memory** (from 1s `nvidia-smi` polling) -- exact **command lines** and key env - -**Notes** - -- All commands are intended to run **inside** docker container `minicpm-sala`. -- Prefer an **idle** GPU (avoid indices that are already near full VRAM). Scan on the host (or `docker exec minicpm-sala nvidia-smi ...` if all GPUs are visible there): - `nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv,noheader,nounits` - Then set `export CUDA_VISIBLE_DEVICES=` for the run and, for scripts that poll VRAM (e.g. `collect_metrics_longtext_decode.py`), set `NVML_GPU_INDEX=` to the **same** index. Example when GPUs 2–4 are mostly free: `CUDA_VISIBLE_DEVICES=2` and `NVML_GPU_INDEX=2`. -- For InfiniLM + InfLLM-v2 builds, `libinfinicore_cpp_api.so` may require preloading `infllm_v2` with `RTLD_GLOBAL` before importing `infinicore`. - -### OOM-safe sweep: one case per process - -Running every long-context case in a **single** Python session can leave CUDA memory fragmented or peak across cases. Prefer `**run_longtext_metrics_cases.sh`**, which runs each `(backend × target × max_new)` as its **own** `python3 collect_metrics_longtext_decode.py --case ...` subprocess, appends one JSON line per row to `profiling_runs/longtext_decode_rows.jsonl`, then prints a markdown table via `--from-jsonl`. - -```bash -REPO=/home/zenghua/workspace/minicpm-sala-support -export CUDA_VISIBLE_DEVICES=2 -export NVML_GPU_INDEX=2 -export PYTHONPATH=$REPO/InfiniLM/examples:$REPO/InfiniCore/python:$REPO/InfiniLM/python -export LD_LIBRARY_PATH=/root/.infini/lib:${LD_LIBRARY_PATH:-} -export METRICS_DATE=2026-03-23 -cd $REPO/InfiniLM/examples -./run_longtext_metrics_cases.sh -``` - -Single case manually: `python3 collect_metrics_longtext_decode.py --case hf:16384` or `infinilm_rec:65536:1`. Monolithic (unsafe) full matrix: `python3 collect_metrics_longtext_decode.py --all-in-process`. - -### Clean & Validate Status (post-cleanup: 2026-03-23) - -- Clean: removed unused debug helper `log_tensor_stats_to_file_if_enabled` and deprecated metrics padded-decode cases; `collect_metrics_longtext_decode.py` + `run_longtext_metrics_cases.sh` no longer sweep `infinilm_pad:*`. -- Validate: rebuilt `_infinilm`, ran `InfiniCore/test/infinicore/ops/test_simple_gla_decode_recurrent.py --nvidia`, `test_simple_gla_prefill.py --nvidia`, and `InfiniLM/examples/minicpm_sala_logits_sanity.py` in `prefill` + `decode1`; confirmed `collect_metrics_longtext_decode.py --case infinilm_pad:*` is rejected with an "Unknown case kind" error. - ---- - -## 2026-03-23 long-context + decode (`longtext_decode_rows.jsonl`) - -Subprocess sweep via `./run_longtext_metrics_cases.sh`. **GPU:** `CUDA_VISIBLE_DEVICES=0`, `NVML_GPU_INDEX=0`. **Targets:** `METRICS_TARGETS=16384,32768`. **Decode steps:** `METRICS_DECODE_STEPS=32`. Recurrent InfiniLM uses `INFINI_LIGHTNING_GLA_RECURRENT_DECODE=1` with batched GLA state sync (GEMM). HF `total_ms` is end-to-end (prefill + decode), matching InfiniLM. `hf (decode_loop)` rows for `max_new=32` are appended via `hf::32`. Regenerate this table: `python3 collect_metrics_longtext_decode.py --from-jsonl profiling_runs/longtext_decode_rows.jsonl` - - -| date | backend | target_in | max_new | peak_mem_mib | total_ms | prefill_ttft_ms | prefill_tok_s | decode_itl_ms | decode_tok_s | gpu | -| ---------- | ------------------------------------------------ | --------- | ------- | ------------ | -------- | --------------- | ------------- | ------------- | ------------ | --- | -| 2026-03-23 | hf (decode_loop) | 16384 | 1 | 38101 | 1821.53 | — | — | — | — | 0 | -| 2026-03-23 | hf (decode_loop) | 32768 | 1 | 51545 | 3711.99 | — | — | — | — | 0 | -| 2026-03-23 | hf (decode_loop) | 16384 | 32 | 38365 | 3435.09 | — | — | 52.24 | 19.14 | 0 | -| 2026-03-23 | hf (decode_loop) | 32768 | 32 | 41717 | 5247.77 | — | — | 52.90 | 18.90 | 0 | -| 2026-03-23 | infinilm (static_fit, recurrent GLA decode) | 16384 | 1 | 33525 | 3162.11 | 3161.5 | 5182.98 | — | — | 0 | -| 2026-03-23 | infinilm (static_fit, recurrent GLA decode) | 32768 | 1 | 44897 | 7139.12 | 7138.74 | 4590.45 | — | — | 0 | -| 2026-03-23 | infinilm (static_fit, recurrent GLA, +32 decode) | 16384 | 32 | 33537 | 4111.32 | 3182.07 | 5149.48 | 29.94 | 33.4 | 0 | -| 2026-03-23 | infinilm (static_fit, recurrent GLA, +32 decode) | 32768 | 32 | 44911 | 8357.39 | 7146.78 | 4585.28 | 39 | 25.64 | 0 | - - ---- - -## History table - - -| date | backend | target_input_tokens | max_new_tokens | cache_mode | peak_mem_mib | total_time_ms | prefill_ttft_ms | prefill_throughput_tok_s | gpu | -| ---------- | --------------------------------------------- | ------------------- | -------------- | ---------- | ------------ | ------------- | --------------- | ------------------------ | --- | -| 2026-03-18 | hf | 16384 | 1 | — | 38091 | 1757.21 | — | 9325.01 | 2 | -| 2026-03-19 | hf | 16384 | 1 | — | 38091 | 1760.08 | — | 9311.48 | 2 | -| 2026-03-18 | hf | 32768 | 1 | — | 41173 | 3537.65 | — | 9263.22 | 2 | -| 2026-03-19 | hf | 32768 | 1 | — | 41151 | 3516.06 | — | 9319.51 | 2 | -| 2026-03-19 | infinilm(baseline) | 16384 | 1 | static_fit | 33570 | 2849.22 | 2849.03 | 5751.44 | 0 | -| 2026-03-19 | infinilm(baseline) | 32768 | 1 | static_fit | 44174 | 5960.41 | 5960.14 | 5498.19 | 0 | -| 2026-03-19 | infinilm(baseline) | 65536 | 1 | static_fit | 67195 | 13929.51 | 13929.12 | 4705.11 | 4 | -| 2026-03-19 | hf (consistent-batch) | 16384 | 1 | — | 38091 | 1782.63 | — | 9192.04 | 4 | -| 2026-03-19 | hf (consistent-batch) | 32768 | 1 | — | 41173 | 3585.96 | — | 9138.42 | 4 | -| 2026-03-19 | hf (consistent-batch) | 65536 | 1 | — | 47319 | 7426.98 | — | 8824.32 | 4 | -| 2026-03-19 | infinilm (consistent-batch) | 16384 | 1 | static_fit | 32605 | 2887.28 | 2887.06 | 5675.67 | 4 | -| 2026-03-19 | infinilm (consistent-batch) | 32768 | 1 | static_fit | 43209 | 6005.78 | 6005.57 | 5456.60 | 4 | -| 2026-03-19 | infinilm (consistent-batch) | 65536 | 1 | static_fit | 67195 | 13940.17 | 13939.90 | 4701.47 | 4 | -| 2026-03-19 | infinilm (exp2/3 opt: strided KV + GLA views) | 32768 | 1 | static_fit | 38613 | 5993.70 | 5993.45 | 5467.64 | 4 | -| 2026-03-19 | infinilm (exp2/3 opt: strided KV + GLA views) | 65536 | 1 | static_fit | 67195 | 13959.08 | 13958.78 | 4695.11 | 4 | -| 2026-03-19 | infinilm(baseline) | 131072 | 1 | static_fit | 79883 | OOM | — | — | 6 | -| 2026-03-18 | hf | 524288 | 1 | — | 59591 | OOM | — | — | 3 | -| 2026-03-18 | hf | 65536 | 1 | — | 47319 | 7340.99 | — | 8927.67 | 1 | -| 2026-03-18 | hf | 131072 | 1 | — | 61641 | 15290.39 | — | 8572.31 | 1 | -| 2026-03-18 | hf | 262144 | 1 | — | 80059 | OOM | — | — | 1 | - - ---- - -## 2026-03-19 consistent batch summary (GPU 4, 1s polling) - -Protocol used for both backends: - -- same physical GPU (`CUDA_VISIBLE_DEVICES=4`), same model, `max_new_tokens=1` -- same target lengths: 16k / 32k / 64k -- memory measured from 1s `nvidia-smi -i 4 --query-gpu=memory.used` polling -- HF path: `--hf_mode forward_prefill --hf_forward_use_cache --hf_forward_warmup 1 --hf_forward_iters 1` -- InfiniLM path: `--infinilm_inprocess --infinilm_cache_mode static_fit` - -### Growth deltas (16k->32k and 32k->64k) - -TTFT note: HF forward-prefill does not emit TTFT; `total_time_ms` is used as prefill-time proxy for HF deltas. - - -| backend | 16k->32k mem delta (MiB) | 32k->64k mem delta (MiB) | 16k->32k time delta (ms) | 32k->64k time delta (ms) | -| --------------------- | ------------------------ | ------------------------ | ------------------------ | ------------------------ | -| hf (forward-prefill) | +3082 | +6146 | +1803.33 | +3841.02 | -| infinilm (static_fit) | +10604 | +23986 | +3118.51 (TTFT) | +7934.33 (TTFT) | - - -### Attribution profiling (InfiniLM 32k / 64k) - -Artifacts are saved in `InfiniLM/examples/profiling_runs`: - -- allocator logs: `alloc_infinilm_32768_gpu4.log`, `alloc_infinilm_65536_gpu4.log` -- nsys logs: `nsys_infinilm_32768_gpu4.log`, `nsys_infinilm_65536_gpu4.log` - -Allocator observations (`INFINICORE_DEBUG_ALLOC=1`): - -- both runs show identical small/medium allocation patterns (e.g., many `32 MiB` and `128 MiB` class allocations), suggesting these are mostly fixed/runtime-structural. -- 64k introduces substantially larger "large" allocations than 32k (examples in logs include `12.0 GiB`, `9.0 GiB`, and `2.0 GiB`-class requests), consistent with context-length-driven persistent KV slab growth. -- 32k large allocations are present but markedly smaller (e.g., `~6.0 GiB`, `~4.5 GiB`, `~1.0 GiB`), aligning with lower persistent cache footprint. - -Nsight Systems observations (`nsys profile --trace=cuda,nvtx,osrt --stats=true`): - -- NVTX `infinilm_generate` range scales from `~6.18s` (32k) to `~14.17s` (64k), matching TTFT growth. -- CUDA API summary becomes more memcpy-dominated at 64k: - - 32k: `cudaMemcpy ~64.6%`, `cudaMemcpyAsync ~33.0%` - - 64k: `cudaMemcpy ~83.0%`, `cudaMemcpyAsync ~15.7%` -- GPU kernel summary shows both attention and GLA prefill kernels scaling up: - - `flash_fwd_kernel` total: `~1.03s` -> `~4.09s` - - `simple_gla_prefill_chunked_kernel` total: `~1.24s` -> `~2.45s` - -Attribution confidence: - -- **High**: persistent KV/cache-related allocations are the primary memory-growth driver from 32k to 64k. -- **Medium**: transient prefill compute/workspace growth contributes, but is secondary vs persistent slabs for memory. -- **Medium**: synchronization/memcpy behavior is a major TTFT growth contributor at 64k. - -### Short-context decode profiling (Nsight Systems, vs HF) - -**Artifacts** (under `InfiniLM/examples/profiling_runs/`): - -- HF manual decode: `nsys_decode_hf_tok256_gpu4.log` (`--hf_mode decode_loop`, short prompt, `max_new_tokens=256`). -- InfiniLM generate: `nsys_decode_infinilm_tok256_gpu4.log`, `nsys_decode_infinilm_nvtx_tok256_gpu4.log`, `nsys_decode_infinilm_nvtx_opt_tok256_gpu4.log` (same prompt / 256 new tokens; NVTX ranges from `infer_engine.generate`). -- Post–`write_i32`/`write_i64` rebuild (2026-03-20, GPU 4): `nsys_decode_infinilm_tok256_gpu4_pybind_run.log` (failed: stale `_infinicore` without `write_i32`), `nsys_decode_infinilm_tok256_gpu4_pybind_run2.log` + `decode_infinilm_tok256_gpu4_pybind_run2.nsys-rep` (**good** after `install.py` + `xmake build/install _infinicore` in container). Script `compare_inference_speed.py` preloads InfLLM-v2 (`RTLD_GLOBAL`) so `libinfinicore_cpp_api.so` resolves `mha_varlen_fwd`; bare `python -c import infinicore` without that preload can show an undefined-symbol error. - -**NVTX (InfiniLM)** — use these ranges in the Nsight UI / `nsys stats` to isolate prefill vs steady decode: - -- `infinilm_prefill_step` — first `generate` iteration. -- `infinilm_decode_total` — spans decode iterations 1..N-1 (opened on iter 1). -- `infinilm_decode_step` — one range per token step (high instance count). -- `infinilm_generate` — full `engine.generate()` call. - -**HF**: `hf_decode_loop` wraps the timed decode loop (prefill is outside this range). - -**Headline comparison** (same GPU, 256 decode steps, short prompt; numbers from the logs above): - - -| Metric (CUDA API sum) | HF `decode_loop` | InfiniLM `generate` | -| ------------------------ | ------------------- | ------------------- | -| `cudaLaunchKernel` calls | ~593k | ~7.44M | -| ~calls / decode step | ~2.3k | ~29k | -| `cudaMemcpyAsync` calls | lower than InfiniLM | ~988k | - - -**Memcpy time** (`cuda_gpu_mem_time_sum`): InfiniLM decode shows large **H2D** wall share (~63% of memcpy time in one run) with **many** small transfers; HF decode shows **fewer** H2D operations but they can dominate memcpy time when they occur. - -**Interpretation**: InfiniLM short decode is limited less by a single kernel and more by **per-step framework overhead** (launch count + small copies). Next wins are structural (fewer launches per token, true decode KV path, graph/capture where safe), not scalar metadata alone. - -**Continuing profiling — repro commands** (inside `minicpm-sala`, pick idle `GPU`; outputs go to `profiling_runs/`): - -```bash -REPO=/home/zenghua/workspace/minicpm-sala-support -MODEL=/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA -GPU=4 -export CUDA_VISIBLE_DEVICES=$GPU -export PYTHONPATH=$REPO/InfiniLM/python:$REPO/InfiniCore/python:${PYTHONPATH:-} -export LD_LIBRARY_PATH=/root/.infini/lib:${LD_LIBRARY_PATH:-} -cd $REPO/InfiniLM/examples - -TAG=decode_infinilm_tok256_gpu${GPU} -nsys profile --trace=cuda,nvtx,osrt --stats=true -o profiling_runs/${TAG} --force-overwrite true \ - python3 compare_inference_speed.py \ - --model_path "$MODEL" \ - --prompt "Write a short haiku about GPUs." \ - --max_new_tokens 256 \ - --backends infinilm \ - --no_hf \ - --infinilm_inprocess \ - --infinilm_cache_mode static_fit \ - 2>&1 | tee profiling_runs/nsys_${TAG}.log - -# Optional (InfiniLM decode): reduce D2H / Python overhead and A/B CPU metadata tensors -# export INFINI_PROFILE_KEEP_OUTPUT_IDS_ON_DEVICE=1 -# export INFINI_PROFILE_COLLECT_OUTPUT_IDS=0 -# export INFINI_PROFILE_DISABLE_FAST_DECODE_META=1 # force per-step from_list() metadata vs reusable CPU+write_i* fast path - -TAG=decode_hf_tok256_gpu${GPU} -nsys profile --trace=cuda,nvtx,osrt --stats=true -o profiling_runs/${TAG} --force-overwrite true \ - python3 compare_inference_speed.py \ - --model_path "$MODEL" \ - --prompt "Write a short haiku about GPUs." \ - --max_new_tokens 256 \ - --backends hf \ - --no_infinilm \ - --hf_mode decode_loop \ - --hf_decode_warmup 8 \ - --hf_decode_iters 1 \ - --hf_attn_implementation flash_attention_2 \ - 2>&1 | tee profiling_runs/nsys_${TAG}.log -``` - -**Long-context decode** (optional): add e.g. `--target_input_tokens 32768` to either command so NVTX still tags prefill vs decode; expect traces to be large. - -**Prefill-only nsys** (matches earlier 32k/64k attribution): - -```bash -TAG=infinilm_prefill_32768_gpu${GPU} -nsys profile --trace=cuda,nvtx,osrt --stats=true -o profiling_runs/${TAG} --force-overwrite true \ - python3 compare_inference_speed.py \ - --model_path "$MODEL" \ - --target_input_tokens 32768 \ - --max_new_tokens 1 \ - --backends infinilm \ - --no_hf \ - --infinilm_inprocess \ - --infinilm_cache_mode static_fit \ - 2>&1 | tee profiling_runs/nsys_${TAG}.log -``` - -After code changes (e.g. pybind metadata path), re-run the **same** `TAG` with a suffix (`_run2`) and diff `cuda_api_sum` / `cuda_gpu_kern_sum` / NVTX tables. - -### Ranked next optimization experiments (minimal changes) - -1. **Constrain/reshape persistent KV growth first** -Expected impact: High memory reduction, likely best leverage on 32k->64k slope. -Minimal experiment: compare `static_fit` vs `paged` (small block sizes, e.g., 128/256) at 32k/64k and re-measure peaks + TTFT. -2. **Reduce transient prefill movement/workspace** -Expected impact: Medium TTFT gain, small-to-medium memory relief. -Minimal experiment: isolate `simple_gla_prefill` transform/workspace path and reduce extra copies/format conversions; confirm via reduced `cudaMemcpy` share in nsys. -3. **Trim synchronization/copy overhead around prefill** -Expected impact: Medium TTFT gain at long context. -Minimal experiment: profile before/after removing avoidable sync points or host-device transfers in attention/prefill orchestration; success criterion is lower `cudaMemcpy` wall share with unchanged logits. - -Applied (2026-03-19): removed `permute(...)->contiguous()` materialization for KV cache update and GLA prefill inputs in `minicpm_sala_attention.cpp` (pass strided views). -Result: 32k peak memory improved on GPU 4 (**43209 MiB → 38613 MiB**) with similar TTFT; 64k peak unchanged (dominated by persistent KV slabs). - -Validation gate for each experiment: - -- **Operator unit tests (CUDA) first** — InfLLM-v2 + Simple GLA prefill (see below). Failing ops almost always mean wasted time on full-model logits debugging. -- run `minicpm_sala_logits_sanity.py` (prefill mode) and compare ratio/max_diff/mean_diff against current baseline. -- run one prompt generation sanity and verify no functional regression. - ---- - -## Commands (repro) - -### InfiniCore operator tests (run before logits sanity) - -MiniCPM-SALA stack depends on `infllmv2_varlen` / `infllmv2_kvcache` and `simple_gla_prefill`. Run these inside `minicpm-sala` with `InfiniLM/python` on `PYTHONPATH` so InfLLM-v2 preloads before `import infinicore`: - -```bash -REPO=/home/zenghua/workspace/minicpm-sala-support -export CUDA_VISIBLE_DEVICES=1 -export PYTHONPATH=$REPO/InfiniCore/test/infinicore:$REPO/InfiniCore/python:$REPO/InfiniLM/python:${PYTHONPATH:-} -export LD_LIBRARY_PATH=/root/.infini/lib:${LD_LIBRARY_PATH:-} -cd $REPO/InfiniCore/test/infinicore/ops - -python3 test_infllmv2_attention.py --nvidia -python3 test_simple_gla_prefill.py --nvidia -``` - -One-liner wrapper (same env assumptions as the repo): - -```bash -bash $REPO/InfiniLM/examples/run_infinicore_ops_before_logits.sh -``` - -### Logits correctness gate (HF vs InfiniLM) - -Run (inside `minicpm-sala`) to sanity-check HF vs InfiniLM prefill logits on a short prompt: - -```bash -REPO=/home/zenghua/workspace/minicpm-sala-support -MODEL=/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA -export CUDA_VISIBLE_DEVICES=1 -export HF_CUDA_INDEX=0 -export INFINILM_CUDA_INDEX=0 -export PYTHONPATH=$REPO/InfiniLM/python:$REPO/InfiniCore/python:${PYTHONPATH:-} -export LD_LIBRARY_PATH=/root/.infini/lib:${LD_LIBRARY_PATH:-} -cd $REPO/InfiniLM/examples - -python3 minicpm_sala_logits_sanity.py \ - --model_path "$MODEL" \ - --mode prefill \ - --prompt "How are you? Tell me a short joke." \ - --k 10 -``` - -Recorded output (2026-03-18, GPU=1): - -```text -SANITY_ONELINE ratio=0.9889 max_diff=0.1875 mean_diff=0.0682 -``` - -`--mode decode1` (prefill + one decode step): **prefill section** should match the prefill-only run. The **decode** section should now be finite (the previous `NaN` issue was traced to the CUDA embedding kernel leaving outputs uninitialized for out-of-range indices). Correctness can still diverge from HF for longer prompts due to decode/KV/attention parity work; treat **prefill** as the strongest HF parity gate for now. - -### GPU scan - -```bash -nvidia-smi --query-gpu=index,name,memory.used,memory.total --format=csv,noheader,nounits -``` - -### HF-only prefill (32k) with 1s memory polling - -```bash -REPO=/home/zenghua/workspace/minicpm-sala-support -MODEL=/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA -export CUDA_VISIBLE_DEVICES=2 -export PYTHONPATH=$REPO/InfiniLM/python:$REPO/InfiniCore/python:${PYTHONPATH:-} -cd $REPO/InfiniLM/examples - -python3 compare_inference_speed.py \ - --model_path "$MODEL" \ - --target_input_tokens 32768 \ - --max_new_tokens 1 \ - --backends hf \ - --hf_mode forward_prefill \ - --hf_forward_use_cache \ - --hf_forward_warmup 1 \ - --hf_forward_iters 1 \ - --hf_attn_implementation flash_attention_2 \ - & pid=$! - -echo "[mem] polling physical GPU 2 while pid=$pid" -while kill -0 $pid 2>/dev/null; do - date +"%F %T" - nvidia-smi -i 2 --query-gpu=memory.used,memory.total --format=csv,noheader,nounits - sleep 1 -done -wait $pid -``` - -### InfiniLM-only (32k) with InfLLM-v2 preload + 1s memory polling - -```bash -REPO=/home/zenghua/workspace/minicpm-sala-support -MODEL=/data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA -export CUDA_VISIBLE_DEVICES=2 -export PYTHONPATH=$REPO/InfiniLM/python:$REPO/InfiniCore/python:${PYTHONPATH:-} -export LD_LIBRARY_PATH=/root/.infini/lib:$REPO/InfiniLM/build/linux/x86_64/release:${LD_LIBRARY_PATH:-} -cd $REPO/InfiniLM/examples - -python3 - <<'PY' & pid=$! -import ctypes, os, runpy, sys -ctypes.CDLL("/usr/local/lib/python3.12/dist-packages/infllm_v2/C.cpython-312-x86_64-linux-gnu.so", mode=ctypes.RTLD_GLOBAL) -sys.argv = [ - "compare_inference_speed.py", - "--model_path", os.environ["MODEL"], - "--target_input_tokens", "32768", - "--max_new_tokens", "1", - "--backends", "infinilm", - "--no_hf", - "--infinilm_inprocess", - "--infinilm_cache_mode", "static_fit", -] -runpy.run_path("compare_inference_speed.py", run_name="__main__") -PY - -echo "[mem] polling physical GPU 2 while pid=$pid" -while kill -0 $pid 2>/dev/null; do - date +"%F %T" - nvidia-smi -i 2 --query-gpu=memory.used,memory.total --format=csv,noheader,nounits - sleep 1 -done -wait $pid -``` - diff --git a/examples/run_infinicore_ops_before_logits.sh b/examples/run_infinicore_ops_before_logits.sh deleted file mode 100755 index 5a93fe11..00000000 --- a/examples/run_infinicore_ops_before_logits.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env bash -# InfiniCore CUDA operator smoke tests for MiniCPM-SALA-related ops. -# Run inside minicpm-sala docker before deeper HF-vs-InfiniLM alignment probes. -set -euo pipefail - -REPO="${REPO:-/home/zenghua/workspace/minicpm-sala-support}" -export PYTHONPATH="$REPO/InfiniCore/test/infinicore:$REPO/InfiniCore/python:$REPO/InfiniLM/python:${PYTHONPATH:-}" -export LD_LIBRARY_PATH="/root/.infini/lib:${LD_LIBRARY_PATH:-}" - -OPS_DIR="$REPO/InfiniCore/test/infinicore/ops" -cd "$OPS_DIR" - -echo "[run_infinicore_ops] REPO=$REPO" -echo "[run_infinicore_ops] test_infllmv2_attention.py --nvidia" -python3 test_infllmv2_attention.py --nvidia -echo "[run_infinicore_ops] test_simple_gla_prefill.py --nvidia" -python3 test_simple_gla_prefill.py --nvidia -echo "[run_infinicore_ops] OK" diff --git a/examples/run_longtext_metrics_cases.sh b/examples/run_longtext_metrics_cases.sh deleted file mode 100755 index dd595c7b..00000000 --- a/examples/run_longtext_metrics_cases.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env bash -# Run each longtext/decode metric case in a **separate** Python process to release CUDA -# memory between runs (reduces OOM when sweeping 16k/32k/64k × HF + InfiniLM). -# -# Usage (inside minicpm-sala, after picking an idle GPU): -# export CUDA_VISIBLE_DEVICES=2 -# export NVML_GPU_INDEX=2 -# export REPO=/home/zenghua/workspace/minicpm-sala-support -# export PYTHONPATH=$REPO/InfiniLM/examples:$REPO/InfiniCore/python:$REPO/InfiniLM/python -# export LD_LIBRARY_PATH=/root/.infini/lib:${LD_LIBRARY_PATH:-} -# export METRICS_DATE=2026-03-23 -# cd $REPO/InfiniLM/examples && ./run_longtext_metrics_cases.sh -# -# Optional: -# METRICS_TARGETS=16384,32768 METRICS_DECODE_STEPS=32 ./run_longtext_metrics_cases.sh -# SLEEP_BETWEEN_SEC=3 # extra pause between subprocesses - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "$SCRIPT_DIR" - -REPO="${REPO:-/home/zenghua/workspace/minicpm-sala-support}" -export PYTHONPATH="${SCRIPT_DIR}:${REPO}/InfiniCore/python:${REPO}/InfiniLM/python:${PYTHONPATH:-}" -export LD_LIBRARY_PATH="/root/.infini/lib:${LD_LIBRARY_PATH:-}" - -: "${CUDA_VISIBLE_DEVICES:=0}" -: "${NVML_GPU_INDEX:=${CUDA_VISIBLE_DEVICES}}" -: "${METRICS_DATE:=2026-03-23}" -: "${METRICS_DECODE_STEPS:=32}" -: "${METRICS_TARGETS:=16384,32768,65536}" -: "${SLEEP_BETWEEN_SEC:=2}" - -OUT_JSONL="${OUT_JSONL:-${SCRIPT_DIR}/profiling_runs/longtext_decode_rows.jsonl}" -mkdir -p "$(dirname "$OUT_JSONL")" -rm -f "$OUT_JSONL" -echo "[run_longtext_metrics] jsonl -> $OUT_JSONL GPU smi index=$NVML_GPU_INDEX" - -IFS=',' read -r -a TARGETS <<< "$METRICS_TARGETS" - -run_one() { - local c="$1" - echo "[run_longtext_metrics] case=$c" - python3 collect_metrics_longtext_decode.py --case "$c" --append-jsonl "$OUT_JSONL" || true - sleep "${SLEEP_BETWEEN_SEC}" -} - -for t in "${TARGETS[@]}"; do - run_one "hf:${t}" -done -for t in "${TARGETS[@]}"; do - run_one "infinilm_rec:${t}:1" -done -for t in "${TARGETS[@]}"; do - run_one "infinilm_rec:${t}:${METRICS_DECODE_STEPS}" -done - -echo "[run_longtext_metrics] merged table:" -python3 collect_metrics_longtext_decode.py --from-jsonl "$OUT_JSONL" From 8f85cb726cdd99423a07a416059aac7f619de00b Mon Sep 17 00:00:00 2001 From: Ceng23333 <441651826@qq.com> Date: Thu, 9 Apr 2026 01:45:25 +0000 Subject: [PATCH 04/11] revert server Signed-off-by: Ceng23333 <441651826@qq.com> --- .../infinilm/server/chat_message_normalize.py | 76 ------------------- python/infinilm/server/inference_server.py | 34 ++++++++- 2 files changed, 31 insertions(+), 79 deletions(-) delete mode 100644 python/infinilm/server/chat_message_normalize.py diff --git a/python/infinilm/server/chat_message_normalize.py b/python/infinilm/server/chat_message_normalize.py deleted file mode 100644 index 04afe176..00000000 --- a/python/infinilm/server/chat_message_normalize.py +++ /dev/null @@ -1,76 +0,0 @@ -"""Normalize OpenAI-style chat messages before HuggingFace chat_template. - -Kept separate from ``inference_server`` so this logic can be smoke-tested without -loading InfiniCore / CUDA (see ``__main__`` block). -""" - - -def normalize_openai_messages_for_hf_template(messages: list) -> list: - """Strip lm-eval ``type: text`` wrappers; flatten multimodal text parts. - - lm-eval ``local-chat-completions`` with ``tokenized_requests=False`` JSON-encodes - each turn with an extra top-level ``"type": "text"`` (see ``TemplateAPI.apply_chat_template`` - in lm-eval). HuggingFace ``--model hf`` passes plain ``{role, content}`` dicts into - ``apply_chat_template``. Stripping unknown keys keeps server templating aligned with - the HF harness for text-only tasks. - """ - normalized: list = [] - for msg in messages: - if not isinstance(msg, dict): - normalized.append(msg) - continue - - role = msg.get("role") - if role is None: - normalized.append(msg) - continue - - content = msg.get("content") - if isinstance(content, list): - text_parts: list[str] = [] - for part in content: - if isinstance(part, dict): - if part.get("type") == "text" and "text" in part: - text_parts.append(part["text"]) - elif isinstance(part, str): - text_parts.append(part) - elif isinstance(part, str): - text_parts.append(part) - merged = "".join(text_parts) if text_parts else "" - core = {"role": role, "content": merged} - if msg.get("name") is not None: - core["name"] = msg["name"] - normalized.append(core) - elif isinstance(content, str): - core = {"role": role, "content": content} - if msg.get("name") is not None: - core["name"] = msg["name"] - normalized.append(core) - else: - normalized.append(msg) - - return normalized - - -if __name__ == "__main__": - # Smoke test (no InfiniCore): run as - # python3 -m infinilm.server.chat_message_normalize - lm_eval_style = [ - {"role": "system", "content": "sys", "type": "text"}, - {"role": "user", "content": "hi", "type": "text"}, - ] - out = normalize_openai_messages_for_hf_template(lm_eval_style) - assert out == [{"role": "system", "content": "sys"}, {"role": "user", "content": "hi"}], out - mm = [ - { - "role": "user", - "content": [ - {"type": "text", "text": "a"}, - {"type": "text", "text": "b"}, - ], - } - ] - assert normalize_openai_messages_for_hf_template(mm) == [ - {"role": "user", "content": "ab"} - ] - print("chat_message_normalize: ok") diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py index 8c361c4e..b5c49247 100644 --- a/python/infinilm/server/inference_server.py +++ b/python/infinilm/server/inference_server.py @@ -17,7 +17,6 @@ from fastapi.responses import JSONResponse, StreamingResponse from infinilm.llm import AsyncLLMEngine, SamplingParams, FinishReason -from infinilm.server.chat_message_normalize import normalize_openai_messages_for_hf_template logger = logging.getLogger(__name__) @@ -267,8 +266,37 @@ async def list_models_legacy(): return _models_payload() def _normalize_messages(self, messages: list) -> list: - """Delegate to :func:`normalize_openai_messages_for_hf_template`.""" - return normalize_openai_messages_for_hf_template(messages) + """Normalize messages to handle multimodal content (list format). + + Converts content from list format [{"type": "text", "text": "..."}] + to string format for chat template compatibility. + """ + normalized = [] + for msg in messages: + if not isinstance(msg, dict): + normalized.append(msg) + continue + + content = msg.get("content") + if isinstance(content, list): + # Extract text from multimodal content list + text_parts = [] + for part in content: + if isinstance(part, dict): + if part.get("type") == "text" and "text" in part: + text_parts.append(part["text"]) + elif isinstance(part, str): + text_parts.append(part) + elif isinstance(part, str): + text_parts.append(part) + # Join all text parts + normalized_msg = msg.copy() + normalized_msg["content"] = "".join(text_parts) if text_parts else "" + normalized.append(normalized_msg) + else: + normalized.append(msg) + + return normalized def _build_sampling_params(self, data: dict) -> SamplingParams: """Build SamplingParams from request data.""" From 0d98e759b69e884e1dec99ed00d8829bf79b1980 Mon Sep 17 00:00:00 2001 From: Ceng23333 <441651826@qq.com> Date: Thu, 9 Apr 2026 01:47:42 +0000 Subject: [PATCH 05/11] revert some code Signed-off-by: Ceng23333 <441651826@qq.com> --- include/infinicore_infer/cache.h | 5 ----- include/infinicore_infer/weights_loader.h | 5 ----- 2 files changed, 10 deletions(-) diff --git a/include/infinicore_infer/cache.h b/include/infinicore_infer/cache.h index 5f691c64..522f2235 100644 --- a/include/infinicore_infer/cache.h +++ b/include/infinicore_infer/cache.h @@ -3,11 +3,6 @@ #include -#ifndef __INFINI_C -// Compat: older InfiniCore headers use `__C` instead of `__INFINI_C`. -#define __INFINI_C __C -#endif - __INFINI_C __export struct KVCache *createKVCache( size_t nlayers, size_t max_len, diff --git a/include/infinicore_infer/weights_loader.h b/include/infinicore_infer/weights_loader.h index 057c3a1b..82eafe59 100644 --- a/include/infinicore_infer/weights_loader.h +++ b/include/infinicore_infer/weights_loader.h @@ -3,11 +3,6 @@ #include -#ifndef __INFINI_C -// Compat: older InfiniCore headers use `__C` instead of `__INFINI_C`. -#define __INFINI_C __C -#endif - struct ModelWeights; __INFINI_C __export void From 0583ab5be7b134a1aa0c708c0c1266b5b5928f24 Mon Sep 17 00:00:00 2001 From: Ceng23333 <441651826@qq.com> Date: Thu, 9 Apr 2026 03:21:44 +0000 Subject: [PATCH 06/11] refactor Signed-off-by: Ceng23333 <441651826@qq.com> --- ...minicpm_sala_allocate_kv_cache_tensors.cpp | 4 +- .../minicpm_sala/minicpm_sala_attention.cpp | 133 +++++++++--------- .../minicpm_sala/minicpm_sala_attention.hpp | 26 +--- .../minicpm_sala_decoder_layer.cpp | 26 ++-- .../minicpm_sala_decoder_layer.hpp | 5 +- .../minicpm_sala_for_causal_lm.cpp | 28 +++- .../minicpm_sala_for_causal_lm.hpp | 8 +- .../minicpm_sala/minicpm_sala_model.cpp | 85 +---------- .../minicpm_sala/minicpm_sala_model.hpp | 12 +- 9 files changed, 120 insertions(+), 207 deletions(-) diff --git a/csrc/models/minicpm_sala/minicpm_sala_allocate_kv_cache_tensors.cpp b/csrc/models/minicpm_sala/minicpm_sala_allocate_kv_cache_tensors.cpp index f4cb3b55..3ad0b506 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_allocate_kv_cache_tensors.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_allocate_kv_cache_tensors.cpp @@ -32,7 +32,7 @@ std::vector minicpm_sala_allocate_kv_cache_tensors(const cac const size_t num_key_value_heads = text_config->get("num_key_value_heads"); const size_t max_position_embeddings = text_config->get("max_position_embeddings"); - const auto &dtype{text_config->get_dtype()}; + const auto &dtype{text_config->get_kv_cache_dtype()}; std::vector mixer_types = text_config->get>("mixer_types"); size_t current_layer_head_dim, current_layer_num_key_value_heads; for (size_t layer_idx = 0; layer_idx < num_hidden_layers; ++layer_idx) { @@ -70,7 +70,7 @@ std::vector minicpm_sala_allocate_kv_cache_tensors(const cac const size_t head_dim = text_config->get("head_dim"); const size_t num_key_value_heads = text_config->get("num_key_value_heads"); - const auto &dtype{text_config->get_dtype()}; + const auto &dtype{text_config->get_kv_cache_dtype()}; std::vector mixer_types = text_config->get>("mixer_types"); size_t current_layer_head_dim, current_layer_num_key_value_heads; for (size_t layer_idx = 0; layer_idx < num_hidden_layers; ++layer_idx) { diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp index 001122e4..f437f9e9 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp @@ -7,6 +7,7 @@ #include "infinicore/ops/simple_gla_prefill.hpp" #include "infinicore/ops/simple_gla_recurrent_state_append.hpp" #include "infinicore/context/context.hpp" +#include "../../global_state/global_state.hpp" #include "../debug_utils/tensor_utils.hpp" #include @@ -18,6 +19,35 @@ namespace infinilm::models::minicpm_sala { namespace { + +// Per-layer KV tensor layout from `StaticKVCache::create_layer_kv_cache`: [2, B, n_kv, max_len, D]. +void minicpm_sala_update_layer_kv_tensor(infinicore::Tensor &kv_bundle, + const infinicore::Tensor &k_permuted, + const infinicore::Tensor &v_permuted, + const infinicore::Tensor &past_sequence_lengths) { + auto k_cache_layer = kv_bundle->narrow({{0, 0, 1}})->squeeze(0); + auto v_cache_layer = kv_bundle->narrow({{0, 1, 1}})->squeeze(0); + +#ifdef ENABLE_KV_CACHING + infinicore::op::kv_caching_( + k_cache_layer, + v_cache_layer, + k_permuted, + v_permuted, + past_sequence_lengths); +#else + const size_t cache_pos = static_cast( + reinterpret_cast(past_sequence_lengths->to(infinicore::Device::cpu())->data())[0]); + const size_t update_len = k_permuted->size(2); + const size_t result_len = cache_pos + update_len; + if (result_len > k_cache_layer->size(2)) { + throw std::runtime_error("MiniCPMSALAAttention: KV cache length exceeded"); + } + k_cache_layer->narrow({{2, cache_pos, update_len}})->copy_from(k_permuted); + v_cache_layer->narrow({{2, cache_pos, update_len}})->copy_from(v_permuted); +#endif +} + // Same as HF MiniCPM-SALA _build_slope_tensor (used for Simple GLA decay). std::vector build_slope_tensor(size_t n) { auto get_slopes_power_of_2 = [](size_t n) -> std::vector { @@ -105,35 +135,6 @@ MiniCPMSALAAttention::MiniCPMSALAAttention(std::shared_ptr(1.0 / std::sqrt(static_cast(head_dim_))); - // StaticKVCache is allocated as a compact slab per cache type: - // - minicpm4-cache stores only layers where mixer_types[i] == "minicpm4" - // - lightning-cache stores only layers where mixer_types[i] != "minicpm4" - // - // Compute this attention instance's local cache index (0-based) from its - // absolute layer_idx_. - { - bool this_is_minicpm4_cache = (mixer_type == "minicpm4"); - std::vector mixer_types; - try { - mixer_types = model_config_->get>("mixer_types"); - } catch (...) { - mixer_types.assign(model_config_->get("num_hidden_layers"), "minicpm4"); - } - // Be defensive if mixer_types size mismatches. - if (mixer_types.size() != model_config_->get("num_hidden_layers")) { - mixer_types.resize(model_config_->get("num_hidden_layers"), "minicpm4"); - } - size_t count = 0; - for (size_t i = 0; i <= layer_idx_ && i < mixer_types.size(); ++i) { - const bool is_minicpm4_layer = (mixer_types[i] == "minicpm4"); - if (is_minicpm4_layer == this_is_minicpm4_cache) { - ++count; - } - } - // layer_idx_ is always a valid layer, so count should be >= 1. - cache_layer_idx_ = count > 0 ? (count - 1) : 0; - } - // HyPE: RoPE in lightning layers, NoPE in sparse (minicpm4) layers. // We treat all non-minicpm4 as "linear" (lightning-attn) for M1 dense fallback. use_rope_ = (mixer_type != "minicpm4") && model_config_->get_or("lightning_use_rope", true); @@ -176,7 +177,7 @@ void MiniCPMSALAAttention::set_rotary_emb(const std::shared_ptr kv_cache, - std::optional past_sequence_lengths, - std::optional total_sequence_lengths, - std::optional input_offsets, - std::optional cu_seqlens, - std::optional block_tables, - std::optional slot_mapping) const { - (void)input_offsets; - (void)block_tables; - (void)slot_mapping; - return forward_dense_(hidden_states, position_ids, kv_cache, past_sequence_lengths, total_sequence_lengths, cu_seqlens); -} - -infinicore::Tensor MiniCPMSALAAttention::forward_dense_(const infinicore::Tensor &hidden_states, - const infinicore::Tensor &position_ids, - std::shared_ptr kv_cache, - std::optional past_sequence_lengths, - std::optional total_sequence_lengths, - std::optional cu_seqlens) const { +infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &position_ids, + const infinicore::Tensor &hidden_states) const { + const auto &attn_meta = infinilm::global_state::get_forward_context().attn_metadata; + auto past_sequence_lengths = attn_meta.past_sequence_lengths; + auto total_sequence_lengths = attn_meta.total_sequence_lengths; + auto cu_seqlens = attn_meta.cu_seqlens; + // input_offsets/block_tables/slot_mapping are not used in this dense/per-layer-kv implementation yet. + (void)cu_seqlens; // Input: [B, S, H] auto shape = hidden_states->shape(); const size_t batch_size = shape[0]; @@ -277,22 +265,28 @@ infinicore::Tensor MiniCPMSALAAttention::forward_dense_(const infinicore::Tensor auto k_permuted = k_reshaped->permute({0, 2, 1, 3})->contiguous(); // [B, n_kv, S, D] auto v_permuted = v_reshaped->permute({0, 2, 1, 3})->contiguous(); // [B, n_kv, S, D] - // HF-like dense KV caching using the engine-provided StaticKVCache. + // Per-layer KV tensors in `global_state::get_forward_context().kv_cache_vec` (same pattern as + // `InfinilmModel::reset_cache` / `StaticAttentionImpl`). infinicore::Tensor k_total = k_permuted; infinicore::Tensor v_total = v_permuted; - std::shared_ptr static_kv_cache = nullptr; - if (kv_cache != nullptr && has_cache_meta) { - static_kv_cache = std::dynamic_pointer_cast(kv_cache); - if (!static_kv_cache) { - throw std::runtime_error("MiniCPMSALAAttention: Unsupported cache type (expected StaticKVCache)"); + bool use_forward_kv = false; + if (has_cache_meta) { + auto &kv_vec = infinilm::global_state::get_forward_context().kv_cache_vec; + if (layer_idx_ >= kv_vec.size()) { + throw std::runtime_error( + "MiniCPMSALAAttention: forward_context.kv_cache_vec is unset or too small (call reset_cache / align layer count)"); } - // Default behavior: update cache here. For minicpm4 decode we may override and let InfLLM-v2 update. - auto [k_cached, v_cached] = static_kv_cache->update( - cache_layer_idx_, k_permuted, v_permuted, past_sequence_lengths.value()); - k_total = k_cached; - v_total = v_cached; + use_forward_kv = true; + minicpm_sala_update_layer_kv_tensor( + kv_vec[layer_idx_], + k_permuted, + v_permuted, + past_sequence_lengths.value()); + auto k_cache_layer = kv_vec[layer_idx_]->narrow({{0, 0, 1}})->squeeze(0); + auto v_cache_layer = kv_vec[layer_idx_]->narrow({{0, 1, 1}})->squeeze(0); + k_total = k_cache_layer; + v_total = v_cache_layer; } else { - // No cache metadata => treat as prefill-only. total_seq_len = seq_len; } @@ -339,7 +333,7 @@ infinicore::Tensor MiniCPMSALAAttention::forward_dense_(const infinicore::Tensor // Lightning fast decode: maintain recurrent state locally (do NOT depend on StaticKVCache extensions). // We rebuild state on-demand if it is out-of-sync with cache_pos. - const bool is_decode = has_cache_meta && static_kv_cache && (seq_len == 1) && (total_seq_len > 1); + const bool is_decode = has_cache_meta && use_forward_kv && (seq_len == 1) && (total_seq_len > 1); if (is_decode) { ensure_gla_state_allocated(gla_state_, q_bthd->device(), batch_size, n_h, head_dim_); @@ -416,13 +410,12 @@ infinicore::Tensor MiniCPMSALAAttention::forward_dense_(const infinicore::Tensor "MiniCPMSALAAttention(minicpm4): total_sequence_lengths is required for InfLLM-v2 path"); } // `infllmv2_kvcache` expects the number of valid K/V entries in the - // provided cache tensors. Since we already appended the current - // token via StaticKVCache::update, the valid length is the total - // KV length (past + current token). + // provided cache tensors. After per-layer KV update, valid length is + // total KV length (past + current token). const auto cache_lens = total_sequence_lengths.value(); // Prefill: InfLLM-v2 varlen (Q and K packed lengths match `seq_len == total_seq_len` here). - // Decode: `seq_len < total_seq_len` — use `infllmv2_kvcache` after StaticKVCache::update + // Decode: `seq_len < total_seq_len` — use `infllmv2_kvcache` after KV tensor update // (valid KV length == `total_seq_len`). Using varlen for decode (1 query vs long K) hit NaNs // in practice for modest sequence lengths; kvcache matches operator tests and Flash path. const bool force_varlen_decode = [&]() { @@ -465,7 +458,7 @@ infinicore::Tensor MiniCPMSALAAttention::forward_dense_(const infinicore::Tensor /*window_size_left=*/window_left, /*window_size_right=*/window_right); attn_output = out_var->view({batch_size, seq_len, num_attention_heads_ * head_dim_}); - } else if (static_kv_cache) { + } else if (use_forward_kv) { if (batch_size != 1) { throw std::runtime_error("MiniCPMSALAAttention(minicpm4): kvcache decode path currently requires batch_size=1"); } @@ -490,7 +483,7 @@ infinicore::Tensor MiniCPMSALAAttention::forward_dense_(const infinicore::Tensor {batch_size, seq_len, num_attention_heads_ * head_dim_}); } else { throw std::runtime_error( - "MiniCPMSALAAttention(minicpm4): decode requires StaticKVCache (missing cache metadata or cache)"); + "MiniCPMSALAAttention(minicpm4): decode requires KV cache (missing cache metadata or kv_cache_vec)"); } } catch (const std::exception &e) { throw std::runtime_error( diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp index 37dab7ec..d11a6037 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp @@ -30,26 +30,13 @@ class MiniCPMSALAAttention : public infinicore::nn::Module { engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); - infinicore::Tensor forward(const infinicore::Tensor &hidden_states, - const infinicore::Tensor &position_ids, - std::shared_ptr kv_cache, - std::optional past_sequence_lengths, - std::optional total_sequence_lengths, - std::optional input_offsets, - std::optional cu_seqlens, - std::optional block_tables, - std::optional slot_mapping) const; + // Match `infinilm::layers::attention::Attention` API: metadata is pulled from + // `global_state::get_forward_context().attn_metadata`. + infinicore::Tensor forward(const infinicore::Tensor &position_ids, + const infinicore::Tensor &hidden_states) const; void set_rotary_emb(const std::shared_ptr &rotary_emb); - void reset_cache(); - -private: - infinicore::Tensor forward_dense_(const infinicore::Tensor &hidden_states, - const infinicore::Tensor &position_ids, - std::shared_ptr kv_cache, - std::optional past_sequence_lengths, - std::optional total_sequence_lengths, - std::optional cu_seqlens) const; + void reset_state(); protected: // Projections (HF-aligned naming) @@ -72,9 +59,6 @@ class MiniCPMSALAAttention : public infinicore::nn::Module { engine::distributed::RankInfo rank_info_; size_t layer_idx_; - // Layer index remapped into the cache instance (minicpm4-cache vs lightning-cache). - // StaticKVCache allocates a compact [num_layers, ...] slab per cache type. - size_t cache_layer_idx_ = 0; size_t hidden_size_; size_t num_attention_heads_; size_t num_key_value_heads_; diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp index 391b626b..feacb3d3 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp @@ -1,5 +1,6 @@ #include "minicpm_sala_decoder_layer.hpp" +#include "../../global_state/global_state.hpp" #include "infinicore/ops.hpp" #include "infinicore/context/context.hpp" #include @@ -38,10 +39,6 @@ void MiniCPMSALADecoderLayer::set_rotary_emb(const std::shared_ptrset_rotary_emb(rotary_emb); } -void MiniCPMSALADecoderLayer::reset_cache() { - self_attn_->reset_cache(); -} - infinicore::Tensor MiniCPMSALADecoderLayer::forward(const infinicore::Tensor &hidden_states, const infinicore::Tensor &position_ids, std::shared_ptr kv_cache, @@ -51,18 +48,19 @@ infinicore::Tensor MiniCPMSALADecoderLayer::forward(const infinicore::Tensor &hi std::optional cu_seqlens, std::optional block_tables, std::optional slot_mapping) const { + // Match `layers/attention/Attention`: stash attention metadata in global forward context. + infinilm::global_state::get_forward_context().attn_metadata = + infinilm::global_state::AttentionMetadata(past_sequence_lengths, + total_sequence_lengths, + input_offsets, + cu_seqlens, + block_tables, + slot_mapping); + // Pre-norm attention auto hs1 = input_layernorm_->forward(hidden_states); - auto attn_out = self_attn_->forward( - hs1, - position_ids, - kv_cache, - past_sequence_lengths, - total_sequence_lengths, - input_offsets, - cu_seqlens, - block_tables, - slot_mapping); + (void)kv_cache; + auto attn_out = self_attn_->forward(position_ids, hs1); // residual + scale_down * attn_out (MuP) auto ones_attn = infinicore::Tensor::empty(attn_out->shape(), attn_out->dtype(), attn_out->device()); diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp index 948e4d97..094e8650 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp @@ -17,6 +17,8 @@ namespace infinilm::models::minicpm_sala { +class MiniCPMSALAModel; + class MiniCPMSALADecoderLayer : public infinicore::nn::Module { public: MiniCPMSALADecoderLayer(std::shared_ptr model_config, @@ -37,9 +39,10 @@ class MiniCPMSALADecoderLayer : public infinicore::nn::Module { std::optional slot_mapping) const; void set_rotary_emb(const std::shared_ptr &rotary_emb); - void reset_cache(); private: + friend class MiniCPMSALAModel; + double residual_scale_ = 1.0; size_t layer_idx_ = 0; diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp index 74ea4f9a..bcbb9f6f 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp @@ -1,6 +1,7 @@ #include "minicpm_sala_for_causal_lm.hpp" #include "../models_registry.hpp" +#include "../../global_state/global_state.hpp" #include "infinicore/ops.hpp" #include #include @@ -8,6 +9,11 @@ namespace infinilm::models::minicpm_sala { +std::vector minicpm_sala_allocate_kv_cache_tensors( + const cache::CacheConfig *cache_config, + const std::shared_ptr &text_config, + const backends::AttentionBackend &attention_backend); + std::shared_ptr create_minicpm_sala_model_config( std::shared_ptr model_config) { const std::string &model_type = model_config->get("model_type"); @@ -23,6 +29,7 @@ MiniCPMSALAForCausalLM::MiniCPMSALAForCausalLM( engine::distributed::RankInfo rank_info, backends::AttentionBackend attention_backend) { device_ = device; + model_config_ = model_config; // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA). const auto dtype = model_config->get_dtype(); @@ -62,12 +69,23 @@ MiniCPMSALAForCausalLM::Output MiniCPMSALAForCausalLM::forward( } void MiniCPMSALAForCausalLM::reset_cache(const cache::CacheConfig *cache_config) { + // Match `InfinilmModel::reset_cache`: own `cache_config_` + `kv_cache_vec` here; inner model only + // resets per-layer attention state. MiniCPM uses `minicpm_sala_allocate_kv_cache_tensors` instead of + // `default_allocate_kv_cache_tensors`. + if (cache_config == nullptr) { + cache_config_.reset(); + infinilm::global_state::get_forward_context().kv_cache_vec.clear(); + model_->reset_state(); + return; + } cache_config_ = cache_config->unique_copy(); - model_->reset_cache(cache_config_.get()); -} - -const cache::CacheConfig *MiniCPMSALAForCausalLM::get_cache_config() const { - return cache_config_.get(); + auto &kv_cache_vec = infinilm::global_state::get_forward_context().kv_cache_vec; + kv_cache_vec.clear(); + const backends::AttentionBackend attention_backend = + infinilm::global_state::get_infinilm_config().attention_backend; + kv_cache_vec = std::move( + minicpm_sala_allocate_kv_cache_tensors(cache_config, model_config_, attention_backend)); + model_->reset_state(); } } // namespace infinilm::models::minicpm_sala diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp index 33305b23..9344dfd3 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp @@ -6,9 +6,9 @@ #include "../../config/model_config.hpp" #include "../../engine/distributed/distributed.hpp" #include "../../backends/attention_backends.hpp" +#include "../../layers/linear/linear.hpp" #include "infinicore/device.hpp" -#include "infinicore/nn/linear.hpp" namespace infinilm::models::minicpm_sala { @@ -26,12 +26,9 @@ class MiniCPMSALAForCausalLM : public InfinilmModel { void reset_cache(const cache::CacheConfig *cache_config) override; - const cache::CacheConfig *get_cache_config() const override; - private: INFINICORE_NN_MODULE(MiniCPMSALAModel, model); - INFINICORE_NN_MODULE(infinicore::nn::Linear, lm_head); - std::unique_ptr cache_config_; + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, lm_head); }; } // namespace infinilm::models::minicpm_sala @@ -42,4 +39,3 @@ std::shared_ptr create_minicpm_sala_model_config( std::shared_ptr model_config); } // namespace infinilm::models::minicpm_sala - diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.cpp b/csrc/models/minicpm_sala/minicpm_sala_model.cpp index 6fd00bfe..de63831a 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_model.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_model.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include namespace infinilm::models::minicpm_sala { @@ -34,15 +33,8 @@ MiniCPMSALAModel::MiniCPMSALAModel(std::shared_ptrget("rms_norm_eps"), dtype, device); - // Shared rotary embedding (used by lightning layers only) - INFINICORE_NN_MODULE_INIT(rotary_emb, - model_config_->get_head_dim(), - model_config_->get("max_position_embeddings"), - model_config_->get("rope_theta"), - infinicore::nn::RoPE::Algo::GPT_NEOX, - dtype, - device, - model_config_->get_rope_scaling()); + // Shared rotary embedding (used by lightning layers only) — match `get_rope` pattern. + rotary_emb_ = infinilm::layers::rotary_embedding::get_rope(model_config_, device); // Mixer types per-layer decide attention flavor (minicpm4 vs lightning-attn). std::vector mixer_types; @@ -54,7 +46,6 @@ MiniCPMSALAModel::MiniCPMSALAModel(std::shared_ptrreset_cache(); - } - return; - } - - if (auto static_cfg = dynamic_cast(cache_config)) { - // Allocate separate caches by KV shape to avoid per-layer padding copies. - const size_t num_hidden_layers = model_config_->get("num_hidden_layers"); - // mixer_types_ is filled in ctor from model_config_->get("mixer_types"). - const size_t minicpm4_layer_count = - !mixer_types_.empty() ? std::count(mixer_types_.begin(), mixer_types_.end(), "minicpm4") : num_hidden_layers; - const size_t lightning_layer_count = num_hidden_layers - minicpm4_layer_count; - - const size_t base_kv_heads = model_config_->get("num_key_value_heads"); - const size_t base_head_dim = model_config_->get("head_dim"); - const size_t lightning_kv_heads = model_config_->get_or("lightning_nkv", base_kv_heads); - const size_t lightning_head_dim = model_config_->get_or("lightning_head_dim", base_head_dim); - - kv_cache_minicpm4_ = (minicpm4_layer_count > 0) - ? std::make_shared( - /*k_dim=*/base_head_dim, - /*v_dim=*/base_head_dim, - /*num_k_heads=*/base_kv_heads, - /*num_v_heads=*/base_kv_heads, - /*num_layers=*/minicpm4_layer_count, - /*max_positional_embedding=*/model_config_->get("max_position_embeddings"), - /*dtype=*/model_config_->get_dtype(), - *static_cfg, - rank_info_) - : nullptr; - - kv_cache_lightning_ = (lightning_layer_count > 0) - ? std::make_shared( - /*k_dim=*/lightning_head_dim, - /*v_dim=*/lightning_head_dim, - /*num_k_heads=*/lightning_kv_heads, - /*num_v_heads=*/lightning_kv_heads, - /*num_layers=*/lightning_layer_count, - /*max_positional_embedding=*/model_config_->get("max_position_embeddings"), - /*dtype=*/model_config_->get_dtype(), - *static_cfg, - rank_info_) - : nullptr; - } else { - // This refactor implements HF-like dense caching only. - throw std::runtime_error("MiniCPMSALAModel::reset_cache: Unsupported cache type (expected StaticKVCacheConfig)"); - } - +void MiniCPMSALAModel::reset_state() { for (auto &layer : layers_) { - layer->reset_cache(); + layer->self_attn_->reset_state(); } } @@ -134,29 +73,15 @@ infinicore::Tensor MiniCPMSALAModel::forward(const infinicore::Tensor &input_ids auto hs = embed_tokens_->forward(input_ids); for (size_t i = 0; i < layers_.size(); ++i) { - std::shared_ptr layer_cache; - if (!mixer_types_.empty() && mixer_types_[i] == "minicpm4") { - layer_cache = kv_cache_minicpm4_; - } else { - layer_cache = kv_cache_lightning_; - } hs = layers_[i]->forward(hs, position_ids, - layer_cache, + nullptr, past_sequence_lengths, total_sequence_lengths, input_offsets, cu_seqlens, block_tables, slot_mapping); - if (const char *env = std::getenv("MINICPM_SALA_LAYER_TRACE")) { - if (env[0] != '\0' && env[0] != '0') { - fprintf(stderr, "[minicpm_sala][layer_trace] layer=%zu mixer=%s\n", - i, - mixer_types_.empty() ? "unknown" : mixer_types_[i].c_str()); - fflush(stderr); - } - } } hs = norm_->forward(hs); diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.hpp b/csrc/models/minicpm_sala/minicpm_sala_model.hpp index d360dd3e..93f0a7e7 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_model.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_model.hpp @@ -3,10 +3,11 @@ #include "minicpm_sala_decoder_layer.hpp" #include "../../backends/attention_backends.hpp" -#include "../../cache/kv_cache.hpp" +#include "../../cache/cache.hpp" #include "../../config/model_config.hpp" #include "../../engine/distributed/distributed.hpp" +#include "../../layers/rotary_embedding/rotary_embedding.hpp" #include "infinicore/nn/embedding.hpp" #include "infinicore/nn/module.hpp" #include "infinicore/nn/rmsnorm.hpp" @@ -35,7 +36,7 @@ class MiniCPMSALAModel : public infinicore::nn::Module { std::optional block_tables, std::optional slot_mapping) const; - void reset_cache(const cache::CacheConfig *cache_config); + void reset_state(); size_t hidden_size() const { return hidden_size_; } double dim_model_base() const { return dim_model_base_; } @@ -44,17 +45,12 @@ class MiniCPMSALAModel : public infinicore::nn::Module { INFINICORE_NN_MODULE(infinicore::nn::Embedding, embed_tokens); INFINICORE_NN_MODULE_VEC(MiniCPMSALADecoderLayer, layers); INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, norm); - INFINICORE_NN_MODULE(infinicore::nn::RoPE, rotary_emb); private: std::shared_ptr model_config_; + std::shared_ptr rotary_emb_; engine::distributed::RankInfo rank_info_; backends::AttentionBackend attention_backend_; - // MiniCPM-SALA is hybrid: minicpm4 vs lightning layers can have different KV shapes. - // Use two StaticKVCache instances to avoid per-layer padding/copies during long prefill. - std::shared_ptr kv_cache_minicpm4_; - std::shared_ptr kv_cache_lightning_; - std::vector mixer_types_; infinicore::Device compute_device_; size_t hidden_size_; From e11223f1a7c4deaae1e3fcbc12ab1c6a5da2fb2e Mon Sep 17 00:00:00 2001 From: Ceng23333 <441651826@qq.com> Date: Thu, 9 Apr 2026 06:12:39 +0000 Subject: [PATCH 07/11] refactor Signed-off-by: Ceng23333 <441651826@qq.com> --- .../minicpm_sala/minicpm_sala_attention.cpp | 5 +-- .../minicpm_sala/minicpm_sala_attention.hpp | 2 +- .../minicpm_sala_decoder_layer.cpp | 24 +------------- .../minicpm_sala_decoder_layer.hpp | 12 +------ .../minicpm_sala_for_causal_lm.cpp | 4 ++- .../minicpm_sala/minicpm_sala_model.cpp | 31 +++++++++---------- .../minicpm_sala/minicpm_sala_model.hpp | 7 ++--- python/infinilm/infer_engine.py | 16 ++-------- python/infinilm/llm/llm.py | 9 +----- python/infinilm/llm/static_scheduler.py | 7 ----- 10 files changed, 27 insertions(+), 90 deletions(-) diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp index f437f9e9..f36b84c5 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp @@ -138,6 +138,7 @@ MiniCPMSALAAttention::MiniCPMSALAAttention(std::shared_ptrget_or("lightning_use_rope", true); + rotary_emb_ = infinilm::layers::rotary_embedding::get_rope(model_config_, device); // MiniCPM-SALA uses QK-norm and output gates by default. use_qk_norm_ = model_config_->get_or("qk_norm", true) && (mixer_type != "minicpm4"); @@ -173,10 +174,6 @@ MiniCPMSALAAttention::MiniCPMSALAAttention(std::shared_ptrto(device); } -void MiniCPMSALAAttention::set_rotary_emb(const std::shared_ptr &rotary_emb) { - rotary_emb_ = rotary_emb; -} - void MiniCPMSALAAttention::reset_state() { // KV tensors are maintained by the shared engine cache (StaticKVCache). // Lightning decode recurrent state is maintained locally for performance. diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp index d11a6037..2013d678 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp @@ -4,6 +4,7 @@ #include "../../cache/kv_cache.hpp" #include "../../config/model_config.hpp" #include "../../engine/distributed/distributed.hpp" +#include "../../layers/rotary_embedding/rotary_embedding.hpp" #include "infinicore/nn/linear.hpp" #include "infinicore/nn/module.hpp" @@ -35,7 +36,6 @@ class MiniCPMSALAAttention : public infinicore::nn::Module { infinicore::Tensor forward(const infinicore::Tensor &position_ids, const infinicore::Tensor &hidden_states) const; - void set_rotary_emb(const std::shared_ptr &rotary_emb); void reset_state(); protected: diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp index feacb3d3..7a44704e 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp @@ -1,6 +1,5 @@ #include "minicpm_sala_decoder_layer.hpp" -#include "../../global_state/global_state.hpp" #include "infinicore/ops.hpp" #include "infinicore/context/context.hpp" #include @@ -35,31 +34,10 @@ MiniCPMSALADecoderLayer::MiniCPMSALADecoderLayer(std::shared_ptr &rotary_emb) { - self_attn_->set_rotary_emb(rotary_emb); -} - infinicore::Tensor MiniCPMSALADecoderLayer::forward(const infinicore::Tensor &hidden_states, - const infinicore::Tensor &position_ids, - std::shared_ptr kv_cache, - std::optional past_sequence_lengths, - std::optional total_sequence_lengths, - std::optional input_offsets, - std::optional cu_seqlens, - std::optional block_tables, - std::optional slot_mapping) const { - // Match `layers/attention/Attention`: stash attention metadata in global forward context. - infinilm::global_state::get_forward_context().attn_metadata = - infinilm::global_state::AttentionMetadata(past_sequence_lengths, - total_sequence_lengths, - input_offsets, - cu_seqlens, - block_tables, - slot_mapping); - + const infinicore::Tensor &position_ids) const { // Pre-norm attention auto hs1 = input_layernorm_->forward(hidden_states); - (void)kv_cache; auto attn_out = self_attn_->forward(position_ids, hs1); // residual + scale_down * attn_out (MuP) diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp index 094e8650..44d320c9 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp @@ -4,7 +4,6 @@ #include "minicpm_sala_mlp.hpp" #include "../../backends/attention_backends.hpp" -#include "../../cache/kv_cache.hpp" #include "../../config/model_config.hpp" #include "../../engine/distributed/distributed.hpp" @@ -29,16 +28,7 @@ class MiniCPMSALADecoderLayer : public infinicore::nn::Module { backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); infinicore::Tensor forward(const infinicore::Tensor &hidden_states, - const infinicore::Tensor &position_ids, - std::shared_ptr kv_cache, - std::optional past_sequence_lengths, - std::optional total_sequence_lengths, - std::optional input_offsets, - std::optional cu_seqlens, - std::optional block_tables, - std::optional slot_mapping) const; - - void set_rotary_emb(const std::shared_ptr &rotary_emb); + const infinicore::Tensor &position_ids) const; private: friend class MiniCPMSALAModel; diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp index bcbb9f6f..fb55556f 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp @@ -33,7 +33,9 @@ MiniCPMSALAForCausalLM::MiniCPMSALAForCausalLM( // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA). const auto dtype = model_config->get_dtype(); - INFINICORE_NN_MODULE_INIT(model, model_config, device, rank_info, attention_backend); + (void)rank_info; + (void)attention_backend; + INFINICORE_NN_MODULE_INIT(model, model_config, device); const size_t hidden_size = model_config->get("hidden_size"); const size_t vocab_size = model_config->get("vocab_size"); diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.cpp b/csrc/models/minicpm_sala/minicpm_sala_model.cpp index de63831a..f6d9bb4d 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_model.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_model.cpp @@ -12,16 +12,14 @@ namespace infinilm::models::minicpm_sala { MiniCPMSALAModel::MiniCPMSALAModel(std::shared_ptr model_config, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info, - backends::AttentionBackend attention_backend) - : model_config_(std::move(model_config)), - rank_info_(rank_info), - attention_backend_(attention_backend) { + const infinicore::Device &device) + : model_config_(std::move(model_config)) { // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA). const auto dtype = model_config_->get_dtype(); compute_device_ = device; + const engine::distributed::RankInfo &rank_info = infinilm::global_state::get_tensor_model_parallel_rank_info(); + const backends::AttentionBackend attention_backend = infinilm::global_state::get_infinilm_config().attention_backend; hidden_size_ = model_config_->get("hidden_size"); dim_model_base_ = model_config_->get_or("dim_model_base", static_cast(hidden_size_)); @@ -50,8 +48,7 @@ MiniCPMSALAModel::MiniCPMSALAModel(std::shared_ptrregister_module( - "layers." + std::to_string(i), model_config_, device, i, mixer_types[i], rank_info_, attention_backend_)); - layers_.back()->set_rotary_emb(rotary_emb_); + "layers." + std::to_string(i), model_config_, device, i, mixer_types[i], rank_info, attention_backend)); } } @@ -69,19 +66,19 @@ infinicore::Tensor MiniCPMSALAModel::forward(const infinicore::Tensor &input_ids std::optional cu_seqlens, std::optional block_tables, std::optional slot_mapping) const { + infinilm::global_state::get_forward_context().attn_metadata = + infinilm::global_state::AttentionMetadata(past_sequence_lengths, + total_sequence_lengths, + input_offsets, + cu_seqlens, + block_tables, + slot_mapping); + // MuP scaling baked into weights at load time for minicpm_sala; no forward scaling here. auto hs = embed_tokens_->forward(input_ids); for (size_t i = 0; i < layers_.size(); ++i) { - hs = layers_[i]->forward(hs, - position_ids, - nullptr, - past_sequence_lengths, - total_sequence_lengths, - input_offsets, - cu_seqlens, - block_tables, - slot_mapping); + hs = layers_[i]->forward(hs, position_ids); } hs = norm_->forward(hs); diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.hpp b/csrc/models/minicpm_sala/minicpm_sala_model.hpp index 93f0a7e7..9b4a81c2 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_model.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_model.hpp @@ -8,6 +8,7 @@ #include "../../engine/distributed/distributed.hpp" #include "../../layers/rotary_embedding/rotary_embedding.hpp" +#include "../../global_state/global_state.hpp" #include "infinicore/nn/embedding.hpp" #include "infinicore/nn/module.hpp" #include "infinicore/nn/rmsnorm.hpp" @@ -23,9 +24,7 @@ namespace infinilm::models::minicpm_sala { class MiniCPMSALAModel : public infinicore::nn::Module { public: MiniCPMSALAModel(std::shared_ptr model_config, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); + const infinicore::Device &device); infinicore::Tensor forward(const infinicore::Tensor &input_ids, const infinicore::Tensor &position_ids, @@ -49,8 +48,6 @@ class MiniCPMSALAModel : public infinicore::nn::Module { private: std::shared_ptr model_config_; std::shared_ptr rotary_emb_; - engine::distributed::RankInfo rank_info_; - backends::AttentionBackend attention_backend_; infinicore::Device compute_device_; size_t hidden_size_; diff --git a/python/infinilm/infer_engine.py b/python/infinilm/infer_engine.py index 6552227c..9046b790 100644 --- a/python/infinilm/infer_engine.py +++ b/python/infinilm/infer_engine.py @@ -164,19 +164,9 @@ def generate( dtype=infinicore.int32, ) - # Decode metadata fast path (batch=1, static cache): - # avoid per-step from_list()/numpy allocations for tiny scalar tensors. - # Those tensors live on CPU and are H2D-copied each forward; for profiling - # comparisons vs `from_list` device metadata, set: - # INFINI_PROFILE_DISABLE_FAST_DECODE_META=1 - disable_fast_decode_meta = os.environ.get( - "INFINI_PROFILE_DISABLE_FAST_DECODE_META", "0" - ) not in ("", "0", "false", "False") - fast_decode_meta = ( - (not self.enable_paged_attn) - and (initial_batch_size == 1) - and not disable_fast_decode_meta - ) + # Decode metadata fast path (batch=1, static cache): avoid per-step from_list() allocations + # for tiny scalar tensors (these live on CPU and are H2D-copied each forward). + fast_decode_meta = (not self.enable_paged_attn) and (initial_batch_size == 1) if fast_decode_meta: cpu = infinicore.device("cpu", 0) diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py index e07e2155..07ada981 100644 --- a/python/infinilm/llm/llm.py +++ b/python/infinilm/llm/llm.py @@ -95,15 +95,8 @@ def __init__(self, config: EngineConfig): ) # Load model weights - dtype_map = { - "float16": infinicore.float16, - "bfloat16": infinicore.bfloat16, - "float32": infinicore.float32, - } load_model_state_dict_by_file( - self.model_engine, - config.model_path, - dtype=dtype_map.get(config.dtype, self.model_engine.config.dtype), + self.model_engine, config.model_path, dtype=self.model_engine.config.dtype ) # Initialize tokenizer diff --git a/python/infinilm/llm/static_scheduler.py b/python/infinilm/llm/static_scheduler.py index 25d64ae5..860bf7b9 100644 --- a/python/infinilm/llm/static_scheduler.py +++ b/python/infinilm/llm/static_scheduler.py @@ -115,11 +115,6 @@ def __init__(self, max_cache_len: int = 4096): self.max_cache_len = max_cache_len self.cached_block_hashes: List[int] = [] self.pending_block_hashes: List[int] = [] - # Safety switch: disable cross-request prefix reuse when investigating - # corrupted/contaminated generations. - self.disable_prefix_reuse = os.getenv( - "INFINILM_STATIC_DISABLE_PREFIX_REUSE", "0" - ) in ("1", "true", "True", "yes", "on") def add_request(self, request: InferenceRequest): if request is not None: @@ -219,8 +214,6 @@ def schedule(self) -> Optional[StaticSchedulerOutput]: num_full_blocks = prompt_len // _BLOCK_SIZE matched = 0 - if self.disable_prefix_reuse and self.cached_block_hashes: - self.cached_block_hashes.clear() self.pending_block_hashes.clear() for i in range(num_full_blocks): From 33eb78dd781bd4e3ae50ae83131c64bdf482879e Mon Sep 17 00:00:00 2001 From: Ceng23333 <441651826@qq.com> Date: Thu, 9 Apr 2026 06:27:15 +0000 Subject: [PATCH 08/11] refactor Signed-off-by: Ceng23333 <441651826@qq.com> --- MINICPM_SALA_BUILD_AND_CHANGES.md | 244 ---------------- MiniCPM_SALA_alignment_progress.md | 359 ------------------------ examples/jiuge.py | 16 +- python/infinilm/llm/llm.py | 1 - python/infinilm/llm/static_scheduler.py | 1 - python/infinilm/modeling_utils.py | 20 +- 6 files changed, 12 insertions(+), 629 deletions(-) delete mode 100644 MINICPM_SALA_BUILD_AND_CHANGES.md delete mode 100644 MiniCPM_SALA_alignment_progress.md diff --git a/MINICPM_SALA_BUILD_AND_CHANGES.md b/MINICPM_SALA_BUILD_AND_CHANGES.md deleted file mode 100644 index 1ec53fad..00000000 --- a/MINICPM_SALA_BUILD_AND_CHANGES.md +++ /dev/null @@ -1,244 +0,0 @@ -# MiniCPM-SALA on InfiniLM: Build Guide and Change Summary - -This document describes the changes in **InfiniCore** and **InfiniLM** from their baseline commits to support MiniCPM-SALA with InfLLM-v2, the **prerequisites**, and a **step-by-step build and run guide**. With these changes, `InfiniLM/examples/jiuge.py` produces **reasonable MiniCPM-SALA generation output** when run with the correct environment. - -**Baseline commits (for reference):** - -- **InfiniLM:** `main` -- **InfiniCore:** `5fc85c8b1e6728839993f1b743a525a066da585f` - -To see the exact diff from baseline: -`git diff 5fc85c8b1e6728839993f1b743a525a066da585f -- InfiniCore` and -`git diff main -- InfiniLM`. - ---- - -## 1. Changes in InfiniCore (from `5fc85c8b1e6728839993f1b743a525a066da585f`) - -InfiniCore was extended to **wire InfLLM-v2** (Stage-2 sparse attention) so that when built with `--infllmv2=y`, the C++ API calls `mha_varlen_fwd` and `mha_fwd_kvcache` from the infllmv2_cuda_impl .so. - -### 1.1 New or modified files (summary) - -| Area | Path | Purpose | -|------|------|--------| -| API (decl) | `include/infinicore/ops/infllmv2_api.hpp` | Declares `mha_varlen_fwd`, `mha_fwd_kvcache` (must be provided by infllmv2 .so at link/runtime). | -| API (decl) | `include/infinicore/ops/infllmv2_attention.hpp` | Public op header for infllmv2 attention. | -| Ops impl | `src/infinicore/ops/infllmv2_attention/infllmv2_attention.cc` | Implements `infllmv2_varlen` and `infllmv2_kvcache` by calling the above APIs when `ENABLE_INFLLMV2` and `ENABLE_ATEN` are set. | -| Pybind | `src/infinicore/pybind11/ops/infllmv2_attention.hpp` | Exposes infllmv2 ops to Python. | -| Pybind | `src/infinicore/pybind11/ops.hpp` | Includes infllmv2 op bindings. | -| Python | `python/infinicore/ops/infllmv2_attention.py` | Python wrapper for `infllmv2_varlen` / `infllmv2_kvcache`. | -| Python | `python/infinicore/__init__.py` | Exports `infllmv2_varlen`, `infllmv2_kvcache`. | -| Build | `xmake.lua` | New option `--infllmv2=y`; when set with `--aten=y`, defines `ENABLE_INFLLMV2` and links/rpath to the auto-detected .so. | -| Test | `test/infinicore/ops/test_infllmv2_attention.py` | Unit tests for infllmv2 varlen/kvcache (skipped if not built or no CUDA). | -| Example | `examples/infllmv2_sanity.py` | Sanity script for InfLLM-v2 (skips if .so absent or no CUDA). | - -### 1.2 Build option - -- **Option:** `infllmv2` (enable InfLLM-v2; xmake auto-detects `infllm_v2/*.so` under `InfiniCore/third_party/infllmv2_cuda_impl/build/...`). -- **Requires:** `aten=y` (InfiniCore must be built with PyTorch/ATen). -- **Effect:** Defines `ENABLE_INFLLMV2`, adds link and rpath to the auto-detected infllmv2 .so. At runtime, `libinfinicore_cpp_api.so` resolves `mha_varlen_fwd` / `mha_fwd_kvcache` from that .so (via `LD_LIBRARY_PATH` or `LD_PRELOAD`). - ---- - -## 2. Changes in InfiniLM (from `main`) - -InfiniLM was extended to support the **MiniCPM-SALA** model (embedding, layers, attention, MLP, LM head) and to use InfiniCore (including InfLLM-v2 when available) for inference. - -### 2.1 New or modified files (summary) - -| Area | Path | Purpose | -|------|------|--------| -| C++ model | `csrc/models/minicpm_sala/*.cpp`, `*.hpp` | MiniCPM-SALA model: `minicpm_sala_attention`, `minicpm_sala_decoder_layer`, `minicpm_sala_model`, `minicpm_sala_for_causal_lm`, `minicpm_sala_mlp`. Per-layer dense KV cache; lightning (GLA) and optional InfLLM-v2 (minicpm4) attention paths. | -| C++ factory | `csrc/models/model_factory.cpp` | Registers MiniCPM-SALA model type. | -| Config | `python/infinilm/auto_config.py` | MiniCPM-SALA config handling. | -| Weights | `python/infinilm/modeling_utils.py` | MiniCPM-SALA weight loading (MuP scaling, etc.). | -| Examples | `examples/jiuge.py` | Generic InferEngine generation script; docstring updated with env (PYTHONPATH, LD_LIBRARY_PATH, LD_PRELOAD) for MiniCPM-SALA. | -| Examples | `examples/minicpm_sala_logits_sanity.py` | HF vs InfiniLM logits sanity (prefill/decode1/decodeN); single-token decode for correct KV cache; one-prompt output comparison. | -| Examples | `examples/modeling_minicpm_sala.py` | HF-side MiniCPM-SALA modeling (reference). | -| Docs | `MiniCPM_SALA_alignment_progress.md` | Alignment and debugging notes. | - -### 2.2 Behaviour notes - -- **Attention:** Layer 0 (minicpm4) can use compiled InfLLM-v2 when InfiniCore is built with `--infllmv2=y` and the .so is preloaded; other layers use lightning (GLA) path. -- **Attention overhead optimizations:** In `minicpm_sala_attention.cpp`: (1) sequence lengths are read in one place when both `past_sequence_lengths` and `total_sequence_lengths` are present (`has_cache_meta`), avoiding duplicate logic; (2) Q/K/V use a single `contiguous()->view` chain after projections; (3) lightning path builds `q_bthd` via one `permute->contiguous` from `q_perm`; (4) sparse path uses `q_perm` directly (already contiguous) and only calls `contiguous()` on K/V when repeating heads. Semantics and logits are unchanged. -- **KV cache:** Decode must use **single-token input** per step; passing the full sequence each step would misalign the per-layer KV cache (see sanity script). -- **Engine / KV cache config:** MiniCPM-SALA uses per-layer dense KV cache in C++; the engine’s `cache_config` is used only for scheduling (e.g. `past_sequence_lengths` / `total_sequence_lengths`). **Static cache** is recommended (default in `jiuge.py` when not passing `--enable-paged-attn`). For static, `jiuge.py` sets `max_cache_len = max(initial_capacity, max_position_embeddings)` when `model_type == "minicpm_sala"` so long contexts are supported without re-alloc. - ---- - -## 3. Prerequisites - -### 3.1 System and toolchain - -- **OS:** Linux. -- **Python:** 3.12 recommended (match the infllmv2 .so and InfiniCore pybind ABI). -- **CUDA:** 11.6+ (e.g. 12.x); `nvcc` in `PATH` (e.g. via `CUDA_HOME=/usr/local/cuda` and `PATH=$CUDA_HOME/bin:$PATH`). -- **C++:** GCC (e.g. `CC=gcc CXX=g++`) for infllmv2_cuda_impl and InfiniCore. -- **xmake:** For building InfiniCore (install from https://xmake.io or use a project-provided path). -- **PyTorch:** Installed in the same Python env used to build infllmv2 and to run InfiniLM (InfiniCore with `aten=y` links against this PyTorch’s libs). - -### 3.2 Python environment - -Use a **single venv** (or env) that has: - -- `torch` -- `transformers` -- `triton` (e.g. 3.2.0; for MiniCPM-SALA HF path; if CUDA 12.8, a small patch may be needed for Triton’s `ptx_get_version` or use a Triton version that supports 12.8) -- `flash-linear-attention` (or HF deps for MiniCPM-SALA) -- Other InfiniLM/InfiniCore runtime deps - -Build **infllmv2_cuda_impl** and **InfiniCore** with this same Python (and thus same PyTorch ABI). - -### 3.3 Repo layout - -- **minicpm-sala-support** (repo root) contains: - - **InfiniCore/** — InfiniCore with InfLLM-v2 wiring. - - **InfiniLM/** — InfiniLM with MiniCPM-SALA. - - **InfiniCore/third_party/infllmv2_cuda_impl/** — InfLLM-v2 CUDA kernel implementation (provides `mha_varlen_fwd`, `mha_fwd_kvcache`). - ---- - -## 4. Build Guide - -### 4.1 Build InfLLM-v2 (infllmv2_cuda_impl) - -This produces the `.so` that provides `mha_varlen_fwd` and `mha_fwd_kvcache`. InfiniCore must be built with a PyTorch/ABI-compatible env (same Python/torch as here). - -1. **From repo root:** - ```bash - cd InfiniCore/third_party/infllmv2_cuda_impl - ``` -2. **Submodules:** - ```bash - git submodule update --init --recursive - ``` -3. **Env (recommended):** - ```bash - export CC=gcc CXX=g++ - export CUDA_HOME=/usr/local/cuda # or your CUDA path - export PATH=$CUDA_HOME/bin:$PATH - ``` -4. **Build/install** (use the Python that has torch and that you will use for InfiniLM): - ```bash - python setup.py install - ``` - Or: `pip install -e .` -5. **Locate the .so:** - Typically under `build/lib.linux-x86_64-cpython-312/infllm_v2/` (name like `C.cpython-312-x86_64-linux-gnu.so`). Set: - ```bash - INFLLMV2_SO_DIR="/InfiniCore/third_party/infllmv2_cuda_impl/build/lib.linux-x86_64-cpython-312/infllm_v2" - ``` - -### 4.2 Build InfiniCore (with InfLLM-v2) - -InfiniCore must be built with **aten** and, for MiniCPM-SALA with InfLLM-v2, with **infllmv2=y** enabled (xmake auto-detects the .so). - -1. **Install Infini dependencies** (if not already): - Build and install Infini libs so they are under `$INFINI_ROOT` (default `~/.infini`). InfiniCore’s xmake expects `include/` and `lib/` there (e.g. `libinfinicore_cpp_api.so`, `libinfiniop.so`, etc.). - -2. **From repo root:** - ```bash - cd InfiniCore - ``` -3. **Configure** (use the same Python/torch as infllmv2): - ```bash - xmake config -y --root --nv-gpu=y --aten=y --infllmv2=y - ``` - Omit `--infllmv2=y` for a build without InfLLM-v2 (then no MiniCPM-SALA layer0 infllmv2 path). -4. **Build the Python extension:** - ```bash - xmake --root _infinicore - ``` -5. **Optional – install to ~/.infini:** - ```bash - xmake install - ``` - The Python loadable is also copied under `InfiniCore/python/infinicore/lib/` by the build. - -### 4.3 Run jiuge.py (MiniCPM-SALA) - -Use the **same venv** that has `torch`, `transformers`, etc., and set env so InfiniCore and the infllmv2 .so are found and symbols resolve. - -**Required:** - -- `PYTHONPATH`: InfiniLM and InfiniCore Python packages. -- `LD_LIBRARY_PATH`: Torch lib, Infini lib (`/root/.infini/lib` or your `INFINI_ROOT/lib`), and optionally `INFLLMV2_SO_DIR` (if not using `LD_PRELOAD`). -- If InfiniCore was built with InfLLM-v2: **`LD_PRELOAD`** the infllmv2 .so so `libinfinicore_cpp_api.so` resolves `mha_varlen_fwd` (and `mha_fwd_kvcache`). - -**Example (from repo root):** - -```bash -INFLLMV2_SO_DIR="$(pwd)/InfiniCore/third_party/infllmv2_cuda_impl/build/lib.linux-x86_64-cpython-312/infllm_v2" - -PYTHONPATH="$(pwd)/InfiniLM/python:$(pwd)/InfiniCore/python:$PYTHONPATH" \ -LD_LIBRARY_PATH="$(python -c 'import torch; print(torch.__path__[0])')/lib:/root/.infini/lib:${INFLLMV2_SO_DIR}:$LD_LIBRARY_PATH" \ -LD_PRELOAD="${INFLLMV2_SO_DIR}/C.cpython-312-x86_64-linux-gnu.so" \ -python InfiniLM/examples/jiuge.py --nvidia --model_path /root/.cache/modelscope/hub/models/OpenBMB/MiniCPM-SALA -``` - -Use the **venv** Python explicitly if needed, e.g.: - -```bash -/path/to/venv/bin/python InfiniLM/examples/jiuge.py ... -``` - -For Triton (HF path) on CUDA 12.8 you may need: - -```bash -TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas -``` - ---- - -## 5. Verification - -- **InfiniCore InfLLM-v2 ops:** - `PYTHONPATH=InfiniCore/python:InfiniCore/test/infinicore LD_LIBRARY_PATH=:${INFLLMV2_SO_DIR}:/root/.infini/lib LD_PRELOAD=${INFLLMV2_SO_DIR}/C.cpython-312-x86_64-linux-gnu.so python InfiniCore/test/infinicore/ops/test_infllmv2_attention.py --nvidia` - -- **HF vs InfiniLM logits (one-prompt decode):** - Same env + `LD_PRELOAD` and (if needed) `TRITON_PTXAS_PATH`: - `python InfiniLM/examples/minicpm_sala_logits_sanity.py --model_path --mode decodeN --decode_steps 64` - -- **Generation:** - `jiuge.py` with the same env should produce **reasonable MiniCPM-SALA output** (e.g. for prompt "How are you"). - ---- - -## 6. Related docs - -- **CURRENT_PROGRESS.md** — Local progress, InfLLM-v2 plan, and run commands. -- **InfiniLM/MiniCPM_SALA_alignment_progress.md** — Alignment and debugging details. -- **InfiniCore/third_party/infllmv2_cuda_impl/README.md** — InfLLM-v2 kernel design and install. -- **InfiniLM/examples/jiuge.py** — Docstring at top with env summary. - ---- - -## 7. TODO - -- **Remove temporal log and dump code** — Strip or gate debug logging, `INFINI_DEBUG_*`, and temporary dump paths (e.g. `/tmp/` tensor dumps, `dump_tensor_to_bin_if_enabled`, `log_tensor_stats_if_enabled`) from InfiniLM/InfiniCore once alignment and bring-up are stable. -- **Adapt inference_server.py** — Wire MiniCPM-SALA (and InfiniLM InferEngine) into the inference server (e.g. `inference_server.py` or equivalent in the workspace) so that the server can load and serve MiniCPM-SALA with the same env (PYTHONPATH, LD_LIBRARY_PATH, LD_PRELOAD) and run generation endpoints. - -### 7.1 Debug and sanity env and code (for future erasing) - -When removing temporal log and dump code, use this as the reference for **env parsing** and **locations to erase or gate**. - -**Environment variables (debug / sanity):** - -| Env var | Parsing / behavior | Purpose | -|---------|---------------------|--------| -| `INFINI_DEBUG_LOG` | Set to a file path (e.g. `/tmp/minicpm_sala_sanity_debug.log`). When set, C++ and Python append JSON/text lines to this file. | Text log for alignment debugging. | -| `INFINI_DEBUG_ATTN_DUMP` | Presence = enable (e.g. `"1"` or any). When set, tensors are written to fixed `/tmp/` paths below. | Enable binary tensor dumps and per-layer stats. | - -**Where they are read:** - -- **InfiniLM C++:** `std::getenv("INFINI_DEBUG_LOG")`, `std::getenv("INFINI_DEBUG_ATTN_DUMP")` in: - - `InfiniLM/csrc/models/minicpm_sala/minicpm_sala_attention.cpp` (dump_tensor_f32, layer q/k/v/g_gamma and attn out dumps) - - `InfiniLM/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp` (log_tensor_stats_if_enabled, tensor_to_f32_and_dump, layer input/out dumps) - - `InfiniLM/csrc/models/minicpm_sala/minicpm_sala_model.cpp` (dump_tensor_to_bin_if_enabled, log_tensor_stats_if_enabled; embed and final hidden dumps) -- **InfiniLM Python (sanity script):** `os.environ["INFINI_DEBUG_LOG"]`, `os.environ["INFINI_DEBUG_ATTN_DUMP"]` set in `InfiniLM/examples/minicpm_sala_logits_sanity.py` before runs; `os.getenv("INFINI_DEBUG_*")` in `InfiniLM/examples/modeling_minicpm_sala.py` (HF-side hooks that write `/tmp/hf_*.pt` and log to `INFINI_DEBUG_LOG`). - -**Temporary paths to remove or stop writing:** - -- **C++ dumps (binary):** `/tmp/inf_embed_out.bin`, `/tmp/inf_final_hidden.bin`, `/tmp/inf_layer0_q.bin`, `/tmp/inf_layer0_k.bin`, `/tmp/inf_layer0_v.bin`, `/tmp/inf_layer0_g_gamma.bin`, `/tmp/inf_layer1_q.bin`, `/tmp/inf_layer1_k.bin`, `/tmp/inf_layer1_v.bin`, `/tmp/inf_layer1_g_gamma.bin`, `/tmp/inf_layer0_attn_input.bin`, `/tmp/inf_attn_out_layer0.bin`, `/tmp/inf_attn_out_layer1.bin`, `/tmp/inf_layer_out_.bin`. -- **Python (sanity) writes:** `DEBUG_LOG_PATH` (e.g. `/tmp/minicpm_sala_sanity_debug.log`); `/tmp/hf_embed_out.pt`, `/tmp/hf_final_hidden.pt`, `/tmp/hf_layer0_attn_input.pt`, `/tmp/hf_layer_out_.pt`, `/tmp/hf_layer0_q.pt`, `/tmp/hf_layer0_k.pt`, `/tmp/hf_layer0_v.pt`, `/tmp/hf_attn_out_layer0.pt`, `/tmp/hf_layer1_q.pt`, `/tmp/hf_layer1_k.pt`, `/tmp/hf_layer1_v.pt`, `/tmp/hf_attn_out_layer1.pt`. -- **Helpers to remove or gate:** `dump_tensor_f32`, `dump_tensor_to_bin_if_enabled`, `log_tensor_stats_if_enabled`, `tensor_to_f32_and_dump`; sanity script’s `_append_debug_log`, and all `torch.save(..., "/tmp/...")` / `np.fromfile("/tmp/...")` / `os.path.isfile("/tmp/...")` blocks that exist only for alignment comparison. diff --git a/MiniCPM_SALA_alignment_progress.md b/MiniCPM_SALA_alignment_progress.md deleted file mode 100644 index 538208c9..00000000 --- a/MiniCPM_SALA_alignment_progress.md +++ /dev/null @@ -1,359 +0,0 @@ -### MiniCPM‑SALA sanity alignment – current status - -### Scope - -- **Goal**: Align InfiniLM MiniCPM‑SALA logits with HF reference on the dense/GLA (non‑sparse) path, using the `examples/minicpm_sala_logits_sanity.py` script running inside the `minicpm-sala` container. - ---- - -### Instrumentation and plumbing - -- **Sanity script (`minicpm_sala_logits_sanity.py`)** - - **Backend lock**: All InfiniLM `InferEngine` paths now use `attention_backend="default"` so they hit the dense/GLA fallback. - - **Debug log target**: The script sets `INFINI_DEBUG_LOG=/home/zenghua/repos/.cursor/debug-9146ea.log` and `INFINI_DEBUG_ATTN_DUMP=1` so both Python and C++ write to the same NDJSON file. - - **HF per-layer hooks**: - - `_register_hf_layer_hooks` walks the model (`hf.transformer.layers`, `hf.model.layers`, or `hf.layers`) and registers forward hooks on the first 3 layers. - - For each layer \(i\), it logs: - - `min`, `max`, `mean`, `l2` of the layer output, as `hypothesisId="HF_L"`, `data.layer = i`. - - Hooks are installed for `run_prefill_only` and removed after the forward pass. - -- **InfiniLM attention (`minicpm_sala_attention.cpp`)** - - Existing **layer‑0** diagnostics: - - At entry to `forward_dense_`: `forward_dense_entry` logs env/config, including `INFINI_DEBUG_ATTN_DUMP`, `use_rope`, `use_qk_norm`, `use_output_gate`, `use_output_norm`, `is_sparse_layer`, and shapes. - - For layer 0, logs stats for: - - Pre‑gate attention output (`attn_pre_gate`): full tensor min/max/mean, `l2`, shape and scaling. - - Post‑gate/norm (`attn_post_gate`), and post‑`o_proj` (`attn_post_oproj`). - - **Planned / partially implemented**: extended logging for `layer_idx_ < 2` (layers 0 and 1) with: - - `attn_pre_gate_l0` / `attn_pre_gate_l1`. - - `attn_post_gate_l0` / `attn_post_gate_l1`. - - `attn_post_oproj_l0` / `attn_post_oproj_l1`. - - Current runs still only show layer‑0 entries; the `_infinilm` binary in use has not yet picked up the `_l1` variants (see below). - -- **InfiniLM decoder layer (`minicpm_sala_decoder_layer.cpp/.hpp`)** - - **MuP residual scaling**: - - `residual_scale_ = scale_depth / sqrt(num_hidden_layers)` using `scale_depth` from `ModelConfig` (matches HF path). - - `forward` applies: - - `out1 = hidden_states + residual_scale_ * attn_out`. - - `out2 = out1 + residual_scale_ * mlp_out`. - - **Per-layer Inf output stats**: - - New member `size_t layer_idx_` stored from constructor. - - For `layer_idx_ < 3`, after computing `out2`, it: - - Copies to CPU, converts BF16/F16/F32 to float, computes `min`, `max`, `mean`, `l2` and shape. - - Logs as `hypothesisId="INF_L"`, with `data.layer = layer_idx_`. - -- **Weight scaling / MuP configuration (`modeling_utils.py`)** - - Loader reads `config.json` and applies MiniCPM‑style scaling: - - `scale_input = scale_emb`, `scale_depth`, `num_hidden_layers`, `dim_model_base`, `hidden_size`. - - For `model_type == "minicpm_sala"`: - - `scale_o` and `scale_down` are reset to 1.0 (residual scaling is done at C++ forward time). - - `scale_lm_head = dim_model_base / hidden_size` is baked into `lm_head.weight`. - - Embedding and norm weights are scaled as in the MiniCPM scripts. - -- **Rebuild and install (`rebuild.sh`, xmake)** - - `rebuild.sh`: - - `InfiniCore`: `python scripts/install.py --nv-gpu=y --ccl=y --aten=y`, then `xmake build _infinicore` and `xmake install _infinicore`. - - `InfiniLM`: optional `xmake clean`, then `xmake build _infinilm` and `xmake install _infinilm`. - - Verified inside container: - - Shared libs in `/root/.infini/lib` are updated (e.g. `libinfiniop.so`, `libinfinicore_cpp_api.so` with current timestamps). - - Python sees `infinilm` from `/home/zenghua/repos/InfiniLM/python/infinilm`. - - The extension in use is `_infinilm` at: - - `/home/zenghua/repos/InfiniLM/python/infinilm/lib/_infinilm.cpython-312-x86_64-linux-gnu.so`. - ---- - -### Sanity run behavior and current misalignment - -- **Command used (container, GPU 1)**: - ```bash - docker exec -e CUDA_VISIBLE_DEVICES=1 minicpm-sala bash -lc ' - source /app/docker/nvidia/env-set.sh - cd /home/zenghua/repos/InfiniLM - python3 examples/minicpm_sala_logits_sanity.py \ - --model_path /data-aisoft/zenghua/models/OpenBMB/MiniCPM-SALA \ - --mode prefill \ - --prompt "How are you" - ' - ``` -- **HF vs Inf logits (from `SANITY_ONELINE`)** - - `inf_norm ≈ 387.66` - - `hf_norm ≈ 1588.89` - - **ratio_inf_hf ≈ 0.244** - - `max_diff ≈ 12.77`, `mean_diff ≈ 4.64` - - Top‑1 token IDs differ (HF: 74, Inf: 59358). - -- **HF early layers (from `HF_L` logs)** - - Using the HF hooks in the sanity script: - - Layer 0: `l2 ≈ 59.49` - - Layer 1: `l2 ≈ 73.91` (first GLA layer) - - Layer 2: `l2 ≈ 87.38` - - Norms grow smoothly with depth; nothing obviously pathological on HF side. - -- **Inf attention layer‑0 vs HF** - - HF layer‑0 pre‑gate attention (`modeling_minicpm_sala.py:attn_pre_gate`): - - Shape `[1, 4, 4096]`, `min=-8.375`, `max=9.0`, `mean≈-0.1273`. - - Inf layer‑0: - - **Pre‑gate (`attn_pre_gate`)**: - - `l2 ≈ 105.50`, `min=-8.375`, `max=9.0`. - - Python’s comparison (`compare_attn`) reports `norm_ratio_inf_hf ≈ 0.4487`, i.e. Inf pre‑gate norm ≈ 0.45× HF’s. - - **Post‑gate/norm (`attn_post_gate`)**: - - `l2 ≈ 60.38`, very close to HF layer‑0 output `l2 ≈ 59.49`. - - **Post‑o_proj (`attn_post_oproj`)**: - - `l2 ≈ 98.66` (used as input to the decoder’s residual path). - - Interpretation: - - By the end of the **layer‑0 attention block**, Inf and HF are roughly matched in scale at the decoder output (norms ≈ 60). - - The severe **0.244 logits norm ratio** is therefore not due to an immediate blow‑up/vanish at layer‑0 attention output; it accumulates later (likely starting at the first GLA layer and/or via MuP/residual/MLP scaling). - ---- - -### Binary / build state - -- **Extension module mapping** - - In container, importing `infinilm` shows: - - `infinilm.__file__` → `/home/zenghua/repos/InfiniLM/python/infinilm/__init__.py` - - `_infinilm` (top‑level) → `/home/zenghua/repos/InfiniLM/python/infinilm/lib/_infinilm.cpython-312-x86_64-linux-gnu.so` - - That is the `.so` used by the sanity script. - -- **Why new attention logs for layer 1 don’t appear yet** - - `strings _infinilm.cpython-312-...so | grep 'attn_pre_gate_l1'` currently returns **no matches**: - - This confirms the loaded `_infinilm` was built **before** we added the `_l1` logging strings. - - We attempted a fresh `_infinilm` build and initially hit: - - C++ error in `MiniCPMSALADecoderLayer::forward`: `layer_idx_` not declared. - - That prevented `_infinilm` from rebuilding/overwriting the old `.so`, so your layer‑1 logging changes never reached runtime. - -- **Decoder fix applied to unblock rebuild** - - Added `size_t layer_idx_ = 0;` as a private member in `minicpm_sala_decoder_layer.hpp`. - - Set `layer_idx_ = layer_idx;` in the decoder layer constructor. - - After this fix, `_infinilm` can compile; `rebuild.sh` now proceeds past the decoder layer and updates the core libraries (and should be able to update `_infinilm` when the entire build/install completes successfully). - ---- - -### Open issues / next steps - -- **1. Get the new `_infinilm` into use** - - Ensure `rebuild.sh` completes the `_infinilm` build + install step successfully (no early termination due to missing libffi/openssl/ca‑certificates link checks). - - Confirm via: - ```bash - strings /home/zenghua/repos/InfiniLM/python/infinilm/lib/_infinilm.cpython-312-x86_64-linux-gnu.so \ - | grep -E 'attn_pre_gate_l1|attn_post_gate_l1|attn_post_oproj_l1' - ``` - If this prints the `_l1` labels, the new binary is in place. - -- **2. Re‑run sanity and capture layer‑1 attention logs** - - With the updated `_infinilm`, re‑run the prefill sanity script and inspect `debug-9146ea.log` for: - - `minicpm_sala_attention.cpp:attn_pre_gate_l1` - - `minicpm_sala_attention.cpp:attn_post_gate_l1` - - `minicpm_sala_attention.cpp:attn_post_oproj_l1` - - Compare their `l2` to HF layer‑1 (`HF_L` `l2 ≈ 73.9`). - - This will tell us whether the **first GLA layer** is where Inf starts to diverge in norm, or whether norms remain close through layer 1 and drift later. - -- **3. Use decoder `INF_L` logs to see per‑layer drift** - - Once `_infinilm` is rebuilt, `MiniCPMSALADecoderLayer`’s per‑layer `INF_L` logs for `layer_idx_ < 3` should appear in `debug-9146ea.log`. - - By comparing HF (`HF_L`) vs Inf (`INF_L`) for layers 0/1/2, we can see exactly where norm ratios deviate from ~1 and head toward ~0.244 at the logits. - - That will guide targeted fixes in: - - GLA gating / normalization (in `minicpm_sala_attention.cpp`), and/or - - MuP residual & MLP scaling (still matching HF in formula, but potentially interacting differently with the SALA configuration). - ---- - -### Summary - -- **Plumbing**: Shared log path and HF/Inf instrumentation are in place; per‑layer HF stats and layer‑0 Inf attention stats work and confirm that **layer‑0 attention output scale is roughly aligned**. -- **Mismatch**: Final logits norm is still **Inf/HF ≈ 0.244**, so the discrepancy is accumulating across layers, likely starting at or after the first GLA layer. -- **Blocking issue**: The `_infinilm` C++ extension in use predates the layer‑1 logging changes; an earlier C++ compile error prevented a fresh install. That decode‑layer bug has been fixed so we can now rebuild and get the new diagnostics into the runtime. -- **Next milestone**: Successfully rebuild `_infinilm`, confirm the `_l1` log strings are present, rerun sanity, and use the new layer‑1 and decoder `INF_L` stats to precisely locate where Inf’s norms start drifting away from HF. - ---- - -### Host follow-up (2026-03-14) - -- Ran `examples/minicpm_sala_logits_sanity.py --mode prefill --prompt "How are you"` directly on the host using the local venv and the same base env as the documented `jiuge.py` run. -- Extra host-only prep required for the HF reference path: - - installed `flash-linear-attention` to provide the `fla` module - - installed `triton==3.2.0` to avoid the Triton `STAGE` autotune import failure - - created `/home/zenghua/repos/.cursor/` because the script hardcodes `DEBUG_LOG_PATH` there -- Result on host: - - `SANITY_ONELINE ratio=0.6215 max_diff=11.5391 mean_diff=2.5607` - - HF top-1 token id `74`, Inf top-1 token id `23917` -- Interpretation: - - The host environment now reproduces the alignment issue without Docker. - - The ratio is better than the older container snapshot (`~0.244`) but still far from aligned, so the poor generation quality remains consistent with a real logits mismatch. -- Full reproducibility details for this host run were appended to `CURRENT_PROGRESS.md`. - ---- - -### HF MiniCPM4 dense-fallback experiment (2026-03-14) - -- Goal: - - Test whether the remaining mismatch is coming from the HF `minicpm4` sparse-vs-dense code path by forcing `minicpm4` layers onto the standard dense attention implementation. -- HF model-file change: - - Patched both cached copies of `modeling_minicpm_sala.py` so `MiniCPMSALADecoderLayer` uses `MINICPM_ATTENTION_CLASSES[config._attn_implementation]` for `mixer_type == "minicpm4"` instead of `MiniCPMInfLLMv2Attention`. - - Backups: - - `/root/.cache/modelscope/hub/models/OpenBMB/MiniCPM-SALA/modeling_minicpm_sala.py.bak-20260314-210428` - - `/root/.cache/huggingface/modules/transformers_modules/MiniCPM-SALA/modeling_minicpm_sala.py.bak-20260314-210619` -- Rerun result: - - `SANITY_ONELINE ratio=0.6215 max_diff=11.5391 mean_diff=2.5607` - - HF top-1 token id `74`, Inf top-1 token id `23917` - - These numbers are unchanged from the earlier host run. -- Fresh per-layer log from `debug-9146ea.log`: - - HF decoder output `l2`: - - layer 0: `59.49` - - layer 1: `73.91` - - layer 2: `87.38` - - Inf decoder output `l2`: - - layer 0: `35.08` - - layer 1: `295.86` - - layer 2: `531.38` - - Inf layer-1 attention stats: - - pre-gate `l2 ~= 749.58` - - post-gate `l2 ~= 745.29` - - post-`o_proj` `l2 ~= 1112.6` -- Interpretation: - - For this short prefill case, forcing HF `minicpm4` to the dense fallback path does not move the mismatch at all. - - The strongest current evidence is that the large norm drift starts in the InfiniLM implementation at or immediately after the first `lightning-attn` layer, not in the HF `minicpm4` branch. - ---- - -### InfiniLM MiniCPM4 HF-math experiment (2026-03-14) - -- Goal: - - Make the InfiniLM `minicpm4` layer compute the same dense attention math as the HF reference path and see whether layer 0 aligns at the start of sanity. -- C++ change: - - In `csrc/models/minicpm_sala/minicpm_sala_attention.cpp`, replaced the `minicpm4` sparse/varlen/grouped fallback branch with an explicit HF-style dense path: - - repeat KV heads to `num_attention_heads` - - compute per-head dense causal attention - - keep the same sigmoid output gate and `o_proj` -- Rebuild: - - Rebuilt and reinstalled `_infinilm` successfully using the local `xmake` toolchain. -- Rerun result: - - `SANITY_ONELINE ratio=0.6215 max_diff=11.5391 mean_diff=2.5607` - - HF top-1 token id `74`, Inf top-1 token id `23917` - - These numbers are unchanged. -- Fresh layer stats after the InfiniLM-side change: - - HF decoder output `l2`: `59.49 -> 73.91 -> 87.38` - - Inf decoder output `l2`: `35.08 -> 295.86 -> 531.38` - - Inf layer-0 attention: - - pre-gate `142.87` - - post-gate `80.43` - - post-`o_proj` `135.39` -- Interpretation: - - Even after making the InfiniLM `minicpm4` branch follow the HF dense attention structure, layer 0 does not move toward HF. - - This strongly suggests the remaining mismatch is not in the `minicpm4` attention branch itself; attention should shift to other decoder-path components and especially the first `lightning-attn` layer. - ---- - -### Temporary all-lightning experiment (2026-03-14) - -- Goal: - - Force both HF and InfiniLM to use lightning-style attention math for former `minicpm4` layers as a temporary precision-alignment probe, without changing checkpoint tensor shapes. -- Why not use `config.json` only: - - A direct `mixer_types -> all lightning-attn` config edit failed during HF weight load because former `minicpm4` layers have incompatible checkpoint shapes for the stock `LightningAttention` module (e.g. `256 x 4096` vs `4096 x 4096`). - - The original `mixer_types` config was restored. -- Temporary override implementation: - - Added env flag `MINICPM_SALA_FORCE_ALL_LIGHTNING=1`. - - HF side: - - former `minicpm4` layers instantiate `MiniCPMAttention` under the flag - - `MiniCPMAttention.forward()` switches to lightning-style GLA computation under the flag, while keeping original q/k/v/o_proj/o_gate weights - - InfiniLM side: - - `minicpm_sala_attention.cpp` routes sparse layers through `gla_attention` under the same flag - - Sanity script: - - `examples/minicpm_sala_logits_sanity.py` now sets `MINICPM_SALA_FORCE_ALL_LIGHTNING=1` for this experiment -- Result: - - `SANITY_ONELINE ratio=0.4728 max_diff=12.1406 mean_diff=1.9942` - - HF top-1 token id `59375`, Inf top-1 token id `59358` -- Fresh per-layer stats under the override: - - HF decoder output `l2`: - - layer 0: `385.10` - - layer 1: `374.87` - - layer 2: `426.87` - - Inf decoder output `l2`: - - layer 0: `26.23` - - layer 1: `208.72` - - layer 2: `403.90` - - Inf layer-0 attention: - - pre-gate `105.50` - - post-gate `60.38` - - post-`o_proj` `98.66` - - Inf layer-1 attention: - - pre-gate `672.74` - - post-gate `459.67` - - post-`o_proj` `737.03` -- Interpretation: - - The override is definitely active on both sides, because HF logits/top-1 and HF early-layer norms changed substantially. - - However, the former `minicpm4` layers still do not align numerically with InfiniLM under lightning-style attention. - - This points to a mismatch in the lightning formulation itself (decay/slopes, layout, gating, norm/casting, or related details), not just in the original mixed `mixer_types` layout. - ---- - -### Layer-0 narrowing after matched temporary semantics (2026-03-14) - -- Change: - - Updated the temporary HF override so its former `minicpm4` path uses the same grouped causal-softmax math as `InfiniCore` `gla_attention`, instead of `simple_gla` with decay. - - Added layer-0 sub-stage logging on both sides: - - HF: `inputs_embeds`, `input_layernorm`, `attn_pre_gate`, `attn_post_oproj` - - Inf: embedding output, `input_layernorm`, `attn_pre_gate`, `attn_post_oproj` -- Result: - - Layer-0 pre-gate attention still mismatches strongly: - - HF `attn_pre_gate l2 ~= 235.11` - - Inf `attn_pre_gate l2 ~= 105.50` - - `Inf/HF ~= 0.4487` - - But this is no longer the earliest divergence. -- New root-cause evidence: - - Embedding output already differs: - - HF `inputs_embeds l2 ~= 44.09` - - Inf embed output `l2 ~= 25.51` - - First decoder layer pre-norm output also differs: - - HF layer0 `input_layernorm l2 ~= 95.88` - - Inf layer0 `input_layernorm l2 ~= 70.94` -- Interpretation: - - The mismatch starts before layer-0 attention. - - Attention, gating, and `o_proj` are downstream amplifiers, but not the first source. - - The next priority should be MiniCPM-SALA embedding behavior in InfiniLM: - - verify `model.embed_tokens.weight` load/scaling, - - verify runtime embedding lookup output against HF for the same token ids, - - then re-check whether layer-0 attention comes into line automatically. - ---- - -### Multi-layer alignment after embed fix (2026-03-14) - -- Instrumentation added: - - InfiniLM dumps decoder layer outputs (out2) for layers 0–2 to `/tmp/inf_layer_out_{0,1,2}.bin` and final hidden (after norm) to `/tmp/inf_final_hidden.bin` when `INFINI_DEBUG_ATTN_DUMP=1`. - - HF hooks save layer outputs to `/tmp/hf_layer_out_{0,1,2}.pt` and final hidden to `/tmp/hf_final_hidden.pt`. - - Sanity script prints per-layer and final-hidden norm_ratio and max/mean diff. -- Result (prefill "How are you", int32 input_ids workaround): - - **Layer 0**: norm_ratio ≈ 1.0002, max_diff ≈ 0.0625 → aligned. - - **Layer 1**: norm_ratio ≈ 3.24, max_diff ≈ 28.4 → large divergence. - - **Layer 2**: norm_ratio ≈ 5.73 → further drift. -- Root cause for layer 1+: - - Config: layer 0 = `minicpm4` (sparse/dense), layer 1+ = `lightning-attn`. - - HF `LightningAttention` uses **Simple GLA** (`chunk_simple_gla` / `fused_recurrent_simple_gla`): linear/recurrent attention with decay (g_gamma), not causal softmax. - - InfiniLM now routes lightning layers through **Simple GLA** (InfiniCore `simple_gla_*` ops), matching HF’s formulation (recurrent with decay). -- Next step to align after layer 0: - - Implement Simple GLA (chunk or fused_recurrent) in InfiniCore and route lightning layers through it, matching HF’s `attn_fn` (decay, scale=1/sqrt(d), layout). - ---- - -### MMLU-Pro validation mismatches vs logit work (2026-03-24) - -Paired lm-eval `--log_samples` runs (HF vs local chat / Infini server) often disagree for **heterogeneous** reasons. Treat them differently before spending time on logits: - -| Heuristic tag (export script) | Meaning | Use logits / greedy trace? | -|------------------------------|---------|----------------------------| -| `model_disagreement` | Both sides return a valid letter choice but disagree; text is on-topic. | **Yes** — same `input_ids` + `run_prefill_and_greedy_trace` localizes numerical / decode divergence. | -| `parse_or_format` | One side `[invalid]` or regex extraction differs though the model may agree. | **No** (first fix template, stops, or metric extraction). | -| `garbage` | Off-topic or corrupted completion (e.g. wrong language / spam). | **No** — serving hygiene, batching, or cache contamination. | - -**Repo tooling** - -- `InfiniLM/examples/eval_tasks/mmlu_pro_val/export_mismatch_subset.py` — join two `samples_*.jsonl` dirs on `doc_hash`, optional filters, heuristic tag, write `mismatch_subset.json` + `.md` (includes `arguments_a` / `arguments_b` for replay). -- `InfiniLM/examples/eval_tasks/mmlu_pro_val/mmlu_pro_val_prompt.py` — rebuild `input_ids` from logged rows (rendered string vs JsonChat message list) like lm-eval. -- `InfiniLM/examples/eval_tasks/mmlu_pro_val/mmlu_pro_val_logit_probe.py` — drive `minicpm_sala_logits_sanity.run_prefill_and_greedy_trace` on subset rows (in-process HF + `InferEngine` only; HTTP cannot return logits). -- `InfiniLM/examples/minicpm_sala_logits_sanity.py` — `--mode greedy_trace` for ad-hoc prompts; shared `run_prefill_and_greedy_trace()` for subset probes. - -If greedy trace matches HF on a row but the API eval still differs, diff **chat template**, **stop sequences**, **max_tokens**, or server batching — not the GLA kernel alone. - -**HF vs `local-chat-completions` harness (practical parity)** - -- For the same `doc_hash`, the **rendered prompt string** from `--model hf` can match **byte-for-byte** re-templating the JSON messages the API path logs (verified on a biology mismatch example). -- Differences that still moved scores: **regex extraction** used the *first* `answer is (X)` in long CoT while the model’s final line said another letter; `_default_template_yaml` now uses `group_select: -1` (last match) and case-insensitive pattern. -- **Server**: strip lm-eval’s per-message `type: text` wrapper to `{role, content}` before `apply_chat_template`, and set `continue_final_message=not add_generation_prompt` like lm-eval’s HF model class (`inference_server.py`, `llm.py`). diff --git a/examples/jiuge.py b/examples/jiuge.py index 1fcba6c4..fa547435 100644 --- a/examples/jiuge.py +++ b/examples/jiuge.py @@ -252,13 +252,9 @@ def test( # ---------------------------------------------------------------------------- # # Create KVCache # ---------------------------------------------------------------------------- # - batch_size = 1 if isinstance(prompts, str) else len(prompts) - initial_capacity = max_new_tokens + len(input_ids_list[0]) - # MiniCPM-SALA uses per-layer dense KV cache in C++; engine cache_config drives - # scheduling only. Static cache is recommended (no paged bookkeeping) unless - # --enable-paged-attn is explicitly set. if enable_paged_attn: - max_total_tokens = initial_capacity + batch_size = 1 if prompts is str else len(prompts) + max_total_tokens = max_new_tokens + len(input_ids_list[0]) cache_config = PagedKVCacheConfig( num_blocks=( (max_total_tokens + (_PAGED_KV_BLOCK_SIZE - 1)) // _PAGED_KV_BLOCK_SIZE @@ -267,12 +263,10 @@ def test( block_size=_PAGED_KV_BLOCK_SIZE, ) else: - max_cache_len = initial_capacity - if getattr(model.config, "model_type", None) == "minicpm_sala": - max_pos = getattr(model.config, "max_position_embeddings", 4096) - max_cache_len = max(initial_capacity, max_pos) + batch_size = 1 if prompts is str else len(prompts) + initial_capacity = max_new_tokens + len(input_ids_list[0]) cache_config = StaticKVCacheConfig( - max_batch_size=batch_size, max_cache_len=max_cache_len + max_batch_size=batch_size, max_cache_len=initial_capacity ) model.reset_cache(cache_config) diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py index 07ada981..7b6ceea4 100644 --- a/python/infinilm/llm/llm.py +++ b/python/infinilm/llm/llm.py @@ -371,7 +371,6 @@ def apply_chat_template( conversation=messages, add_generation_prompt=add_generation_prompt, tokenize=False, - continue_final_message=not add_generation_prompt, **chat_template_kwargs, ) diff --git a/python/infinilm/llm/static_scheduler.py b/python/infinilm/llm/static_scheduler.py index 860bf7b9..c9b0cb30 100644 --- a/python/infinilm/llm/static_scheduler.py +++ b/python/infinilm/llm/static_scheduler.py @@ -4,7 +4,6 @@ import logging import queue -import os import janus from typing import List, Optional diff --git a/python/infinilm/modeling_utils.py b/python/infinilm/modeling_utils.py index 17a5fe58..ec045185 100644 --- a/python/infinilm/modeling_utils.py +++ b/python/infinilm/modeling_utils.py @@ -155,32 +155,26 @@ def load_model_state_dict_by_file( torch_dtype = infinicore.utils.to_torch_dtype(dtype) model_keys = model.state_dict_keyname() - # MiniCPM-style scaling (used by MiniCPM / FM9G; also applies to MiniCPM-SALA checkpoints). - # This matches `InfiniLM/scripts/jiuge.py` weight scaling behavior. + # MiniCPM-SALA scaling (bake selected MuP scales into weights). + # This matches `InfiniLM/scripts/jiuge.py` weight scaling behavior for `model_type=="minicpm_sala"`. scale_input = 1.0 scale_output = 1.0 scale_o = 1.0 scale_down = 1.0 scale_lm_head = 1.0 try: + # TODO: fetch config from model rather than file directly with open(os.path.join(model_path, "config.json")) as f: cfg = json.load(f) - if ( - cfg.get("model_type") in ["fm9g", "minicpm", "minicpm_sala"] - and "scale_emb" in cfg - and "scale_depth" in cfg - ): + if cfg.get("model_type") == "minicpm_sala" and "scale_emb" in cfg and "scale_depth" in cfg: scale_input = float(cfg["scale_emb"]) scale_o = float(cfg["scale_depth"]) / math.sqrt(float(cfg["num_hidden_layers"])) scale_down = float(cfg["scale_depth"]) / math.sqrt(float(cfg["num_hidden_layers"])) - if cfg.get("model_type") in ["fm9g", "minicpm"] and "dim_model_base" in cfg: - scale_output = float(int(cfg["hidden_size"]) // int(cfg["dim_model_base"])) - if cfg.get("model_type") == "minicpm_sala" and "dim_model_base" in cfg and "hidden_size" in cfg: + if "dim_model_base" in cfg and "hidden_size" in cfg: scale_lm_head = float(cfg["dim_model_base"]) / float(cfg["hidden_size"]) # minicpm_sala: only bake embed and lm_head; residual scaling done at forward in C++ - if cfg.get("model_type") == "minicpm_sala": - scale_o = 1.0 - scale_down = 1.0 + scale_o = 1.0 + scale_down = 1.0 except Exception: pass From 54a07dd6e7c742ba0198daa82f1bd6d956bd5396 Mon Sep 17 00:00:00 2001 From: Ceng23333 <441651826@qq.com> Date: Thu, 9 Apr 2026 06:34:31 +0000 Subject: [PATCH 09/11] refactor Signed-off-by: Ceng23333 <441651826@qq.com> --- python/infinilm/infer_engine.py | 7 ++++--- python/infinilm/modeling_utils.py | 3 +-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/infinilm/infer_engine.py b/python/infinilm/infer_engine.py index 9046b790..f25b97b9 100644 --- a/python/infinilm/infer_engine.py +++ b/python/infinilm/infer_engine.py @@ -1,5 +1,4 @@ import time -import os from dataclasses import dataclass import infinicore @@ -79,7 +78,9 @@ def forward( try: # TODO: Remove `_underlying` and simplify the corresponding code. input_ids = input_ids._underlying if input_ids is not None else None - position_ids = position_ids._underlying if position_ids is not None else None + position_ids = ( + position_ids._underlying if position_ids is not None else None + ) past_kv_lengths = ( past_kv_lengths._underlying if past_kv_lengths is not None else None ) @@ -133,7 +134,6 @@ def generate( eos_token_id = generation_config.eos_token_id past_seq_len = 0 - output_ids = [] initial_batch_size, initial_seqlen = input_ids.shape[:2] seq_len = initial_seqlen @@ -291,6 +291,7 @@ def generate( top_k=generation_config.top_k, top_p=generation_config.top_p, ) + output_ids.append(output_id) if ( diff --git a/python/infinilm/modeling_utils.py b/python/infinilm/modeling_utils.py index ec045185..03d3c062 100644 --- a/python/infinilm/modeling_utils.py +++ b/python/infinilm/modeling_utils.py @@ -95,8 +95,7 @@ def load_state_dict( ) for k in f.keys(): - # Explicitly cast dtype: some ops (e.g. embedding) may not support BF16 on all backends. - state_dict[k] = f.get_tensor(k).to(device=device, dtype=dtype) + state_dict[k] = f.get_tensor(k).to(device=device) return state_dict From f9f6a120412f4e8654f20e246aaed8673b65f83e Mon Sep 17 00:00:00 2001 From: Ceng23333 <441651826@qq.com> Date: Thu, 9 Apr 2026 07:11:28 +0000 Subject: [PATCH 10/11] seperate 2 attn Signed-off-by: Ceng23333 <441651826@qq.com> --- .../minicpm_sala/minicpm_sala_attention.cpp | 398 ++++++++++-------- .../minicpm_sala/minicpm_sala_attention.hpp | 80 ++-- .../minicpm_sala_decoder_layer.cpp | 13 +- .../minicpm_sala_decoder_layer.hpp | 7 +- .../minicpm_sala_for_causal_lm.cpp | 18 +- .../minicpm_sala/minicpm_sala_model.cpp | 18 +- .../minicpm_sala/minicpm_sala_model.hpp | 8 +- 7 files changed, 297 insertions(+), 245 deletions(-) diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp index f36b84c5..af346445 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp @@ -41,7 +41,7 @@ void minicpm_sala_update_layer_kv_tensor(infinicore::Tensor &kv_bundle, const size_t update_len = k_permuted->size(2); const size_t result_len = cache_pos + update_len; if (result_len > k_cache_layer->size(2)) { - throw std::runtime_error("MiniCPMSALAAttention: KV cache length exceeded"); + throw std::runtime_error("MiniCPMSALAAttention(KV update): KV cache length exceeded"); } k_cache_layer->narrow({{2, cache_pos, update_len}})->copy_from(k_permuted); v_cache_layer->narrow({{2, cache_pos, update_len}})->copy_from(v_permuted); @@ -90,81 +90,39 @@ void ensure_gla_state_allocated(infinicore::Tensor &state, } } // namespace -MiniCPMSALAAttention::MiniCPMSALAAttention(std::shared_ptr model_config, - const infinicore::Device &device, - size_t layer_idx, - const std::string &mixer_type, - engine::distributed::RankInfo rank_info, - backends::AttentionBackend attention_backend) +MiniCPMSALALightningAttention::MiniCPMSALALightningAttention(std::shared_ptr model_config, + const infinicore::Device &device, + size_t layer_idx) : model_config_(std::move(model_config)), - rank_info_(rank_info), - layer_idx_(layer_idx), - attention_backend_(attention_backend) { - - // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA). + layer_idx_(layer_idx) { const auto dtype = model_config_->get_dtype(); + attention_backend_ = infinilm::global_state::get_infinilm_config().attention_backend; hidden_size_ = model_config_->get("hidden_size"); - if (mixer_type == "minicpm4") { - is_sparse_layer_ = true; - num_attention_heads_ = model_config_->get("num_attention_heads"); - num_key_value_heads_ = model_config_->get("num_key_value_heads"); - head_dim_ = model_config_->get("head_dim"); - - // InfLLM-v2 local-window masking (causal-local semantics) for minicpm4. - // Prefer `sparse_window_size`, but fall back to `window_size` if needed. - int sparse_window_size = model_config_->get_or("sparse_window_size", -1); - if (sparse_window_size <= 0) { - // Some HF configs store this under `sparse_config.window_size`. - auto sparse_cfg = model_config_->get_or("sparse_config", nlohmann::json{}); - if (!sparse_cfg.is_null() && sparse_cfg.contains("window_size")) { - sparse_window_size = sparse_cfg["window_size"].get(); - } else { - sparse_window_size = model_config_->get_or("window_size", -1); - } - } - if (sparse_window_size > 0) { - infllmv2_window_left_ = sparse_window_size; - infllmv2_window_right_ = 0; - use_local_window_ = true; - } - } else { - // Lightning layers have their own head config. - num_attention_heads_ = model_config_->get_or("lightning_nh", model_config_->get("num_attention_heads")); - num_key_value_heads_ = model_config_->get_or("lightning_nkv", model_config_->get("num_key_value_heads")); - head_dim_ = model_config_->get_or("lightning_head_dim", model_config_->get("head_dim")); - } + + num_attention_heads_ = model_config_->get_or("lightning_nh", model_config_->get("num_attention_heads")); + num_key_value_heads_ = model_config_->get_or("lightning_nkv", model_config_->get("num_key_value_heads")); + head_dim_ = model_config_->get_or("lightning_head_dim", model_config_->get("head_dim")); scaling_ = static_cast(1.0 / std::sqrt(static_cast(head_dim_))); - // HyPE: RoPE in lightning layers, NoPE in sparse (minicpm4) layers. - // We treat all non-minicpm4 as "linear" (lightning-attn) for M1 dense fallback. - use_rope_ = (mixer_type != "minicpm4") && model_config_->get_or("lightning_use_rope", true); + use_rope_ = model_config_->get_or("lightning_use_rope", true); rotary_emb_ = infinilm::layers::rotary_embedding::get_rope(model_config_, device); - // MiniCPM-SALA uses QK-norm and output gates by default. - use_qk_norm_ = model_config_->get_or("qk_norm", true) && (mixer_type != "minicpm4"); + use_qk_norm_ = model_config_->get_or("qk_norm", true); use_output_gate_ = model_config_->get_or("use_output_gate", true); - // Projections INFINICORE_NN_MODULE_INIT(q_proj, hidden_size_, num_attention_heads_ * head_dim_, false, dtype, device); INFINICORE_NN_MODULE_INIT(k_proj, hidden_size_, num_key_value_heads_ * head_dim_, false, dtype, device); INFINICORE_NN_MODULE_INIT(v_proj, hidden_size_, num_key_value_heads_ * head_dim_, false, dtype, device); INFINICORE_NN_MODULE_INIT(o_proj, num_attention_heads_ * head_dim_, hidden_size_, false, dtype, device); - if (mixer_type == "minicpm4") { - // Sparse layers use o_gate (sigmoid gate on attention output) - INFINICORE_NN_MODULE_INIT(o_gate, hidden_size_, hidden_size_, false, dtype, device); - } else { - // Lightning layers use q/k norm + output norm and z-projection gate - if (use_qk_norm_) { - INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, model_config_->get("rms_norm_eps"), dtype, device); - INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, model_config_->get("rms_norm_eps"), dtype, device); - } - use_output_norm_ = true; - // Checkpoint uses o_norm over hidden_size (shape [hidden_size]). - INFINICORE_NN_MODULE_INIT(o_norm, hidden_size_, model_config_->get("rms_norm_eps"), dtype, device); - INFINICORE_NN_MODULE_INIT(z_proj, hidden_size_, hidden_size_, false, dtype, device); + if (use_qk_norm_) { + INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, model_config_->get("rms_norm_eps"), dtype, device); + INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, model_config_->get("rms_norm_eps"), dtype, device); } - // Simple GLA decay for lightning path: g_gamma = _build_slope_tensor * -1. + use_output_norm_ = true; + INFINICORE_NN_MODULE_INIT(o_norm, hidden_size_, model_config_->get("rms_norm_eps"), dtype, device); + INFINICORE_NN_MODULE_INIT(z_proj, hidden_size_, hidden_size_, false, dtype, device); + std::vector slopes = build_slope_tensor(num_attention_heads_); auto g_cpu = infinicore::Tensor::empty( {num_attention_heads_}, infinicore::DataType::F32, infinicore::Device::cpu()); @@ -174,17 +132,14 @@ MiniCPMSALAAttention::MiniCPMSALAAttention(std::shared_ptrto(device); } -void MiniCPMSALAAttention::reset_state() { - // KV tensors are maintained by the shared engine cache (StaticKVCache). - // Lightning decode recurrent state is maintained locally for performance. +void MiniCPMSALALightningAttention::reset_state() { gla_state_valid_ = false; gla_state_cached_len_ = 0; gla_state_ = {}; } - -infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &position_ids, - const infinicore::Tensor &hidden_states) const { +infinicore::Tensor MiniCPMSALALightningAttention::forward(const infinicore::Tensor &position_ids, + const infinicore::Tensor &hidden_states) const { const auto &attn_meta = infinilm::global_state::get_forward_context().attn_metadata; auto past_sequence_lengths = attn_meta.past_sequence_lengths; auto total_sequence_lengths = attn_meta.total_sequence_lengths; @@ -218,7 +173,7 @@ infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &posit // RoPE only for lightning layers (HyPE) if (use_rope_) { if (!rotary_emb_) { - throw std::runtime_error("MiniCPMSALAAttention: rotary_emb is not set but use_rope=true"); + throw std::runtime_error("MiniCPMSALALightningAttention: rotary_emb is not set but use_rope=true"); } // position_ids can be [B,S] or [S]; follow LlamaAttention behavior. auto pos_shape = position_ids->shape(); @@ -229,7 +184,7 @@ infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &posit } else if (pos_shape.size() == 1) { pos_ids_for_rope = position_ids->contiguous(); } else { - throw std::runtime_error("MiniCPMSALAAttention: Unexpected position_ids shape"); + throw std::runtime_error("MiniCPMSALALightningAttention: Unexpected position_ids shape"); } rotary_emb_->forward(q_reshaped, pos_ids_for_rope, true); @@ -271,7 +226,7 @@ infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &posit auto &kv_vec = infinilm::global_state::get_forward_context().kv_cache_vec; if (layer_idx_ >= kv_vec.size()) { throw std::runtime_error( - "MiniCPMSALAAttention: forward_context.kv_cache_vec is unset or too small (call reset_cache / align layer count)"); + "MiniCPMSALALightningAttention: forward_context.kv_cache_vec is unset or too small (call reset_cache / align layer count)"); } use_forward_kv = true; minicpm_sala_update_layer_kv_tensor( @@ -289,14 +244,14 @@ infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &posit // Slice to total_seq_len (decode-only / cont-batch) if (total_seq_len > k_total->shape()[2]) { - throw std::runtime_error("MiniCPMSALAAttention: total_seq_len exceeds available KV length (cache not correctly updated)"); + throw std::runtime_error("MiniCPMSALALightningAttention: total_seq_len exceeds available KV length (cache not correctly updated)"); } k_total = k_total->narrow({{2, 0, total_seq_len}}); v_total = v_total->narrow({{2, 0, total_seq_len}}); infinicore::Tensor attn_output; - if (!is_sparse_layer_) { - // Lightning-attn: Simple GLA (HF-aligned). + { + // Lightning-attn only: Simple GLA (HF-aligned). // simple_gla_attention(q,k,v,g_gamma,scale) expects [B, T, H, D]; g_gamma [H]. const size_t n_h = num_attention_heads_; const size_t n_kv = num_key_value_heads_; @@ -398,116 +353,17 @@ infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &posit infinicore::Tensor out_slice = gla_out->narrow({{1, total_seq_len - seq_len, seq_len}}); attn_output = out_slice->view({batch_size, seq_len, n_h * head_dim_}); } - } else { - // minicpm4 layers must use InfLLM-v2 attention (hard error if not available). - // NOTE: Lightning layers keep Simple GLA for correctness; only minicpm4 routes here. - try { - if (!total_sequence_lengths.has_value()) { - throw std::runtime_error( - "MiniCPMSALAAttention(minicpm4): total_sequence_lengths is required for InfLLM-v2 path"); - } - // `infllmv2_kvcache` expects the number of valid K/V entries in the - // provided cache tensors. After per-layer KV update, valid length is - // total KV length (past + current token). - const auto cache_lens = total_sequence_lengths.value(); - - // Prefill: InfLLM-v2 varlen (Q and K packed lengths match `seq_len == total_seq_len` here). - // Decode: `seq_len < total_seq_len` — use `infllmv2_kvcache` after KV tensor update - // (valid KV length == `total_seq_len`). Using varlen for decode (1 query vs long K) hit NaNs - // in practice for modest sequence lengths; kvcache matches operator tests and Flash path. - const bool force_varlen_decode = [&]() { - const char *env = std::getenv("INFINI_MINICPM4_DECODE_VARLEN"); - return env && env[0] != '\0' && env[0] != '0'; - }(); - - if (seq_len == total_seq_len || (force_varlen_decode && batch_size == 1)) { - if (batch_size != 1) { - throw std::runtime_error("MiniCPMSALAAttention(minicpm4): varlen prefill path currently requires batch_size=1"); - } - auto q_bshd = q_reshaped->contiguous(); // [B, S, n_h, D] - auto k_btkd = k_total->permute({0, 2, 1, 3})->contiguous(); // [B, T, n_kv, D] - auto v_btkd = v_total->permute({0, 2, 1, 3})->contiguous(); // [B, T, n_kv, D] - auto q_var = q_bshd->view({static_cast(seq_len), static_cast(num_attention_heads_), static_cast(head_dim_)}); - auto k_var = k_btkd->view({static_cast(total_seq_len), static_cast(num_key_value_heads_), static_cast(head_dim_)}); - auto v_var = v_btkd->view({static_cast(total_seq_len), static_cast(num_key_value_heads_), static_cast(head_dim_)}); - - auto cuq_cpu = infinicore::Tensor::empty({2}, infinicore::DataType::I32, infinicore::Device::cpu()); - reinterpret_cast(cuq_cpu->data())[0] = 0; - reinterpret_cast(cuq_cpu->data())[1] = static_cast(seq_len); - infinicore::Tensor cu_q = cuq_cpu->to(q_var->device()); - // cu_k corresponds to the full KV length used by k_var/v_var. - auto cuk_cpu = infinicore::Tensor::empty({2}, infinicore::DataType::I32, infinicore::Device::cpu()); - reinterpret_cast(cuk_cpu->data())[0] = 0; - reinterpret_cast(cuk_cpu->data())[1] = static_cast(total_seq_len); - infinicore::Tensor cu_k = cuk_cpu->to(q_var->device()); - - const bool infllmv2_causal = !use_local_window_; - const int window_left = use_local_window_ ? infllmv2_window_left_ : -1; - const int window_right = use_local_window_ ? 0 : -1; - - auto out_var = infinicore::op::infllmv2_varlen( - q_var, k_var, v_var, - cu_q, cu_k, - static_cast(seq_len), - static_cast(total_seq_len), - scaling_, - /*causal=*/infllmv2_causal, - /*window_size_left=*/window_left, - /*window_size_right=*/window_right); - attn_output = out_var->view({batch_size, seq_len, num_attention_heads_ * head_dim_}); - } else if (use_forward_kv) { - if (batch_size != 1) { - throw std::runtime_error("MiniCPMSALAAttention(minicpm4): kvcache decode path currently requires batch_size=1"); - } - auto q_bshd = q_reshaped->contiguous(); // [B, S_q, n_h, D] - auto k_bthd = k_total->permute({0, 2, 1, 3})->contiguous(); // [B, T, n_kv, D] - auto v_bthd = v_total->permute({0, 2, 1, 3})->contiguous(); // [B, T, n_kv, D] - - const bool infllmv2_causal = !use_local_window_; - const int window_left = use_local_window_ ? infllmv2_window_left_ : -1; - const int window_right = use_local_window_ ? 0 : -1; - - auto out_bshd = infinicore::op::infllmv2_kvcache( - q_bshd, - k_bthd, - v_bthd, - cache_lens, - scaling_, - /*causal=*/infllmv2_causal, - /*window_size_left=*/window_left, - /*window_size_right=*/window_right); - attn_output = out_bshd->contiguous()->view( - {batch_size, seq_len, num_attention_heads_ * head_dim_}); - } else { - throw std::runtime_error( - "MiniCPMSALAAttention(minicpm4): decode requires KV cache (missing cache metadata or kv_cache_vec)"); - } - } catch (const std::exception &e) { - throw std::runtime_error( - std::string("MiniCPMSALAAttention(minicpm4): InfLLM-v2 attention failed. ") - + "This build must provide InfLLM-v2 (ENABLE_INFLLMV2+ENABLE_ATEN) and the infllmv2_cuda_impl .so " - + "must be available via LD_PRELOAD/LD_LIBRARY_PATH. Original error: " + e.what()); - } } - // Output norm + gate variants + // Lightning output gate/norm if (use_output_gate_) { - if (o_gate_) { - // Sparse (minicpm4): y = sigmoid(o_gate(x)) * attn_output - auto gate_in = hidden_states; - auto gate = o_gate_->forward(gate_in); - infinicore::op::sigmoid_(gate, gate); - attn_output = infinicore::op::mul(attn_output, gate); - } else if (z_proj_) { - // Lightning: match HF LightningAttention: o_norm(o) then o * sigmoid(z_proj(x)). - auto z_in = hidden_states; - auto z = z_proj_->forward(z_in); - infinicore::op::sigmoid_(z, z); - if (use_output_norm_ && o_norm_) { - attn_output = o_norm_->forward(attn_output); - } - attn_output = infinicore::op::mul(attn_output, z); + auto z_in = hidden_states; + auto z = z_proj_->forward(z_in); + infinicore::op::sigmoid_(z, z); + if (use_output_norm_ && o_norm_) { + attn_output = o_norm_->forward(attn_output); } + attn_output = infinicore::op::mul(attn_output, z); } else if (use_output_norm_ && o_norm_) { attn_output = o_norm_->forward(attn_output); } @@ -518,4 +374,188 @@ infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &posit return out; } +MiniCPMSALAMinicpm4Attention::MiniCPMSALAMinicpm4Attention(std::shared_ptr model_config, + const infinicore::Device &device, + size_t layer_idx) + : model_config_(std::move(model_config)), + layer_idx_(layer_idx) { + (void)device; + const auto dtype = model_config_->get_dtype(); + attention_backend_ = infinilm::global_state::get_infinilm_config().attention_backend; + hidden_size_ = model_config_->get("hidden_size"); + num_attention_heads_ = model_config_->get("num_attention_heads"); + num_key_value_heads_ = model_config_->get("num_key_value_heads"); + head_dim_ = model_config_->get("head_dim"); + scaling_ = static_cast(1.0 / std::sqrt(static_cast(head_dim_))); + + int sparse_window_size = model_config_->get_or("sparse_window_size", -1); + if (sparse_window_size <= 0) { + auto sparse_cfg = model_config_->get_or("sparse_config", nlohmann::json{}); + if (!sparse_cfg.is_null() && sparse_cfg.contains("window_size")) { + sparse_window_size = sparse_cfg["window_size"].get(); + } else { + sparse_window_size = model_config_->get_or("window_size", -1); + } + } + if (sparse_window_size > 0) { + infllmv2_window_left_ = sparse_window_size; + infllmv2_window_right_ = 0; + use_local_window_ = true; + } + + INFINICORE_NN_MODULE_INIT(q_proj, hidden_size_, num_attention_heads_ * head_dim_, false, dtype, device); + INFINICORE_NN_MODULE_INIT(k_proj, hidden_size_, num_key_value_heads_ * head_dim_, false, dtype, device); + INFINICORE_NN_MODULE_INIT(v_proj, hidden_size_, num_key_value_heads_ * head_dim_, false, dtype, device); + INFINICORE_NN_MODULE_INIT(o_proj, num_attention_heads_ * head_dim_, hidden_size_, false, dtype, device); + INFINICORE_NN_MODULE_INIT(o_gate, hidden_size_, hidden_size_, false, dtype, device); +} + +void MiniCPMSALAMinicpm4Attention::reset_state() { + // no local recurrent state +} + +infinicore::Tensor MiniCPMSALAMinicpm4Attention::forward(const infinicore::Tensor &position_ids, + const infinicore::Tensor &hidden_states) const { + (void)position_ids; + const auto &attn_meta = infinilm::global_state::get_forward_context().attn_metadata; + auto past_sequence_lengths = attn_meta.past_sequence_lengths; + auto total_sequence_lengths = attn_meta.total_sequence_lengths; + + auto shape = hidden_states->shape(); + const size_t batch_size = shape[0]; + const size_t seq_len = shape[1]; + + auto hs_mut = hidden_states; + auto q = q_proj_->forward(hs_mut); + auto k = k_proj_->forward(hs_mut); + auto v = v_proj_->forward(hs_mut); + auto q_reshaped = q->contiguous()->view({batch_size, seq_len, num_attention_heads_, head_dim_}); + auto k_reshaped = k->contiguous()->view({batch_size, seq_len, num_key_value_heads_, head_dim_}); + auto v_reshaped = v->contiguous()->view({batch_size, seq_len, num_key_value_heads_, head_dim_}); + + // KV update via per-layer kv_cache_vec when metadata present + size_t total_seq_len = seq_len; + size_t cache_pos = 0; + const bool has_cache_meta = past_sequence_lengths.has_value() && total_sequence_lengths.has_value(); + if (has_cache_meta) { + auto past_cpu = past_sequence_lengths.value()->to(infinicore::Device::cpu()); + cache_pos = reinterpret_cast(past_cpu->data())[0]; + total_seq_len = cache_pos + seq_len; + } + auto k_permuted = k_reshaped->permute({0, 2, 1, 3})->contiguous(); + auto v_permuted = v_reshaped->permute({0, 2, 1, 3})->contiguous(); + + infinicore::Tensor k_total = k_permuted; + infinicore::Tensor v_total = v_permuted; + bool use_forward_kv = false; + if (has_cache_meta) { + auto &kv_vec = infinilm::global_state::get_forward_context().kv_cache_vec; + if (layer_idx_ >= kv_vec.size()) { + throw std::runtime_error( + "MiniCPMSALAMinicpm4Attention: forward_context.kv_cache_vec is unset or too small"); + } + use_forward_kv = true; + minicpm_sala_update_layer_kv_tensor( + kv_vec[layer_idx_], + k_permuted, + v_permuted, + past_sequence_lengths.value()); + auto k_cache_layer = kv_vec[layer_idx_]->narrow({{0, 0, 1}})->squeeze(0); + auto v_cache_layer = kv_vec[layer_idx_]->narrow({{0, 1, 1}})->squeeze(0); + k_total = k_cache_layer; + v_total = v_cache_layer; + } else { + total_seq_len = seq_len; + } + + if (total_seq_len > k_total->shape()[2]) { + throw std::runtime_error("MiniCPMSALAMinicpm4Attention: total_seq_len exceeds available KV length"); + } + k_total = k_total->narrow({{2, 0, total_seq_len}}); + v_total = v_total->narrow({{2, 0, total_seq_len}}); + + try { + if (!total_sequence_lengths.has_value()) { + throw std::runtime_error("MiniCPMSALAMinicpm4Attention: total_sequence_lengths is required for InfLLM-v2 path"); + } + const auto cache_lens = total_sequence_lengths.value(); + const bool force_varlen_decode = [&]() { + const char *env = std::getenv("INFINI_MINICPM4_DECODE_VARLEN"); + return env && env[0] != '\0' && env[0] != '0'; + }(); + + infinicore::Tensor attn_output; + if (seq_len == total_seq_len || (force_varlen_decode && batch_size == 1)) { + if (batch_size != 1) { + throw std::runtime_error("MiniCPMSALAMinicpm4Attention: varlen path requires batch_size=1"); + } + auto q_bshd = q_reshaped->contiguous(); + auto k_btkd = k_total->permute({0, 2, 1, 3})->contiguous(); + auto v_btkd = v_total->permute({0, 2, 1, 3})->contiguous(); + auto q_var = q_bshd->view({static_cast(seq_len), static_cast(num_attention_heads_), static_cast(head_dim_)}); + auto k_var = k_btkd->view({static_cast(total_seq_len), static_cast(num_key_value_heads_), static_cast(head_dim_)}); + auto v_var = v_btkd->view({static_cast(total_seq_len), static_cast(num_key_value_heads_), static_cast(head_dim_)}); + + auto cuq_cpu = infinicore::Tensor::empty({2}, infinicore::DataType::I32, infinicore::Device::cpu()); + reinterpret_cast(cuq_cpu->data())[0] = 0; + reinterpret_cast(cuq_cpu->data())[1] = static_cast(seq_len); + infinicore::Tensor cu_q = cuq_cpu->to(q_var->device()); + auto cuk_cpu = infinicore::Tensor::empty({2}, infinicore::DataType::I32, infinicore::Device::cpu()); + reinterpret_cast(cuk_cpu->data())[0] = 0; + reinterpret_cast(cuk_cpu->data())[1] = static_cast(total_seq_len); + infinicore::Tensor cu_k = cuk_cpu->to(q_var->device()); + + const bool infllmv2_causal = !use_local_window_; + const int window_left = use_local_window_ ? infllmv2_window_left_ : -1; + const int window_right = use_local_window_ ? 0 : -1; + + auto out_var = infinicore::op::infllmv2_varlen( + q_var, k_var, v_var, + cu_q, cu_k, + static_cast(seq_len), + static_cast(total_seq_len), + scaling_, + /*causal=*/infllmv2_causal, + /*window_size_left=*/window_left, + /*window_size_right=*/window_right); + attn_output = out_var->view({batch_size, seq_len, num_attention_heads_ * head_dim_}); + } else if (use_forward_kv) { + if (batch_size != 1) { + throw std::runtime_error("MiniCPMSALAMinicpm4Attention: kvcache decode requires batch_size=1"); + } + auto q_bshd = q_reshaped->contiguous(); + auto k_bthd = k_total->permute({0, 2, 1, 3})->contiguous(); + auto v_bthd = v_total->permute({0, 2, 1, 3})->contiguous(); + + const bool infllmv2_causal = !use_local_window_; + const int window_left = use_local_window_ ? infllmv2_window_left_ : -1; + const int window_right = use_local_window_ ? 0 : -1; + + auto out_bshd = infinicore::op::infllmv2_kvcache( + q_bshd, + k_bthd, + v_bthd, + cache_lens, + scaling_, + /*causal=*/infllmv2_causal, + /*window_size_left=*/window_left, + /*window_size_right=*/window_right); + attn_output = out_bshd->contiguous()->view({batch_size, seq_len, num_attention_heads_ * head_dim_}); + } else { + throw std::runtime_error("MiniCPMSALAMinicpm4Attention: decode requires KV cache"); + } + + // Sparse gate + o_proj + auto gate = o_gate_->forward(hs_mut); + infinicore::op::sigmoid_(gate, gate); + attn_output = infinicore::op::mul(attn_output, gate); + auto out = o_proj_->forward(attn_output); + return out; + } catch (const std::exception &e) { + throw std::runtime_error( + std::string("MiniCPMSALAMinicpm4Attention: InfLLM-v2 attention failed. ") + + "Original error: " + e.what()); + } +} + } // namespace infinilm::models::minicpm_sala diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp index 2013d678..43784627 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp @@ -3,8 +3,8 @@ #include "../../backends/attention_backends.hpp" #include "../../cache/kv_cache.hpp" #include "../../config/model_config.hpp" -#include "../../engine/distributed/distributed.hpp" #include "../../layers/rotary_embedding/rotary_embedding.hpp" +#include "../../global_state/global_state.hpp" #include "infinicore/nn/linear.hpp" #include "infinicore/nn/module.hpp" @@ -17,26 +17,28 @@ namespace infinilm::models::minicpm_sala { -// Dense attention fallback implementation used for Milestone 1. -// Parameter names are aligned with HF MiniCPM-SALA safetensors keys: -// model.layers.N.self_attn.{q_proj,k_proj,v_proj,o_proj,...} -// TODO(refactor): KV cache is currently per-layer dense; refactor to use engine paged KV pool -// and block_tables/slot_mapping to match SGLang minicpm-sala pattern (see minicpm_sala_attention.cpp). -class MiniCPMSALAAttention : public infinicore::nn::Module { +class MiniCPMSALAAttentionBase : public infinicore::nn::Module { public: - MiniCPMSALAAttention(std::shared_ptr model_config, - const infinicore::Device &device, - size_t layer_idx, - const std::string &mixer_type, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); + virtual infinicore::Tensor forward(const infinicore::Tensor &position_ids, + const infinicore::Tensor &hidden_states) const = 0; + virtual void reset_state() = 0; + virtual ~MiniCPMSALAAttentionBase() = default; +}; + +// Lightning attention path (Simple GLA). Parameter names align with HF: +// model.layers.N.self_attn.{q_proj,k_proj,v_proj,o_proj,q_norm,k_norm,o_norm,z_proj,...} +class MiniCPMSALALightningAttention : public MiniCPMSALAAttentionBase { +public: + MiniCPMSALALightningAttention(std::shared_ptr model_config, + const infinicore::Device &device, + size_t layer_idx); // Match `infinilm::layers::attention::Attention` API: metadata is pulled from // `global_state::get_forward_context().attn_metadata`. infinicore::Tensor forward(const infinicore::Tensor &position_ids, - const infinicore::Tensor &hidden_states) const; + const infinicore::Tensor &hidden_states) const override; - void reset_state(); + void reset_state() override; protected: // Projections (HF-aligned naming) @@ -51,12 +53,8 @@ class MiniCPMSALAAttention : public infinicore::nn::Module { INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, o_norm); INFINICORE_NN_MODULE(infinicore::nn::Linear, z_proj); - // Optional (Sparse layers): o_gate - INFINICORE_NN_MODULE(infinicore::nn::Linear, o_gate); - std::shared_ptr model_config_; std::shared_ptr rotary_emb_; - engine::distributed::RankInfo rank_info_; size_t layer_idx_; size_t hidden_size_; @@ -69,13 +67,6 @@ class MiniCPMSALAAttention : public infinicore::nn::Module { bool use_output_gate_ = false; bool use_output_norm_ = false; bool use_rope_ = false; - bool is_sparse_layer_ = false; - - // InfLLM-v2 local-window masking plumbing for `mixer_type=="minicpm4"`. - // When enabled: causal=false + window_size_left=sparse_window_size + window_size_right=0. - int infllmv2_window_left_ = -1; - int infllmv2_window_right_ = -1; - bool use_local_window_ = false; backends::AttentionBackend attention_backend_; @@ -89,4 +80,41 @@ class MiniCPMSALAAttention : public infinicore::nn::Module { mutable bool gla_state_valid_ = false; }; +// Sparse attention path (`mixer_type=="minicpm4"`) using InfLLM-v2 operators. +// Parameter names align with HF: +// model.layers.N.self_attn.{q_proj,k_proj,v_proj,o_proj,o_gate,...} +class MiniCPMSALAMinicpm4Attention : public MiniCPMSALAAttentionBase { +public: + MiniCPMSALAMinicpm4Attention(std::shared_ptr model_config, + const infinicore::Device &device, + size_t layer_idx); + + infinicore::Tensor forward(const infinicore::Tensor &position_ids, + const infinicore::Tensor &hidden_states) const override; + + void reset_state() override; + +protected: + INFINICORE_NN_MODULE(infinicore::nn::Linear, q_proj); + INFINICORE_NN_MODULE(infinicore::nn::Linear, k_proj); + INFINICORE_NN_MODULE(infinicore::nn::Linear, v_proj); + INFINICORE_NN_MODULE(infinicore::nn::Linear, o_proj); + INFINICORE_NN_MODULE(infinicore::nn::Linear, o_gate); + + std::shared_ptr model_config_; + size_t layer_idx_; + size_t hidden_size_; + size_t num_attention_heads_; + size_t num_key_value_heads_; + size_t head_dim_; + float scaling_; + + // InfLLM-v2 local-window masking plumbing. + int infllmv2_window_left_ = -1; + int infllmv2_window_right_ = -1; + bool use_local_window_ = false; + + backends::AttentionBackend attention_backend_; +}; + } // namespace infinilm::models::minicpm_sala diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp index 7a44704e..b565cf47 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp @@ -1,5 +1,6 @@ #include "minicpm_sala_decoder_layer.hpp" +#include "../../global_state/global_state.hpp" #include "infinicore/ops.hpp" #include "infinicore/context/context.hpp" #include @@ -15,9 +16,7 @@ namespace infinilm::models::minicpm_sala { MiniCPMSALADecoderLayer::MiniCPMSALADecoderLayer(std::shared_ptr model_config, const infinicore::Device &device, size_t layer_idx, - const std::string &mixer_type, - engine::distributed::RankInfo rank_info, - backends::AttentionBackend attention_backend) { + const std::string &mixer_type) { layer_idx_ = layer_idx; // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA). const auto dtype = model_config->get_dtype(); @@ -29,7 +28,13 @@ MiniCPMSALADecoderLayer::MiniCPMSALADecoderLayer(std::shared_ptr(num_layers)); INFINICORE_NN_MODULE_INIT(input_layernorm, model_config->get("hidden_size"), eps, dtype, device); - INFINICORE_NN_MODULE_INIT(self_attn, model_config, device, layer_idx, mixer_type, rank_info, attention_backend); + if (mixer_type == "minicpm4") { + self_attn_ = this->register_module( + "self_attn", model_config, device, layer_idx); + } else { + self_attn_ = this->register_module( + "self_attn", model_config, device, layer_idx); + } INFINICORE_NN_MODULE_INIT(post_attention_layernorm, model_config->get("hidden_size"), eps, dtype, device); INFINICORE_NN_MODULE_INIT(mlp, model_config, device); } diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp index 44d320c9..52f31000 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp @@ -23,9 +23,7 @@ class MiniCPMSALADecoderLayer : public infinicore::nn::Module { MiniCPMSALADecoderLayer(std::shared_ptr model_config, const infinicore::Device &device, size_t layer_idx, - const std::string &mixer_type, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); + const std::string &mixer_type); infinicore::Tensor forward(const infinicore::Tensor &hidden_states, const infinicore::Tensor &position_ids) const; @@ -38,7 +36,8 @@ class MiniCPMSALADecoderLayer : public infinicore::nn::Module { protected: INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, input_layernorm); - INFINICORE_NN_MODULE(MiniCPMSALAAttention, self_attn); + // Registered under the HF-compatible name "self_attn" in ctor. + std::shared_ptr self_attn_; INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, post_attention_layernorm); INFINICORE_NN_MODULE(MiniCPMSALAMLP, mlp); }; diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp index fb55556f..791a7832 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp @@ -55,15 +55,15 @@ MiniCPMSALAForCausalLM::Output MiniCPMSALAForCausalLM::forward( auto block_tables = input.block_tables; auto slot_mapping = input.slot_mapping; - auto hidden_states = model_->forward( - input_ids, - position_ids, - past_sequence_lengths, - total_sequence_lengths, - input_offsets, - cu_seqlens, - block_tables, - slot_mapping); + infinilm::global_state::get_forward_context().attn_metadata = + infinilm::global_state::AttentionMetadata(past_sequence_lengths, + total_sequence_lengths, + input_offsets, + cu_seqlens, + block_tables, + slot_mapping); + + auto hidden_states = model_->forward(input_ids, position_ids); // MuP lm_head scale baked into lm_head.weight at load time; no forward scaling here. auto logits = lm_head_->forward(hidden_states); diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.cpp b/csrc/models/minicpm_sala/minicpm_sala_model.cpp index f6d9bb4d..f665ce0a 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_model.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_model.cpp @@ -48,7 +48,7 @@ MiniCPMSALAModel::MiniCPMSALAModel(std::shared_ptrregister_module( - "layers." + std::to_string(i), model_config_, device, i, mixer_types[i], rank_info, attention_backend)); + "layers." + std::to_string(i), model_config_, device, i, mixer_types[i])); } } @@ -59,21 +59,7 @@ void MiniCPMSALAModel::reset_state() { } infinicore::Tensor MiniCPMSALAModel::forward(const infinicore::Tensor &input_ids, - const infinicore::Tensor &position_ids, - std::optional past_sequence_lengths, - std::optional total_sequence_lengths, - std::optional input_offsets, - std::optional cu_seqlens, - std::optional block_tables, - std::optional slot_mapping) const { - infinilm::global_state::get_forward_context().attn_metadata = - infinilm::global_state::AttentionMetadata(past_sequence_lengths, - total_sequence_lengths, - input_offsets, - cu_seqlens, - block_tables, - slot_mapping); - + const infinicore::Tensor &position_ids) const { // MuP scaling baked into weights at load time for minicpm_sala; no forward scaling here. auto hs = embed_tokens_->forward(input_ids); diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.hpp b/csrc/models/minicpm_sala/minicpm_sala_model.hpp index 9b4a81c2..ed79cd76 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_model.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_model.hpp @@ -27,13 +27,7 @@ class MiniCPMSALAModel : public infinicore::nn::Module { const infinicore::Device &device); infinicore::Tensor forward(const infinicore::Tensor &input_ids, - const infinicore::Tensor &position_ids, - std::optional past_sequence_lengths, - std::optional total_sequence_lengths, - std::optional input_offsets, - std::optional cu_seqlens, - std::optional block_tables, - std::optional slot_mapping) const; + const infinicore::Tensor &position_ids) const; void reset_state(); From fe79f914fa2b89629840c88ce2a7d49cf8e918b4 Mon Sep 17 00:00:00 2001 From: Ceng23333 <441651826@qq.com> Date: Fri, 10 Apr 2026 03:07:00 +0000 Subject: [PATCH 11/11] cleanup code Signed-off-by: Ceng23333 <441651826@qq.com> --- .../minicpm_sala/minicpm_sala_attention.cpp | 79 ++++++++----------- .../minicpm_sala/minicpm_sala_attention.hpp | 12 --- .../minicpm_sala_decoder_layer.cpp | 6 +- .../minicpm_sala_decoder_layer.hpp | 9 +-- .../minicpm_sala_for_causal_lm.cpp | 21 +---- .../minicpm_sala_for_causal_lm.hpp | 6 +- csrc/models/minicpm_sala/minicpm_sala_mlp.cpp | 10 +-- csrc/models/minicpm_sala/minicpm_sala_mlp.hpp | 4 - .../minicpm_sala/minicpm_sala_model.cpp | 27 +++---- .../minicpm_sala/minicpm_sala_model.hpp | 14 ---- 10 files changed, 56 insertions(+), 132 deletions(-) diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp index af346445..27cb2275 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_attention.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_attention.cpp @@ -93,35 +93,33 @@ void ensure_gla_state_allocated(infinicore::Tensor &state, MiniCPMSALALightningAttention::MiniCPMSALALightningAttention(std::shared_ptr model_config, const infinicore::Device &device, size_t layer_idx) - : model_config_(std::move(model_config)), - layer_idx_(layer_idx) { - const auto dtype = model_config_->get_dtype(); - attention_backend_ = infinilm::global_state::get_infinilm_config().attention_backend; - hidden_size_ = model_config_->get("hidden_size"); - - num_attention_heads_ = model_config_->get_or("lightning_nh", model_config_->get("num_attention_heads")); - num_key_value_heads_ = model_config_->get_or("lightning_nkv", model_config_->get("num_key_value_heads")); - head_dim_ = model_config_->get_or("lightning_head_dim", model_config_->get("head_dim")); + : layer_idx_(layer_idx) { + const auto dtype = model_config->get_dtype(); + const size_t hidden_size = model_config->get("hidden_size"); + + num_attention_heads_ = model_config->get_or("lightning_nh", model_config->get("num_attention_heads")); + num_key_value_heads_ = model_config->get_or("lightning_nkv", model_config->get("num_key_value_heads")); + head_dim_ = model_config->get_or("lightning_head_dim", model_config->get("head_dim")); scaling_ = static_cast(1.0 / std::sqrt(static_cast(head_dim_))); - use_rope_ = model_config_->get_or("lightning_use_rope", true); - rotary_emb_ = infinilm::layers::rotary_embedding::get_rope(model_config_, device); + use_rope_ = model_config->get_or("lightning_use_rope", true); + rotary_emb_ = infinilm::layers::rotary_embedding::get_rope(model_config, device); - use_qk_norm_ = model_config_->get_or("qk_norm", true); - use_output_gate_ = model_config_->get_or("use_output_gate", true); + use_qk_norm_ = model_config->get_or("qk_norm", true); + use_output_gate_ = model_config->get_or("use_output_gate", true); - INFINICORE_NN_MODULE_INIT(q_proj, hidden_size_, num_attention_heads_ * head_dim_, false, dtype, device); - INFINICORE_NN_MODULE_INIT(k_proj, hidden_size_, num_key_value_heads_ * head_dim_, false, dtype, device); - INFINICORE_NN_MODULE_INIT(v_proj, hidden_size_, num_key_value_heads_ * head_dim_, false, dtype, device); - INFINICORE_NN_MODULE_INIT(o_proj, num_attention_heads_ * head_dim_, hidden_size_, false, dtype, device); + INFINICORE_NN_MODULE_INIT(q_proj, hidden_size, num_attention_heads_ * head_dim_, false, dtype, device); + INFINICORE_NN_MODULE_INIT(k_proj, hidden_size, num_key_value_heads_ * head_dim_, false, dtype, device); + INFINICORE_NN_MODULE_INIT(v_proj, hidden_size, num_key_value_heads_ * head_dim_, false, dtype, device); + INFINICORE_NN_MODULE_INIT(o_proj, num_attention_heads_ * head_dim_, hidden_size, false, dtype, device); if (use_qk_norm_) { - INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, model_config_->get("rms_norm_eps"), dtype, device); - INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, model_config_->get("rms_norm_eps"), dtype, device); + INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, model_config->get("rms_norm_eps"), dtype, device); + INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, model_config->get("rms_norm_eps"), dtype, device); } use_output_norm_ = true; - INFINICORE_NN_MODULE_INIT(o_norm, hidden_size_, model_config_->get("rms_norm_eps"), dtype, device); - INFINICORE_NN_MODULE_INIT(z_proj, hidden_size_, hidden_size_, false, dtype, device); + INFINICORE_NN_MODULE_INIT(o_norm, hidden_size, model_config->get("rms_norm_eps"), dtype, device); + INFINICORE_NN_MODULE_INIT(z_proj, hidden_size, hidden_size, false, dtype, device); std::vector slopes = build_slope_tensor(num_attention_heads_); auto g_cpu = infinicore::Tensor::empty( @@ -196,15 +194,9 @@ infinicore::Tensor MiniCPMSALALightningAttention::forward(const infinicore::Tens size_t cache_pos = 0; const bool has_cache_meta = past_sequence_lengths.has_value() && total_sequence_lengths.has_value(); if (has_cache_meta) { - // Single device-to-host sync: read both scalars (engine could pass these as scalars later). auto past_cpu = past_sequence_lengths.value()->to(infinicore::Device::cpu()); - auto total_cpu = total_sequence_lengths.value()->to(infinicore::Device::cpu()); cache_pos = reinterpret_cast(past_cpu->data())[0]; - size_t total_seq_len_raw = reinterpret_cast(total_cpu->data())[0]; - total_seq_len = total_seq_len_raw; - // Some engine call sites pass `total_sequence_lengths` as the *input* length (e.g. 1 for decode), - // while `past_sequence_lengths` is the cached KV length. Attention needs total KV length. - // Use KV semantics: total_kv_len = cache_pos + current seq_len. + // `total_sequence_lengths` may be input length (e.g. 1 on decode); KV length is cache_pos + seq_len. total_seq_len = cache_pos + seq_len; } else if (total_sequence_lengths.has_value()) { total_seq_len = reinterpret_cast(total_sequence_lengths.value()->to(infinicore::Device::cpu())->data())[0]; @@ -377,37 +369,34 @@ infinicore::Tensor MiniCPMSALALightningAttention::forward(const infinicore::Tens MiniCPMSALAMinicpm4Attention::MiniCPMSALAMinicpm4Attention(std::shared_ptr model_config, const infinicore::Device &device, size_t layer_idx) - : model_config_(std::move(model_config)), - layer_idx_(layer_idx) { + : layer_idx_(layer_idx) { (void)device; - const auto dtype = model_config_->get_dtype(); - attention_backend_ = infinilm::global_state::get_infinilm_config().attention_backend; - hidden_size_ = model_config_->get("hidden_size"); - num_attention_heads_ = model_config_->get("num_attention_heads"); - num_key_value_heads_ = model_config_->get("num_key_value_heads"); - head_dim_ = model_config_->get("head_dim"); + const auto dtype = model_config->get_dtype(); + const size_t hidden_size = model_config->get("hidden_size"); + num_attention_heads_ = model_config->get("num_attention_heads"); + num_key_value_heads_ = model_config->get("num_key_value_heads"); + head_dim_ = model_config->get("head_dim"); scaling_ = static_cast(1.0 / std::sqrt(static_cast(head_dim_))); - int sparse_window_size = model_config_->get_or("sparse_window_size", -1); + int sparse_window_size = model_config->get_or("sparse_window_size", -1); if (sparse_window_size <= 0) { - auto sparse_cfg = model_config_->get_or("sparse_config", nlohmann::json{}); + auto sparse_cfg = model_config->get_or("sparse_config", nlohmann::json{}); if (!sparse_cfg.is_null() && sparse_cfg.contains("window_size")) { sparse_window_size = sparse_cfg["window_size"].get(); } else { - sparse_window_size = model_config_->get_or("window_size", -1); + sparse_window_size = model_config->get_or("window_size", -1); } } if (sparse_window_size > 0) { infllmv2_window_left_ = sparse_window_size; - infllmv2_window_right_ = 0; use_local_window_ = true; } - INFINICORE_NN_MODULE_INIT(q_proj, hidden_size_, num_attention_heads_ * head_dim_, false, dtype, device); - INFINICORE_NN_MODULE_INIT(k_proj, hidden_size_, num_key_value_heads_ * head_dim_, false, dtype, device); - INFINICORE_NN_MODULE_INIT(v_proj, hidden_size_, num_key_value_heads_ * head_dim_, false, dtype, device); - INFINICORE_NN_MODULE_INIT(o_proj, num_attention_heads_ * head_dim_, hidden_size_, false, dtype, device); - INFINICORE_NN_MODULE_INIT(o_gate, hidden_size_, hidden_size_, false, dtype, device); + INFINICORE_NN_MODULE_INIT(q_proj, hidden_size, num_attention_heads_ * head_dim_, false, dtype, device); + INFINICORE_NN_MODULE_INIT(k_proj, hidden_size, num_key_value_heads_ * head_dim_, false, dtype, device); + INFINICORE_NN_MODULE_INIT(v_proj, hidden_size, num_key_value_heads_ * head_dim_, false, dtype, device); + INFINICORE_NN_MODULE_INIT(o_proj, num_attention_heads_ * head_dim_, hidden_size, false, dtype, device); + INFINICORE_NN_MODULE_INIT(o_gate, hidden_size, hidden_size, false, dtype, device); } void MiniCPMSALAMinicpm4Attention::reset_state() { diff --git a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp index 43784627..9af665aa 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_attention.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_attention.hpp @@ -1,10 +1,7 @@ #pragma once -#include "../../backends/attention_backends.hpp" -#include "../../cache/kv_cache.hpp" #include "../../config/model_config.hpp" #include "../../layers/rotary_embedding/rotary_embedding.hpp" -#include "../../global_state/global_state.hpp" #include "infinicore/nn/linear.hpp" #include "infinicore/nn/module.hpp" @@ -53,11 +50,9 @@ class MiniCPMSALALightningAttention : public MiniCPMSALAAttentionBase { INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, o_norm); INFINICORE_NN_MODULE(infinicore::nn::Linear, z_proj); - std::shared_ptr model_config_; std::shared_ptr rotary_emb_; size_t layer_idx_; - size_t hidden_size_; size_t num_attention_heads_; size_t num_key_value_heads_; size_t head_dim_; @@ -68,8 +63,6 @@ class MiniCPMSALALightningAttention : public MiniCPMSALAAttentionBase { bool use_output_norm_ = false; bool use_rope_ = false; - backends::AttentionBackend attention_backend_; - // Lightning layers only: per-head log-decay for Simple GLA (HF _build_slope_tensor * -1). infinicore::Tensor g_gamma_; @@ -101,9 +94,7 @@ class MiniCPMSALAMinicpm4Attention : public MiniCPMSALAAttentionBase { INFINICORE_NN_MODULE(infinicore::nn::Linear, o_proj); INFINICORE_NN_MODULE(infinicore::nn::Linear, o_gate); - std::shared_ptr model_config_; size_t layer_idx_; - size_t hidden_size_; size_t num_attention_heads_; size_t num_key_value_heads_; size_t head_dim_; @@ -111,10 +102,7 @@ class MiniCPMSALAMinicpm4Attention : public MiniCPMSALAAttentionBase { // InfLLM-v2 local-window masking plumbing. int infllmv2_window_left_ = -1; - int infllmv2_window_right_ = -1; bool use_local_window_ = false; - - backends::AttentionBackend attention_backend_; }; } // namespace infinilm::models::minicpm_sala diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp index b565cf47..6c04b480 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp @@ -1,6 +1,5 @@ #include "minicpm_sala_decoder_layer.hpp" -#include "../../global_state/global_state.hpp" #include "infinicore/ops.hpp" #include "infinicore/context/context.hpp" #include @@ -17,7 +16,6 @@ MiniCPMSALADecoderLayer::MiniCPMSALADecoderLayer(std::shared_ptrget_dtype(); const double eps = model_config->get("rms_norm_eps"); @@ -39,6 +37,10 @@ MiniCPMSALADecoderLayer::MiniCPMSALADecoderLayer(std::shared_ptrreset_state(); +} + infinicore::Tensor MiniCPMSALADecoderLayer::forward(const infinicore::Tensor &hidden_states, const infinicore::Tensor &position_ids) const { // Pre-norm attention diff --git a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp index 52f31000..305ab967 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_decoder_layer.hpp @@ -3,9 +3,7 @@ #include "minicpm_sala_attention.hpp" #include "minicpm_sala_mlp.hpp" -#include "../../backends/attention_backends.hpp" #include "../../config/model_config.hpp" -#include "../../engine/distributed/distributed.hpp" #include "infinicore/nn/module.hpp" #include "infinicore/nn/rmsnorm.hpp" @@ -16,8 +14,6 @@ namespace infinilm::models::minicpm_sala { -class MiniCPMSALAModel; - class MiniCPMSALADecoderLayer : public infinicore::nn::Module { public: MiniCPMSALADecoderLayer(std::shared_ptr model_config, @@ -28,11 +24,10 @@ class MiniCPMSALADecoderLayer : public infinicore::nn::Module { infinicore::Tensor forward(const infinicore::Tensor &hidden_states, const infinicore::Tensor &position_ids) const; -private: - friend class MiniCPMSALAModel; + void reset_attn_state(); +private: double residual_scale_ = 1.0; - size_t layer_idx_ = 0; protected: INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, input_layernorm); diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp index 791a7832..de6f34e1 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.cpp @@ -25,16 +25,12 @@ std::shared_ptr create_minicpm_sala_model_config( MiniCPMSALAForCausalLM::MiniCPMSALAForCausalLM( std::shared_ptr model_config, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info, - backends::AttentionBackend attention_backend) { + const infinicore::Device &device) { device_ = device; model_config_ = model_config; // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA). const auto dtype = model_config->get_dtype(); - (void)rank_info; - (void)attention_backend; INFINICORE_NN_MODULE_INIT(model, model_config, device); const size_t hidden_size = model_config->get("hidden_size"); @@ -48,21 +44,6 @@ MiniCPMSALAForCausalLM::Output MiniCPMSALAForCausalLM::forward( auto input_ids = input.input_ids.value(); auto position_ids = input.position_ids.value(); - auto past_sequence_lengths = input.past_sequence_lengths; - auto total_sequence_lengths = input.total_sequence_lengths; - auto input_offsets = input.input_offsets; - auto cu_seqlens = input.cu_seqlens; - auto block_tables = input.block_tables; - auto slot_mapping = input.slot_mapping; - - infinilm::global_state::get_forward_context().attn_metadata = - infinilm::global_state::AttentionMetadata(past_sequence_lengths, - total_sequence_lengths, - input_offsets, - cu_seqlens, - block_tables, - slot_mapping); - auto hidden_states = model_->forward(input_ids, position_ids); // MuP lm_head scale baked into lm_head.weight at load time; no forward scaling here. diff --git a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp index 9344dfd3..0a53e101 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_for_causal_lm.hpp @@ -4,8 +4,6 @@ #include "minicpm_sala_model.hpp" #include "../../config/model_config.hpp" -#include "../../engine/distributed/distributed.hpp" -#include "../../backends/attention_backends.hpp" #include "../../layers/linear/linear.hpp" #include "infinicore/device.hpp" @@ -18,9 +16,7 @@ namespace infinilm::models::minicpm_sala { class MiniCPMSALAForCausalLM : public InfinilmModel { public: MiniCPMSALAForCausalLM(std::shared_ptr model_config, - const infinicore::Device &device, - engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), - backends::AttentionBackend attention_backend = backends::AttentionBackend::Default); + const infinicore::Device &device); Output forward(const Input &input) const override; diff --git a/csrc/models/minicpm_sala/minicpm_sala_mlp.cpp b/csrc/models/minicpm_sala/minicpm_sala_mlp.cpp index 649c0095..b9ebd3c6 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_mlp.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_mlp.cpp @@ -8,12 +8,12 @@ MiniCPMSALAMLP::MiniCPMSALAMLP(std::shared_ptr mo const infinicore::Device &device) { // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA). const auto dtype = model_config->get_dtype(); - hidden_size_ = model_config->get("hidden_size"); - intermediate_size_ = model_config->get("intermediate_size"); + const size_t hidden_size = model_config->get("hidden_size"); + const size_t intermediate_size = model_config->get("intermediate_size"); - INFINICORE_NN_MODULE_INIT(gate_proj, hidden_size_, intermediate_size_, false, dtype, device); - INFINICORE_NN_MODULE_INIT(up_proj, hidden_size_, intermediate_size_, false, dtype, device); - INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, false, dtype, device); + INFINICORE_NN_MODULE_INIT(gate_proj, hidden_size, intermediate_size, false, dtype, device); + INFINICORE_NN_MODULE_INIT(up_proj, hidden_size, intermediate_size, false, dtype, device); + INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size, hidden_size, false, dtype, device); } infinicore::Tensor MiniCPMSALAMLP::forward(const infinicore::Tensor &x) const { diff --git a/csrc/models/minicpm_sala/minicpm_sala_mlp.hpp b/csrc/models/minicpm_sala/minicpm_sala_mlp.hpp index 9a90527a..3150670b 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_mlp.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_mlp.hpp @@ -21,10 +21,6 @@ class MiniCPMSALAMLP : public infinicore::nn::Module { INFINICORE_NN_MODULE(infinicore::nn::Linear, gate_proj); INFINICORE_NN_MODULE(infinicore::nn::Linear, up_proj); INFINICORE_NN_MODULE(infinicore::nn::Linear, down_proj); - -private: - size_t hidden_size_; - size_t intermediate_size_; }; } // namespace infinilm::models::minicpm_sala diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.cpp b/csrc/models/minicpm_sala/minicpm_sala_model.cpp index f665ce0a..20c6d420 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_model.cpp +++ b/csrc/models/minicpm_sala/minicpm_sala_model.cpp @@ -12,32 +12,23 @@ namespace infinilm::models::minicpm_sala { MiniCPMSALAModel::MiniCPMSALAModel(std::shared_ptr model_config, - const infinicore::Device &device) - : model_config_(std::move(model_config)) { + const infinicore::Device &device) { // Match parameter dtype with checkpoint `torch_dtype` (e.g. BF16 for MiniCPM-SALA). - const auto dtype = model_config_->get_dtype(); - compute_device_ = device; - const engine::distributed::RankInfo &rank_info = infinilm::global_state::get_tensor_model_parallel_rank_info(); - const backends::AttentionBackend attention_backend = infinilm::global_state::get_infinilm_config().attention_backend; + const auto dtype = model_config->get_dtype(); - hidden_size_ = model_config_->get("hidden_size"); - dim_model_base_ = model_config_->get_or("dim_model_base", static_cast(hidden_size_)); - scale_emb_ = model_config_->get_or("scale_emb", 1.0); + hidden_size_ = model_config->get("hidden_size"); - const size_t vocab_size = model_config_->get("vocab_size"); - const size_t num_layers = model_config_->get("num_hidden_layers"); + const size_t vocab_size = model_config->get("vocab_size"); + const size_t num_layers = model_config->get("num_hidden_layers"); INFINICORE_NN_MODULE_INIT(embed_tokens, vocab_size, hidden_size_, std::nullopt, dtype, device); - INFINICORE_NN_MODULE_INIT(norm, hidden_size_, model_config_->get("rms_norm_eps"), dtype, device); - - // Shared rotary embedding (used by lightning layers only) — match `get_rope` pattern. - rotary_emb_ = infinilm::layers::rotary_embedding::get_rope(model_config_, device); + INFINICORE_NN_MODULE_INIT(norm, hidden_size_, model_config->get("rms_norm_eps"), dtype, device); // Mixer types per-layer decide attention flavor (minicpm4 vs lightning-attn). std::vector mixer_types; try { - mixer_types = model_config_->get>("mixer_types"); + mixer_types = model_config->get>("mixer_types"); } catch (...) { mixer_types.assign(num_layers, "minicpm4"); } @@ -48,13 +39,13 @@ MiniCPMSALAModel::MiniCPMSALAModel(std::shared_ptrregister_module( - "layers." + std::to_string(i), model_config_, device, i, mixer_types[i])); + "layers." + std::to_string(i), model_config, device, i, mixer_types[i])); } } void MiniCPMSALAModel::reset_state() { for (auto &layer : layers_) { - layer->self_attn_->reset_state(); + layer->reset_attn_state(); } } diff --git a/csrc/models/minicpm_sala/minicpm_sala_model.hpp b/csrc/models/minicpm_sala/minicpm_sala_model.hpp index ed79cd76..811ecbf7 100644 --- a/csrc/models/minicpm_sala/minicpm_sala_model.hpp +++ b/csrc/models/minicpm_sala/minicpm_sala_model.hpp @@ -2,17 +2,10 @@ #include "minicpm_sala_decoder_layer.hpp" -#include "../../backends/attention_backends.hpp" -#include "../../cache/cache.hpp" #include "../../config/model_config.hpp" -#include "../../engine/distributed/distributed.hpp" - -#include "../../layers/rotary_embedding/rotary_embedding.hpp" -#include "../../global_state/global_state.hpp" #include "infinicore/nn/embedding.hpp" #include "infinicore/nn/module.hpp" #include "infinicore/nn/rmsnorm.hpp" -#include "infinicore/nn/rope.hpp" #include "infinicore/tensor.hpp" #include @@ -32,7 +25,6 @@ class MiniCPMSALAModel : public infinicore::nn::Module { void reset_state(); size_t hidden_size() const { return hidden_size_; } - double dim_model_base() const { return dim_model_base_; } protected: INFINICORE_NN_MODULE(infinicore::nn::Embedding, embed_tokens); @@ -40,13 +32,7 @@ class MiniCPMSALAModel : public infinicore::nn::Module { INFINICORE_NN_MODULE(infinicore::nn::RMSNorm, norm); private: - std::shared_ptr model_config_; - std::shared_ptr rotary_emb_; - infinicore::Device compute_device_; - size_t hidden_size_; - double scale_emb_; - double dim_model_base_; }; } // namespace infinilm::models::minicpm_sala