perf

钮圣虓 · 钮圣虓 · commit a505f8ca7c07 · 2026-03-27T20:23:06.000+08:00
diff --git a/lightllm/common/basemodel/attention/nsa/fp8_flashmla_sparse.py b/lightllm/common/basemodel/attention/nsa/fp8_flashmla_sparse.py
@@ -76,10 +76,20 @@ def _nsa_prefill_att(
         nsa_dict = att_control.nsa_prefill_dict
         layer_index = nsa_dict["layer_index"]
         topk_indices = nsa_dict["topk_indices"]
+        topk_indices_local = nsa_dict["topk_indices_local"]
+        prefill_cache_kv = nsa_dict["prefill_cache_kv"]
         softmax_scale = nsa_dict["softmax_scale"]
         kv_lora_rank = nsa_dict["kv_lora_rank"]
 
-        kv = self.infer_state.mem_manager.get_prefill_kv_cache(layer_index)
+        if self.infer_state.prefix_total_token_num > 0:
+            kv, topk_indices = self.infer_state.mem_manager.get_prefill_kv_cache_and_remap_indices(
+                layer_index=layer_index,
+                topk_indices=topk_indices,
+            )
+        else:
+            kv = prefill_cache_kv
+            topk_indices = topk_indices_local
+
         if topk_indices.ndim == 2:
             topk_indices = topk_indices.unsqueeze(1)
 
diff --git a/lightllm/common/kv_cache_mem_manager/fp8_per_token_group_quant_deepseek3_2mem_manager.py b/lightllm/common/kv_cache_mem_manager/fp8_per_token_group_quant_deepseek3_2mem_manager.py
@@ -63,8 +63,7 @@ def get_att_input_params(self, layer_index: int) -> Any:
     def get_flashmla_kv_cache(self, layer_index: int) -> torch.Tensor:
         return self.kv_buffer[layer_index].view(-1, 1, 1, self.flashmla_bytes_per_token)
 
-    def get_prefill_kv_cache(self, layer_index: int) -> torch.Tensor:
-        packed_kv = self.kv_buffer[layer_index]
+    def _dequantize_packed_kv(self, packed_kv: torch.Tensor) -> torch.Tensor:
         kv_nope = packed_kv[:, :, : self.kv_nope_dim].view(torch.float8_e4m3fn)
         kv_scale = packed_kv[:, :, self.kv_nope_dim : self.kv_nope_dim + self.quant_group_num * 4].view(torch.float32)
         kv_rope = packed_kv[:, :, self.kv_nope_dim + self.quant_group_num * 4 :].view(torch.bfloat16)
@@ -82,6 +81,39 @@ def get_prefill_kv_cache(self, layer_index: int) -> torch.Tensor:
         kv[:, :, self.kv_nope_dim :] = kv_rope.to(self.prefill_dtype)
         return kv
 
+    def get_prefill_kv_cache(self, layer_index: int) -> torch.Tensor:
+        return self._dequantize_packed_kv(self.kv_buffer[layer_index])
+
+    def get_prefill_kv_cache_and_remap_indices(self, layer_index: int, topk_indices: torch.Tensor):
+        squeeze_h_kv = topk_indices.ndim == 2
+        if squeeze_h_kv:
+            topk_indices = topk_indices.unsqueeze(1)
+
+        valid_mask = topk_indices != -1
+        valid_indices = topk_indices[valid_mask]
+
+        if valid_indices.numel() == 0:
+            empty_kv = torch.empty(
+                (0, 1, self.kv_head_dim),
+                dtype=self.prefill_dtype,
+                device=topk_indices.device,
+            )
+            remapped = topk_indices.clone()
+            if squeeze_h_kv:
+                remapped = remapped.squeeze(1)
+            return empty_kv, remapped
+
+        unique_mem_index, inverse = torch.unique(valid_indices, sorted=False, return_inverse=True)
+        packed_kv = self.kv_buffer[layer_index].index_select(0, unique_mem_index.to(torch.int64))
+        compact_kv = self._dequantize_packed_kv(packed_kv)
+
+        remapped = torch.full_like(topk_indices, -1)
+        remapped[valid_mask] = inverse.to(remapped.dtype)
+
+        if squeeze_h_kv:
+            remapped = remapped.squeeze(1)
+        return compact_kv, remapped
+
     def get_indexer_k_buffer(self, layer_index: int) -> torch.Tensor:
         return self.indexer_k_buffer[layer_index]
 
diff --git a/lightllm/models/deepseek3_2/layer_infer/transformer_layer_infer.py b/lightllm/models/deepseek3_2/layer_infer/transformer_layer_infer.py
@@ -75,12 +75,13 @@ def _context_attention_kernel(
 
         # 计算 topk_indices
         att_state = infer_state.prefill_att_state
-        topk_indices = self.indexer.get_indices(
+        topk_indices_local, topk_indices = self.indexer.get_indices(
             hidden_states=infer_state.get_topk_indices_params["hidden_states"],
             q_lora=infer_state.get_topk_indices_params["q_lora"],
             infer_state=infer_state,
             att_state=att_state,
             layer_weight=layer_weight,
+            return_local_index=True,
         )
         del infer_state.get_topk_indices_params
 
@@ -90,6 +91,8 @@ def _context_attention_kernel(
             nsa_prefill_dict={
                 "layer_index": self.layer_num_,
                 "topk_indices": topk_indices,
+                "topk_indices_local": topk_indices_local,
+                "prefill_cache_kv": kv,
                 "softmax_scale": self.softmax_scale,
                 "kv_lora_rank": self.kv_lora_rank,
             },
@@ -171,6 +174,7 @@ def get_indices(
         infer_state: Deepseek2InferStateInfo,
         att_state: Any,
         layer_weight: Deepseek3_2TransformerLayerWeight,
+        return_local_index: bool = False,
     ) -> torch.Tensor:
 
         q, k = self._get_q_k_bf16(hidden_states, q_lora, infer_state, layer_weight)
@@ -234,15 +238,18 @@ def get_indices(
             row_starts=ks,
         )
         b_topk_index = torch.where(b_topk_index != -1, b_topk_index + ks.view(-1, 1), -1)
+        local_topk_index = b_topk_index
         # 将 topk index 转化为 mem index
         from ..triton_kernel.topk_index_to_mem_index import trans_topk_index_to_mem_index
 
-        b_topk_index = trans_topk_index_to_mem_index(
-            topk_index=b_topk_index,
+        b_topk_mem_index = trans_topk_index_to_mem_index(
+            topk_index=local_topk_index,
             ragged_mem_index=att_state.ragged_mem_index,
         )
 
-        return b_topk_index
+        if return_local_index:
+            return local_topk_index, b_topk_mem_index
+        return b_topk_mem_index
 
     @staticmethod
     def _rotate_activation(x: torch.Tensor) -> torch.Tensor: