PaddlePaddle · zeroRains · Apr 3, 2026 · Apr 7, 2026 · Apr 8, 2026 · Copilot
diff --git a/fastdeploy/config.py b/fastdeploy/config.py
@@ -211,6 +211,7 @@ def __init__(
         self.enable_logprob = False
         self.max_logprobs = 20
         self.logprobs_mode = "raw_logprobs"
+        self.enable_keep_sampling_mask = False
         self.redundant_experts_num = 0
         self.seed = 0
         self.quantization = None

diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
@@ -460,6 +460,14 @@ class EngineArgs:
     Must be explicitly enabled via the `--enable-logprob` startup parameter to output logprob values.
     """
 
+    enable_keep_sampling_mask: bool = False
+    """
+    When enabled, the server returns a sparse index list for each generated token, indicating
+    which vocabulary positions were retained after top_p/top_k sampling, and streams it to
+    the client. In MTP (multi-token prediction) scenarios this field is a List[List[int]],
+    where each inner list contains the retained vocabulary indices for a predicted token.
+    """
+
     max_logprobs: int = 20
     """
     Maximum number of log probabilities to return when `enable_logprob` is True. The default value comes the default for the
@@ -893,6 +901,18 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=EngineArgs.enable_logprob,
             help="Enable output of token-level log probabilities.",
         )
+        model_group.add_argument(
+            "--enable-keep-sampling-mask",
+            action="store_true",
+            default=EngineArgs.enable_keep_sampling_mask,
+            help=(
+                "Enable output of sampling mask as a sparse index list over the vocabulary. "
+                "For non-MTP decoding, this is a list[int] per token step indicating which "
+                "vocabulary indices were kept after top_p/top_k sampling. "
+                "For MTP decoding, this is a list[list[int]] per token step, where each inner "
+                "list corresponds to one MTP group."
-                "list corresponds to one MTP group."
+                "list corresponds to one MTP group. Warning: when top_p >= 1.0 and top_k is "
+                "unset or non-positive, the returned index list may include nearly the entire "
+                "vocabulary for each token step, which can significantly increase compute, "
+                "memory, serialization, and network overhead. Prefer using this option with a "
+                "bounded top_k to avoid very large responses."
-                "list corresponds to one MTP group."
+                "list corresponds to one MTP group. Warning: when top_p >= 1.0 and top_k is "
+                "unset or non-positive, the returned index list may include nearly the entire "
+                "vocabulary for each token step, which can significantly increase compute, "
+                "memory, serialization, and network overhead. Prefer using this option with a "
+                "bounded top_k to avoid very large responses."
+            ),
+        )
         model_group.add_argument(
             "--max-logprobs",
             type=int,

diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py
@@ -2508,6 +2508,7 @@ def _start_worker_service(self):
             "moe_gate_fp32": self.cfg.model_config.moe_gate_fp32,
             "enable_entropy": self.cfg.model_config.enable_entropy,
             "enable_overlap_schedule": self.cfg.scheduler_config.enable_overlap_schedule,
+            "enable_keep_sampling_mask": self.cfg.model_config.enable_keep_sampling_mask,
         }
         for worker_flag, value in worker_store_true_flag.items():
             if value:

diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py
@@ -655,6 +655,7 @@ def _start_worker_service(self):
             "enable_entropy": self.cfg.model_config.enable_entropy,
             "ep_prefill_use_worst_num_tokens": self.cfg.parallel_config.ep_prefill_use_worst_num_tokens,
             "enable_overlap_schedule": self.cfg.scheduler_config.enable_overlap_schedule,
+            "enable_keep_sampling_mask": self.cfg.model_config.enable_keep_sampling_mask,
         }
         for worker_flag, value in worker_store_true_flag.items():
             if value:

diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py
@@ -727,6 +727,10 @@ class CompletionOutput:
     delta_message: Optional[DeltaMessage] = None
     multipart: Optional[list[Any]] = None
     num_image_tokens: Optional[int] = None
+    # Sparse indices of retained vocab ids:
+    #   - Non-MTP: list[int]
+    #   - MTP: list[list[int]]
+    sampling_mask: Optional[Any] = None
 
     def to_dict(self):
         """
@@ -745,6 +749,7 @@ def to_dict(self):
             "text": self.text,
             "reasoning_content": self.reasoning_content,
             "reasoning_token_num": self.reasoning_token_num,
+            "sampling_mask": self.sampling_mask,
         }
 
     @classmethod

diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py
@@ -270,6 +270,8 @@ class ChatCompletionResponseChoice(BaseModel):
     prompt_logprobs: Optional[PromptLogprobs] = None
     finish_reason: Optional[Literal["stop", "length", "tool_calls", "recover_stop"]]
     speculate_metrics: Optional[SpeculateMetrics] = None
+    # Per-token retained vocab indices from top_p/top_k sampling: List[List[int]], one list of vocab indices per token
+    sampling_mask: Optional[List[List[int]]] = None
 
 
 class ChatCompletionResponse(BaseModel):
@@ -333,6 +335,9 @@ class ChatCompletionResponseStreamChoice(BaseModel):
     logprobs: Optional[LogProbs] = None
     draft_logprobs: Optional[LogProbs] = None
     prompt_logprobs: Optional[PromptLogprobs] = None
+    # Per-token index list of retained positions after top_p sampling.
+    # Non-MTP: [[idx, ...]] (1 token/step). MTP: [[idx, ...], ...] (N accepted tokens/step).
+    sampling_mask: Optional[List[List[int]]] = None
     finish_reason: Optional[Literal["stop", "length", "tool_calls", "recover_stop"]] = None
     arrival_time: Optional[float] = None
     speculate_metrics: Optional[SpeculateMetrics] = None

diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -435,6 +435,11 @@ async def chat_completion_stream_generator(
                         delta=delta_message,
                         logprobs=logprobs_res,
                         draft_logprobs=draft_logprobs_res,
+                        sampling_mask=(
+                            self._make_sampling_mask_list(output["sampling_mask"])
+                            if output.get("sampling_mask") is not None
+                            else None
+                        ),
                         arrival_time=arrival_time,
                         speculate_metrics=output_speculate_metrics,
                     )
@@ -580,6 +585,7 @@ async def chat_completion_full_generator(
                 decoder_base_url=self.tokenizer_base_url,
             )
             prompt_logprobs_res_list = [[] for _ in range(num_choices)]
+            sampling_mask_list = [[] for _ in range(num_choices)]
             speculate_metrics = [None for _ in range(num_choices)]
             choices = []
             while num_choices > 0:
@@ -660,6 +666,9 @@ async def chat_completion_full_generator(
                         )
                         if prompt_logprobs_res:
                             prompt_logprobs_res_list[idx].extend(clamp_prompt_logprobs(prompt_logprobs_res))
+                    output_sampling_mask = output.get("sampling_mask", None)
+                    if output_sampling_mask is not None:
+                        sampling_mask_list[idx].append(self._make_sampling_mask_list(output_sampling_mask))
                     speculate_metrics[idx] = data["metrics"].get("speculate_metrics", None)
                     if data["finished"]:
                         trace_carrier = data.get("trace_carrier")
@@ -695,6 +704,7 @@ async def chat_completion_full_generator(
                             draft_logprob_contents=draft_logprob_contents,
                             response_processor=response_processor,
                             prompt_logprobs_res_list=prompt_logprobs_res_list,
+                            sampling_mask_list=sampling_mask_list,
                             max_tokens=max_tokens,
                             speculate_metrics=speculate_metrics[idx],
                         )
@@ -749,6 +759,7 @@ async def _create_chat_completion_choice(
         logprob_contents: list,
         draft_logprob_contents: list,
         prompt_logprobs_res_list: list,
+        sampling_mask_list: list,
         response_processor: ChatResponseProcessor,
         max_tokens: int,
         speculate_metrics: SpeculateMetrics | None,
@@ -787,6 +798,11 @@ async def _create_chat_completion_choice(
         if prompt_logprobs_res_list[idx]:
             prompt_logprobs_full_res = prompt_logprobs_res_list[idx]
 
+        # Flatten per-step List[List[int]] into a single List[List[int]] over all tokens.
+        sampling_mask_full_res = None
+        if sampling_mask_list and sampling_mask_list[idx]:
+            sampling_mask_full_res = [mask for step in sampling_mask_list[idx] for mask in step]
+
         num_cached_tokens[idx] = data.get("num_cached_tokens", 0)
         num_input_image_tokens[idx] = data.get("num_input_image_tokens", 0)
         num_input_video_tokens[idx] = data.get("num_input_video_tokens", 0)
@@ -810,6 +826,7 @@ async def _create_chat_completion_choice(
             logprobs=logprobs_full_res,
             draft_logprobs=draft_logprobs_full_res,
             prompt_logprobs=prompt_logprobs_full_res,
+            sampling_mask=sampling_mask_full_res,
             finish_reason=finish_reason,
             speculate_metrics=speculate_metrics,
         )
@@ -1000,3 +1017,18 @@ def _make_logprob_dict(
             )
             for token_id, logprob, rank, token in zip(logprob_token_ids, logprobs, ranks, decoded_tokens)
         }
+
+    @staticmethod
+    def _make_sampling_mask_list(sampling_mask) -> List[List[int]]:
+        """Wrap sampling_mask into a uniform List[List[int]] format.
+
+        sampling_mask is already in sparse-index form (no bool-to-index conversion needed):
+          Non-MTP: List[int]        (indices for 1 token/step)  → [[idx, ...]]
+          MTP:     List[List[int]]  (indices for N tokens/step) → [[idx, ...], ...]
+        """
+        assert sampling_mask is not None
+        if sampling_mask and isinstance(sampling_mask[0], list):
+            # MTP: already List[List[int]], return as-is
+            return sampling_mask
+        # Non-MTP: already List[int], wrap in outer list for uniform format
+        return [sampling_mask]
diff --git a/fastdeploy/model_executor/layers/sample/logprobs.py b/fastdeploy/model_executor/layers/sample/logprobs.py
@@ -159,9 +159,6 @@ def build_output_logprobs(
     logprobs_tensors = None
     cu_batch_token_offset = None
 
-    if num_logprobs is None:
-        return logprobs_tensors, cu_batch_token_offset
-
     real_bsz = share_inputs["seq_lens_this_time"].shape[0]
 
     if is_naive:
@@ -208,6 +205,10 @@ def build_output_logprobs(
         mask = idx < share_inputs["accept_num"].unsqueeze(1)
         token_ids = paddle.masked_select(share_inputs["accept_tokens"], mask)
 
+    # Adapate for sampling mask
+    if num_logprobs is None:
+        return None, None, output_logits
+
     # Compute logprobs with temperature scaling and top_p normalization
     if logprobs_mode == "raw_logprobs":
         raw_logprobs = compute_logprobs_fn(output_logits, sampling_metadata)
@@ -217,5 +218,5 @@ def build_output_logprobs(
         raw_logprobs = F.log_softmax(output_logits, axis=-1)
 
     logprobs_tensors = gather_logprobs(raw_logprobs, num_logprobs, token_ids=token_ids)
-
-    return logprobs_tensors, cu_batch_token_offset
+    # output_logits use to compute sampling_mask
+    return logprobs_tensors, cu_batch_token_offset, output_logits
diff --git a/fastdeploy/model_executor/layers/sample/meta_data.py b/fastdeploy/model_executor/layers/sample/meta_data.py
@@ -66,3 +66,5 @@ class SamplingMetadata:
     # Add for HPU post-processing
     seq_lens_encoder: Optional[paddle.Tensor] = None
     seq_lens_decoder: Optional[paddle.Tensor] = None
+    # Add for keep sampling mask
+    keep_sampling_mask: Optional[bool] = None
diff --git a/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py b/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py
@@ -125,6 +125,16 @@ def top_k_top_p_sampling(
                 if topp_seed is not None:
                     topp_seed_device = paddle.empty(shape=topp_seed.shape, dtype=topp_seed.dtype)
                     topp_seed_device.copy_(topp_seed, False)
+                if top_k_list and any(x > 0 for x in top_k_list):
+                    try:
+                        from fastdeploy.model_executor.ops.gpu import top_k_renorm_probs
+
+                        x = top_k_renorm_probs(x, top_k)
+                    except ImportError:
+                        logger.warning(
+                            "top_k_renorm_probs is not supported on current platform, skipping top_k_renorm_probs."
+                        )
+
-                if top_k_list and any(x > 0 for x in top_k_list):
-                    try:
-                        from fastdeploy.model_executor.ops.gpu import top_k_renorm_probs
-
-                        x = top_k_renorm_probs(x, top_k)
-                    except ImportError:
-                        logger.warning(
-                            "top_k_renorm_probs is not supported on current platform, skipping top_k_renorm_probs."
-                        )
-                if top_k_list and any(x > 0 for x in top_k_list):
-                    try:
-                        from fastdeploy.model_executor.ops.gpu import top_k_renorm_probs
-
-                        x = top_k_renorm_probs(x, top_k)
-                    except ImportError:
-                        logger.warning(
-                            "top_k_renorm_probs is not supported on current platform, skipping top_k_renorm_probs."
-                        )
                 _, ids = paddle.tensor.top_p_sampling(
                     x,
                     top_p,