diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc b/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc index 2a040a7e7b4..f72f3774107 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc @@ -36,8 +36,9 @@ void SpeculateSaveWithOutputMsg(const paddle::Tensor& accept_tokens, int msg_queue_id, int save_each_rank, bool skip_prefill) { - // printf("enter save output"); - if (!save_each_rank && rank_id > 0) { + // NOTE(yaohuicong): Skip non-zero TP ranks — they share identical sampling + // outputs, so only rank 0 needs to send results to the message queue. + if (rank_id > 0) { return; } diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_save_output_with_topk.cc b/custom_ops/gpu_ops/speculate_decoding/speculate_save_output_with_topk.cc index 53e822e6223..3d75886bd25 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_save_output_with_topk.cc +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_save_output_with_topk.cc @@ -53,7 +53,9 @@ void SpeculateSaveOutMmsgTopK(const paddle::Tensor& sampled_token_ids, int message_flag, // Target: 3, Draft: 4 int64_t rank_id, bool save_each_rank) { - if (!save_each_rank && rank_id > 0) { + // NOTE(yaohuicong): Skip non-zero TP ranks — they share identical sampling + // outputs, so only rank 0 needs to send results to the message queue. + if (rank_id > 0) { return; } diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index c0e689735d4..bc315c3646b 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -345,9 +345,7 @@ def _predict_next_launch_token_num(self) -> int: is_block_step_cpu = self.share_inputs["is_block_step_cpu"].numpy() next_real_bsz = (seq_lens_this_time_cpu > 0).sum().item() + (is_block_step_cpu > 0).sum().item() token_num_one_step = (self.speculative_config.num_speculative_tokens + 1) if self.speculative_decoding else 1 - next_launch_token_num = ( - seq_lens_this_time_cpu.sum().item() + is_block_step_cpu.sum().item() * token_num_one_step - ) + next_launch_token_num = next_real_bsz * token_num_one_step return next_launch_token_num, next_real_bsz def only_prefill(self):