From c7b49c3b030617950c3b084cf9bea47852552658 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cliuruian=E2=80=9D?= Date: Tue, 7 Apr 2026 16:52:56 +0800 Subject: [PATCH] Use triton qk_norm both in Prefill and Decode --- fastdeploy/model_executor/layers/normalization.py | 2 +- tests/e2e/test_Qwen3VL_serving.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fastdeploy/model_executor/layers/normalization.py b/fastdeploy/model_executor/layers/normalization.py index 14e248e0a72..4532b8d27af 100644 --- a/fastdeploy/model_executor/layers/normalization.py +++ b/fastdeploy/model_executor/layers/normalization.py @@ -341,7 +341,7 @@ def forward( forward_meta, proxy_rmsnorm=None, ) -> paddle.Tensor: - if proxy_rmsnorm is None and self.qk_norm_fused and forward_meta.step_use_cudagraph: + if proxy_rmsnorm is None and self.qk_norm_fused: qkv_out = qk_rmsnorm_fused( qkv_out, self.q_norm.weight, diff --git a/tests/e2e/test_Qwen3VL_serving.py b/tests/e2e/test_Qwen3VL_serving.py index 3872b4050ce..bbb053b13dd 100644 --- a/tests/e2e/test_Qwen3VL_serving.py +++ b/tests/e2e/test_Qwen3VL_serving.py @@ -173,7 +173,7 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): content1 = result1["choices"][0]["message"]["content"] # base result - content2 = "视频中手机支架的颜色是黑色的。" + content2 = "视频中手机支架的颜色是黑色。" # Verify that result is same as the base result assert content1.startswith(content2), content1