From 9d83df2717555bfcf5ce2f63ef85b87d97c38d98 Mon Sep 17 00:00:00 2001 From: Bingoo <1575938147@qq.com> Date: Tue, 24 Mar 2026 14:20:31 +0800 Subject: [PATCH 01/11] merge matmul and add --- fastdeploy/model_executor/layers/linear.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index 2bee885ff43..39591c9f656 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -82,11 +82,9 @@ def process_loaded_weights(self, layer, weights) -> None: layer.weight.set_value(weights) def apply(self, layer: nn.Layer, x: paddle.Tensor) -> paddle.Tensor: - linear_out = paddle.matmul(x, layer.weight) if layer.with_bias: - linear_out = paddle.add(linear_out, layer.bias) - return linear_out - + return paddle._C_ops.linear(x, layer.weight, layer.bias) + return paddle.matmul(x, layer.weight) class LinearBase(nn.Layer): """ From 954645bdb7993421d0d1792ea2e687c7cfb7581e Mon Sep 17 00:00:00 2001 From: Bingoo <1575938147@qq.com> Date: Tue, 24 Mar 2026 14:27:14 +0800 Subject: [PATCH 02/11] modify format --- fastdeploy/model_executor/layers/linear.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index 39591c9f656..5a8b549bad3 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -86,6 +86,7 @@ def apply(self, layer: nn.Layer, x: paddle.Tensor) -> paddle.Tensor: return paddle._C_ops.linear(x, layer.weight, layer.bias) return paddle.matmul(x, layer.weight) + class LinearBase(nn.Layer): """ LinearBase Layer. From 272272cf7dffba0c4504caae9bb37fc53df90e9e Mon Sep 17 00:00:00 2001 From: Bingoo <1575938147@qq.com> Date: Tue, 24 Mar 2026 16:56:27 +0800 Subject: [PATCH 03/11] using paddle.nn.functional.linear --- fastdeploy/model_executor/layers/linear.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index 5a8b549bad3..c13f8d4dec2 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -83,7 +83,7 @@ def process_loaded_weights(self, layer, weights) -> None: def apply(self, layer: nn.Layer, x: paddle.Tensor) -> paddle.Tensor: if layer.with_bias: - return paddle._C_ops.linear(x, layer.weight, layer.bias) + return paddle.nn.functional.linear(x, layer.weight, layer.bias) return paddle.matmul(x, layer.weight) From 70a0724c278fd253dc2a8e886f05986dc44d53fa Mon Sep 17 00:00:00 2001 From: Bingoo <1575938147@qq.com> Date: Thu, 26 Mar 2026 22:07:46 +0800 Subject: [PATCH 04/11] using _C_ops.linear --- fastdeploy/model_executor/layers/linear.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index c13f8d4dec2..5a8b549bad3 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -83,7 +83,7 @@ def process_loaded_weights(self, layer, weights) -> None: def apply(self, layer: nn.Layer, x: paddle.Tensor) -> paddle.Tensor: if layer.with_bias: - return paddle.nn.functional.linear(x, layer.weight, layer.bias) + return paddle._C_ops.linear(x, layer.weight, layer.bias) return paddle.matmul(x, layer.weight) From 7ac24481f90e307c29276d5a6769c7b907ab3a49 Mon Sep 17 00:00:00 2001 From: Bingoo <1575938147@qq.com> Date: Fri, 27 Mar 2026 11:48:28 +0800 Subject: [PATCH 05/11] using paddle.nn.functional.linear --- fastdeploy/model_executor/layers/linear.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index 5a8b549bad3..468a608cd40 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -82,9 +82,8 @@ def process_loaded_weights(self, layer, weights) -> None: layer.weight.set_value(weights) def apply(self, layer: nn.Layer, x: paddle.Tensor) -> paddle.Tensor: - if layer.with_bias: - return paddle._C_ops.linear(x, layer.weight, layer.bias) - return paddle.matmul(x, layer.weight) + out = paddle.nn.functional.linear(x, layer.weight, layer.bias if layer.with_bias else None) + return out class LinearBase(nn.Layer): From a8ed2a1050dbc2fe2b5efd2a862a53728c696d15 Mon Sep 17 00:00:00 2001 From: Bingoo <1575938147@qq.com> Date: Sun, 29 Mar 2026 17:10:40 +0800 Subject: [PATCH 06/11] add FLAGS_use_legacy_linear env var in test case --- tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py | 1 + tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py | 1 + tests/e2e/test_EB_Lite_serving.py | 1 + tests/e2e/test_Qwen3VLMoe_serving.py | 1 + tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py | 1 + 5 files changed, 5 insertions(+) diff --git a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py b/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py index b3bd6d9b455..0280969216c 100644 --- a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py +++ b/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py @@ -35,6 +35,7 @@ is_port_open, ) +os.environ["FLAGS_use_legacy_linear"] = "1" @pytest.fixture(scope="session", autouse=True) def setup_and_run_server(): diff --git a/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py b/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py index 9a46b9cd0a2..a797e0432b0 100644 --- a/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py +++ b/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py @@ -35,6 +35,7 @@ is_port_open, ) +os.environ["FLAGS_use_legacy_linear"] = "1" @pytest.fixture(scope="session", autouse=True) def setup_and_run_server(): diff --git a/tests/e2e/test_EB_Lite_serving.py b/tests/e2e/test_EB_Lite_serving.py index 6c84306927c..6e9ef2fc9d0 100644 --- a/tests/e2e/test_EB_Lite_serving.py +++ b/tests/e2e/test_EB_Lite_serving.py @@ -32,6 +32,7 @@ is_port_open, ) +os.environ["FLAGS_use_legacy_linear"] = "1" @pytest.fixture(scope="session", autouse=True) def setup_and_run_server(): diff --git a/tests/e2e/test_Qwen3VLMoe_serving.py b/tests/e2e/test_Qwen3VLMoe_serving.py index 01fee8bb4ed..e3d803a35a4 100644 --- a/tests/e2e/test_Qwen3VLMoe_serving.py +++ b/tests/e2e/test_Qwen3VLMoe_serving.py @@ -30,6 +30,7 @@ is_port_open, ) +os.environ["FLAGS_use_legacy_linear"] = "1" @pytest.fixture(scope="session", autouse=True) def setup_and_run_server(): diff --git a/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py b/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py index 71ee1607a21..f8dc390c466 100644 --- a/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py +++ b/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py @@ -38,6 +38,7 @@ is_port_open, ) +os.environ["FLAGS_use_legacy_linear"] = "1" # Read ports from environment variables; use default values if not set FD_CONNECTOR_PORT = int(os.getenv("FD_CONNECTOR_PORT", 8433)) FD_ROUTER_PORT = int(os.getenv("FD_ROUTER_PORT", 8533)) From c420da7ef24819f5664f3a2de080e118da71a98e Mon Sep 17 00:00:00 2001 From: Bingoo <1575938147@qq.com> Date: Sun, 29 Mar 2026 22:22:50 +0800 Subject: [PATCH 07/11] fix format --- tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py | 1 + tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py | 1 + tests/e2e/test_EB_Lite_serving.py | 1 + tests/e2e/test_Qwen3VLMoe_serving.py | 1 + 4 files changed, 4 insertions(+) diff --git a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py b/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py index 0280969216c..8b11d6ccd9f 100644 --- a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py +++ b/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py @@ -37,6 +37,7 @@ os.environ["FLAGS_use_legacy_linear"] = "1" + @pytest.fixture(scope="session", autouse=True) def setup_and_run_server(): """ diff --git a/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py b/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py index a797e0432b0..c31dec46832 100644 --- a/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py +++ b/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py @@ -37,6 +37,7 @@ os.environ["FLAGS_use_legacy_linear"] = "1" + @pytest.fixture(scope="session", autouse=True) def setup_and_run_server(): """ diff --git a/tests/e2e/test_EB_Lite_serving.py b/tests/e2e/test_EB_Lite_serving.py index 6e9ef2fc9d0..26b2dbf1831 100644 --- a/tests/e2e/test_EB_Lite_serving.py +++ b/tests/e2e/test_EB_Lite_serving.py @@ -34,6 +34,7 @@ os.environ["FLAGS_use_legacy_linear"] = "1" + @pytest.fixture(scope="session", autouse=True) def setup_and_run_server(): """ diff --git a/tests/e2e/test_Qwen3VLMoe_serving.py b/tests/e2e/test_Qwen3VLMoe_serving.py index e3d803a35a4..e61a8f7b356 100644 --- a/tests/e2e/test_Qwen3VLMoe_serving.py +++ b/tests/e2e/test_Qwen3VLMoe_serving.py @@ -32,6 +32,7 @@ os.environ["FLAGS_use_legacy_linear"] = "1" + @pytest.fixture(scope="session", autouse=True) def setup_and_run_server(): """ From 0c421b7f6655782cd7598822146f9b219493860d Mon Sep 17 00:00:00 2001 From: Bingoo <1575938147@qq.com> Date: Fri, 3 Apr 2026 11:28:35 +0800 Subject: [PATCH 08/11] add assert and remove env --- fastdeploy/model_executor/layers/linear.py | 10 +++++++++- tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py | 1 - tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py | 2 -- tests/e2e/test_EB_Lite_serving.py | 2 -- tests/e2e/test_Qwen3VLMoe_serving.py | 2 -- .../test_ernie_03b_pd_router_v1_rdma_global_cache.py | 1 - 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index 468a608cd40..65183ed61af 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -82,7 +82,15 @@ def process_loaded_weights(self, layer, weights) -> None: layer.weight.set_value(weights) def apply(self, layer: nn.Layer, x: paddle.Tensor) -> paddle.Tensor: - out = paddle.nn.functional.linear(x, layer.weight, layer.bias if layer.with_bias else None) + if layer.with_bias: + bias = layer.bias + assert bias.dim() == 1 and bias.shape[-1] == layer.weight.shape[-1], \ + f"bias must be 1D with size equal to the last dim of weight, " \ + f"but got bias.shape={bias.shape}, weight.shape[-1]={layer.weight.shape[-1]}" + else: + bias = None + + out = paddle.nn.functional.linear(x, layer.weight, bias) return out diff --git a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py b/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py index 8b11d6ccd9f..072c1186e83 100644 --- a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py +++ b/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py @@ -35,7 +35,6 @@ is_port_open, ) -os.environ["FLAGS_use_legacy_linear"] = "1" @pytest.fixture(scope="session", autouse=True) diff --git a/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py b/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py index c31dec46832..9a46b9cd0a2 100644 --- a/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py +++ b/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py @@ -35,8 +35,6 @@ is_port_open, ) -os.environ["FLAGS_use_legacy_linear"] = "1" - @pytest.fixture(scope="session", autouse=True) def setup_and_run_server(): diff --git a/tests/e2e/test_EB_Lite_serving.py b/tests/e2e/test_EB_Lite_serving.py index 26b2dbf1831..6c84306927c 100644 --- a/tests/e2e/test_EB_Lite_serving.py +++ b/tests/e2e/test_EB_Lite_serving.py @@ -32,8 +32,6 @@ is_port_open, ) -os.environ["FLAGS_use_legacy_linear"] = "1" - @pytest.fixture(scope="session", autouse=True) def setup_and_run_server(): diff --git a/tests/e2e/test_Qwen3VLMoe_serving.py b/tests/e2e/test_Qwen3VLMoe_serving.py index e61a8f7b356..01fee8bb4ed 100644 --- a/tests/e2e/test_Qwen3VLMoe_serving.py +++ b/tests/e2e/test_Qwen3VLMoe_serving.py @@ -30,8 +30,6 @@ is_port_open, ) -os.environ["FLAGS_use_legacy_linear"] = "1" - @pytest.fixture(scope="session", autouse=True) def setup_and_run_server(): diff --git a/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py b/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py index f8dc390c466..71ee1607a21 100644 --- a/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py +++ b/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py @@ -38,7 +38,6 @@ is_port_open, ) -os.environ["FLAGS_use_legacy_linear"] = "1" # Read ports from environment variables; use default values if not set FD_CONNECTOR_PORT = int(os.getenv("FD_CONNECTOR_PORT", 8433)) FD_ROUTER_PORT = int(os.getenv("FD_ROUTER_PORT", 8533)) From 251eb4ecccc10b6eff78b91c99e7c56bca35ca53 Mon Sep 17 00:00:00 2001 From: Bingoo <1575938147@qq.com> Date: Fri, 3 Apr 2026 11:30:19 +0800 Subject: [PATCH 09/11] modify format --- fastdeploy/model_executor/layers/linear.py | 5 +++-- tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index 65183ed61af..574a030848f 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -84,9 +84,10 @@ def process_loaded_weights(self, layer, weights) -> None: def apply(self, layer: nn.Layer, x: paddle.Tensor) -> paddle.Tensor: if layer.with_bias: bias = layer.bias - assert bias.dim() == 1 and bias.shape[-1] == layer.weight.shape[-1], \ - f"bias must be 1D with size equal to the last dim of weight, " \ + assert bias.dim() == 1 and bias.shape[-1] == layer.weight.shape[-1], ( + f"bias must be 1D with size equal to the last dim of weight, " f"but got bias.shape={bias.shape}, weight.shape[-1]={layer.weight.shape[-1]}" + ) else: bias = None diff --git a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py b/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py index 072c1186e83..b3bd6d9b455 100644 --- a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py +++ b/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py @@ -36,7 +36,6 @@ ) - @pytest.fixture(scope="session", autouse=True) def setup_and_run_server(): """ From 6acbde5cf8850032a2b77672cb4d50a5dd2e533b Mon Sep 17 00:00:00 2001 From: Bingoo <1575938147@qq.com> Date: Fri, 3 Apr 2026 12:28:23 +0800 Subject: [PATCH 10/11] using matmul for no bias --- fastdeploy/model_executor/layers/linear.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index 574a030848f..b35d97d7660 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -88,10 +88,10 @@ def apply(self, layer: nn.Layer, x: paddle.Tensor) -> paddle.Tensor: f"bias must be 1D with size equal to the last dim of weight, " f"but got bias.shape={bias.shape}, weight.shape[-1]={layer.weight.shape[-1]}" ) + out = paddle.nn.functional.linear(x, layer.weight, bias) else: - bias = None + out = paddle.matmul(x, layer.weight) - out = paddle.nn.functional.linear(x, layer.weight, bias) return out From ddabdb17461933363fc6ab7cafd5794c01d6fd4a Mon Sep 17 00:00:00 2001 From: Bingoo <1575938147@qq.com> Date: Fri, 3 Apr 2026 15:28:19 +0800 Subject: [PATCH 11/11] modify accurate baseline --- tests/e2e/utils/rollout_routing_replay_test_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/e2e/utils/rollout_routing_replay_test_utils.py b/tests/e2e/utils/rollout_routing_replay_test_utils.py index 4186a71649a..74af852a292 100644 --- a/tests/e2e/utils/rollout_routing_replay_test_utils.py +++ b/tests/e2e/utils/rollout_routing_replay_test_utils.py @@ -157,10 +157,10 @@ def check_routing_replay_chat_completion(openai_client, moe_layer_num: int, mode model_path = os.getenv("MODEL_PATH") if model_path: baseline_path = os.path.join( - model_path, f"R3_BaseLine_dev_uint8_0402/routing_replay_output_baseline_{model_name}" + model_path, f"R3_BaseLine_dev_uint8_0403/routing_replay_output_baseline_{model_name}" ) else: - baseline_path = f"./R3_BaseLine_dev_uint8_0402/routing_replay_output_baseline_{model_name}" + baseline_path = f"./R3_BaseLine_dev_uint8_0403/routing_replay_output_baseline_{model_name}" stream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_stream") nonstream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_nonstream")