From 9d83df2717555bfcf5ce2f63ef85b87d97c38d98 Mon Sep 17 00:00:00 2001
From: Bingoo <1575938147@qq.com>
Date: Tue, 24 Mar 2026 14:20:31 +0800
Subject: [PATCH 01/11] merge matmul and add

---
 fastdeploy/model_executor/layers/linear.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py
index 2bee885ff43..39591c9f656 100644
--- a/fastdeploy/model_executor/layers/linear.py
+++ b/fastdeploy/model_executor/layers/linear.py
@@ -82,11 +82,9 @@ def process_loaded_weights(self, layer, weights) -> None:
         layer.weight.set_value(weights)
 
     def apply(self, layer: nn.Layer, x: paddle.Tensor) -> paddle.Tensor:
-        linear_out = paddle.matmul(x, layer.weight)
         if layer.with_bias:
-            linear_out = paddle.add(linear_out, layer.bias)
-        return linear_out
-
+            return paddle._C_ops.linear(x, layer.weight, layer.bias)
+        return paddle.matmul(x, layer.weight)
 
 class LinearBase(nn.Layer):
     """

From 954645bdb7993421d0d1792ea2e687c7cfb7581e Mon Sep 17 00:00:00 2001
From: Bingoo <1575938147@qq.com>
Date: Tue, 24 Mar 2026 14:27:14 +0800
Subject: [PATCH 02/11] modify format

---
 fastdeploy/model_executor/layers/linear.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py
index 39591c9f656..5a8b549bad3 100644
--- a/fastdeploy/model_executor/layers/linear.py
+++ b/fastdeploy/model_executor/layers/linear.py
@@ -86,6 +86,7 @@ def apply(self, layer: nn.Layer, x: paddle.Tensor) -> paddle.Tensor:
             return paddle._C_ops.linear(x, layer.weight, layer.bias)
         return paddle.matmul(x, layer.weight)
 
+
 class LinearBase(nn.Layer):
     """
     LinearBase Layer.

From 272272cf7dffba0c4504caae9bb37fc53df90e9e Mon Sep 17 00:00:00 2001
From: Bingoo <1575938147@qq.com>
Date: Tue, 24 Mar 2026 16:56:27 +0800
Subject: [PATCH 03/11] using paddle.nn.functional.linear

---
 fastdeploy/model_executor/layers/linear.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py
index 5a8b549bad3..c13f8d4dec2 100644
--- a/fastdeploy/model_executor/layers/linear.py
+++ b/fastdeploy/model_executor/layers/linear.py
@@ -83,7 +83,7 @@ def process_loaded_weights(self, layer, weights) -> None:
 
     def apply(self, layer: nn.Layer, x: paddle.Tensor) -> paddle.Tensor:
         if layer.with_bias:
-            return paddle._C_ops.linear(x, layer.weight, layer.bias)
+            return paddle.nn.functional.linear(x, layer.weight, layer.bias)
         return paddle.matmul(x, layer.weight)
 
 

From 70a0724c278fd253dc2a8e886f05986dc44d53fa Mon Sep 17 00:00:00 2001
From: Bingoo <1575938147@qq.com>
Date: Thu, 26 Mar 2026 22:07:46 +0800
Subject: [PATCH 04/11] using _C_ops.linear

---
 fastdeploy/model_executor/layers/linear.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py
index c13f8d4dec2..5a8b549bad3 100644
--- a/fastdeploy/model_executor/layers/linear.py
+++ b/fastdeploy/model_executor/layers/linear.py
@@ -83,7 +83,7 @@ def process_loaded_weights(self, layer, weights) -> None:
 
     def apply(self, layer: nn.Layer, x: paddle.Tensor) -> paddle.Tensor:
         if layer.with_bias:
-            return paddle.nn.functional.linear(x, layer.weight, layer.bias)
+            return paddle._C_ops.linear(x, layer.weight, layer.bias)
         return paddle.matmul(x, layer.weight)
 
 

From 7ac24481f90e307c29276d5a6769c7b907ab3a49 Mon Sep 17 00:00:00 2001
From: Bingoo <1575938147@qq.com>
Date: Fri, 27 Mar 2026 11:48:28 +0800
Subject: [PATCH 05/11] using paddle.nn.functional.linear

---
 fastdeploy/model_executor/layers/linear.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py
index 5a8b549bad3..468a608cd40 100644
--- a/fastdeploy/model_executor/layers/linear.py
+++ b/fastdeploy/model_executor/layers/linear.py
@@ -82,9 +82,8 @@ def process_loaded_weights(self, layer, weights) -> None:
         layer.weight.set_value(weights)
 
     def apply(self, layer: nn.Layer, x: paddle.Tensor) -> paddle.Tensor:
-        if layer.with_bias:
-            return paddle._C_ops.linear(x, layer.weight, layer.bias)
-        return paddle.matmul(x, layer.weight)
+        out = paddle.nn.functional.linear(x, layer.weight, layer.bias if layer.with_bias else None)
+        return out
 
 
 class LinearBase(nn.Layer):

From a8ed2a1050dbc2fe2b5efd2a862a53728c696d15 Mon Sep 17 00:00:00 2001
From: Bingoo <1575938147@qq.com>
Date: Sun, 29 Mar 2026 17:10:40 +0800
Subject: [PATCH 06/11] add FLAGS_use_legacy_linear env var in test case

---
 tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py          | 1 +
 tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py              | 1 +
 tests/e2e/test_EB_Lite_serving.py                          | 1 +
 tests/e2e/test_Qwen3VLMoe_serving.py                       | 1 +
 tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py | 1 +
 5 files changed, 5 insertions(+)

diff --git a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py b/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py
index b3bd6d9b455..0280969216c 100644
--- a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py
+++ b/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py
@@ -35,6 +35,7 @@
     is_port_open,
 )
 
+os.environ["FLAGS_use_legacy_linear"] = "1"
 
 @pytest.fixture(scope="session", autouse=True)
 def setup_and_run_server():
diff --git a/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py b/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py
index 9a46b9cd0a2..a797e0432b0 100644
--- a/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py
+++ b/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py
@@ -35,6 +35,7 @@
     is_port_open,
 )
 
+os.environ["FLAGS_use_legacy_linear"] = "1"
 
 @pytest.fixture(scope="session", autouse=True)
 def setup_and_run_server():
diff --git a/tests/e2e/test_EB_Lite_serving.py b/tests/e2e/test_EB_Lite_serving.py
index 6c84306927c..6e9ef2fc9d0 100644
--- a/tests/e2e/test_EB_Lite_serving.py
+++ b/tests/e2e/test_EB_Lite_serving.py
@@ -32,6 +32,7 @@
     is_port_open,
 )
 
+os.environ["FLAGS_use_legacy_linear"] = "1"
 
 @pytest.fixture(scope="session", autouse=True)
 def setup_and_run_server():
diff --git a/tests/e2e/test_Qwen3VLMoe_serving.py b/tests/e2e/test_Qwen3VLMoe_serving.py
index 01fee8bb4ed..e3d803a35a4 100644
--- a/tests/e2e/test_Qwen3VLMoe_serving.py
+++ b/tests/e2e/test_Qwen3VLMoe_serving.py
@@ -30,6 +30,7 @@
     is_port_open,
 )
 
+os.environ["FLAGS_use_legacy_linear"] = "1"
 
 @pytest.fixture(scope="session", autouse=True)
 def setup_and_run_server():
diff --git a/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py b/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py
index 71ee1607a21..f8dc390c466 100644
--- a/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py
+++ b/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py
@@ -38,6 +38,7 @@
     is_port_open,
 )
 
+os.environ["FLAGS_use_legacy_linear"] = "1"
 # Read ports from environment variables; use default values if not set
 FD_CONNECTOR_PORT = int(os.getenv("FD_CONNECTOR_PORT", 8433))
 FD_ROUTER_PORT = int(os.getenv("FD_ROUTER_PORT", 8533))

From c420da7ef24819f5664f3a2de080e118da71a98e Mon Sep 17 00:00:00 2001
From: Bingoo <1575938147@qq.com>
Date: Sun, 29 Mar 2026 22:22:50 +0800
Subject: [PATCH 07/11] fix format

---
 tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py | 1 +
 tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py     | 1 +
 tests/e2e/test_EB_Lite_serving.py                 | 1 +
 tests/e2e/test_Qwen3VLMoe_serving.py              | 1 +
 4 files changed, 4 insertions(+)

diff --git a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py b/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py
index 0280969216c..8b11d6ccd9f 100644
--- a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py
+++ b/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py
@@ -37,6 +37,7 @@
 
 os.environ["FLAGS_use_legacy_linear"] = "1"
 
+
 @pytest.fixture(scope="session", autouse=True)
 def setup_and_run_server():
     """
diff --git a/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py b/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py
index a797e0432b0..c31dec46832 100644
--- a/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py
+++ b/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py
@@ -37,6 +37,7 @@
 
 os.environ["FLAGS_use_legacy_linear"] = "1"
 
+
 @pytest.fixture(scope="session", autouse=True)
 def setup_and_run_server():
     """
diff --git a/tests/e2e/test_EB_Lite_serving.py b/tests/e2e/test_EB_Lite_serving.py
index 6e9ef2fc9d0..26b2dbf1831 100644
--- a/tests/e2e/test_EB_Lite_serving.py
+++ b/tests/e2e/test_EB_Lite_serving.py
@@ -34,6 +34,7 @@
 
 os.environ["FLAGS_use_legacy_linear"] = "1"
 
+
 @pytest.fixture(scope="session", autouse=True)
 def setup_and_run_server():
     """
diff --git a/tests/e2e/test_Qwen3VLMoe_serving.py b/tests/e2e/test_Qwen3VLMoe_serving.py
index e3d803a35a4..e61a8f7b356 100644
--- a/tests/e2e/test_Qwen3VLMoe_serving.py
+++ b/tests/e2e/test_Qwen3VLMoe_serving.py
@@ -32,6 +32,7 @@
 
 os.environ["FLAGS_use_legacy_linear"] = "1"
 
+
 @pytest.fixture(scope="session", autouse=True)
 def setup_and_run_server():
     """

From 0c421b7f6655782cd7598822146f9b219493860d Mon Sep 17 00:00:00 2001
From: Bingoo <1575938147@qq.com>
Date: Fri, 3 Apr 2026 11:28:35 +0800
Subject: [PATCH 08/11] add assert and remove env

---
 fastdeploy/model_executor/layers/linear.py             | 10 +++++++++-
 tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py      |  1 -
 tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py          |  2 --
 tests/e2e/test_EB_Lite_serving.py                      |  2 --
 tests/e2e/test_Qwen3VLMoe_serving.py                   |  2 --
 .../test_ernie_03b_pd_router_v1_rdma_global_cache.py   |  1 -
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py
index 468a608cd40..65183ed61af 100644
--- a/fastdeploy/model_executor/layers/linear.py
+++ b/fastdeploy/model_executor/layers/linear.py
@@ -82,7 +82,15 @@ def process_loaded_weights(self, layer, weights) -> None:
         layer.weight.set_value(weights)
 
     def apply(self, layer: nn.Layer, x: paddle.Tensor) -> paddle.Tensor:
-        out = paddle.nn.functional.linear(x, layer.weight, layer.bias if layer.with_bias else None)
+        if layer.with_bias:
+            bias = layer.bias
+            assert bias.dim() == 1 and bias.shape[-1] == layer.weight.shape[-1], \
+                f"bias must be 1D with size equal to the last dim of weight, " \
+                f"but got bias.shape={bias.shape}, weight.shape[-1]={layer.weight.shape[-1]}"
+        else:
+            bias = None
+
+        out = paddle.nn.functional.linear(x, layer.weight, bias)
         return out
 
 
diff --git a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py b/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py
index 8b11d6ccd9f..072c1186e83 100644
--- a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py
+++ b/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py
@@ -35,7 +35,6 @@
     is_port_open,
 )
 
-os.environ["FLAGS_use_legacy_linear"] = "1"
 
 
 @pytest.fixture(scope="session", autouse=True)
diff --git a/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py b/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py
index c31dec46832..9a46b9cd0a2 100644
--- a/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py
+++ b/tests/e2e/4cards_cases/test_GLM_45_AIR_tp4.py
@@ -35,8 +35,6 @@
     is_port_open,
 )
 
-os.environ["FLAGS_use_legacy_linear"] = "1"
-
 
 @pytest.fixture(scope="session", autouse=True)
 def setup_and_run_server():
diff --git a/tests/e2e/test_EB_Lite_serving.py b/tests/e2e/test_EB_Lite_serving.py
index 26b2dbf1831..6c84306927c 100644
--- a/tests/e2e/test_EB_Lite_serving.py
+++ b/tests/e2e/test_EB_Lite_serving.py
@@ -32,8 +32,6 @@
     is_port_open,
 )
 
-os.environ["FLAGS_use_legacy_linear"] = "1"
-
 
 @pytest.fixture(scope="session", autouse=True)
 def setup_and_run_server():
diff --git a/tests/e2e/test_Qwen3VLMoe_serving.py b/tests/e2e/test_Qwen3VLMoe_serving.py
index e61a8f7b356..01fee8bb4ed 100644
--- a/tests/e2e/test_Qwen3VLMoe_serving.py
+++ b/tests/e2e/test_Qwen3VLMoe_serving.py
@@ -30,8 +30,6 @@
     is_port_open,
 )
 
-os.environ["FLAGS_use_legacy_linear"] = "1"
-
 
 @pytest.fixture(scope="session", autouse=True)
 def setup_and_run_server():
diff --git a/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py b/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py
index f8dc390c466..71ee1607a21 100644
--- a/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py
+++ b/tests/e2e/test_ernie_03b_pd_router_v1_rdma_global_cache.py
@@ -38,7 +38,6 @@
     is_port_open,
 )
 
-os.environ["FLAGS_use_legacy_linear"] = "1"
 # Read ports from environment variables; use default values if not set
 FD_CONNECTOR_PORT = int(os.getenv("FD_CONNECTOR_PORT", 8433))
 FD_ROUTER_PORT = int(os.getenv("FD_ROUTER_PORT", 8533))

From 251eb4ecccc10b6eff78b91c99e7c56bca35ca53 Mon Sep 17 00:00:00 2001
From: Bingoo <1575938147@qq.com>
Date: Fri, 3 Apr 2026 11:30:19 +0800
Subject: [PATCH 09/11] modify format

---
 fastdeploy/model_executor/layers/linear.py        | 5 +++--
 tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py | 1 -
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py
index 65183ed61af..574a030848f 100644
--- a/fastdeploy/model_executor/layers/linear.py
+++ b/fastdeploy/model_executor/layers/linear.py
@@ -84,9 +84,10 @@ def process_loaded_weights(self, layer, weights) -> None:
     def apply(self, layer: nn.Layer, x: paddle.Tensor) -> paddle.Tensor:
         if layer.with_bias:
             bias = layer.bias
-            assert bias.dim() == 1 and bias.shape[-1] == layer.weight.shape[-1], \
-                f"bias must be 1D with size equal to the last dim of weight, " \
+            assert bias.dim() == 1 and bias.shape[-1] == layer.weight.shape[-1], (
+                f"bias must be 1D with size equal to the last dim of weight, "
                 f"but got bias.shape={bias.shape}, weight.shape[-1]={layer.weight.shape[-1]}"
+            )
         else:
             bias = None
 
diff --git a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py b/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py
index 072c1186e83..b3bd6d9b455 100644
--- a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py
+++ b/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py
@@ -36,7 +36,6 @@
 )
 
 
-
 @pytest.fixture(scope="session", autouse=True)
 def setup_and_run_server():
     """

From 6acbde5cf8850032a2b77672cb4d50a5dd2e533b Mon Sep 17 00:00:00 2001
From: Bingoo <1575938147@qq.com>
Date: Fri, 3 Apr 2026 12:28:23 +0800
Subject: [PATCH 10/11] using matmul for no bias

---
 fastdeploy/model_executor/layers/linear.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py
index 574a030848f..b35d97d7660 100644
--- a/fastdeploy/model_executor/layers/linear.py
+++ b/fastdeploy/model_executor/layers/linear.py
@@ -88,10 +88,10 @@ def apply(self, layer: nn.Layer, x: paddle.Tensor) -> paddle.Tensor:
                 f"bias must be 1D with size equal to the last dim of weight, "
                 f"but got bias.shape={bias.shape}, weight.shape[-1]={layer.weight.shape[-1]}"
             )
+            out = paddle.nn.functional.linear(x, layer.weight, bias)
         else:
-            bias = None
+            out = paddle.matmul(x, layer.weight)
 
-        out = paddle.nn.functional.linear(x, layer.weight, bias)
         return out
 
 

From ddabdb17461933363fc6ab7cafd5794c01d6fd4a Mon Sep 17 00:00:00 2001
From: Bingoo <1575938147@qq.com>
Date: Fri, 3 Apr 2026 15:28:19 +0800
Subject: [PATCH 11/11] modify accurate baseline

---
 tests/e2e/utils/rollout_routing_replay_test_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/e2e/utils/rollout_routing_replay_test_utils.py b/tests/e2e/utils/rollout_routing_replay_test_utils.py
index 4186a71649a..74af852a292 100644
--- a/tests/e2e/utils/rollout_routing_replay_test_utils.py
+++ b/tests/e2e/utils/rollout_routing_replay_test_utils.py
@@ -157,10 +157,10 @@ def check_routing_replay_chat_completion(openai_client, moe_layer_num: int, mode
     model_path = os.getenv("MODEL_PATH")
     if model_path:
         baseline_path = os.path.join(
-            model_path, f"R3_BaseLine_dev_uint8_0402/routing_replay_output_baseline_{model_name}"
+            model_path, f"R3_BaseLine_dev_uint8_0403/routing_replay_output_baseline_{model_name}"
         )
     else:
-        baseline_path = f"./R3_BaseLine_dev_uint8_0402/routing_replay_output_baseline_{model_name}"
+        baseline_path = f"./R3_BaseLine_dev_uint8_0403/routing_replay_output_baseline_{model_name}"
     stream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_stream")
 
     nonstream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_nonstream")