From 88cda39e89edf98f86c79acfb08f6025888cbf58 Mon Sep 17 00:00:00 2001
From: bukejiyu <395822456@qq.com>
Date: Mon, 23 Mar 2026 20:37:26 +0800
Subject: [PATCH 1/3] add skip_layer_mixed_quantization

---
 .../moe/fused_moe_cutlass_metax_backend.py    |  27 ++--
 .../moe/fused_moe_triton_metax_backend.py     |   7 +-
 .../layers/backends/xpu/moe/fused_moe.py      |   2 -
 .../backends/xpu/quantization/weight_only.py  |   2 -
 .../layers/moe/fused_moe_cutlass_backend.py   |  27 ++--
 .../layers/moe/fused_moe_triton_backend.py    |  26 ++--
 fastdeploy/model_executor/layers/moe/moe.py   |   7 +-
 .../layers/quantization/mix_quant.py          |   8 +-
 .../layers/quantization/weight_only.py        |  37 +++--
 .../model_executor/models/deepseek_v3.py      |   1 +
 .../model_executor/models/ernie4_5_moe.py     |   1 +
 .../models/ernie4_5_vl/ernie4_5_vl_moe.py     |   1 +
 fastdeploy/model_executor/models/glm4_moe.py  |   1 +
 fastdeploy/model_executor/models/gpt_oss.py   |   1 +
 fastdeploy/model_executor/models/qwen3moe.py  |   1 +
 fastdeploy/model_executor/utils.py            | 130 +++++++++++++++++-
 tests/layers/test_fusedmoe.py                 |   1 +
 tests/layers/test_w4a8_moe.py                 |   1 +
 tests/layers/test_w4afp8_moe.py               |   1 +
 tests/model_loader/test_model/config.json     |  48 +++++++
 .../test_skip_layer_mixed_quantization.py     | 116 ++++++++++++++++
 21 files changed, 361 insertions(+), 85 deletions(-)
 create mode 100644 tests/model_loader/test_model/config.json
 create mode 100644 tests/model_loader/test_skip_layer_mixed_quantization.py

diff --git a/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_cutlass_metax_backend.py b/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_cutlass_metax_backend.py
index 742d6e60f8f..252b0b8ead0 100644
--- a/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_cutlass_metax_backend.py
+++ b/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_cutlass_metax_backend.py
@@ -240,7 +240,7 @@ class MetaxCutlassWeightOnlyMoEMethod(MetaxCutlassMoEMethod):
     def __init__(self, quant_config):
         super().__init__(quant_config)
         if quant_config is None:
-            self.quant_config = WeightOnlyConfig(algo="weight_only_int8", is_checkpoint_bf16=True)
+            self.quant_config = WeightOnlyConfig(algo="weight_only_int8")
         else:
             self.quant_config = quant_config
         self.moe_quant_type = self.quant_config.algo
@@ -480,21 +480,18 @@ def _process_quantize(weight_idx):
             getattr(layer, weight_name).copy_(weight.transpose([0, 2, 1]), False)
             getattr(layer, scale_name).copy_(scale, False)
 
-        if self.quant_config.is_checkpoint_bf16:
-            weight_id_map = {"gate_up": 0, "down": 1}
-            if weight_fully_copied(layer.up_gate_proj_weight):
-                weight_type = "gate_up"
-            else:
-                weight_type = "down"
-
-            if self.model_format == "torch":
-                unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
-                    "quant_weight", "weight"
-                )
-                process_weight_transpose(layer, unquantized_weight_name)
-            _process_quantize(weight_id_map[weight_type])
+        weight_id_map = {"gate_up": 0, "down": 1}
+        if weight_fully_copied(layer.up_gate_proj_weight):
+            weight_type = "gate_up"
         else:
-            return
+            weight_type = "down"
+
+        if self.model_format == "torch":
+            unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
+                "quant_weight", "weight"
+            )
+            process_weight_transpose(layer, unquantized_weight_name)
+        _process_quantize(weight_id_map[weight_type])
 
     def process_loaded_weights(self, layer: nn.Layer, state_dict):
         """
diff --git a/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_triton_metax_backend.py b/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_triton_metax_backend.py
index e2908b7c6e8..141ca8070e7 100644
--- a/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_triton_metax_backend.py
+++ b/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_triton_metax_backend.py
@@ -69,8 +69,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
             layer.hidden_size,
         ]
         # TODO(bukejiyu): remove v1 loader check when v0 loader is removed
-        is_checkpoint_bf16 = self.quant_config.is_checkpoint_bf16 if self.quant_config is not None else True
-        if is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
+        if layer.fd_config.load_config.load_choices == "default_v1":
             layer.up_gate_proj_weight = layer.create_parameter(
                 shape=self.up_gate_proj_weight_shape,
                 dtype=layer.weight_dtype,
@@ -184,10 +183,6 @@ def process_loaded_weights(self, layer: nn.Layer, state_dict):
     @paddle.no_grad()
     def process_weights_after_loading(self, layer):
         """ """
-        is_checkpoint_bf16 = self.quant_config.is_checkpoint_bf16 if self.quant_config is not None else True
-        if not is_checkpoint_bf16:
-            return
-
         if self.quant_config is not None:
             algo = layer.quant_method.quant_config.name()
             assert algo == "wint8"
diff --git a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py
index 085c202c9a2..6b683cc931a 100644
--- a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py
+++ b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py
@@ -616,8 +616,6 @@ def process_loaded_weights(self, layer: nn.Layer, state_dict):
 
     def process_weights_after_loading(self, layer):
         """ """
-        if not self.quant_config.is_checkpoint_bf16:
-            return
         weight_id_map = {"gate_up": 0, "down": 1}
         if (
             hasattr(layer.up_gate_proj_weight, "tensor_track")
diff --git a/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py b/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py
index 7ce51c06b63..26c6496ffa3 100644
--- a/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py
+++ b/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py
@@ -118,8 +118,6 @@ def process_loaded_weights(self, layer: nn.Layer, weight: paddle.Tensor) -> None
         layer.weight_scale.set_value(weight_scale)
 
     def process_weights_after_loading(self, layer) -> None:
-        if not self.quant_config.is_checkpoint_bf16:
-            return
 
         quanted_weight, weight_scale = self._quantize_weight_in_blocks(layer.weight)
         free_tensor(layer.weight)
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
index 0c86270c630..862c5526a73 100644
--- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
@@ -1468,7 +1468,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
         self.down_proj_scale_shape = [layer.num_local_experts, layer.hidden_size]
         self.model_format = extra_weight_attrs.get("model_format")
         # TODO(bukejiyu): remove v1 loader check when v0 loader is removed
-        if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
+        if layer.fd_config.load_config.load_choices == "default_v1":
             if self.model_format != "torch":
                 up_gate_proj_weight_shape = [
                     layer.num_local_experts,
@@ -1649,21 +1649,18 @@ def _process_quantize(weight_idx):
             getattr(layer, weight_name).copy_(weight, False)
             getattr(layer, scale_name).copy_(scale, False)
 
-        if self.quant_config.is_checkpoint_bf16:
-            weight_id_map = {"gate_up": 0, "down": 1}
-            if weight_fully_copied(layer.up_gate_proj_weight):
-                weight_type = "gate_up"
-            else:
-                weight_type = "down"
-
-            if self.model_format == "torch":
-                unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
-                    "quant_weight", "weight"
-                )
-                process_weight_transpose(layer, unquantized_weight_name)
-            _process_quantize(weight_id_map[weight_type])
+        weight_id_map = {"gate_up": 0, "down": 1}
+        if weight_fully_copied(layer.up_gate_proj_weight):
+            weight_type = "gate_up"
         else:
-            return
+            weight_type = "down"
+
+        if self.model_format == "torch":
+            unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
+                "quant_weight", "weight"
+            )
+            process_weight_transpose(layer, unquantized_weight_name)
+        _process_quantize(weight_id_map[weight_type])
 
     def process_loaded_weights(self, layer: nn.Layer, state_dict):
         """
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
index d1db43a3241..eea0dc9a0ca 100644
--- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
@@ -84,7 +84,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
         ]
         self.model_format = extra_weight_attrs.get("model_format")
         # TODO(bukejiyu): remove v1 loader check when v0 loader is removed
-        if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
+        if layer.fd_config.load_config.load_choices == "default_v1":
             if self.model_format != "torch":
                 up_gate_proj_weight_shape = [
                     layer.num_local_experts,
@@ -268,21 +268,17 @@ def _process_quantize(weight_idx):
             getattr(layer, weight_name).copy_(quanted_weight, False)
             getattr(layer, scale_name).copy_(quanted_weight_scale, False)
 
-        if self.quant_config.is_checkpoint_bf16:
-            weight_id_map = {"gate_up": 0, "down": 1}
-            if weight_fully_copied(layer.up_gate_proj_weight):
-                weight_type = "gate_up"
-            else:
-                weight_type = "down"
-            if self.model_format == "torch":
-                unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
-                    "quant_weight", "weight"
-                )
-                process_weight_transpose(layer, unquantized_weight_name)
-            _process_quantize(weight_id_map[weight_type])
-
+        weight_id_map = {"gate_up": 0, "down": 1}
+        if weight_fully_copied(layer.up_gate_proj_weight):
+            weight_type = "gate_up"
         else:
-            return
+            weight_type = "down"
+        if self.model_format == "torch":
+            unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
+                "quant_weight", "weight"
+            )
+            process_weight_transpose(layer, unquantized_weight_name)
+        _process_quantize(weight_id_map[weight_type])
 
     def apply(
         self,
diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py
index 4e56c7485f9..2b02a576610 100644
--- a/fastdeploy/model_executor/layers/moe/moe.py
+++ b/fastdeploy/model_executor/layers/moe/moe.py
@@ -30,7 +30,7 @@
 from fastdeploy.model_executor.layers.moe.routing_indices_cache import (
     save_routing_to_buffer,
 )
-from fastdeploy.model_executor.layers.utils import get_tensor
+from fastdeploy.model_executor.layers.utils import get_tensor, modules_to_convert
 from fastdeploy.model_executor.utils import h2d_copy, slice_fn
 from fastdeploy.platforms import current_platform
 from fastdeploy.worker.experts_manager import RedundantExpertManger
@@ -152,6 +152,7 @@ def __init__(
         with_bias: bool = False,
         activation="swiglu",
         model_format: Optional[str] = None,
+        prefix: str = "",
     ):
         """
         Initialize the Moe layer with given parameters.
@@ -175,7 +176,7 @@ def __init__(
         if self.ep_size > 1:
             self.tp_size = 1
             self.tp_rank = 0
-
+        self.prefix = prefix
         self.attn_tp_size = fd_config.parallel_config.tensor_parallel_size
         self.attn_tp_rank = fd_config.parallel_config.tensor_parallel_rank
 
@@ -226,7 +227,7 @@ def __init__(
         moe_quant_config = fd_config.quant_config
         self.moe_quant_config = moe_quant_config
         self.moe_quant_type = None
-        if moe_quant_config and moe_quant_config.get_quant_method(self):
+        if moe_quant_config and moe_quant_config.get_quant_method(self) and modules_to_convert(prefix, self.fd_config):
             self.quant_method = moe_quant_config.get_quant_method(self)
             self.moe_quant_type = moe_quant_config.name()
         else:
diff --git a/fastdeploy/model_executor/layers/quantization/mix_quant.py b/fastdeploy/model_executor/layers/quantization/mix_quant.py
index 2956d506306..b6a849d6192 100644
--- a/fastdeploy/model_executor/layers/quantization/mix_quant.py
+++ b/fastdeploy/model_executor/layers/quantization/mix_quant.py
@@ -18,6 +18,7 @@
 
 from fastdeploy.model_executor.layers.attention.attention import Attention
 from fastdeploy.model_executor.layers.moe.moe import FusedMoE
+from fastdeploy.model_executor.utils import get_special_quant_config
 
 from . import get_quantization_config
 from .quant_base import QuantConfigBase, QuantMethodBase
@@ -41,6 +42,7 @@ def __init__(
         hadamard_block_size: int = 128,
         moe_dynamic_quant: bool = False,
         is_moe_quantized: bool = False,
+        modules_to_quant: dict = {},
     ) -> None:
         super().__init__()
         self.dense_quant_type = dense_quant_type
@@ -61,6 +63,7 @@ def __init__(
         self.hadamard_block_size = hadamard_block_size
         self.moe_dynamic_quant = moe_dynamic_quant
         self.is_moe_quantized = is_moe_quantized
+        self.modules_to_quant = modules_to_quant
 
     def name(self) -> str:
         return "mix_quant"
@@ -79,6 +82,7 @@ def from_config(cls, config: dict) -> "MixQuantConfig":
             config.get("hadamard_block_size", 128),
             config.get("moe_dynamic_quant", False),
             config.get("is_moe_quantized", False),
+            config.get("modules_to_quant", {}),
         )
 
     def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
@@ -86,7 +90,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
             if layer.moe_tag == "Image":
                 if self.image_moe_quant_type is not None:
                     return (
-                        get_quantization_config(self.image_moe_quant_type)
+                        get_special_quant_config(layer, self.modules_to_quant, self.image_moe_quant_type)
                         .from_config(
                             {
                                 "is_permuted": self.is_permuted,
@@ -101,7 +105,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
             else:
                 if self.moe_quant_type is not None:
                     return (
-                        get_quantization_config(self.moe_quant_type)
+                        get_special_quant_config(layer, self.modules_to_quant, self.moe_quant_type)
                         .from_config(
                             {
                                 "is_permuted": self.is_permuted,
diff --git a/fastdeploy/model_executor/layers/quantization/weight_only.py b/fastdeploy/model_executor/layers/quantization/weight_only.py
index 24fad6130c8..553f4e8ff61 100644
--- a/fastdeploy/model_executor/layers/quantization/weight_only.py
+++ b/fastdeploy/model_executor/layers/quantization/weight_only.py
@@ -70,7 +70,6 @@ class WeightOnlyConfig(QuantConfigBase):
     def __init__(
         self,
         algo: str,
-        is_checkpoint_bf16: bool = False,
     ) -> None:
         super().__init__()
         self.algo = algo
@@ -82,7 +81,7 @@ def __init__(
         self.quant_max_bound = 0
         self.quant_min_bound = 0
         self.quant_round_type = 0
-        self.is_checkpoint_bf16 = is_checkpoint_bf16
+        self.is_checkpoint_bf16 = True  # weight only linear support dynamic quantization only
         self.group_size = -1
 
     def name(self) -> str:
@@ -91,11 +90,12 @@ def name(self) -> str:
     @classmethod
     def from_config(cls, config: dict) -> "WeightOnlyConfig":
         algo = config["algo"]
-        is_checkpoint_bf16 = not config.get("is_quantized", False)
-        return cls(algo, is_checkpoint_bf16)
+        return cls(algo)
 
     def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
+        # 根据平台类型和层类型选择对应的量化方法
         if current_platform.is_xpu():
+            # XPU平台：区分MoE层和普通Linear层
             if isinstance(layer, FusedMoE):
                 from fastdeploy.model_executor.layers.backends import (
                     XPUWeightOnlyMoEMethod,
@@ -109,6 +109,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
 
                 return XPUWeightOnlyLinearMethod(self)
         elif current_platform.is_gcu():
+            # GCU平台：区分MoE层和普通Linear层
             from fastdeploy.model_executor.layers.backends import (
                 GCUWeightOnlyLinearMethod,
                 GCUWeightOnlyMoEMethod,
@@ -119,6 +120,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
             else:
                 return GCUWeightOnlyLinearMethod(self)
         elif current_platform.is_dcu():
+            # DCU平台：区分MoE层和普通Linear层
             if isinstance(layer, FusedMoE):
                 from fastdeploy.model_executor.layers.backends import (
                     DCUTritonWeightOnlyMoEMethod,
@@ -132,6 +134,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
 
                 return DCUWeightOnlyLinearMethod(self)
         elif current_platform.is_maca():
+            # MACA平台：MoE层支持cutlass和triton两种后端
             if isinstance(layer, FusedMoE):
                 from fastdeploy.model_executor.layers.backends import (
                     MetaxCutlassWeightOnlyMoEMethod,
@@ -166,6 +169,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
 
                 return IluvatarWeightOnlyLinearMethod(self)
         else:
+            # GPU默认平台：MoE层支持cutlass/triton/marlin三种后端
             if isinstance(layer, FusedMoE):
                 if layer.use_method == "cutlass":
                     from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import (
@@ -188,6 +192,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
                 else:
                     raise ValueError(f"Unsupported MOE backend {layer.use_method}")
             else:
+                # 普通Linear层：满足条件时使用Machete优化内核，否则使用默认GPU方法
                 if (
                     _ENABLE_MACHETE
                     and envs.FD_USE_MACHETE == "1"
@@ -206,13 +211,12 @@ class WINT8Config(WeightOnlyConfig):
     weight only int8 config
     """
 
-    def __init__(self, is_checkpoint_bf16: bool = False) -> None:
-        super().__init__("weight_only_int8", is_checkpoint_bf16)
+    def __init__(self) -> None:
+        super().__init__("weight_only_int8")
 
     @classmethod
     def from_config(cls, config: dict) -> "WINT8Config":
-        is_checkpoint_bf16 = not config.get("is_quantized", False)
-        return cls(is_checkpoint_bf16)
+        return cls()
 
     def name(self) -> str:
         return "wint8"
@@ -225,14 +229,12 @@ class WINT4Config(WeightOnlyConfig):
 
     def __init__(
         self,
-        is_checkpoint_bf16: bool = False,
     ) -> None:
-        super().__init__("weight_only_int4", is_checkpoint_bf16)
+        super().__init__("weight_only_int4")
 
     @classmethod
     def from_config(cls, config: dict) -> "WINT4Config":
-        is_checkpoint_bf16 = not config.get("is_quantized", False)
-        return cls(is_checkpoint_bf16)
+        return cls()
 
     def name(self) -> str:
         return "wint4"
@@ -253,7 +255,7 @@ def __init__(
     def create_weights(self, layer, **extra_weight_attrs):
         # TODO(bukejiyu): remove v1 loader check when v0 loader is removed
         self.model_format = extra_weight_attrs.get("model_format")
-        if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
+        if layer.fd_config.load_config.load_choices == "default_v1":
             weight_shape = layer.weight_shape[::-1] if self.model_format == "torch" else layer.weight_shape
             layer.weight = layer.create_parameter(
                 shape=weight_shape,
@@ -363,12 +365,9 @@ def _process_quantize():
             layer.weight.copy_(quanted_weight_tensor, False)
             layer.weight_scale.copy_(weight_scale_tensor, False)
 
-        if self.quant_config.is_checkpoint_bf16:
-            if self.model_format == "torch":
-                process_weight_transpose(layer, "weight")
-            _process_quantize()
-        else:
-            return
+        if self.model_format == "torch":
+            process_weight_transpose(layer, "weight")
+        _process_quantize()
 
     @abstractmethod
     def process_loaded_weights(self, layer, weights) -> None:
diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py
index f6b7c417089..e439c2740be 100644
--- a/fastdeploy/model_executor/models/deepseek_v3.py
+++ b/fastdeploy/model_executor/models/deepseek_v3.py
@@ -178,6 +178,7 @@ def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str) -> None:
             layer_idx=layer_id,
             gate_correction_bias=self.gate.e_score_correction_bias,
             weight_key_map=weight_key_map,
+            prefix=f"{prefix}.experts",
         )
 
         self.num_shared_experts = fd_config.model_config.n_shared_experts
diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py
index 4cc4306de5f..496fad15baa 100644
--- a/fastdeploy/model_executor/models/ernie4_5_moe.py
+++ b/fastdeploy/model_executor/models/ernie4_5_moe.py
@@ -217,6 +217,7 @@ def __init__(
             gate_correction_bias=None,
             redundant_table_manger=redundant_table_manger,
             weight_key_map=weight_key_map,
+            prefix=f"{prefix}.experts",
         )
 
         if fd_config.model_config.moe_use_aux_free:
diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
index f4d70108e4b..e1d837f2b65 100644
--- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
+++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
@@ -157,6 +157,7 @@ def __init__(
             moe_tag=moe_tag,
             weight_key_map=weight_key_map,
             gate_correction_bias=gate_correction_bias,
+            prefix=f"{prefix}.experts",
         )
 
         self.gate = ReplicatedLinear(
diff --git a/fastdeploy/model_executor/models/glm4_moe.py b/fastdeploy/model_executor/models/glm4_moe.py
index c31928ebe31..4d423f6375b 100644
--- a/fastdeploy/model_executor/models/glm4_moe.py
+++ b/fastdeploy/model_executor/models/glm4_moe.py
@@ -180,6 +180,7 @@ def __init__(
             layer_idx=layer_id,
             gate_correction_bias=self.gate.e_score_correction_bias,
             weight_key_map=weight_key_map,
+            prefix=f"{prefix}.experts",
         )
 
         if self.n_shared_experts > 0:
diff --git a/fastdeploy/model_executor/models/gpt_oss.py b/fastdeploy/model_executor/models/gpt_oss.py
index a6cf231ed24..9ea1c7a0582 100644
--- a/fastdeploy/model_executor/models/gpt_oss.py
+++ b/fastdeploy/model_executor/models/gpt_oss.py
@@ -122,6 +122,7 @@ def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str = ""):
             with_bias=True,
             activation="swigluoai",
             model_format="",
+            prefix=f"{prefix}.experts",
         )
 
     def forward(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta):
diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py
index 6c443d68bcc..987e365b0ce 100644
--- a/fastdeploy/model_executor/models/qwen3moe.py
+++ b/fastdeploy/model_executor/models/qwen3moe.py
@@ -68,6 +68,7 @@ def __init__(
             top_k=fd_config.model_config.num_experts_per_tok,
             layer_idx=layer_id,
             weight_key_map=weight_key_map,
+            prefix=f"{prefix}.experts",
         )
 
         self.gate = ReplicatedLinear(
diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py
index e63603047be..a6e293e1a4b 100644
--- a/fastdeploy/model_executor/utils.py
+++ b/fastdeploy/model_executor/utils.py
@@ -14,19 +14,22 @@
 # limitations under the License.
 """
 
+import fnmatch
 import os
 import re
 from collections.abc import Mapping
 from contextlib import contextmanager
 from dataclasses import dataclass, field
-from functools import cache
+from functools import cache, lru_cache
 from typing import Any, List, Optional, Union
 
 import paddle
+from paddle import nn
 from paddleformers.utils.log import logger
 
 from fastdeploy import envs
 from fastdeploy.config import FDConfig
+from fastdeploy.model_executor.layers.quantization import get_quantization_config
 from fastdeploy.model_executor.layers.utils import get_tensor
 from fastdeploy.platforms import current_platform
 
@@ -155,6 +158,72 @@ def process_weight_transpose(layer, weight_name):
     setattr(layer, weight_name, weight_tmp)
 
 
+def _get_weight_only_method_cls_list():
+    # 根据平台类型和层类型选择对应的量化方法
+    weight_only_cls_list = []
+    if current_platform.is_xpu():
+        from fastdeploy.model_executor.layers.backends import (
+            XPUWeightOnlyLinearMethod,
+            XPUWeightOnlyMoEMethod,
+        )
+
+        weight_only_cls_list = [XPUWeightOnlyMoEMethod, XPUWeightOnlyLinearMethod]
+
+    elif current_platform.is_gcu():
+        from fastdeploy.model_executor.layers.backends import (
+            GCUWeightOnlyLinearMethod,
+            GCUWeightOnlyMoEMethod,
+        )
+
+        weight_only_cls_list = [GCUWeightOnlyMoEMethod, GCUWeightOnlyLinearMethod]
+    elif current_platform.is_dcu():
+        from fastdeploy.model_executor.layers.backends import (
+            DCUTritonWeightOnlyMoEMethod,
+            DCUWeightOnlyLinearMethod,
+        )
+
+        weight_only_cls_list = [DCUTritonWeightOnlyMoEMethod, DCUWeightOnlyLinearMethod]
+
+    elif current_platform.is_maca():
+        from fastdeploy.model_executor.layers.backends import (
+            MetaxCutlassWeightOnlyMoEMethod,
+            MetaxTritonWeightOnlyMoEMethod,
+        )
+        from fastdeploy.model_executor.layers.quantization.weight_only import (
+            GPUWeightOnlyLinearMethod,
+        )
+
+        weight_only_cls_list = [
+            MetaxCutlassWeightOnlyMoEMethod,
+            MetaxTritonWeightOnlyMoEMethod,
+            GPUWeightOnlyLinearMethod,
+        ]
+    else:
+        # GPU默认平台：MoE层支持cutlass/triton/marlin三种后端
+        from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import (
+            CutlassWeightOnlyMoEMethod,
+        )
+        from fastdeploy.model_executor.layers.moe.fused_moe_marlin_backend import (
+            MarlinWeightOnlyMoEMethod,
+        )
+        from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import (
+            TritonWeightOnlyMoEMethod,
+        )
+        from fastdeploy.model_executor.layers.quantization.weight_only import (
+            GPUWeightOnlyLinearMethod,
+            GPUWeightOnlyMoEMethod,
+        )
+
+        weight_only_cls_list = [
+            CutlassWeightOnlyMoEMethod,
+            TritonWeightOnlyMoEMethod,
+            MarlinWeightOnlyMoEMethod,
+            GPUWeightOnlyLinearMethod,
+            GPUWeightOnlyMoEMethod,
+        ]
+    return weight_only_cls_list
+
+
 def process_weights_after_loading(sublayers_dict: dict, fd_config: FDConfig):
     """
     process_weights_after_loading:
@@ -172,9 +241,9 @@ def fn(model_sublayer_name: str, param=None):
         model_sublayer = sublayers_dict[model_sublayer_name]
         if isinstance(model_sublayer, KVBatchLinear):
             model_sublayer.process_weights_after_loading()
-        if fd_config.quant_config and not fd_config.quant_config.is_checkpoint_bf16:
-            # skip for offline quantization
-            return
+        # if fd_config.quant_config and not fd_config.quant_config.is_checkpoint_bf16:
+        #     # skip for offline quantization
+        #     return
         if hasattr(model_sublayer, "quant_method"):
             quant_method = getattr(model_sublayer, "quant_method", None)
             unquant_moe_layer = get_moe_method()
@@ -185,6 +254,12 @@ def fn(model_sublayer_name: str, param=None):
             if type(quant_method) is UnquantizedLinearMethod or type(quant_method) is unquant_moe_cls:
                 # skip unquantized linear
                 return
+
+            if type(quant_method) not in _get_weight_only_method_cls_list():
+                if fd_config.quant_config and not fd_config.quant_config.is_checkpoint_bf16:
+                    # Skip offline quantization if quant_method is not "weight_only".
+                    return
+
             if not hasattr(quant_method, "process_weights_after_loading"):
                 return
             if param is not None and hasattr(param, "tensor_track") and param.tensor_track is None:
@@ -250,7 +325,7 @@ def process_final_after_loading(model, fd_config: FDConfig):
     )
     from fastdeploy.model_executor.layers.moe.moe import get_moe_method
 
-    for name, sublayer in model.named_sublayers():
+    for _, sublayer in model.named_sublayers():
         if isinstance(sublayer, KVBatchLinear):
             continue
         quant_method = getattr(sublayer, "quant_method", None)
@@ -262,7 +337,9 @@ def process_final_after_loading(model, fd_config: FDConfig):
                 unquant_moe_cls = type(unquant_moe_layer)
             is_unquant_cls = type(quant_method) is UnquantizedLinearMethod or type(quant_method) is unquant_moe_cls
             is_offline_quantized_ckpt = not (fd_config.quant_config and fd_config.quant_config.is_checkpoint_bf16)
-            if is_unquant_cls or is_offline_quantized_ckpt:
+            if (is_unquant_cls or is_offline_quantized_ckpt) and type(
+                quant_method
+            ) not in _get_weight_only_method_cls_list():
                 if hasattr(quant_method, "process_weights_after_loading"):
                     quant_method.process_weights_after_loading(sublayer)
                 continue
@@ -624,3 +701,44 @@ def need_memory_reconstruction(fd_config):
         return True
     else:
         return False
+
+
+@lru_cache(None)
+def parse_layer_range(r: str) -> list[tuple[int, int]]:
+
+    return [(int(a), int(b)) for a, b in (part.split("-") for part in r.split(","))]
+
+
+def layer_in_range(layer_idx: int, layer_range: str) -> bool:
+    for lo, hi in parse_layer_range(layer_range):
+        if lo <= layer_idx <= hi:
+            return True
+    return False
+
+
+def resolve_quant_type(layer_idx: int, prefix: str, modules_to_quant: dict) -> str | None:
+    rules = modules_to_quant
+
+    for quant_type, rule in rules.items():
+        if not prefix_match(prefix, rule["prefix_module"]):
+            continue
+
+        if layer_in_range(layer_idx, rule["layer_range"]):
+            return quant_type
+
+    return None
+
+
+def prefix_match(prefix: str, patterns: list[str]) -> bool:
+    return any(fnmatch.fnmatch(prefix, p) or fnmatch.fnmatch(prefix, p + ".*") for p in patterns)
+
+
+def get_special_quant_config(layer: nn.Layer, modules_to_quant: dict, ori_quant_type: str):
+    """
+    only Moe and offline quant Now
+    """
+    qtype = resolve_quant_type(layer.layer_idx, layer.prefix, modules_to_quant)
+    if qtype is None:
+        return get_quantization_config(ori_quant_type)
+    else:
+        return get_quantization_config(qtype)
diff --git a/tests/layers/test_fusedmoe.py b/tests/layers/test_fusedmoe.py
index d97363fe758..19168a765e6 100644
--- a/tests/layers/test_fusedmoe.py
+++ b/tests/layers/test_fusedmoe.py
@@ -519,6 +519,7 @@ def __init__(
             topk_group=4,
             n_group=8,
             gate_correction_bias=paddle.zeros([self.fd_config.model_config.moe_num_experts], paddle.float32),
+            prefix=f"{prefix}.experts",
             # gate_correction_bias = gate_correction_bias_real_data
         )
         moe_layer = self.fused_moe
diff --git a/tests/layers/test_w4a8_moe.py b/tests/layers/test_w4a8_moe.py
index 9584702223a..f62d97b9fd9 100644
--- a/tests/layers/test_w4a8_moe.py
+++ b/tests/layers/test_w4a8_moe.py
@@ -120,6 +120,7 @@ def __init__(
             topk_group=4,
             n_group=8,
             gate_correction_bias=paddle.zeros([self.fd_config.model_config.moe_num_experts], paddle.float32),
+            prefix=f"{prefix}.experts",
             # gate_correction_bias = gate_correction_bias_real_data
         )
         self.pack_num = 2
diff --git a/tests/layers/test_w4afp8_moe.py b/tests/layers/test_w4afp8_moe.py
index f21834354c9..af7eb5fac0d 100644
--- a/tests/layers/test_w4afp8_moe.py
+++ b/tests/layers/test_w4afp8_moe.py
@@ -106,6 +106,7 @@ def __init__(
             topk_group=4,
             n_group=8,
             gate_correction_bias=paddle.zeros([self.fd_config.model_config.moe_num_experts], paddle.float32),
+            prefix=f"{prefix}.experts",
             # gate_correction_bias = gate_correction_bias_real_data
         )
         self.pack_num = 1
diff --git a/tests/model_loader/test_model/config.json b/tests/model_loader/test_model/config.json
new file mode 100644
index 00000000000..f5816b160e9
--- /dev/null
+++ b/tests/model_loader/test_model/config.json
@@ -0,0 +1,48 @@
+{
+  "architectures": [
+    "Ernie4_5_MoeForCausalLM"
+  ],
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "intermediate_size": 12288,
+  "max_position_embeddings": 131072,
+  "model_type": "ernie4_5_moe",
+  "num_attention_heads": 20,
+  "num_key_value_heads": 4,
+  "num_hidden_layers": 28,
+  "pad_token_id": 0,
+  "rms_norm_eps": 1e-05,
+  "use_cache": false,
+  "vocab_size": 103424,
+  "rope_theta": 500000,
+  "use_rmsnorm": true,
+  "tie_word_embeddings": true,
+  "use_bias": false,
+  "moe_num_experts": 64,
+  "moe_num_shared_experts": 2,
+  "moe_layer_start_index": 1,
+  "moe_intermediate_size": 1536,
+  "moe_capacity": [64,64,64],
+  "moe_gate": "topk",
+  "moe_k": 6,
+  "moe_layer_interval": 1,
+  "moe_use_aux_free": true,
+  "dtype": "bfloat16",
+  "num_nextn_predict_layers": 1,
+  "multi_token_pred_lambda": 0.3,
+  "quantization_config":{
+    "dense_quant_type": "wint8",
+    "moe_quant_type": "w4a8",
+    "quantization": "mix_quant",
+    "kv_cache_quant_type": "int8",
+    "modules_to_quant":{
+      "wint8":{
+            "layer_range":"3-7,10-15,20-24",
+            "prefix_module":[
+              "ernie.layers.*.mlp.experts*"
+            ]}
+    }
+  }
+}
diff --git a/tests/model_loader/test_skip_layer_mixed_quantization.py b/tests/model_loader/test_skip_layer_mixed_quantization.py
new file mode 100644
index 00000000000..5182bd17553
--- /dev/null
+++ b/tests/model_loader/test_skip_layer_mixed_quantization.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from types import SimpleNamespace
+
+import paddle
+
+architectures = "Ernie4_5_MoeForCausalLM"
+import sys
+from pathlib import Path
+
+from fastdeploy.config import ErnieArchitectures, ModelConfig
+from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import (
+    CutlassWeightOnlyMoEMethod,
+)
+from fastdeploy.model_executor.layers.quantization import parse_quant_config
+from fastdeploy.model_executor.models.model_base import ModelRegistry
+from fastdeploy.worker.worker_process import init_distributed_environment, parse_args
+
+TEST_DIR = Path(__file__).resolve().parent
+TEST_MODEL_DIR = TEST_DIR / "test_model"
+
+
+def make_fd_config(
+    *,
+    model_format="paddle",
+    tensor_parallel_size=1,
+    tensor_parallel_rank=0,
+    splitwise_role="prefill",
+    use_sequence_parallel_moe=False,
+    load_choices="default_v1",
+    model=str(TEST_MODEL_DIR),
+):
+    argv_backup = sys.argv
+    try:
+        sys.argv = ["fastdeploy"]
+        args = parse_args()
+    finally:
+        sys.argv = argv_backup
+    args.model = model
+    model_config = ModelConfig(vars(args))
+    return SimpleNamespace(
+        model_config=model_config,
+        parallel_config=SimpleNamespace(
+            tensor_parallel_size=tensor_parallel_size,
+            tensor_parallel_rank=tensor_parallel_rank,
+            expert_parallel_size=1,
+            expert_parallel_rank=0,
+            tp_group=None,
+            use_sequence_parallel_moe=use_sequence_parallel_moe,
+        ),
+        scheduler_config=SimpleNamespace(splitwise_role=splitwise_role, max_num_seqs=1),
+        load_config=SimpleNamespace(dynamic_load_weight=False, load_choices=load_choices),
+        quant_config=parse_quant_config(
+            args,
+            model_config,
+            is_ernie=ErnieArchitectures.contains_ernie_arch(model_config.architectures),
+            is_v1_loader=True,
+        ),
+        eplb_config=SimpleNamespace(enable_eplb=False),
+        plas_attention_config=None,
+        routing_replay_config=SimpleNamespace(enable_routing_replay=False),
+        graph_opt_config=SimpleNamespace(graph_opt_level=0, use_cudagraph=False),
+    )
+
+
+baseline = {
+    "ernie.layers.24.mlp.experts",
+    "ernie.layers.23.mlp.experts",
+    "ernie.layers.11.mlp.experts",
+    "ernie.layers.5.mlp.experts",
+    "ernie.layers.22.mlp.experts",
+    "ernie.layers.15.mlp.experts",
+    "ernie.layers.3.mlp.experts",
+    "ernie.layers.21.mlp.experts",
+    "ernie.layers.4.mlp.experts",
+    "ernie.layers.13.mlp.experts",
+    "ernie.layers.6.mlp.experts",
+    "ernie.layers.7.mlp.experts",
+    "ernie.layers.14.mlp.experts",
+    "ernie.layers.12.mlp.experts",
+    "ernie.layers.10.mlp.experts",
+    "ernie.layers.20.mlp.experts",
+}
+
+
+def collect_cutlass_moe_layers(model) -> set[str]:
+    matched_keys = set()
+
+    for name, layer in model.named_sublayers():
+        quant_method = getattr(layer, "quant_method", None)
+        if isinstance(quant_method, CutlassWeightOnlyMoEMethod):
+            matched_keys.add(name)
+
+    return matched_keys
+
+
+def test_skip_layer_mixed_quantization():
+    ranks, local_rank = init_distributed_environment()
+    context = paddle.LazyGuard()
+    with context:
+        model_cls = ModelRegistry.get_class(architectures)
+        model = model_cls(make_fd_config())
+    res = collect_cutlass_moe_layers(model)
+    assert res == baseline

From b574e1dd98fcd682a5da5fadabc7d6fb99ccd8c4 Mon Sep 17 00:00:00 2001
From: bukejiyu <395822456@qq.com>
Date: Wed, 25 Mar 2026 11:45:51 +0800
Subject: [PATCH 2/3] update

---
 fastdeploy/model_executor/utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py
index a6e293e1a4b..27ca2156d8f 100644
--- a/fastdeploy/model_executor/utils.py
+++ b/fastdeploy/model_executor/utils.py
@@ -211,7 +211,6 @@ def _get_weight_only_method_cls_list():
         )
         from fastdeploy.model_executor.layers.quantization.weight_only import (
             GPUWeightOnlyLinearMethod,
-            GPUWeightOnlyMoEMethod,
         )
 
         weight_only_cls_list = [
@@ -219,7 +218,6 @@ def _get_weight_only_method_cls_list():
             TritonWeightOnlyMoEMethod,
             MarlinWeightOnlyMoEMethod,
             GPUWeightOnlyLinearMethod,
-            GPUWeightOnlyMoEMethod,
         ]
     return weight_only_cls_list
 

From 5b07e0bce787143d614e0618c5226722abe686f7 Mon Sep 17 00:00:00 2001
From: zccjjj <zhangcaiji123@163.com>
Date: Tue, 7 Apr 2026 15:55:36 +0800
Subject: [PATCH 3/3] support w4a8(Decode)/C8/C8+TP4EP4/PD disaggregation +
 compatibility fixes

Squashed from 6 feature commits + 2 compatibility fix commits:
- support w4a8(Decode)
- support C8 KV cache quantization
- support C8+TP4EP4
- bugfix C8
- bugfix pd+C8
- bugfix pd+mtp
- fix: make weight_need_transpose conditional and remove hardcoded layer_id
- fix: comprehensive compatibility fixes (Iluvatar platform, moe cast bug,
  mutable default, hardcoded magic number, unconditional XPU import, etc.)
---
 .../engine/sched/resource_manager_v1.py       | 50 +++++++++--
 fastdeploy/envs.py                            |  2 +
 .../layers/backends/xpu/attention.py          |  8 +-
 .../layers/backends/xpu/moe/fused_moe.py      | 52 ++++++++++++
 .../backends/xpu/quantization/kv_cache.py     | 84 +++++++++++++++++--
 fastdeploy/model_executor/layers/moe/moe.py   | 28 ++++++-
 .../layers/quantization/__init__.py           | 10 +++
 .../layers/quantization/mix_quant.py          |  4 +-
 .../model_executor/models/ernie4_5_moe.py     |  1 +
 fastdeploy/model_executor/utils.py            | 28 ++++++-
 10 files changed, 239 insertions(+), 28 deletions(-)

diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py
index 80b58d68972..df907e08f47 100644
--- a/fastdeploy/engine/sched/resource_manager_v1.py
+++ b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -235,9 +235,14 @@ def allocated_slots(self, request: Request):
         return len(request.block_tables) * self.config.cache_config.block_size
 
     def get_new_block_nums(self, request: Request, num_new_tokens: int):
+        # Account for preallocated blocks that haven't been added to block_tables yet
+        preallocated_count = len(getattr(request, "preallocated_blocks", []))
         block_num = (
-            request.num_computed_tokens + num_new_tokens + self.config.cache_config.block_size - 1
-        ) // self.config.cache_config.block_size - len(request.block_tables)
+            (request.num_computed_tokens + num_new_tokens + self.config.cache_config.block_size - 1)
+            // self.config.cache_config.block_size
+            - len(request.block_tables)
+            - preallocated_count
+        )
 
         if self.config.speculative_config.method is not None:
             block_num = min(block_num + 1, self.config.cache_config.max_block_num_per_seq)
@@ -800,8 +805,14 @@ def get_enough_request(request, scheduled_reqs):
                         self.allocated_slots(request) - request.num_total_tokens
                         <= self.config.cache_config.prealloc_dec_block_slot_num_threshold
                     ):
+                        # First, consume any preallocated blocks before allocating new ones
+                        preallocated = getattr(request, "preallocated_blocks", [])
+                        if preallocated:
+                            request.block_tables.extend(preallocated)
+                            request.preallocated_blocks = []
+                            scheduled_reqs.append(self._prepare_decode_task(request))
                         # Allocation for next decoding blocks
-                        if self.cache_manager.can_allocate_gpu_blocks(self.config.cache_config.enc_dec_block_num):
+                        elif self.cache_manager.can_allocate_gpu_blocks(self.config.cache_config.enc_dec_block_num):
                             llm_logger.debug(
                                 f"schedule decoding task: {request} request.num_total_tokens {request.num_total_tokens} request.num_computed_tokens {request.num_computed_tokens}"
                             )
@@ -911,6 +922,12 @@ def _allocate_decode_and_extend():
                         request.block_tables.extend(
                             self.cache_manager.allocate_gpu_blocks(num_new_block, request.request_id)
                         )
+                        # Merge preallocated blocks (from PD disaggregation) into block_tables
+                        # so the attention kernel can access all reserved blocks.
+                        preallocated = getattr(request, "preallocated_blocks", [])
+                        if preallocated:
+                            request.block_tables.extend(preallocated)
+                            request.preallocated_blocks = []
                         # Prepare prefill task
                         scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens))
                     else:  # Not enough blocks to allocate, trigger preemption
@@ -920,6 +937,11 @@ def _allocate_decode_and_extend():
                         request.block_tables.extend(
                             self.cache_manager.allocate_gpu_blocks(num_new_block, request.request_id)
                         )
+                        # Merge preallocated blocks (from PD disaggregation) into block_tables
+                        preallocated = getattr(request, "preallocated_blocks", [])
+                        if preallocated:
+                            request.block_tables.extend(preallocated)
+                            request.preallocated_blocks = []
                         # Prepare prefill task
                         scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens))
                     token_budget -= num_new_tokens
@@ -1403,9 +1425,10 @@ def preallocate_resource_in_d(self, request: Request):
         """
         assert self.config.scheduler_config.splitwise_role == "decode", "Only D instance can call this method"
         request.need_prefill_tokens = len(request.prompt_token_ids)
-        need_prealloc_prefill_blocks = (
+        actual_prefill_blocks = (
             request.need_prefill_tokens + self.config.cache_config.block_size - 1
-        ) // self.config.cache_config.block_size + self.config.cache_config.enc_dec_block_num
+        ) // self.config.cache_config.block_size
+        need_prealloc_prefill_blocks = actual_prefill_blocks + self.config.cache_config.enc_dec_block_num
 
         with self.lock:
             if len(self.waiting) > 0:
@@ -1416,11 +1439,14 @@ def preallocate_resource_in_d(self, request: Request):
             if not self.cache_manager.can_allocate_gpu_blocks(total_need_blocks):
                 return False
 
-            request.block_tables = self.cache_manager.allocate_gpu_blocks(
-                need_prealloc_prefill_blocks, request.request_id
-            )
+            all_blocks = self.cache_manager.allocate_gpu_blocks(need_prealloc_prefill_blocks, request.request_id)
+            # Only put the blocks that will actually contain prefilled KV data into block_tables.
+            # The extra enc_dec_block_num blocks are pre-reserved for future decode tokens and
+            # stored separately to avoid the attention kernel reading uninitialized KV cache data.
+            request.block_tables = all_blocks[:actual_prefill_blocks]
+            request.preallocated_blocks = all_blocks[actual_prefill_blocks:]
             request.num_computed_tokens = request.need_prefill_tokens
-            request.disaggregate_info["block_tables"] = request.block_tables
+            request.disaggregate_info["block_tables"] = all_blocks
             allocated_position = self.get_available_position()
             request.idx = allocated_position
             self.tasks_list[request.idx] = request
@@ -1470,6 +1496,12 @@ def add_prefilled_request(self, request_output: RequestOutput):
             self.running.append(request)
 
     def _free_blocks(self, request: Request):
+        # Also free any preallocated blocks that haven't been consumed yet
+        preallocated = getattr(request, "preallocated_blocks", [])
+        if preallocated:
+            self.cache_manager.recycle_gpu_blocks(preallocated, request.request_id)
+            request.preallocated_blocks = []
+
         if self.config.cache_config.enable_prefix_caching and self.config.scheduler_config.splitwise_role != "decode":
             self.cache_manager.release_block_ids(request)
             self.cache_manager.recycle_gpu_blocks(
diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py
index 433e2441c2f..3dbd599c1ac 100644
--- a/fastdeploy/envs.py
+++ b/fastdeploy/envs.py
@@ -210,6 +210,8 @@ def _validate_split_kv_size(value: int) -> int:
     "FD_XPU_MOE_FFN_QUANT_TYPE_MAP": lambda: os.getenv("FD_XPU_MOE_FFN_QUANT_TYPE_MAP", ""),
     # Whether to enable low latency in mixed scenario
     "FD_XPU_ENABLE_MIXED_EP_MODE": lambda: bool(int(os.getenv("FD_XPU_ENABLE_MIXED_EP_MODE", "0"))),
+    # Whether to use yiyan model
+    "FD_XPU_USE_YIYAN_MODEL": lambda: bool(int(os.getenv("FD_XPU_USE_YIYAN_MODEL", "0"))),
     # Whether to use phi FP8 quantization,if 1,use paddle default.
     "FD_USE_PHI_FP8_QUANT": lambda: bool(int(os.getenv("FD_USE_PHI_FP8_QUANT", "1"))),
     # Enables the Paddle/phi combined TopK operator only when topk_method == noaux_tc,
diff --git a/fastdeploy/model_executor/layers/backends/xpu/attention.py b/fastdeploy/model_executor/layers/backends/xpu/attention.py
index 85565d33efb..5388251d869 100644
--- a/fastdeploy/model_executor/layers/backends/xpu/attention.py
+++ b/fastdeploy/model_executor/layers/backends/xpu/attention.py
@@ -181,8 +181,8 @@ def forward_mixed(
         cache_v_scale = getattr(layer, "cache_v_scale", None)
         cache_k_out_scale = getattr(layer, "cache_k_out_scale", None)
         cache_v_out_scale = getattr(layer, "cache_v_out_scale", None)
-        cache_k_zp = getattr(self, "cache_k_zp", None)
-        cache_v_zp = getattr(self, "cache_v_zp", None)
+        cache_k_zp = getattr(layer, "cache_k_zp", None)
+        cache_v_zp = getattr(layer, "cache_v_zp", None)
 
         if layer.use_qk_norm:
             q_norm_weight = layer.q_norm_weight
@@ -220,8 +220,8 @@ def forward_mixed(
             cache_v_scale,
             cache_k_out_scale,
             cache_v_out_scale,
-            cache_k_zp,
-            cache_v_zp,
+            cache_k_zp.astype("bfloat16") if cache_k_zp is not None else None,  # for C8
+            cache_v_zp.astype("bfloat16") if cache_v_zp is not None else None,  # for C8
             None,  # shift
             None,  # smooth
             q_norm_weight,
diff --git a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py
index 6b683cc931a..a88ca01bd71 100644
--- a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py
+++ b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py
@@ -268,6 +268,14 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
                         default_initializer=paddle.nn.initializer.Constant(0),
                     ),
                 )
+                set_weight_attrs(
+                    getattr(layer, self.added_scale_attrs[0]),
+                    {
+                        "weight_loader": extra_weight_attrs.get(
+                            "weight_loader", default_weight_loader(layer.fd_config)
+                        ),
+                    },
+                )
                 setattr(
                     layer,
                     self.added_scale_attrs[1],
@@ -277,6 +285,31 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
                         default_initializer=paddle.nn.initializer.Constant(0),
                     ),
                 )
+                set_weight_attrs(
+                    getattr(layer, self.added_scale_attrs[1]),
+                    {
+                        "weight_loader": extra_weight_attrs.get(
+                            "weight_loader", default_weight_loader(layer.fd_config)
+                        ),
+                    },
+                )
+
+                set_weight_attrs(
+                    layer.up_gate_proj_weight,
+                    {
+                        "weight_loader": extra_weight_attrs.get(
+                            "weight_loader", default_weight_loader(layer.fd_config)
+                        ),
+                    },
+                )
+                set_weight_attrs(
+                    layer.down_proj_weight,
+                    {
+                        "weight_loader": extra_weight_attrs.get(
+                            "weight_loader", default_weight_loader(layer.fd_config)
+                        ),
+                    },
+                )
 
             if self.moe_quant_type in ["w8a8", "w4a8"]:
                 for in_scale_name in self.added_in_scale_attrs:
@@ -289,6 +322,25 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
                             default_initializer=paddle.nn.initializer.Constant(0),
                         ),
                     )
+                set_weight_attrs(
+                    layer.down_proj_in_scale,
+                    {
+                        "SHARD_ID_TO_SHARDED_DIM": {"gate": None, "up": None, "down": None},
+                        "weight_loader": extra_weight_attrs.get(
+                            "weight_loader", default_weight_loader(layer.fd_config)
+                        ),
+                    },
+                )
+
+                set_weight_attrs(
+                    layer.up_gate_proj_in_scale,
+                    {
+                        "SHARD_ID_TO_SHARDED_DIM": {"gate": None, "up": None, "down": None},
+                        "weight_loader": extra_weight_attrs.get(
+                            "weight_loader", default_weight_loader(layer.fd_config)
+                        ),
+                    },
+                )
 
     def process_loaded_weights(self, layer: nn.Layer, state_dict):
         up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
diff --git a/fastdeploy/model_executor/layers/backends/xpu/quantization/kv_cache.py b/fastdeploy/model_executor/layers/backends/xpu/quantization/kv_cache.py
index 25044bc0939..feebab3889d 100644
--- a/fastdeploy/model_executor/layers/backends/xpu/quantization/kv_cache.py
+++ b/fastdeploy/model_executor/layers/backends/xpu/quantization/kv_cache.py
@@ -19,6 +19,7 @@
 import paddle
 from paddle import nn
 
+from fastdeploy import envs
 from fastdeploy.model_executor.layers.quantization.kv_cache import (
     KvCacheQuantzationTypes,
 )
@@ -42,6 +43,7 @@ def __init__(self, kv_cache_quant_type: str, is_channel_wise: bool, has_zero_poi
         super().__init__()
         self.kv_cache_quant_type = kv_cache_quant_type
         self.is_channel_wise = is_channel_wise
+        self.has_zero_point = has_zero_point
 
         try:
             self.quant_type = KvCacheQuantzationTypes(kv_cache_quant_type)
@@ -139,6 +141,62 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
         scale_shape = [layer.fd_config.model_config.num_key_value_heads]
         if self.cache_quant_config.is_channel_wise:
             scale_shape = [layer.kv_num_heads * layer.head_dim]
+            # Custom weight_loader for C8+TP: the safetensors scale/zp shape is
+            # [1, num_kv_heads, 1, head_dim]. We must split along the kv_heads
+            # dimension (dim=1), not the last dimension. The default_weight_loader
+            # treats output_dim as boolean and always splits along dim=-1, which
+            # is incorrect for 4D tensors where we need to split along dim=1.
+            fd_config = layer.fd_config
+            total_kv_heads = fd_config.model_config.num_key_value_heads
+            tp_size = fd_config.parallel_config.tensor_parallel_size
+            tp_rank = fd_config.parallel_config.tensor_parallel_rank
+            max_bound = self.cache_quant_config.max_bound
+
+            def _kv_scale_weight_loader(
+                param,
+                loaded_weight,
+                shard_id=None,
+                _total_kv_heads=total_kv_heads,
+                _tp_size=tp_size,
+                _tp_rank=tp_rank,
+                _max_bound=max_bound,
+            ):
+                loaded_weight = get_tensor(loaded_weight).cast("float32")
+                # TP split along kv_heads dimension
+                if _tp_size > 1 and not fd_config.load_config.is_pre_sharded:
+                    head_dim = loaded_weight.numel() // _total_kv_heads
+                    loaded_weight = loaded_weight.reshape([_total_kv_heads, head_dim])
+                    assert (
+                        _total_kv_heads % _tp_size == 0
+                    ), f"num_kv_heads ({_total_kv_heads}) must be divisible by tp_size ({_tp_size})"
+                    kv_heads_per_rank = _total_kv_heads // _tp_size
+                    start = _tp_rank * kv_heads_per_rank
+                    end = start + kv_heads_per_rank
+                    loaded_weight = loaded_weight[start:end, :]
+                loaded_weight = paddle.clip(loaded_weight, min=1e-8)
+                loaded_weight = (_max_bound / loaded_weight).reshape(param.shape).cast(param.dtype)
+                param.copy_(loaded_weight, False)
+
+            def _kv_zp_weight_loader(
+                param, loaded_weight, shard_id=None, _total_kv_heads=total_kv_heads, _tp_size=tp_size, _tp_rank=tp_rank
+            ):
+                loaded_weight = get_tensor(loaded_weight).cast(param.dtype)
+                # TP split along kv_heads dimension
+                if _tp_size > 1 and not fd_config.load_config.is_pre_sharded:
+                    head_dim = loaded_weight.numel() // _total_kv_heads
+                    loaded_weight = loaded_weight.reshape([_total_kv_heads, head_dim])
+                    kv_heads_per_rank = _total_kv_heads // _tp_size
+                    start = _tp_rank * kv_heads_per_rank
+                    end = start + kv_heads_per_rank
+                    loaded_weight = loaded_weight[start:end, :]
+                loaded_weight = loaded_weight.reshape(param.shape)
+                param.copy_(loaded_weight, False)
+
+            scale_weight_attrs = {**extra_weight_attrs, "weight_loader": _kv_scale_weight_loader}
+            zp_weight_attrs = {**extra_weight_attrs, "weight_loader": _kv_zp_weight_loader}
+        else:
+            scale_weight_attrs = extra_weight_attrs
+            zp_weight_attrs = extra_weight_attrs
 
         layer.cache_k_scale = layer.create_parameter(
             shape=scale_shape,
@@ -154,13 +212,13 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
         set_weight_attrs(
             layer.cache_k_scale,
             {
-                **extra_weight_attrs,
+                **scale_weight_attrs,
             },
         )
         set_weight_attrs(
             layer.cache_v_scale,
             {
-                **extra_weight_attrs,
+                **scale_weight_attrs,
             },
         )
 
@@ -189,13 +247,13 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
             set_weight_attrs(
                 layer.cache_k_zp,
                 {
-                    **extra_weight_attrs,
+                    **zp_weight_attrs,
                 },
             )
             set_weight_attrs(
                 layer.cache_v_zp,
                 {
-                    **extra_weight_attrs,
+                    **zp_weight_attrs,
                 },
             )
 
@@ -219,10 +277,20 @@ def process_weights_after_loading(self, layer: nn.Layer):
         use for loader v1
         """
         # cache_k_out_scale is the reciprocal of cache_k_scale
-        if layer.cache_k_scale._is_initialized():
-            layer.cache_k_out_scale.set_value(1 / layer.cache_k_scale)  # cache_k_out_scale
-        if layer.cache_v_scale._is_initialized():
-            layer.cache_v_out_scale.set_value(1 / layer.cache_v_scale)
+        if envs.FD_XPU_USE_YIYAN_MODEL:
+            if layer.cache_k_scale._is_initialized():
+                layer.cache_k_out_scale.set_value(
+                    self.cache_quant_config.max_bound / layer.cache_k_scale.cast("float32").reshape_([-1])
+                )
+            if layer.cache_v_scale._is_initialized():
+                layer.cache_v_out_scale.set_value(
+                    self.cache_quant_config.max_bound / layer.cache_v_scale.cast("float32").reshape_([-1])
+                )
+        else:
+            if layer.cache_k_scale._is_initialized():
+                layer.cache_k_out_scale.set_value(1 / layer.cache_k_scale)
+            if layer.cache_v_scale._is_initialized():
+                layer.cache_v_out_scale.set_value(1 / layer.cache_v_scale)
 
     def apply(self, layer):
         """
diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py
index 2b02a576610..2ef99d38aba 100644
--- a/fastdeploy/model_executor/layers/moe/moe.py
+++ b/fastdeploy/model_executor/layers/moe/moe.py
@@ -260,6 +260,19 @@ def __init__(
             tp_size={self.tp_size}."
         )
 
+    def _load_in_scale_weight(self, param, expert_id, loaded_weight):
+        # only spport ernie now
+        expert_param = param[expert_id - self.expert_id_offset]
+        loaded_weight = get_tensor(loaded_weight)
+        if len(expert_param.shape) != len(loaded_weight.shape):
+            loaded_weight = loaded_weight.reshape(expert_param.shape)
+        assert expert_param.shape == loaded_weight.shape, (
+            f"Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({expert_param.shape})"
+        )
+        if expert_param.dtype != loaded_weight.dtype:
+            loaded_weight = loaded_weight.cast(expert_param.dtype)
+        param[expert_id - self.expert_id_offset].copy_(loaded_weight, False)
+
     def weight_loader(
         self,
         param,
@@ -292,9 +305,18 @@ def weight_loader(
         if weight_need_transpose:
             loaded_weight = loaded_weight.transpose([1, 0])
 
+        if SHARD_ID_TO_SHARDED_DIM["gate"] is None and SHARD_ID_TO_SHARDED_DIM["up"] is None:
+            # in scale
+            self._load_in_scale_weight(param, expert_id, loaded_weight)
+            return
+
         if shard_id is None:
             # 1.gate up fused in disk
-            output_size = param[expert_id - self.expert_id_offset].shape[SHARD_ID_TO_SHARDED_DIM["gate"]]
+
+            shard_param = param[expert_id - self.expert_id_offset]
+            if shard_param.shape == []:
+                shard_param = shard_param.unsqueeze(0)
+            output_size = shard_param.shape[SHARD_ID_TO_SHARDED_DIM["gate"]]
             shard_offsets = [
                 # (shard_id, shard_offset, shard_size)
                 ("gate", 0, output_size // 2 * self.tp_size),
@@ -333,6 +355,8 @@ def _load_gate_up_weight(self, param, expert_id, loaded_weight, shard_id, shard_
             shard_size = (self.tp_rank + 1) * block_size
             loaded_weight = slice_fn(loaded_weight, tp_shard_dim, shard_offset, shard_size)
         expert_param = param[expert_id - self.expert_id_offset]
+        if expert_param.shape == []:
+            expert_param = expert_param.unsqueeze(0)
         dim = -1 if shard_dim else 0
         param_shard_size = expert_param.shape[dim] // 2
         if shard_id == "gate":
@@ -387,6 +411,8 @@ def _load_down_weight(self, param, expert_id, loaded_weight, shard_id, shard_dim
             shard_size = (self.tp_rank + 1) * block_size
             loaded_weight = slice_fn(loaded_weight, tp_shard_dim, shard_offset, shard_size)
         expert_param = param[expert_id - self.expert_id_offset]
+        if expert_param.shape == []:
+            expert_param = expert_param.unsqueeze(0)
         if hasattr(param, "tensor_track"):
             # for dyn quant
             param.tensor_track.mark(start=0, batch_id=expert_id - self.expert_id_offset)
diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py
index 85fb3c4e868..e5cbebfb9e0 100644
--- a/fastdeploy/model_executor/layers/quantization/__init__.py
+++ b/fastdeploy/model_executor/layers/quantization/__init__.py
@@ -166,6 +166,8 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]:
     if quantization == "modelopt_fp4":
         from .nvfp4 import ModelOptNvFp4Config
 
+    from fastdeploy.platforms import current_platform
+
     from .tensor_wise_fp8 import TensorWiseFP8Config
     from .w4a8 import W4A8Config
     from .w4afp8 import W4AFP8Config
@@ -196,4 +198,12 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]:
     if quantization == "modelopt_fp4":
         method_to_config["modelopt_fp4"] = ModelOptNvFp4Config
 
+    # For XPU platform, use XPUKvCacheQuantConfig instead of KvCacheQuantConfig
+    if quantization == "kvcache" and current_platform.is_xpu():
+        from fastdeploy.model_executor.layers.backends.xpu.quantization.kv_cache import (
+            XPUKvCacheQuantConfig,
+        )
+
+        method_to_config["kvcache"] = XPUKvCacheQuantConfig
+
     return method_to_config[quantization]
diff --git a/fastdeploy/model_executor/layers/quantization/mix_quant.py b/fastdeploy/model_executor/layers/quantization/mix_quant.py
index b6a849d6192..620203cd677 100644
--- a/fastdeploy/model_executor/layers/quantization/mix_quant.py
+++ b/fastdeploy/model_executor/layers/quantization/mix_quant.py
@@ -42,7 +42,7 @@ def __init__(
         hadamard_block_size: int = 128,
         moe_dynamic_quant: bool = False,
         is_moe_quantized: bool = False,
-        modules_to_quant: dict = {},
+        modules_to_quant: dict = None,
     ) -> None:
         super().__init__()
         self.dense_quant_type = dense_quant_type
@@ -63,7 +63,7 @@ def __init__(
         self.hadamard_block_size = hadamard_block_size
         self.moe_dynamic_quant = moe_dynamic_quant
         self.is_moe_quantized = is_moe_quantized
-        self.modules_to_quant = modules_to_quant
+        self.modules_to_quant = modules_to_quant if modules_to_quant is not None else {}
 
     def name(self) -> str:
         return "mix_quant"
diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py
index 496fad15baa..5fec7d2e18c 100644
--- a/fastdeploy/model_executor/models/ernie4_5_moe.py
+++ b/fastdeploy/model_executor/models/ernie4_5_moe.py
@@ -597,6 +597,7 @@ def load_weights(self, weights_iterator) -> None:
             ("attn.cache_k_scale", "cachek_matmul.in_scale", None, None),
             ("attn.cache_v_scale", "cachev_matmul.in_scale", None, None),
             ("up_gate_proj_in_scale", "up_gate_proj.in_scale", None, None),
+            ("down_proj_in_scale", "down_proj.in_scale", None, None),
         ]
 
         expert_params_mapping = []
diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py
index 27ca2156d8f..825faddd51b 100644
--- a/fastdeploy/model_executor/utils.py
+++ b/fastdeploy/model_executor/utils.py
@@ -15,6 +15,7 @@
 """
 
 import fnmatch
+import math
 import os
 import re
 from collections.abc import Mapping
@@ -158,6 +159,7 @@ def process_weight_transpose(layer, weight_name):
     setattr(layer, weight_name, weight_tmp)
 
 
+@cache
 def _get_weight_only_method_cls_list():
     # 根据平台类型和层类型选择对应的量化方法
     weight_only_cls_list = []
@@ -198,6 +200,13 @@ def _get_weight_only_method_cls_list():
             MetaxTritonWeightOnlyMoEMethod,
             GPUWeightOnlyLinearMethod,
         ]
+    elif current_platform.is_iluvatar():
+        from fastdeploy.model_executor.layers.backends import (
+            IluvatarCutlassWeightOnlyMoEMethod,
+            IluvatarWeightOnlyLinearMethod,
+        )
+
+        weight_only_cls_list = [IluvatarCutlassWeightOnlyMoEMethod, IluvatarWeightOnlyLinearMethod]
     else:
         # GPU默认平台：MoE层支持cutlass/triton/marlin三种后端
         from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import (
@@ -414,8 +423,8 @@ def fn(param, loaded_weight, shard_id: Optional[Union[int, str]] = None):
 
         # mlp.gate.weight is precision-sensitive, so we cast it to float32 for computation
         loaded_weight = fd_cast(loaded_weight, param)
-        if param.shape != loaded_weight.shape:
-            # for e_score_correction_bias
+        if param.shape != loaded_weight.shape and math.prod(param.shape) == math.prod(loaded_weight.shape):
+            # for e_score_correction_bias and kv cache scale
             loaded_weight = loaded_weight.reshape(param.shape)
         assert param.shape == loaded_weight.shape, (
             f" Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({param.shape})"
@@ -496,7 +505,7 @@ def _get_unsupported_quant():
         if current_platform.is_cuda():
             return {"w4a8", "wint2"}
         elif current_platform.is_xpu():
-            return {"w4a8", "w8a8"}
+            return {"w8a8"}
         return set()
 
     def _err_msg(msg: str) -> str:
@@ -600,6 +609,10 @@ def rename_offline_ckpt_suffix_to_fd_suffix(
         ckpt_weight_suffix: "weight",
         ckpt_act_suffix: "in_scale",
     }
+    w4a8_suffix_map = {
+        ckpt_weight_suffix: "weight",
+        ckpt_act_suffix: "in_scale",
+    }
     moe_quant_type = ""
     dense_quant_type = ""
     if fd_config.quant_config is not None:
@@ -617,8 +630,15 @@ def fn(loaded_weight_name, is_moe):
         fd_suffix_map = {}
         if (is_moe and moe_quant_type == "block_wise_fp8") or (not is_moe and dense_quant_type == "block_wise_fp8"):
             fd_suffix_map = fp8_suffix_map
-        if (is_moe and moe_quant_type == "tensor_wise_fp8") or (not is_moe and dense_quant_type == "tensor_wise_fp8"):
+        elif (is_moe and moe_quant_type == "tensor_wise_fp8") or (
+            not is_moe and dense_quant_type == "tensor_wise_fp8"
+        ):
             fd_suffix_map = tensor_wise_fp8_suffix_map
+        elif is_moe and moe_quant_type in ("w4a8", "w4afp8"):
+            fd_suffix_map = w4a8_suffix_map
+        else:
+            fd_suffix_map = {}
+
         for ckpt_suffix, fd_suffix in fd_suffix_map.items():
             if re.search(rf"{ckpt_suffix}$", loaded_weight_name):
                 loaded_weight_name = loaded_weight_name.replace(ckpt_suffix, fd_suffix)