From 88cda39e89edf98f86c79acfb08f6025888cbf58 Mon Sep 17 00:00:00 2001 From: bukejiyu <395822456@qq.com> Date: Mon, 23 Mar 2026 20:37:26 +0800 Subject: [PATCH 1/3] add skip_layer_mixed_quantization --- .../moe/fused_moe_cutlass_metax_backend.py | 27 ++-- .../moe/fused_moe_triton_metax_backend.py | 7 +- .../layers/backends/xpu/moe/fused_moe.py | 2 - .../backends/xpu/quantization/weight_only.py | 2 - .../layers/moe/fused_moe_cutlass_backend.py | 27 ++-- .../layers/moe/fused_moe_triton_backend.py | 26 ++-- fastdeploy/model_executor/layers/moe/moe.py | 7 +- .../layers/quantization/mix_quant.py | 8 +- .../layers/quantization/weight_only.py | 37 +++-- .../model_executor/models/deepseek_v3.py | 1 + .../model_executor/models/ernie4_5_moe.py | 1 + .../models/ernie4_5_vl/ernie4_5_vl_moe.py | 1 + fastdeploy/model_executor/models/glm4_moe.py | 1 + fastdeploy/model_executor/models/gpt_oss.py | 1 + fastdeploy/model_executor/models/qwen3moe.py | 1 + fastdeploy/model_executor/utils.py | 130 +++++++++++++++++- tests/layers/test_fusedmoe.py | 1 + tests/layers/test_w4a8_moe.py | 1 + tests/layers/test_w4afp8_moe.py | 1 + tests/model_loader/test_model/config.json | 48 +++++++ .../test_skip_layer_mixed_quantization.py | 116 ++++++++++++++++ 21 files changed, 361 insertions(+), 85 deletions(-) create mode 100644 tests/model_loader/test_model/config.json create mode 100644 tests/model_loader/test_skip_layer_mixed_quantization.py diff --git a/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_cutlass_metax_backend.py b/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_cutlass_metax_backend.py index 742d6e60f8f..252b0b8ead0 100644 --- a/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_cutlass_metax_backend.py +++ b/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_cutlass_metax_backend.py @@ -240,7 +240,7 @@ class MetaxCutlassWeightOnlyMoEMethod(MetaxCutlassMoEMethod): def __init__(self, quant_config): super().__init__(quant_config) if quant_config is None: - self.quant_config = WeightOnlyConfig(algo="weight_only_int8", is_checkpoint_bf16=True) + self.quant_config = WeightOnlyConfig(algo="weight_only_int8") else: self.quant_config = quant_config self.moe_quant_type = self.quant_config.algo @@ -480,21 +480,18 @@ def _process_quantize(weight_idx): getattr(layer, weight_name).copy_(weight.transpose([0, 2, 1]), False) getattr(layer, scale_name).copy_(scale, False) - if self.quant_config.is_checkpoint_bf16: - weight_id_map = {"gate_up": 0, "down": 1} - if weight_fully_copied(layer.up_gate_proj_weight): - weight_type = "gate_up" - else: - weight_type = "down" - - if self.model_format == "torch": - unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace( - "quant_weight", "weight" - ) - process_weight_transpose(layer, unquantized_weight_name) - _process_quantize(weight_id_map[weight_type]) + weight_id_map = {"gate_up": 0, "down": 1} + if weight_fully_copied(layer.up_gate_proj_weight): + weight_type = "gate_up" else: - return + weight_type = "down" + + if self.model_format == "torch": + unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace( + "quant_weight", "weight" + ) + process_weight_transpose(layer, unquantized_weight_name) + _process_quantize(weight_id_map[weight_type]) def process_loaded_weights(self, layer: nn.Layer, state_dict): """ diff --git a/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_triton_metax_backend.py b/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_triton_metax_backend.py index e2908b7c6e8..141ca8070e7 100644 --- a/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_triton_metax_backend.py +++ b/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_triton_metax_backend.py @@ -69,8 +69,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs): layer.hidden_size, ] # TODO(bukejiyu): remove v1 loader check when v0 loader is removed - is_checkpoint_bf16 = self.quant_config.is_checkpoint_bf16 if self.quant_config is not None else True - if is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1": + if layer.fd_config.load_config.load_choices == "default_v1": layer.up_gate_proj_weight = layer.create_parameter( shape=self.up_gate_proj_weight_shape, dtype=layer.weight_dtype, @@ -184,10 +183,6 @@ def process_loaded_weights(self, layer: nn.Layer, state_dict): @paddle.no_grad() def process_weights_after_loading(self, layer): """ """ - is_checkpoint_bf16 = self.quant_config.is_checkpoint_bf16 if self.quant_config is not None else True - if not is_checkpoint_bf16: - return - if self.quant_config is not None: algo = layer.quant_method.quant_config.name() assert algo == "wint8" diff --git a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py index 085c202c9a2..6b683cc931a 100644 --- a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py +++ b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py @@ -616,8 +616,6 @@ def process_loaded_weights(self, layer: nn.Layer, state_dict): def process_weights_after_loading(self, layer): """ """ - if not self.quant_config.is_checkpoint_bf16: - return weight_id_map = {"gate_up": 0, "down": 1} if ( hasattr(layer.up_gate_proj_weight, "tensor_track") diff --git a/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py b/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py index 7ce51c06b63..26c6496ffa3 100644 --- a/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py +++ b/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py @@ -118,8 +118,6 @@ def process_loaded_weights(self, layer: nn.Layer, weight: paddle.Tensor) -> None layer.weight_scale.set_value(weight_scale) def process_weights_after_loading(self, layer) -> None: - if not self.quant_config.is_checkpoint_bf16: - return quanted_weight, weight_scale = self._quantize_weight_in_blocks(layer.weight) free_tensor(layer.weight) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py index 0c86270c630..862c5526a73 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py @@ -1468,7 +1468,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs): self.down_proj_scale_shape = [layer.num_local_experts, layer.hidden_size] self.model_format = extra_weight_attrs.get("model_format") # TODO(bukejiyu): remove v1 loader check when v0 loader is removed - if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1": + if layer.fd_config.load_config.load_choices == "default_v1": if self.model_format != "torch": up_gate_proj_weight_shape = [ layer.num_local_experts, @@ -1649,21 +1649,18 @@ def _process_quantize(weight_idx): getattr(layer, weight_name).copy_(weight, False) getattr(layer, scale_name).copy_(scale, False) - if self.quant_config.is_checkpoint_bf16: - weight_id_map = {"gate_up": 0, "down": 1} - if weight_fully_copied(layer.up_gate_proj_weight): - weight_type = "gate_up" - else: - weight_type = "down" - - if self.model_format == "torch": - unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace( - "quant_weight", "weight" - ) - process_weight_transpose(layer, unquantized_weight_name) - _process_quantize(weight_id_map[weight_type]) + weight_id_map = {"gate_up": 0, "down": 1} + if weight_fully_copied(layer.up_gate_proj_weight): + weight_type = "gate_up" else: - return + weight_type = "down" + + if self.model_format == "torch": + unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace( + "quant_weight", "weight" + ) + process_weight_transpose(layer, unquantized_weight_name) + _process_quantize(weight_id_map[weight_type]) def process_loaded_weights(self, layer: nn.Layer, state_dict): """ diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py index d1db43a3241..eea0dc9a0ca 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py @@ -84,7 +84,7 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs): ] self.model_format = extra_weight_attrs.get("model_format") # TODO(bukejiyu): remove v1 loader check when v0 loader is removed - if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1": + if layer.fd_config.load_config.load_choices == "default_v1": if self.model_format != "torch": up_gate_proj_weight_shape = [ layer.num_local_experts, @@ -268,21 +268,17 @@ def _process_quantize(weight_idx): getattr(layer, weight_name).copy_(quanted_weight, False) getattr(layer, scale_name).copy_(quanted_weight_scale, False) - if self.quant_config.is_checkpoint_bf16: - weight_id_map = {"gate_up": 0, "down": 1} - if weight_fully_copied(layer.up_gate_proj_weight): - weight_type = "gate_up" - else: - weight_type = "down" - if self.model_format == "torch": - unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace( - "quant_weight", "weight" - ) - process_weight_transpose(layer, unquantized_weight_name) - _process_quantize(weight_id_map[weight_type]) - + weight_id_map = {"gate_up": 0, "down": 1} + if weight_fully_copied(layer.up_gate_proj_weight): + weight_type = "gate_up" else: - return + weight_type = "down" + if self.model_format == "torch": + unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace( + "quant_weight", "weight" + ) + process_weight_transpose(layer, unquantized_weight_name) + _process_quantize(weight_id_map[weight_type]) def apply( self, diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index 4e56c7485f9..2b02a576610 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -30,7 +30,7 @@ from fastdeploy.model_executor.layers.moe.routing_indices_cache import ( save_routing_to_buffer, ) -from fastdeploy.model_executor.layers.utils import get_tensor +from fastdeploy.model_executor.layers.utils import get_tensor, modules_to_convert from fastdeploy.model_executor.utils import h2d_copy, slice_fn from fastdeploy.platforms import current_platform from fastdeploy.worker.experts_manager import RedundantExpertManger @@ -152,6 +152,7 @@ def __init__( with_bias: bool = False, activation="swiglu", model_format: Optional[str] = None, + prefix: str = "", ): """ Initialize the Moe layer with given parameters. @@ -175,7 +176,7 @@ def __init__( if self.ep_size > 1: self.tp_size = 1 self.tp_rank = 0 - + self.prefix = prefix self.attn_tp_size = fd_config.parallel_config.tensor_parallel_size self.attn_tp_rank = fd_config.parallel_config.tensor_parallel_rank @@ -226,7 +227,7 @@ def __init__( moe_quant_config = fd_config.quant_config self.moe_quant_config = moe_quant_config self.moe_quant_type = None - if moe_quant_config and moe_quant_config.get_quant_method(self): + if moe_quant_config and moe_quant_config.get_quant_method(self) and modules_to_convert(prefix, self.fd_config): self.quant_method = moe_quant_config.get_quant_method(self) self.moe_quant_type = moe_quant_config.name() else: diff --git a/fastdeploy/model_executor/layers/quantization/mix_quant.py b/fastdeploy/model_executor/layers/quantization/mix_quant.py index 2956d506306..b6a849d6192 100644 --- a/fastdeploy/model_executor/layers/quantization/mix_quant.py +++ b/fastdeploy/model_executor/layers/quantization/mix_quant.py @@ -18,6 +18,7 @@ from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.moe.moe import FusedMoE +from fastdeploy.model_executor.utils import get_special_quant_config from . import get_quantization_config from .quant_base import QuantConfigBase, QuantMethodBase @@ -41,6 +42,7 @@ def __init__( hadamard_block_size: int = 128, moe_dynamic_quant: bool = False, is_moe_quantized: bool = False, + modules_to_quant: dict = {}, ) -> None: super().__init__() self.dense_quant_type = dense_quant_type @@ -61,6 +63,7 @@ def __init__( self.hadamard_block_size = hadamard_block_size self.moe_dynamic_quant = moe_dynamic_quant self.is_moe_quantized = is_moe_quantized + self.modules_to_quant = modules_to_quant def name(self) -> str: return "mix_quant" @@ -79,6 +82,7 @@ def from_config(cls, config: dict) -> "MixQuantConfig": config.get("hadamard_block_size", 128), config.get("moe_dynamic_quant", False), config.get("is_moe_quantized", False), + config.get("modules_to_quant", {}), ) def get_quant_method(self, layer) -> Optional[QuantMethodBase]: @@ -86,7 +90,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]: if layer.moe_tag == "Image": if self.image_moe_quant_type is not None: return ( - get_quantization_config(self.image_moe_quant_type) + get_special_quant_config(layer, self.modules_to_quant, self.image_moe_quant_type) .from_config( { "is_permuted": self.is_permuted, @@ -101,7 +105,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]: else: if self.moe_quant_type is not None: return ( - get_quantization_config(self.moe_quant_type) + get_special_quant_config(layer, self.modules_to_quant, self.moe_quant_type) .from_config( { "is_permuted": self.is_permuted, diff --git a/fastdeploy/model_executor/layers/quantization/weight_only.py b/fastdeploy/model_executor/layers/quantization/weight_only.py index 24fad6130c8..553f4e8ff61 100644 --- a/fastdeploy/model_executor/layers/quantization/weight_only.py +++ b/fastdeploy/model_executor/layers/quantization/weight_only.py @@ -70,7 +70,6 @@ class WeightOnlyConfig(QuantConfigBase): def __init__( self, algo: str, - is_checkpoint_bf16: bool = False, ) -> None: super().__init__() self.algo = algo @@ -82,7 +81,7 @@ def __init__( self.quant_max_bound = 0 self.quant_min_bound = 0 self.quant_round_type = 0 - self.is_checkpoint_bf16 = is_checkpoint_bf16 + self.is_checkpoint_bf16 = True # weight only linear support dynamic quantization only self.group_size = -1 def name(self) -> str: @@ -91,11 +90,12 @@ def name(self) -> str: @classmethod def from_config(cls, config: dict) -> "WeightOnlyConfig": algo = config["algo"] - is_checkpoint_bf16 = not config.get("is_quantized", False) - return cls(algo, is_checkpoint_bf16) + return cls(algo) def get_quant_method(self, layer) -> Optional[QuantMethodBase]: + # 根据平台类型和层类型选择对应的量化方法 if current_platform.is_xpu(): + # XPU平台:区分MoE层和普通Linear层 if isinstance(layer, FusedMoE): from fastdeploy.model_executor.layers.backends import ( XPUWeightOnlyMoEMethod, @@ -109,6 +109,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]: return XPUWeightOnlyLinearMethod(self) elif current_platform.is_gcu(): + # GCU平台:区分MoE层和普通Linear层 from fastdeploy.model_executor.layers.backends import ( GCUWeightOnlyLinearMethod, GCUWeightOnlyMoEMethod, @@ -119,6 +120,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]: else: return GCUWeightOnlyLinearMethod(self) elif current_platform.is_dcu(): + # DCU平台:区分MoE层和普通Linear层 if isinstance(layer, FusedMoE): from fastdeploy.model_executor.layers.backends import ( DCUTritonWeightOnlyMoEMethod, @@ -132,6 +134,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]: return DCUWeightOnlyLinearMethod(self) elif current_platform.is_maca(): + # MACA平台:MoE层支持cutlass和triton两种后端 if isinstance(layer, FusedMoE): from fastdeploy.model_executor.layers.backends import ( MetaxCutlassWeightOnlyMoEMethod, @@ -166,6 +169,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]: return IluvatarWeightOnlyLinearMethod(self) else: + # GPU默认平台:MoE层支持cutlass/triton/marlin三种后端 if isinstance(layer, FusedMoE): if layer.use_method == "cutlass": from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import ( @@ -188,6 +192,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]: else: raise ValueError(f"Unsupported MOE backend {layer.use_method}") else: + # 普通Linear层:满足条件时使用Machete优化内核,否则使用默认GPU方法 if ( _ENABLE_MACHETE and envs.FD_USE_MACHETE == "1" @@ -206,13 +211,12 @@ class WINT8Config(WeightOnlyConfig): weight only int8 config """ - def __init__(self, is_checkpoint_bf16: bool = False) -> None: - super().__init__("weight_only_int8", is_checkpoint_bf16) + def __init__(self) -> None: + super().__init__("weight_only_int8") @classmethod def from_config(cls, config: dict) -> "WINT8Config": - is_checkpoint_bf16 = not config.get("is_quantized", False) - return cls(is_checkpoint_bf16) + return cls() def name(self) -> str: return "wint8" @@ -225,14 +229,12 @@ class WINT4Config(WeightOnlyConfig): def __init__( self, - is_checkpoint_bf16: bool = False, ) -> None: - super().__init__("weight_only_int4", is_checkpoint_bf16) + super().__init__("weight_only_int4") @classmethod def from_config(cls, config: dict) -> "WINT4Config": - is_checkpoint_bf16 = not config.get("is_quantized", False) - return cls(is_checkpoint_bf16) + return cls() def name(self) -> str: return "wint4" @@ -253,7 +255,7 @@ def __init__( def create_weights(self, layer, **extra_weight_attrs): # TODO(bukejiyu): remove v1 loader check when v0 loader is removed self.model_format = extra_weight_attrs.get("model_format") - if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1": + if layer.fd_config.load_config.load_choices == "default_v1": weight_shape = layer.weight_shape[::-1] if self.model_format == "torch" else layer.weight_shape layer.weight = layer.create_parameter( shape=weight_shape, @@ -363,12 +365,9 @@ def _process_quantize(): layer.weight.copy_(quanted_weight_tensor, False) layer.weight_scale.copy_(weight_scale_tensor, False) - if self.quant_config.is_checkpoint_bf16: - if self.model_format == "torch": - process_weight_transpose(layer, "weight") - _process_quantize() - else: - return + if self.model_format == "torch": + process_weight_transpose(layer, "weight") + _process_quantize() @abstractmethod def process_loaded_weights(self, layer, weights) -> None: diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py index f6b7c417089..e439c2740be 100644 --- a/fastdeploy/model_executor/models/deepseek_v3.py +++ b/fastdeploy/model_executor/models/deepseek_v3.py @@ -178,6 +178,7 @@ def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str) -> None: layer_idx=layer_id, gate_correction_bias=self.gate.e_score_correction_bias, weight_key_map=weight_key_map, + prefix=f"{prefix}.experts", ) self.num_shared_experts = fd_config.model_config.n_shared_experts diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index 4cc4306de5f..496fad15baa 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -217,6 +217,7 @@ def __init__( gate_correction_bias=None, redundant_table_manger=redundant_table_manger, weight_key_map=weight_key_map, + prefix=f"{prefix}.experts", ) if fd_config.model_config.moe_use_aux_free: diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index f4d70108e4b..e1d837f2b65 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -157,6 +157,7 @@ def __init__( moe_tag=moe_tag, weight_key_map=weight_key_map, gate_correction_bias=gate_correction_bias, + prefix=f"{prefix}.experts", ) self.gate = ReplicatedLinear( diff --git a/fastdeploy/model_executor/models/glm4_moe.py b/fastdeploy/model_executor/models/glm4_moe.py index c31928ebe31..4d423f6375b 100644 --- a/fastdeploy/model_executor/models/glm4_moe.py +++ b/fastdeploy/model_executor/models/glm4_moe.py @@ -180,6 +180,7 @@ def __init__( layer_idx=layer_id, gate_correction_bias=self.gate.e_score_correction_bias, weight_key_map=weight_key_map, + prefix=f"{prefix}.experts", ) if self.n_shared_experts > 0: diff --git a/fastdeploy/model_executor/models/gpt_oss.py b/fastdeploy/model_executor/models/gpt_oss.py index a6cf231ed24..9ea1c7a0582 100644 --- a/fastdeploy/model_executor/models/gpt_oss.py +++ b/fastdeploy/model_executor/models/gpt_oss.py @@ -122,6 +122,7 @@ def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str = ""): with_bias=True, activation="swigluoai", model_format="", + prefix=f"{prefix}.experts", ) def forward(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta): diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index 6c443d68bcc..987e365b0ce 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -68,6 +68,7 @@ def __init__( top_k=fd_config.model_config.num_experts_per_tok, layer_idx=layer_id, weight_key_map=weight_key_map, + prefix=f"{prefix}.experts", ) self.gate = ReplicatedLinear( diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py index e63603047be..a6e293e1a4b 100644 --- a/fastdeploy/model_executor/utils.py +++ b/fastdeploy/model_executor/utils.py @@ -14,19 +14,22 @@ # limitations under the License. """ +import fnmatch import os import re from collections.abc import Mapping from contextlib import contextmanager from dataclasses import dataclass, field -from functools import cache +from functools import cache, lru_cache from typing import Any, List, Optional, Union import paddle +from paddle import nn from paddleformers.utils.log import logger from fastdeploy import envs from fastdeploy.config import FDConfig +from fastdeploy.model_executor.layers.quantization import get_quantization_config from fastdeploy.model_executor.layers.utils import get_tensor from fastdeploy.platforms import current_platform @@ -155,6 +158,72 @@ def process_weight_transpose(layer, weight_name): setattr(layer, weight_name, weight_tmp) +def _get_weight_only_method_cls_list(): + # 根据平台类型和层类型选择对应的量化方法 + weight_only_cls_list = [] + if current_platform.is_xpu(): + from fastdeploy.model_executor.layers.backends import ( + XPUWeightOnlyLinearMethod, + XPUWeightOnlyMoEMethod, + ) + + weight_only_cls_list = [XPUWeightOnlyMoEMethod, XPUWeightOnlyLinearMethod] + + elif current_platform.is_gcu(): + from fastdeploy.model_executor.layers.backends import ( + GCUWeightOnlyLinearMethod, + GCUWeightOnlyMoEMethod, + ) + + weight_only_cls_list = [GCUWeightOnlyMoEMethod, GCUWeightOnlyLinearMethod] + elif current_platform.is_dcu(): + from fastdeploy.model_executor.layers.backends import ( + DCUTritonWeightOnlyMoEMethod, + DCUWeightOnlyLinearMethod, + ) + + weight_only_cls_list = [DCUTritonWeightOnlyMoEMethod, DCUWeightOnlyLinearMethod] + + elif current_platform.is_maca(): + from fastdeploy.model_executor.layers.backends import ( + MetaxCutlassWeightOnlyMoEMethod, + MetaxTritonWeightOnlyMoEMethod, + ) + from fastdeploy.model_executor.layers.quantization.weight_only import ( + GPUWeightOnlyLinearMethod, + ) + + weight_only_cls_list = [ + MetaxCutlassWeightOnlyMoEMethod, + MetaxTritonWeightOnlyMoEMethod, + GPUWeightOnlyLinearMethod, + ] + else: + # GPU默认平台:MoE层支持cutlass/triton/marlin三种后端 + from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import ( + CutlassWeightOnlyMoEMethod, + ) + from fastdeploy.model_executor.layers.moe.fused_moe_marlin_backend import ( + MarlinWeightOnlyMoEMethod, + ) + from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import ( + TritonWeightOnlyMoEMethod, + ) + from fastdeploy.model_executor.layers.quantization.weight_only import ( + GPUWeightOnlyLinearMethod, + GPUWeightOnlyMoEMethod, + ) + + weight_only_cls_list = [ + CutlassWeightOnlyMoEMethod, + TritonWeightOnlyMoEMethod, + MarlinWeightOnlyMoEMethod, + GPUWeightOnlyLinearMethod, + GPUWeightOnlyMoEMethod, + ] + return weight_only_cls_list + + def process_weights_after_loading(sublayers_dict: dict, fd_config: FDConfig): """ process_weights_after_loading: @@ -172,9 +241,9 @@ def fn(model_sublayer_name: str, param=None): model_sublayer = sublayers_dict[model_sublayer_name] if isinstance(model_sublayer, KVBatchLinear): model_sublayer.process_weights_after_loading() - if fd_config.quant_config and not fd_config.quant_config.is_checkpoint_bf16: - # skip for offline quantization - return + # if fd_config.quant_config and not fd_config.quant_config.is_checkpoint_bf16: + # # skip for offline quantization + # return if hasattr(model_sublayer, "quant_method"): quant_method = getattr(model_sublayer, "quant_method", None) unquant_moe_layer = get_moe_method() @@ -185,6 +254,12 @@ def fn(model_sublayer_name: str, param=None): if type(quant_method) is UnquantizedLinearMethod or type(quant_method) is unquant_moe_cls: # skip unquantized linear return + + if type(quant_method) not in _get_weight_only_method_cls_list(): + if fd_config.quant_config and not fd_config.quant_config.is_checkpoint_bf16: + # Skip offline quantization if quant_method is not "weight_only". + return + if not hasattr(quant_method, "process_weights_after_loading"): return if param is not None and hasattr(param, "tensor_track") and param.tensor_track is None: @@ -250,7 +325,7 @@ def process_final_after_loading(model, fd_config: FDConfig): ) from fastdeploy.model_executor.layers.moe.moe import get_moe_method - for name, sublayer in model.named_sublayers(): + for _, sublayer in model.named_sublayers(): if isinstance(sublayer, KVBatchLinear): continue quant_method = getattr(sublayer, "quant_method", None) @@ -262,7 +337,9 @@ def process_final_after_loading(model, fd_config: FDConfig): unquant_moe_cls = type(unquant_moe_layer) is_unquant_cls = type(quant_method) is UnquantizedLinearMethod or type(quant_method) is unquant_moe_cls is_offline_quantized_ckpt = not (fd_config.quant_config and fd_config.quant_config.is_checkpoint_bf16) - if is_unquant_cls or is_offline_quantized_ckpt: + if (is_unquant_cls or is_offline_quantized_ckpt) and type( + quant_method + ) not in _get_weight_only_method_cls_list(): if hasattr(quant_method, "process_weights_after_loading"): quant_method.process_weights_after_loading(sublayer) continue @@ -624,3 +701,44 @@ def need_memory_reconstruction(fd_config): return True else: return False + + +@lru_cache(None) +def parse_layer_range(r: str) -> list[tuple[int, int]]: + + return [(int(a), int(b)) for a, b in (part.split("-") for part in r.split(","))] + + +def layer_in_range(layer_idx: int, layer_range: str) -> bool: + for lo, hi in parse_layer_range(layer_range): + if lo <= layer_idx <= hi: + return True + return False + + +def resolve_quant_type(layer_idx: int, prefix: str, modules_to_quant: dict) -> str | None: + rules = modules_to_quant + + for quant_type, rule in rules.items(): + if not prefix_match(prefix, rule["prefix_module"]): + continue + + if layer_in_range(layer_idx, rule["layer_range"]): + return quant_type + + return None + + +def prefix_match(prefix: str, patterns: list[str]) -> bool: + return any(fnmatch.fnmatch(prefix, p) or fnmatch.fnmatch(prefix, p + ".*") for p in patterns) + + +def get_special_quant_config(layer: nn.Layer, modules_to_quant: dict, ori_quant_type: str): + """ + only Moe and offline quant Now + """ + qtype = resolve_quant_type(layer.layer_idx, layer.prefix, modules_to_quant) + if qtype is None: + return get_quantization_config(ori_quant_type) + else: + return get_quantization_config(qtype) diff --git a/tests/layers/test_fusedmoe.py b/tests/layers/test_fusedmoe.py index d97363fe758..19168a765e6 100644 --- a/tests/layers/test_fusedmoe.py +++ b/tests/layers/test_fusedmoe.py @@ -519,6 +519,7 @@ def __init__( topk_group=4, n_group=8, gate_correction_bias=paddle.zeros([self.fd_config.model_config.moe_num_experts], paddle.float32), + prefix=f"{prefix}.experts", # gate_correction_bias = gate_correction_bias_real_data ) moe_layer = self.fused_moe diff --git a/tests/layers/test_w4a8_moe.py b/tests/layers/test_w4a8_moe.py index 9584702223a..f62d97b9fd9 100644 --- a/tests/layers/test_w4a8_moe.py +++ b/tests/layers/test_w4a8_moe.py @@ -120,6 +120,7 @@ def __init__( topk_group=4, n_group=8, gate_correction_bias=paddle.zeros([self.fd_config.model_config.moe_num_experts], paddle.float32), + prefix=f"{prefix}.experts", # gate_correction_bias = gate_correction_bias_real_data ) self.pack_num = 2 diff --git a/tests/layers/test_w4afp8_moe.py b/tests/layers/test_w4afp8_moe.py index f21834354c9..af7eb5fac0d 100644 --- a/tests/layers/test_w4afp8_moe.py +++ b/tests/layers/test_w4afp8_moe.py @@ -106,6 +106,7 @@ def __init__( topk_group=4, n_group=8, gate_correction_bias=paddle.zeros([self.fd_config.model_config.moe_num_experts], paddle.float32), + prefix=f"{prefix}.experts", # gate_correction_bias = gate_correction_bias_real_data ) self.pack_num = 1 diff --git a/tests/model_loader/test_model/config.json b/tests/model_loader/test_model/config.json new file mode 100644 index 00000000000..f5816b160e9 --- /dev/null +++ b/tests/model_loader/test_model/config.json @@ -0,0 +1,48 @@ +{ + "architectures": [ + "Ernie4_5_MoeForCausalLM" + ], + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 2560, + "intermediate_size": 12288, + "max_position_embeddings": 131072, + "model_type": "ernie4_5_moe", + "num_attention_heads": 20, + "num_key_value_heads": 4, + "num_hidden_layers": 28, + "pad_token_id": 0, + "rms_norm_eps": 1e-05, + "use_cache": false, + "vocab_size": 103424, + "rope_theta": 500000, + "use_rmsnorm": true, + "tie_word_embeddings": true, + "use_bias": false, + "moe_num_experts": 64, + "moe_num_shared_experts": 2, + "moe_layer_start_index": 1, + "moe_intermediate_size": 1536, + "moe_capacity": [64,64,64], + "moe_gate": "topk", + "moe_k": 6, + "moe_layer_interval": 1, + "moe_use_aux_free": true, + "dtype": "bfloat16", + "num_nextn_predict_layers": 1, + "multi_token_pred_lambda": 0.3, + "quantization_config":{ + "dense_quant_type": "wint8", + "moe_quant_type": "w4a8", + "quantization": "mix_quant", + "kv_cache_quant_type": "int8", + "modules_to_quant":{ + "wint8":{ + "layer_range":"3-7,10-15,20-24", + "prefix_module":[ + "ernie.layers.*.mlp.experts*" + ]} + } + } +} diff --git a/tests/model_loader/test_skip_layer_mixed_quantization.py b/tests/model_loader/test_skip_layer_mixed_quantization.py new file mode 100644 index 00000000000..5182bd17553 --- /dev/null +++ b/tests/model_loader/test_skip_layer_mixed_quantization.py @@ -0,0 +1,116 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from types import SimpleNamespace + +import paddle + +architectures = "Ernie4_5_MoeForCausalLM" +import sys +from pathlib import Path + +from fastdeploy.config import ErnieArchitectures, ModelConfig +from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import ( + CutlassWeightOnlyMoEMethod, +) +from fastdeploy.model_executor.layers.quantization import parse_quant_config +from fastdeploy.model_executor.models.model_base import ModelRegistry +from fastdeploy.worker.worker_process import init_distributed_environment, parse_args + +TEST_DIR = Path(__file__).resolve().parent +TEST_MODEL_DIR = TEST_DIR / "test_model" + + +def make_fd_config( + *, + model_format="paddle", + tensor_parallel_size=1, + tensor_parallel_rank=0, + splitwise_role="prefill", + use_sequence_parallel_moe=False, + load_choices="default_v1", + model=str(TEST_MODEL_DIR), +): + argv_backup = sys.argv + try: + sys.argv = ["fastdeploy"] + args = parse_args() + finally: + sys.argv = argv_backup + args.model = model + model_config = ModelConfig(vars(args)) + return SimpleNamespace( + model_config=model_config, + parallel_config=SimpleNamespace( + tensor_parallel_size=tensor_parallel_size, + tensor_parallel_rank=tensor_parallel_rank, + expert_parallel_size=1, + expert_parallel_rank=0, + tp_group=None, + use_sequence_parallel_moe=use_sequence_parallel_moe, + ), + scheduler_config=SimpleNamespace(splitwise_role=splitwise_role, max_num_seqs=1), + load_config=SimpleNamespace(dynamic_load_weight=False, load_choices=load_choices), + quant_config=parse_quant_config( + args, + model_config, + is_ernie=ErnieArchitectures.contains_ernie_arch(model_config.architectures), + is_v1_loader=True, + ), + eplb_config=SimpleNamespace(enable_eplb=False), + plas_attention_config=None, + routing_replay_config=SimpleNamespace(enable_routing_replay=False), + graph_opt_config=SimpleNamespace(graph_opt_level=0, use_cudagraph=False), + ) + + +baseline = { + "ernie.layers.24.mlp.experts", + "ernie.layers.23.mlp.experts", + "ernie.layers.11.mlp.experts", + "ernie.layers.5.mlp.experts", + "ernie.layers.22.mlp.experts", + "ernie.layers.15.mlp.experts", + "ernie.layers.3.mlp.experts", + "ernie.layers.21.mlp.experts", + "ernie.layers.4.mlp.experts", + "ernie.layers.13.mlp.experts", + "ernie.layers.6.mlp.experts", + "ernie.layers.7.mlp.experts", + "ernie.layers.14.mlp.experts", + "ernie.layers.12.mlp.experts", + "ernie.layers.10.mlp.experts", + "ernie.layers.20.mlp.experts", +} + + +def collect_cutlass_moe_layers(model) -> set[str]: + matched_keys = set() + + for name, layer in model.named_sublayers(): + quant_method = getattr(layer, "quant_method", None) + if isinstance(quant_method, CutlassWeightOnlyMoEMethod): + matched_keys.add(name) + + return matched_keys + + +def test_skip_layer_mixed_quantization(): + ranks, local_rank = init_distributed_environment() + context = paddle.LazyGuard() + with context: + model_cls = ModelRegistry.get_class(architectures) + model = model_cls(make_fd_config()) + res = collect_cutlass_moe_layers(model) + assert res == baseline From b574e1dd98fcd682a5da5fadabc7d6fb99ccd8c4 Mon Sep 17 00:00:00 2001 From: bukejiyu <395822456@qq.com> Date: Wed, 25 Mar 2026 11:45:51 +0800 Subject: [PATCH 2/3] update --- fastdeploy/model_executor/utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py index a6e293e1a4b..27ca2156d8f 100644 --- a/fastdeploy/model_executor/utils.py +++ b/fastdeploy/model_executor/utils.py @@ -211,7 +211,6 @@ def _get_weight_only_method_cls_list(): ) from fastdeploy.model_executor.layers.quantization.weight_only import ( GPUWeightOnlyLinearMethod, - GPUWeightOnlyMoEMethod, ) weight_only_cls_list = [ @@ -219,7 +218,6 @@ def _get_weight_only_method_cls_list(): TritonWeightOnlyMoEMethod, MarlinWeightOnlyMoEMethod, GPUWeightOnlyLinearMethod, - GPUWeightOnlyMoEMethod, ] return weight_only_cls_list From 5b07e0bce787143d614e0618c5226722abe686f7 Mon Sep 17 00:00:00 2001 From: zccjjj Date: Tue, 7 Apr 2026 15:55:36 +0800 Subject: [PATCH 3/3] support w4a8(Decode)/C8/C8+TP4EP4/PD disaggregation + compatibility fixes Squashed from 6 feature commits + 2 compatibility fix commits: - support w4a8(Decode) - support C8 KV cache quantization - support C8+TP4EP4 - bugfix C8 - bugfix pd+C8 - bugfix pd+mtp - fix: make weight_need_transpose conditional and remove hardcoded layer_id - fix: comprehensive compatibility fixes (Iluvatar platform, moe cast bug, mutable default, hardcoded magic number, unconditional XPU import, etc.) --- .../engine/sched/resource_manager_v1.py | 50 +++++++++-- fastdeploy/envs.py | 2 + .../layers/backends/xpu/attention.py | 8 +- .../layers/backends/xpu/moe/fused_moe.py | 52 ++++++++++++ .../backends/xpu/quantization/kv_cache.py | 84 +++++++++++++++++-- fastdeploy/model_executor/layers/moe/moe.py | 28 ++++++- .../layers/quantization/__init__.py | 10 +++ .../layers/quantization/mix_quant.py | 4 +- .../model_executor/models/ernie4_5_moe.py | 1 + fastdeploy/model_executor/utils.py | 28 ++++++- 10 files changed, 239 insertions(+), 28 deletions(-) diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index 80b58d68972..df907e08f47 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -235,9 +235,14 @@ def allocated_slots(self, request: Request): return len(request.block_tables) * self.config.cache_config.block_size def get_new_block_nums(self, request: Request, num_new_tokens: int): + # Account for preallocated blocks that haven't been added to block_tables yet + preallocated_count = len(getattr(request, "preallocated_blocks", [])) block_num = ( - request.num_computed_tokens + num_new_tokens + self.config.cache_config.block_size - 1 - ) // self.config.cache_config.block_size - len(request.block_tables) + (request.num_computed_tokens + num_new_tokens + self.config.cache_config.block_size - 1) + // self.config.cache_config.block_size + - len(request.block_tables) + - preallocated_count + ) if self.config.speculative_config.method is not None: block_num = min(block_num + 1, self.config.cache_config.max_block_num_per_seq) @@ -800,8 +805,14 @@ def get_enough_request(request, scheduled_reqs): self.allocated_slots(request) - request.num_total_tokens <= self.config.cache_config.prealloc_dec_block_slot_num_threshold ): + # First, consume any preallocated blocks before allocating new ones + preallocated = getattr(request, "preallocated_blocks", []) + if preallocated: + request.block_tables.extend(preallocated) + request.preallocated_blocks = [] + scheduled_reqs.append(self._prepare_decode_task(request)) # Allocation for next decoding blocks - if self.cache_manager.can_allocate_gpu_blocks(self.config.cache_config.enc_dec_block_num): + elif self.cache_manager.can_allocate_gpu_blocks(self.config.cache_config.enc_dec_block_num): llm_logger.debug( f"schedule decoding task: {request} request.num_total_tokens {request.num_total_tokens} request.num_computed_tokens {request.num_computed_tokens}" ) @@ -911,6 +922,12 @@ def _allocate_decode_and_extend(): request.block_tables.extend( self.cache_manager.allocate_gpu_blocks(num_new_block, request.request_id) ) + # Merge preallocated blocks (from PD disaggregation) into block_tables + # so the attention kernel can access all reserved blocks. + preallocated = getattr(request, "preallocated_blocks", []) + if preallocated: + request.block_tables.extend(preallocated) + request.preallocated_blocks = [] # Prepare prefill task scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens)) else: # Not enough blocks to allocate, trigger preemption @@ -920,6 +937,11 @@ def _allocate_decode_and_extend(): request.block_tables.extend( self.cache_manager.allocate_gpu_blocks(num_new_block, request.request_id) ) + # Merge preallocated blocks (from PD disaggregation) into block_tables + preallocated = getattr(request, "preallocated_blocks", []) + if preallocated: + request.block_tables.extend(preallocated) + request.preallocated_blocks = [] # Prepare prefill task scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens)) token_budget -= num_new_tokens @@ -1403,9 +1425,10 @@ def preallocate_resource_in_d(self, request: Request): """ assert self.config.scheduler_config.splitwise_role == "decode", "Only D instance can call this method" request.need_prefill_tokens = len(request.prompt_token_ids) - need_prealloc_prefill_blocks = ( + actual_prefill_blocks = ( request.need_prefill_tokens + self.config.cache_config.block_size - 1 - ) // self.config.cache_config.block_size + self.config.cache_config.enc_dec_block_num + ) // self.config.cache_config.block_size + need_prealloc_prefill_blocks = actual_prefill_blocks + self.config.cache_config.enc_dec_block_num with self.lock: if len(self.waiting) > 0: @@ -1416,11 +1439,14 @@ def preallocate_resource_in_d(self, request: Request): if not self.cache_manager.can_allocate_gpu_blocks(total_need_blocks): return False - request.block_tables = self.cache_manager.allocate_gpu_blocks( - need_prealloc_prefill_blocks, request.request_id - ) + all_blocks = self.cache_manager.allocate_gpu_blocks(need_prealloc_prefill_blocks, request.request_id) + # Only put the blocks that will actually contain prefilled KV data into block_tables. + # The extra enc_dec_block_num blocks are pre-reserved for future decode tokens and + # stored separately to avoid the attention kernel reading uninitialized KV cache data. + request.block_tables = all_blocks[:actual_prefill_blocks] + request.preallocated_blocks = all_blocks[actual_prefill_blocks:] request.num_computed_tokens = request.need_prefill_tokens - request.disaggregate_info["block_tables"] = request.block_tables + request.disaggregate_info["block_tables"] = all_blocks allocated_position = self.get_available_position() request.idx = allocated_position self.tasks_list[request.idx] = request @@ -1470,6 +1496,12 @@ def add_prefilled_request(self, request_output: RequestOutput): self.running.append(request) def _free_blocks(self, request: Request): + # Also free any preallocated blocks that haven't been consumed yet + preallocated = getattr(request, "preallocated_blocks", []) + if preallocated: + self.cache_manager.recycle_gpu_blocks(preallocated, request.request_id) + request.preallocated_blocks = [] + if self.config.cache_config.enable_prefix_caching and self.config.scheduler_config.splitwise_role != "decode": self.cache_manager.release_block_ids(request) self.cache_manager.recycle_gpu_blocks( diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index 433e2441c2f..3dbd599c1ac 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -210,6 +210,8 @@ def _validate_split_kv_size(value: int) -> int: "FD_XPU_MOE_FFN_QUANT_TYPE_MAP": lambda: os.getenv("FD_XPU_MOE_FFN_QUANT_TYPE_MAP", ""), # Whether to enable low latency in mixed scenario "FD_XPU_ENABLE_MIXED_EP_MODE": lambda: bool(int(os.getenv("FD_XPU_ENABLE_MIXED_EP_MODE", "0"))), + # Whether to use yiyan model + "FD_XPU_USE_YIYAN_MODEL": lambda: bool(int(os.getenv("FD_XPU_USE_YIYAN_MODEL", "0"))), # Whether to use phi FP8 quantization,if 1,use paddle default. "FD_USE_PHI_FP8_QUANT": lambda: bool(int(os.getenv("FD_USE_PHI_FP8_QUANT", "1"))), # Enables the Paddle/phi combined TopK operator only when topk_method == noaux_tc, diff --git a/fastdeploy/model_executor/layers/backends/xpu/attention.py b/fastdeploy/model_executor/layers/backends/xpu/attention.py index 85565d33efb..5388251d869 100644 --- a/fastdeploy/model_executor/layers/backends/xpu/attention.py +++ b/fastdeploy/model_executor/layers/backends/xpu/attention.py @@ -181,8 +181,8 @@ def forward_mixed( cache_v_scale = getattr(layer, "cache_v_scale", None) cache_k_out_scale = getattr(layer, "cache_k_out_scale", None) cache_v_out_scale = getattr(layer, "cache_v_out_scale", None) - cache_k_zp = getattr(self, "cache_k_zp", None) - cache_v_zp = getattr(self, "cache_v_zp", None) + cache_k_zp = getattr(layer, "cache_k_zp", None) + cache_v_zp = getattr(layer, "cache_v_zp", None) if layer.use_qk_norm: q_norm_weight = layer.q_norm_weight @@ -220,8 +220,8 @@ def forward_mixed( cache_v_scale, cache_k_out_scale, cache_v_out_scale, - cache_k_zp, - cache_v_zp, + cache_k_zp.astype("bfloat16") if cache_k_zp is not None else None, # for C8 + cache_v_zp.astype("bfloat16") if cache_v_zp is not None else None, # for C8 None, # shift None, # smooth q_norm_weight, diff --git a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py index 6b683cc931a..a88ca01bd71 100644 --- a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py +++ b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py @@ -268,6 +268,14 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs): default_initializer=paddle.nn.initializer.Constant(0), ), ) + set_weight_attrs( + getattr(layer, self.added_scale_attrs[0]), + { + "weight_loader": extra_weight_attrs.get( + "weight_loader", default_weight_loader(layer.fd_config) + ), + }, + ) setattr( layer, self.added_scale_attrs[1], @@ -277,6 +285,31 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs): default_initializer=paddle.nn.initializer.Constant(0), ), ) + set_weight_attrs( + getattr(layer, self.added_scale_attrs[1]), + { + "weight_loader": extra_weight_attrs.get( + "weight_loader", default_weight_loader(layer.fd_config) + ), + }, + ) + + set_weight_attrs( + layer.up_gate_proj_weight, + { + "weight_loader": extra_weight_attrs.get( + "weight_loader", default_weight_loader(layer.fd_config) + ), + }, + ) + set_weight_attrs( + layer.down_proj_weight, + { + "weight_loader": extra_weight_attrs.get( + "weight_loader", default_weight_loader(layer.fd_config) + ), + }, + ) if self.moe_quant_type in ["w8a8", "w4a8"]: for in_scale_name in self.added_in_scale_attrs: @@ -289,6 +322,25 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs): default_initializer=paddle.nn.initializer.Constant(0), ), ) + set_weight_attrs( + layer.down_proj_in_scale, + { + "SHARD_ID_TO_SHARDED_DIM": {"gate": None, "up": None, "down": None}, + "weight_loader": extra_weight_attrs.get( + "weight_loader", default_weight_loader(layer.fd_config) + ), + }, + ) + + set_weight_attrs( + layer.up_gate_proj_in_scale, + { + "SHARD_ID_TO_SHARDED_DIM": {"gate": None, "up": None, "down": None}, + "weight_loader": extra_weight_attrs.get( + "weight_loader", default_weight_loader(layer.fd_config) + ), + }, + ) def process_loaded_weights(self, layer: nn.Layer, state_dict): up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict) diff --git a/fastdeploy/model_executor/layers/backends/xpu/quantization/kv_cache.py b/fastdeploy/model_executor/layers/backends/xpu/quantization/kv_cache.py index 25044bc0939..feebab3889d 100644 --- a/fastdeploy/model_executor/layers/backends/xpu/quantization/kv_cache.py +++ b/fastdeploy/model_executor/layers/backends/xpu/quantization/kv_cache.py @@ -19,6 +19,7 @@ import paddle from paddle import nn +from fastdeploy import envs from fastdeploy.model_executor.layers.quantization.kv_cache import ( KvCacheQuantzationTypes, ) @@ -42,6 +43,7 @@ def __init__(self, kv_cache_quant_type: str, is_channel_wise: bool, has_zero_poi super().__init__() self.kv_cache_quant_type = kv_cache_quant_type self.is_channel_wise = is_channel_wise + self.has_zero_point = has_zero_point try: self.quant_type = KvCacheQuantzationTypes(kv_cache_quant_type) @@ -139,6 +141,62 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs): scale_shape = [layer.fd_config.model_config.num_key_value_heads] if self.cache_quant_config.is_channel_wise: scale_shape = [layer.kv_num_heads * layer.head_dim] + # Custom weight_loader for C8+TP: the safetensors scale/zp shape is + # [1, num_kv_heads, 1, head_dim]. We must split along the kv_heads + # dimension (dim=1), not the last dimension. The default_weight_loader + # treats output_dim as boolean and always splits along dim=-1, which + # is incorrect for 4D tensors where we need to split along dim=1. + fd_config = layer.fd_config + total_kv_heads = fd_config.model_config.num_key_value_heads + tp_size = fd_config.parallel_config.tensor_parallel_size + tp_rank = fd_config.parallel_config.tensor_parallel_rank + max_bound = self.cache_quant_config.max_bound + + def _kv_scale_weight_loader( + param, + loaded_weight, + shard_id=None, + _total_kv_heads=total_kv_heads, + _tp_size=tp_size, + _tp_rank=tp_rank, + _max_bound=max_bound, + ): + loaded_weight = get_tensor(loaded_weight).cast("float32") + # TP split along kv_heads dimension + if _tp_size > 1 and not fd_config.load_config.is_pre_sharded: + head_dim = loaded_weight.numel() // _total_kv_heads + loaded_weight = loaded_weight.reshape([_total_kv_heads, head_dim]) + assert ( + _total_kv_heads % _tp_size == 0 + ), f"num_kv_heads ({_total_kv_heads}) must be divisible by tp_size ({_tp_size})" + kv_heads_per_rank = _total_kv_heads // _tp_size + start = _tp_rank * kv_heads_per_rank + end = start + kv_heads_per_rank + loaded_weight = loaded_weight[start:end, :] + loaded_weight = paddle.clip(loaded_weight, min=1e-8) + loaded_weight = (_max_bound / loaded_weight).reshape(param.shape).cast(param.dtype) + param.copy_(loaded_weight, False) + + def _kv_zp_weight_loader( + param, loaded_weight, shard_id=None, _total_kv_heads=total_kv_heads, _tp_size=tp_size, _tp_rank=tp_rank + ): + loaded_weight = get_tensor(loaded_weight).cast(param.dtype) + # TP split along kv_heads dimension + if _tp_size > 1 and not fd_config.load_config.is_pre_sharded: + head_dim = loaded_weight.numel() // _total_kv_heads + loaded_weight = loaded_weight.reshape([_total_kv_heads, head_dim]) + kv_heads_per_rank = _total_kv_heads // _tp_size + start = _tp_rank * kv_heads_per_rank + end = start + kv_heads_per_rank + loaded_weight = loaded_weight[start:end, :] + loaded_weight = loaded_weight.reshape(param.shape) + param.copy_(loaded_weight, False) + + scale_weight_attrs = {**extra_weight_attrs, "weight_loader": _kv_scale_weight_loader} + zp_weight_attrs = {**extra_weight_attrs, "weight_loader": _kv_zp_weight_loader} + else: + scale_weight_attrs = extra_weight_attrs + zp_weight_attrs = extra_weight_attrs layer.cache_k_scale = layer.create_parameter( shape=scale_shape, @@ -154,13 +212,13 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs): set_weight_attrs( layer.cache_k_scale, { - **extra_weight_attrs, + **scale_weight_attrs, }, ) set_weight_attrs( layer.cache_v_scale, { - **extra_weight_attrs, + **scale_weight_attrs, }, ) @@ -189,13 +247,13 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs): set_weight_attrs( layer.cache_k_zp, { - **extra_weight_attrs, + **zp_weight_attrs, }, ) set_weight_attrs( layer.cache_v_zp, { - **extra_weight_attrs, + **zp_weight_attrs, }, ) @@ -219,10 +277,20 @@ def process_weights_after_loading(self, layer: nn.Layer): use for loader v1 """ # cache_k_out_scale is the reciprocal of cache_k_scale - if layer.cache_k_scale._is_initialized(): - layer.cache_k_out_scale.set_value(1 / layer.cache_k_scale) # cache_k_out_scale - if layer.cache_v_scale._is_initialized(): - layer.cache_v_out_scale.set_value(1 / layer.cache_v_scale) + if envs.FD_XPU_USE_YIYAN_MODEL: + if layer.cache_k_scale._is_initialized(): + layer.cache_k_out_scale.set_value( + self.cache_quant_config.max_bound / layer.cache_k_scale.cast("float32").reshape_([-1]) + ) + if layer.cache_v_scale._is_initialized(): + layer.cache_v_out_scale.set_value( + self.cache_quant_config.max_bound / layer.cache_v_scale.cast("float32").reshape_([-1]) + ) + else: + if layer.cache_k_scale._is_initialized(): + layer.cache_k_out_scale.set_value(1 / layer.cache_k_scale) + if layer.cache_v_scale._is_initialized(): + layer.cache_v_out_scale.set_value(1 / layer.cache_v_scale) def apply(self, layer): """ diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index 2b02a576610..2ef99d38aba 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -260,6 +260,19 @@ def __init__( tp_size={self.tp_size}." ) + def _load_in_scale_weight(self, param, expert_id, loaded_weight): + # only spport ernie now + expert_param = param[expert_id - self.expert_id_offset] + loaded_weight = get_tensor(loaded_weight) + if len(expert_param.shape) != len(loaded_weight.shape): + loaded_weight = loaded_weight.reshape(expert_param.shape) + assert expert_param.shape == loaded_weight.shape, ( + f"Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({expert_param.shape})" + ) + if expert_param.dtype != loaded_weight.dtype: + loaded_weight = loaded_weight.cast(expert_param.dtype) + param[expert_id - self.expert_id_offset].copy_(loaded_weight, False) + def weight_loader( self, param, @@ -292,9 +305,18 @@ def weight_loader( if weight_need_transpose: loaded_weight = loaded_weight.transpose([1, 0]) + if SHARD_ID_TO_SHARDED_DIM["gate"] is None and SHARD_ID_TO_SHARDED_DIM["up"] is None: + # in scale + self._load_in_scale_weight(param, expert_id, loaded_weight) + return + if shard_id is None: # 1.gate up fused in disk - output_size = param[expert_id - self.expert_id_offset].shape[SHARD_ID_TO_SHARDED_DIM["gate"]] + + shard_param = param[expert_id - self.expert_id_offset] + if shard_param.shape == []: + shard_param = shard_param.unsqueeze(0) + output_size = shard_param.shape[SHARD_ID_TO_SHARDED_DIM["gate"]] shard_offsets = [ # (shard_id, shard_offset, shard_size) ("gate", 0, output_size // 2 * self.tp_size), @@ -333,6 +355,8 @@ def _load_gate_up_weight(self, param, expert_id, loaded_weight, shard_id, shard_ shard_size = (self.tp_rank + 1) * block_size loaded_weight = slice_fn(loaded_weight, tp_shard_dim, shard_offset, shard_size) expert_param = param[expert_id - self.expert_id_offset] + if expert_param.shape == []: + expert_param = expert_param.unsqueeze(0) dim = -1 if shard_dim else 0 param_shard_size = expert_param.shape[dim] // 2 if shard_id == "gate": @@ -387,6 +411,8 @@ def _load_down_weight(self, param, expert_id, loaded_weight, shard_id, shard_dim shard_size = (self.tp_rank + 1) * block_size loaded_weight = slice_fn(loaded_weight, tp_shard_dim, shard_offset, shard_size) expert_param = param[expert_id - self.expert_id_offset] + if expert_param.shape == []: + expert_param = expert_param.unsqueeze(0) if hasattr(param, "tensor_track"): # for dyn quant param.tensor_track.mark(start=0, batch_id=expert_id - self.expert_id_offset) diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py index 85fb3c4e868..e5cbebfb9e0 100644 --- a/fastdeploy/model_executor/layers/quantization/__init__.py +++ b/fastdeploy/model_executor/layers/quantization/__init__.py @@ -166,6 +166,8 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]: if quantization == "modelopt_fp4": from .nvfp4 import ModelOptNvFp4Config + from fastdeploy.platforms import current_platform + from .tensor_wise_fp8 import TensorWiseFP8Config from .w4a8 import W4A8Config from .w4afp8 import W4AFP8Config @@ -196,4 +198,12 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]: if quantization == "modelopt_fp4": method_to_config["modelopt_fp4"] = ModelOptNvFp4Config + # For XPU platform, use XPUKvCacheQuantConfig instead of KvCacheQuantConfig + if quantization == "kvcache" and current_platform.is_xpu(): + from fastdeploy.model_executor.layers.backends.xpu.quantization.kv_cache import ( + XPUKvCacheQuantConfig, + ) + + method_to_config["kvcache"] = XPUKvCacheQuantConfig + return method_to_config[quantization] diff --git a/fastdeploy/model_executor/layers/quantization/mix_quant.py b/fastdeploy/model_executor/layers/quantization/mix_quant.py index b6a849d6192..620203cd677 100644 --- a/fastdeploy/model_executor/layers/quantization/mix_quant.py +++ b/fastdeploy/model_executor/layers/quantization/mix_quant.py @@ -42,7 +42,7 @@ def __init__( hadamard_block_size: int = 128, moe_dynamic_quant: bool = False, is_moe_quantized: bool = False, - modules_to_quant: dict = {}, + modules_to_quant: dict = None, ) -> None: super().__init__() self.dense_quant_type = dense_quant_type @@ -63,7 +63,7 @@ def __init__( self.hadamard_block_size = hadamard_block_size self.moe_dynamic_quant = moe_dynamic_quant self.is_moe_quantized = is_moe_quantized - self.modules_to_quant = modules_to_quant + self.modules_to_quant = modules_to_quant if modules_to_quant is not None else {} def name(self) -> str: return "mix_quant" diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index 496fad15baa..5fec7d2e18c 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -597,6 +597,7 @@ def load_weights(self, weights_iterator) -> None: ("attn.cache_k_scale", "cachek_matmul.in_scale", None, None), ("attn.cache_v_scale", "cachev_matmul.in_scale", None, None), ("up_gate_proj_in_scale", "up_gate_proj.in_scale", None, None), + ("down_proj_in_scale", "down_proj.in_scale", None, None), ] expert_params_mapping = [] diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py index 27ca2156d8f..825faddd51b 100644 --- a/fastdeploy/model_executor/utils.py +++ b/fastdeploy/model_executor/utils.py @@ -15,6 +15,7 @@ """ import fnmatch +import math import os import re from collections.abc import Mapping @@ -158,6 +159,7 @@ def process_weight_transpose(layer, weight_name): setattr(layer, weight_name, weight_tmp) +@cache def _get_weight_only_method_cls_list(): # 根据平台类型和层类型选择对应的量化方法 weight_only_cls_list = [] @@ -198,6 +200,13 @@ def _get_weight_only_method_cls_list(): MetaxTritonWeightOnlyMoEMethod, GPUWeightOnlyLinearMethod, ] + elif current_platform.is_iluvatar(): + from fastdeploy.model_executor.layers.backends import ( + IluvatarCutlassWeightOnlyMoEMethod, + IluvatarWeightOnlyLinearMethod, + ) + + weight_only_cls_list = [IluvatarCutlassWeightOnlyMoEMethod, IluvatarWeightOnlyLinearMethod] else: # GPU默认平台:MoE层支持cutlass/triton/marlin三种后端 from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import ( @@ -414,8 +423,8 @@ def fn(param, loaded_weight, shard_id: Optional[Union[int, str]] = None): # mlp.gate.weight is precision-sensitive, so we cast it to float32 for computation loaded_weight = fd_cast(loaded_weight, param) - if param.shape != loaded_weight.shape: - # for e_score_correction_bias + if param.shape != loaded_weight.shape and math.prod(param.shape) == math.prod(loaded_weight.shape): + # for e_score_correction_bias and kv cache scale loaded_weight = loaded_weight.reshape(param.shape) assert param.shape == loaded_weight.shape, ( f" Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({param.shape})" @@ -496,7 +505,7 @@ def _get_unsupported_quant(): if current_platform.is_cuda(): return {"w4a8", "wint2"} elif current_platform.is_xpu(): - return {"w4a8", "w8a8"} + return {"w8a8"} return set() def _err_msg(msg: str) -> str: @@ -600,6 +609,10 @@ def rename_offline_ckpt_suffix_to_fd_suffix( ckpt_weight_suffix: "weight", ckpt_act_suffix: "in_scale", } + w4a8_suffix_map = { + ckpt_weight_suffix: "weight", + ckpt_act_suffix: "in_scale", + } moe_quant_type = "" dense_quant_type = "" if fd_config.quant_config is not None: @@ -617,8 +630,15 @@ def fn(loaded_weight_name, is_moe): fd_suffix_map = {} if (is_moe and moe_quant_type == "block_wise_fp8") or (not is_moe and dense_quant_type == "block_wise_fp8"): fd_suffix_map = fp8_suffix_map - if (is_moe and moe_quant_type == "tensor_wise_fp8") or (not is_moe and dense_quant_type == "tensor_wise_fp8"): + elif (is_moe and moe_quant_type == "tensor_wise_fp8") or ( + not is_moe and dense_quant_type == "tensor_wise_fp8" + ): fd_suffix_map = tensor_wise_fp8_suffix_map + elif is_moe and moe_quant_type in ("w4a8", "w4afp8"): + fd_suffix_map = w4a8_suffix_map + else: + fd_suffix_map = {} + for ckpt_suffix, fd_suffix in fd_suffix_map.items(): if re.search(rf"{ckpt_suffix}$", loaded_weight_name): loaded_weight_name = loaded_weight_name.replace(ckpt_suffix, fd_suffix)