PaddlePaddle
diff --git a/‎fastdeploy/engine/sched/resource_manager_v1.py‎
Lines changed: 41 additions & 9 deletions b/‎fastdeploy/engine/sched/resource_manager_v1.py‎
Lines changed: 41 additions & 9 deletions
diff --git a/‎fastdeploy/envs.py‎
Lines changed: 2 additions & 0 deletions b/‎fastdeploy/envs.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎fastdeploy/model_executor/layers/backends/xpu/attention.py‎
Lines changed: 4 additions & 4 deletions b/‎fastdeploy/model_executor/layers/backends/xpu/attention.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py‎
Lines changed: 52 additions & 0 deletions b/‎fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎fastdeploy/model_executor/layers/backends/xpu/quantization/kv_cache.py‎
Lines changed: 76 additions & 8 deletions b/‎fastdeploy/model_executor/layers/backends/xpu/quantization/kv_cache.py‎
Lines changed: 76 additions & 8 deletions
@@ -235,9 +235,14 @@ def allocated_slots(self, request: Request):
         return len(request.block_tables) * self.config.cache_config.block_size
 
     def get_new_block_nums(self, request: Request, num_new_tokens: int):
+        # Account for preallocated blocks that haven't been added to block_tables yet
+        preallocated_count = len(getattr(request, "preallocated_blocks", []))
         block_num = (
-            request.num_computed_tokens + num_new_tokens + self.config.cache_config.block_size - 1
-        ) // self.config.cache_config.block_size - len(request.block_tables)
+            (request.num_computed_tokens + num_new_tokens + self.config.cache_config.block_size - 1)
+            // self.config.cache_config.block_size
+            - len(request.block_tables)
+            - preallocated_count
+        )
 
         if self.config.speculative_config.method is not None:
             block_num = min(block_num + 1, self.config.cache_config.max_block_num_per_seq)
@@ -800,8 +805,14 @@ def get_enough_request(request, scheduled_reqs):
                         self.allocated_slots(request) - request.num_total_tokens
                         <= self.config.cache_config.prealloc_dec_block_slot_num_threshold
                     ):
+                        # First, consume any preallocated blocks before allocating new ones
+                        preallocated = getattr(request, "preallocated_blocks", [])
+                        if preallocated:
+                            request.block_tables.extend(preallocated)
+                            request.preallocated_blocks = []
+                            scheduled_reqs.append(self._prepare_decode_task(request))
                         # Allocation for next decoding blocks
-                        if self.cache_manager.can_allocate_gpu_blocks(self.config.cache_config.enc_dec_block_num):
+                        elif self.cache_manager.can_allocate_gpu_blocks(self.config.cache_config.enc_dec_block_num):
                             llm_logger.debug(
                                 f"schedule decoding task: {request} request.num_total_tokens {request.num_total_tokens} request.num_computed_tokens {request.num_computed_tokens}"
                             )
@@ -911,6 +922,12 @@ def _allocate_decode_and_extend():
                         request.block_tables.extend(
                             self.cache_manager.allocate_gpu_blocks(num_new_block, request.request_id)
                         )
+                        # Merge preallocated blocks (from PD disaggregation) into block_tables
+                        # so the attention kernel can access all reserved blocks.
+                        preallocated = getattr(request, "preallocated_blocks", [])
+                        if preallocated:
+                            request.block_tables.extend(preallocated)
+                            request.preallocated_blocks = []
                         # Prepare prefill task
                         scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens))
                     else:  # Not enough blocks to allocate, trigger preemption
@@ -920,6 +937,11 @@ def _allocate_decode_and_extend():
                         request.block_tables.extend(
                             self.cache_manager.allocate_gpu_blocks(num_new_block, request.request_id)
                         )
+                        # Merge preallocated blocks (from PD disaggregation) into block_tables
+                        preallocated = getattr(request, "preallocated_blocks", [])
+                        if preallocated:
+                            request.block_tables.extend(preallocated)
+                            request.preallocated_blocks = []
                         # Prepare prefill task
                         scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens))
                     token_budget -= num_new_tokens
@@ -1403,9 +1425,10 @@ def preallocate_resource_in_d(self, request: Request):
         """
         assert self.config.scheduler_config.splitwise_role == "decode", "Only D instance can call this method"
         request.need_prefill_tokens = len(request.prompt_token_ids)
-        need_prealloc_prefill_blocks = (
+        actual_prefill_blocks = (
             request.need_prefill_tokens + self.config.cache_config.block_size - 1
-        ) // self.config.cache_config.block_size + self.config.cache_config.enc_dec_block_num
+        ) // self.config.cache_config.block_size
+        need_prealloc_prefill_blocks = actual_prefill_blocks + self.config.cache_config.enc_dec_block_num
 
         with self.lock:
             if len(self.waiting) > 0:
@@ -1416,11 +1439,14 @@ def preallocate_resource_in_d(self, request: Request):
             if not self.cache_manager.can_allocate_gpu_blocks(total_need_blocks):
                 return False
 
-            request.block_tables = self.cache_manager.allocate_gpu_blocks(
-                need_prealloc_prefill_blocks, request.request_id
-            )
+            all_blocks = self.cache_manager.allocate_gpu_blocks(need_prealloc_prefill_blocks, request.request_id)
+            # Only put the blocks that will actually contain prefilled KV data into block_tables.
+            # The extra enc_dec_block_num blocks are pre-reserved for future decode tokens and
+            # stored separately to avoid the attention kernel reading uninitialized KV cache data.
+            request.block_tables = all_blocks[:actual_prefill_blocks]
+            request.preallocated_blocks = all_blocks[actual_prefill_blocks:]
             request.num_computed_tokens = request.need_prefill_tokens
-            request.disaggregate_info["block_tables"] = request.block_tables
+            request.disaggregate_info["block_tables"] = all_blocks
             allocated_position = self.get_available_position()
             request.idx = allocated_position
             self.tasks_list[request.idx] = request
@@ -1470,6 +1496,12 @@ def add_prefilled_request(self, request_output: RequestOutput):
             self.running.append(request)
 
     def _free_blocks(self, request: Request):
+        # Also free any preallocated blocks that haven't been consumed yet
+        preallocated = getattr(request, "preallocated_blocks", [])
+        if preallocated:
+            self.cache_manager.recycle_gpu_blocks(preallocated, request.request_id)
+            request.preallocated_blocks = []
+
         if self.config.cache_config.enable_prefix_caching and self.config.scheduler_config.splitwise_role != "decode":
             self.cache_manager.release_block_ids(request)
             self.cache_manager.recycle_gpu_blocks(
 
@@ -210,6 +210,8 @@ def _validate_split_kv_size(value: int) -> int:
     "FD_XPU_MOE_FFN_QUANT_TYPE_MAP": lambda: os.getenv("FD_XPU_MOE_FFN_QUANT_TYPE_MAP", ""),
     # Whether to enable low latency in mixed scenario
     "FD_XPU_ENABLE_MIXED_EP_MODE": lambda: bool(int(os.getenv("FD_XPU_ENABLE_MIXED_EP_MODE", "0"))),
+    # Whether to use yiyan model
+    "FD_XPU_USE_YIYAN_MODEL": lambda: bool(int(os.getenv("FD_XPU_USE_YIYAN_MODEL", "0"))),
     # Whether to use phi FP8 quantization,if 1,use paddle default.
     "FD_USE_PHI_FP8_QUANT": lambda: bool(int(os.getenv("FD_USE_PHI_FP8_QUANT", "1"))),
     # Enables the Paddle/phi combined TopK operator only when topk_method == noaux_tc,
 
@@ -181,8 +181,8 @@ def forward_mixed(
         cache_v_scale = getattr(layer, "cache_v_scale", None)
         cache_k_out_scale = getattr(layer, "cache_k_out_scale", None)
         cache_v_out_scale = getattr(layer, "cache_v_out_scale", None)
-        cache_k_zp = getattr(self, "cache_k_zp", None)
-        cache_v_zp = getattr(self, "cache_v_zp", None)
+        cache_k_zp = getattr(layer, "cache_k_zp", None)
+        cache_v_zp = getattr(layer, "cache_v_zp", None)
 
         if layer.use_qk_norm:
             q_norm_weight = layer.q_norm_weight
@@ -220,8 +220,8 @@ def forward_mixed(
             cache_v_scale,
             cache_k_out_scale,
             cache_v_out_scale,
-            cache_k_zp,
-            cache_v_zp,
+            cache_k_zp.astype("bfloat16") if cache_k_zp is not None else None,  # for C8
+            cache_v_zp.astype("bfloat16") if cache_v_zp is not None else None,  # for C8
             None,  # shift
             None,  # smooth
             q_norm_weight,
 
@@ -268,6 +268,14 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
                         default_initializer=paddle.nn.initializer.Constant(0),
                     ),
                 )
+                set_weight_attrs(
+                    getattr(layer, self.added_scale_attrs[0]),
+                    {
+                        "weight_loader": extra_weight_attrs.get(
+                            "weight_loader", default_weight_loader(layer.fd_config)
+                        ),
+                    },
+                )
                 setattr(
                     layer,
                     self.added_scale_attrs[1],
@@ -277,6 +285,31 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
                         default_initializer=paddle.nn.initializer.Constant(0),
                     ),
                 )
+                set_weight_attrs(
+                    getattr(layer, self.added_scale_attrs[1]),
+                    {
+                        "weight_loader": extra_weight_attrs.get(
+                            "weight_loader", default_weight_loader(layer.fd_config)
+                        ),
+                    },
+                )
+
+                set_weight_attrs(
+                    layer.up_gate_proj_weight,
+                    {
+                        "weight_loader": extra_weight_attrs.get(
+                            "weight_loader", default_weight_loader(layer.fd_config)
+                        ),
+                    },
+                )
+                set_weight_attrs(
+                    layer.down_proj_weight,
+                    {
+                        "weight_loader": extra_weight_attrs.get(
+                            "weight_loader", default_weight_loader(layer.fd_config)
+                        ),
+                    },
+                )
 
             if self.moe_quant_type in ["w8a8", "w4a8"]:
                 for in_scale_name in self.added_in_scale_attrs:
@@ -289,6 +322,25 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
                             default_initializer=paddle.nn.initializer.Constant(0),
                         ),
                     )
+                set_weight_attrs(
+                    layer.down_proj_in_scale,
+                    {
+                        "SHARD_ID_TO_SHARDED_DIM": {"gate": None, "up": None, "down": None},
+                        "weight_loader": extra_weight_attrs.get(
+                            "weight_loader", default_weight_loader(layer.fd_config)
+                        ),
+                    },
+                )
+
+                set_weight_attrs(
+                    layer.up_gate_proj_in_scale,
+                    {
+                        "SHARD_ID_TO_SHARDED_DIM": {"gate": None, "up": None, "down": None},
+                        "weight_loader": extra_weight_attrs.get(
+                            "weight_loader", default_weight_loader(layer.fd_config)
+                        ),
+                    },
+                )
 
     def process_loaded_weights(self, layer: nn.Layer, state_dict):
         up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
 
@@ -19,6 +19,7 @@
 import paddle
 from paddle import nn
 
+from fastdeploy import envs
 from fastdeploy.model_executor.layers.quantization.kv_cache import (
     KvCacheQuantzationTypes,
 )
@@ -42,6 +43,7 @@ def __init__(self, kv_cache_quant_type: str, is_channel_wise: bool, has_zero_poi
         super().__init__()
         self.kv_cache_quant_type = kv_cache_quant_type
         self.is_channel_wise = is_channel_wise
+        self.has_zero_point = has_zero_point
 
         try:
             self.quant_type = KvCacheQuantzationTypes(kv_cache_quant_type)
@@ -139,6 +141,62 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
         scale_shape = [layer.fd_config.model_config.num_key_value_heads]
         if self.cache_quant_config.is_channel_wise:
             scale_shape = [layer.kv_num_heads * layer.head_dim]
+            # Custom weight_loader for C8+TP: the safetensors scale/zp shape is
+            # [1, num_kv_heads, 1, head_dim]. We must split along the kv_heads
+            # dimension (dim=1), not the last dimension. The default_weight_loader
+            # treats output_dim as boolean and always splits along dim=-1, which
+            # is incorrect for 4D tensors where we need to split along dim=1.
+            fd_config = layer.fd_config
+            total_kv_heads = fd_config.model_config.num_key_value_heads
+            tp_size = fd_config.parallel_config.tensor_parallel_size
+            tp_rank = fd_config.parallel_config.tensor_parallel_rank
+            max_bound = self.cache_quant_config.max_bound
+
+            def _kv_scale_weight_loader(
+                param,
+                loaded_weight,
+                shard_id=None,
+                _total_kv_heads=total_kv_heads,
+                _tp_size=tp_size,
+                _tp_rank=tp_rank,
+                _max_bound=max_bound,
+            ):
+                loaded_weight = get_tensor(loaded_weight).cast("float32")
+                # TP split along kv_heads dimension
+                if _tp_size > 1 and not fd_config.load_config.is_pre_sharded:
+                    head_dim = loaded_weight.numel() // _total_kv_heads
+                    loaded_weight = loaded_weight.reshape([_total_kv_heads, head_dim])
+                    assert (
+                        _total_kv_heads % _tp_size == 0
+                    ), f"num_kv_heads ({_total_kv_heads}) must be divisible by tp_size ({_tp_size})"
+                    kv_heads_per_rank = _total_kv_heads // _tp_size
+                    start = _tp_rank * kv_heads_per_rank
+                    end = start + kv_heads_per_rank
+                    loaded_weight = loaded_weight[start:end, :]
+                loaded_weight = paddle.clip(loaded_weight, min=1e-8)
+                loaded_weight = (_max_bound / loaded_weight).reshape(param.shape).cast(param.dtype)
+                param.copy_(loaded_weight, False)
+
+            def _kv_zp_weight_loader(
+                param, loaded_weight, shard_id=None, _total_kv_heads=total_kv_heads, _tp_size=tp_size, _tp_rank=tp_rank
+            ):
+                loaded_weight = get_tensor(loaded_weight).cast(param.dtype)
+                # TP split along kv_heads dimension
+                if _tp_size > 1 and not fd_config.load_config.is_pre_sharded:
+                    head_dim = loaded_weight.numel() // _total_kv_heads
+                    loaded_weight = loaded_weight.reshape([_total_kv_heads, head_dim])
+                    kv_heads_per_rank = _total_kv_heads // _tp_size
+                    start = _tp_rank * kv_heads_per_rank
+                    end = start + kv_heads_per_rank
+                    loaded_weight = loaded_weight[start:end, :]
+                loaded_weight = loaded_weight.reshape(param.shape)
+                param.copy_(loaded_weight, False)
+
+            scale_weight_attrs = {**extra_weight_attrs, "weight_loader": _kv_scale_weight_loader}
+            zp_weight_attrs = {**extra_weight_attrs, "weight_loader": _kv_zp_weight_loader}
+        else:
+            scale_weight_attrs = extra_weight_attrs
+            zp_weight_attrs = extra_weight_attrs
 
         layer.cache_k_scale = layer.create_parameter(
             shape=scale_shape,
@@ -154,13 +212,13 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
         set_weight_attrs(
             layer.cache_k_scale,
             {
-                **extra_weight_attrs,
+                **scale_weight_attrs,
             },
         )
         set_weight_attrs(
             layer.cache_v_scale,
             {
-                **extra_weight_attrs,
+                **scale_weight_attrs,
             },
         )
 
@@ -189,13 +247,13 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
             set_weight_attrs(
                 layer.cache_k_zp,
                 {
-                    **extra_weight_attrs,
+                    **zp_weight_attrs,
                 },
             )
             set_weight_attrs(
                 layer.cache_v_zp,
                 {
-                    **extra_weight_attrs,
+                    **zp_weight_attrs,
                 },
             )
 
@@ -219,10 +277,20 @@ def process_weights_after_loading(self, layer: nn.Layer):
         use for loader v1
         """
         # cache_k_out_scale is the reciprocal of cache_k_scale
-        if layer.cache_k_scale._is_initialized():
-            layer.cache_k_out_scale.set_value(1 / layer.cache_k_scale)  # cache_k_out_scale
-        if layer.cache_v_scale._is_initialized():
-            layer.cache_v_out_scale.set_value(1 / layer.cache_v_scale)
+        if envs.FD_XPU_USE_YIYAN_MODEL:
+            if layer.cache_k_scale._is_initialized():
+                layer.cache_k_out_scale.set_value(
+                    self.cache_quant_config.max_bound / layer.cache_k_scale.cast("float32").reshape_([-1])
+                )
+            if layer.cache_v_scale._is_initialized():
+                layer.cache_v_out_scale.set_value(
+                    self.cache_quant_config.max_bound / layer.cache_v_scale.cast("float32").reshape_([-1])
+                )
+        else:
+            if layer.cache_k_scale._is_initialized():
+                layer.cache_k_out_scale.set_value(1 / layer.cache_k_scale)
+            if layer.cache_v_scale._is_initialized():
+                layer.cache_v_out_scale.set_value(1 / layer.cache_v_scale)
 
     def apply(self, layer):
         """