DeepLink-org · jinminxi104 · Jan 26, 2026 · Feb 4, 2026 · Feb 5, 2026 · Copilot
@@ -12,7 +12,7 @@ on:
 env:
   CI_PATH: "${HOME}/GitHub/${{ github.repository }}/${GITHUB_RUN_NUMBER}"
   LMDEPLOY_PATH: "${HOME}/GitHub/lmdeploy"
-  LMDEPLOY_COMMIT_OR_BRANCH: 'main'
+  LMDEPLOY_COMMIT_OR_BRANCH: 'pa-tnd'
   REPORT_DIR: "${HOME}/GitHub/ci_log/test_reports"
   TEST_LMDEPLOY_E2E_LOG_PATH: "${HOME}/Github/ci_log/logs"
   TEST_LMDEPLOY_E2E_MODEL_PATH: "${HOME}/Github/model"
@@ -74,7 +74,7 @@ jobs:
       - name: Clone lmdeploy
         run: |
           set -ex
-          git clone https://github.com/InternLM/lmdeploy.git ${{ env.LMDEPLOY_PATH }}
+          git clone https://github.com/jinminxi104/lmdeploy.git ${{ env.LMDEPLOY_PATH }}
           cd ${{ env.LMDEPLOY_PATH }} && git checkout ${{ env.LMDEPLOY_COMMIT_OR_BRANCH }}
           # git apply ${{env.CI_PATH }}/.github/ci/fix-exit-multi-npu.patch
 

@@ -57,8 +57,10 @@ def AscendCudaGraphMixin_make_buffers_cudagraph(
         (max_batches, num_blocks), dtype=torch.int32, device=device
     )
 
-    input_buffers["q_seqlens"] = torch.ones(
-        max_batches, dtype=torch.int32, device=device
+    # q_seqlens means cu_attn actual_seq_lengths
+    # this buffer is only used during graph capture under decoding phase
+    input_buffers["q_seqlens"] = torch.arange(
+        1, max_batches + 1, dtype=torch.int32
     )
 
     input_buffers["kv_seqlens"] = torch.ones(max_batches, dtype=torch.int32)
-    input_buffers["q_seqlens"] = torch.arange(
-        1, max_batches + 1, dtype=torch.int32
-    )
-
-    input_buffers["kv_seqlens"] = torch.ones(max_batches, dtype=torch.int32)
+    input_buffers["q_seqlens"] = torch.ones(
+        max_batches, dtype=torch.int32, device=device
+    )
+
+    input_buffers["kv_seqlens"] = torch.ones(
+        max_batches, dtype=torch.int32, device=device
+    )
-    input_buffers["q_seqlens"] = torch.arange(
-        1, max_batches + 1, dtype=torch.int32
-    )
-
-    input_buffers["kv_seqlens"] = torch.ones(max_batches, dtype=torch.int32)
+    input_buffers["q_seqlens"] = torch.ones(
+        max_batches, dtype=torch.int32, device=device
+    )
+
+    input_buffers["kv_seqlens"] = torch.ones(
+        max_batches, dtype=torch.int32, device=device
+    )
@@ -86,6 +88,7 @@ def AscendCudaGraphMixin_fill_buffers_cudagraph(
     """fill cudagraph buffers from forward inputs."""
     block_offsets: Tensor = attn_metadata.block_offsets
     kv_seqlens: Tensor = attn_metadata.kv_seqlens
+    q_seqlens: Tensor = attn_metadata.q_seqlens
     kv_start_indices: Tensor = attn_metadata.kv_start_indices
 
     input_buffers: BuffType = graph_meta.input_buffers
@@ -114,6 +117,7 @@ def AscendCudaGraphMixin_fill_buffers_cudagraph(
 
     attn_metadata.block_offsets = input_buffers["block_offsets"][:new_batch_size]
     attn_metadata.kv_seqlens = input_buffers["kv_seqlens"][:new_batch_size]
+    attn_metadata.q_seqlens = input_buffers["q_seqlens"][:new_batch_size]
     attn_metadata.kv_start_indices = input_buffers["kv_start_indices"][:new_batch_size]
 
     new_inputs = dict(

@@ -260,6 +260,7 @@ def paged_decode_attention(
     value_cache: Tensor,
     block_table: Tensor,
     block_size: int,
+    q_seqlens: Tensor,
     kv_seq_len: Tensor,
     max_kv_seq_len: int,
     num_q_heads: int,
@@ -304,6 +305,7 @@ def paged_decode_attention(
         value_cache,
         block_table,
         block_size,
+        q_seqlens,
         kv_seq_len,
         max_kv_seq_len,
         num_q_heads,

@@ -332,6 +332,7 @@ def paged_decode_attention(
     value_cache: Tensor,
     block_table: Optional[Tensor],
     block_size: int,
+    q_seqlens: Tensor,
     kv_seq_len: Tensor,
     max_kv_seq_len: int,
     num_q_heads: int,
@@ -386,42 +387,31 @@ def paged_decode_attention(
         )
         handle = torch.npu.graph_task_group_end(stream)
         graph_params.handles[num_tokens].append(handle)
-    elif AscendGraphRunner.capturing:
+    else:
         bs, _, dim = query.shape
         block_num = key_cache.size(0)
         query = query.contiguous()
         attn_output = attn_output.contiguous()
-        query = query.view(bs, 1, num_q_heads * dim)
         key_cache = key_cache.view(block_num, block_size, -1)
         value_cache = value_cache.view(block_num, block_size, -1)
         scale_value = softmax_scale if softmax_scale else 1.0 / math.sqrt(dim)
+        softmax_lse = torch.empty(1, dtype=query.dtype, device=query.device)
 
-        attn_output, _ = torch.ops.npu.npu_fused_infer_attention_score(
+        torch.ops.npu.npu_fused_infer_attention_score.out(
             query=query,
             key=key_cache,
             value=value_cache,
             atten_mask=None,
             block_table=block_table,
-            input_layout="BSH",
+            input_layout="TND",
             block_size=block_size,
-            actual_seq_lengths=None,
+            actual_seq_lengths=q_seqlens,
             actual_seq_lengths_kv=kv_seq_len,
             num_key_value_heads=num_kv_heads,
             num_heads=num_q_heads,
             scale=scale_value,
             sparse_mode=0,
-        )
-    else:
-        torch.ops.atb._npu_paged_attention(
-            query=query,
-            key_cache=key_cache,
-            value_cache=value_cache,
-            num_kv_heads=num_kv_heads,
-            num_heads=num_q_heads,
-            scale_value=scale_value,
-            block_table=block_table,
-            context_lens=kv_seq_len,
-            out=attn_output,
+            out = [attn_output, softmax_lse]
         )
     return attn_output
 

@@ -1,7 +1,7 @@
 # Supported torch versions: 2.3.1, 2.5.1, 2.6.0, 2.7.1, 2.8.0
 # Please install one of the supported versions manually
-torch>=2.3.1,<2.9.0
-torch-npu>=2.3.1,<2.9.0
-torchvision>=0.18.1,<0.24.0
+torch>=2.3.1,<2.10.0
+torch-npu>=2.3.1,<2.10.0
+torchvision>=0.18.1,<0.25.0
 importlib-metadata
 pyyaml