Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ on:
env:
CI_PATH: "${HOME}/GitHub/${{ github.repository }}/${GITHUB_RUN_NUMBER}"
LMDEPLOY_PATH: "${HOME}/GitHub/lmdeploy"
LMDEPLOY_COMMIT_OR_BRANCH: 'main'
LMDEPLOY_COMMIT_OR_BRANCH: 'pa-tnd'
REPORT_DIR: "${HOME}/GitHub/ci_log/test_reports"
TEST_LMDEPLOY_E2E_LOG_PATH: "${HOME}/Github/ci_log/logs"
TEST_LMDEPLOY_E2E_MODEL_PATH: "${HOME}/Github/model"
Expand Down Expand Up @@ -74,7 +74,7 @@ jobs:
- name: Clone lmdeploy
run: |
set -ex
git clone https://github.com/InternLM/lmdeploy.git ${{ env.LMDEPLOY_PATH }}
git clone https://github.com/jinminxi104/lmdeploy.git ${{ env.LMDEPLOY_PATH }}
cd ${{ env.LMDEPLOY_PATH }} && git checkout ${{ env.LMDEPLOY_COMMIT_OR_BRANCH }}
# git apply ${{env.CI_PATH }}/.github/ci/fix-exit-multi-npu.patch

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,10 @@ def AscendCudaGraphMixin_make_buffers_cudagraph(
(max_batches, num_blocks), dtype=torch.int32, device=device
)

input_buffers["q_seqlens"] = torch.ones(
max_batches, dtype=torch.int32, device=device
# q_seqlens means cu_attn actual_seq_lengths
# this buffer is only used during graph capture under decoding phase
input_buffers["q_seqlens"] = torch.arange(
1, max_batches + 1, dtype=torch.int32
Comment thread
jinminxi104 marked this conversation as resolved.
)

input_buffers["kv_seqlens"] = torch.ones(max_batches, dtype=torch.int32)
Comment on lines +62 to 66
Copy link

Copilot AI Jan 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potentially incorrect initialization values: The q_seqlens buffer is initialized with torch.arange(1, max_batches + 1), creating values [1, 2, 3, ..., max_batches]. However, according to the comment, this represents "cu_attn actual_seq_lengths" for the decoding phase. In decoding, each request typically processes exactly 1 token, so all values should be 1 (similar to how camb_cudagraph.py initializes it with torch.ones). The incrementing sequence suggests this might be confusing q_seqlens with cumulative sequence lengths (which is what q_start_loc represents). Verify that this initialization is correct for the TND layout and the actual_seq_lengths parameter of npu_fused_infer_attention_score.

Suggested change
input_buffers["q_seqlens"] = torch.arange(
1, max_batches + 1, dtype=torch.int32
)
input_buffers["kv_seqlens"] = torch.ones(max_batches, dtype=torch.int32)
input_buffers["q_seqlens"] = torch.ones(
max_batches, dtype=torch.int32, device=device
)
input_buffers["kv_seqlens"] = torch.ones(
max_batches, dtype=torch.int32, device=device
)

Copilot uses AI. Check for mistakes.
Expand Down Expand Up @@ -86,6 +88,7 @@ def AscendCudaGraphMixin_fill_buffers_cudagraph(
"""fill cudagraph buffers from forward inputs."""
block_offsets: Tensor = attn_metadata.block_offsets
kv_seqlens: Tensor = attn_metadata.kv_seqlens
q_seqlens: Tensor = attn_metadata.q_seqlens
kv_start_indices: Tensor = attn_metadata.kv_start_indices

input_buffers: BuffType = graph_meta.input_buffers
Expand Down Expand Up @@ -114,6 +117,7 @@ def AscendCudaGraphMixin_fill_buffers_cudagraph(

attn_metadata.block_offsets = input_buffers["block_offsets"][:new_batch_size]
attn_metadata.kv_seqlens = input_buffers["kv_seqlens"][:new_batch_size]
attn_metadata.q_seqlens = input_buffers["q_seqlens"][:new_batch_size]
attn_metadata.kv_start_indices = input_buffers["kv_start_indices"][:new_batch_size]

new_inputs = dict(
Expand Down
2 changes: 2 additions & 0 deletions dlinfer/ops/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@ def paged_decode_attention(
value_cache: Tensor,
block_table: Tensor,
block_size: int,
q_seqlens: Tensor,
kv_seq_len: Tensor,
max_kv_seq_len: int,
num_q_heads: int,
Expand Down Expand Up @@ -304,6 +305,7 @@ def paged_decode_attention(
value_cache,
block_table,
block_size,
q_seqlens,
kv_seq_len,
max_kv_seq_len,
num_q_heads,
Expand Down
24 changes: 7 additions & 17 deletions dlinfer/vendor/ascend/torch_npu_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ def paged_decode_attention(
value_cache: Tensor,
block_table: Optional[Tensor],
block_size: int,
q_seqlens: Tensor,
kv_seq_len: Tensor,
max_kv_seq_len: int,
num_q_heads: int,
Expand Down Expand Up @@ -386,42 +387,31 @@ def paged_decode_attention(
)
handle = torch.npu.graph_task_group_end(stream)
graph_params.handles[num_tokens].append(handle)
elif AscendGraphRunner.capturing:
else:
bs, _, dim = query.shape
block_num = key_cache.size(0)
query = query.contiguous()
attn_output = attn_output.contiguous()
query = query.view(bs, 1, num_q_heads * dim)
key_cache = key_cache.view(block_num, block_size, -1)
value_cache = value_cache.view(block_num, block_size, -1)
scale_value = softmax_scale if softmax_scale else 1.0 / math.sqrt(dim)
softmax_lse = torch.empty(1, dtype=query.dtype, device=query.device)

attn_output, _ = torch.ops.npu.npu_fused_infer_attention_score(
torch.ops.npu.npu_fused_infer_attention_score.out(
query=query,
key=key_cache,
value=value_cache,
atten_mask=None,
block_table=block_table,
input_layout="BSH",
input_layout="TND",
block_size=block_size,
actual_seq_lengths=None,
actual_seq_lengths=q_seqlens,
actual_seq_lengths_kv=kv_seq_len,
num_key_value_heads=num_kv_heads,
num_heads=num_q_heads,
scale=scale_value,
sparse_mode=0,
)
else:
torch.ops.atb._npu_paged_attention(
query=query,
key_cache=key_cache,
value_cache=value_cache,
num_kv_heads=num_kv_heads,
num_heads=num_q_heads,
scale_value=scale_value,
block_table=block_table,
context_lens=kv_seq_len,
out=attn_output,
out = [attn_output, softmax_lse]
)
return attn_output

Expand Down
6 changes: 3 additions & 3 deletions requirements/ascend/torch.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Supported torch versions: 2.3.1, 2.5.1, 2.6.0, 2.7.1, 2.8.0
# Please install one of the supported versions manually
torch>=2.3.1,<2.9.0
torch-npu>=2.3.1,<2.9.0
torchvision>=0.18.1,<0.24.0
torch>=2.3.1,<2.10.0
torch-npu>=2.3.1,<2.10.0
torchvision>=0.18.1,<0.25.0
importlib-metadata
pyyaml
Loading