[CP&SP] Integrate FIA operator in mla_cp._forward_decode (#5641)

### What this PR does / why we need it? Replace the npu_multi_head_latent_attention with FIA operator in mla_cp.py _forward_decode. Adjust mla_attn_dpc_pcp in acl_graph.py ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: 2f4e6548ef --------- Signed-off-by: 白永斌 <baiyongbin3@h-partners.com> Signed-off-by: Bai Yongbin <845473182@qq.com> Signed-off-by: tongyuzhou <t00886357@china.huawei.com> Co-authored-by: 白永斌 <baiyongbin3@h-partners.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: tongyuzhou <t00886357@china.huawei.com>
2026-01-22 20:02:30 +08:00
parent 88632cf976
commit 7f91ac2649
4 changed files with 123 additions and 81 deletions
--- a/vllm_ascend/compilation/acl_graph.py
+++ b/vllm_ascend/compilation/acl_graph.py
@@ -468,45 +468,56 @@ def update_mla_attn_dcp_pcp_params(update_stream, forward_context, runtime_shape
        ):
            (
                q_nope,
-                q_pe,
                k_nope,
+                q_pe,
                k_pe,
-                block_table,
-                seq_len,
                num_heads,
-                scale,
                num_kv_heads,
+                input_layout,
+                spec_attn_mask,
+                sparse_mode,
+                scale,
+                block_table,
+                block_size,
+                actual_seq_lengths,
+                actual_seq_lengths_kv,
                attn_output,
                softmax_lse,
            ) = param

            decode_meta = forward_context.attn_metadata[key].decode
            seq_len = decode_meta.cp_seq_len
+            if isinstance(seq_len, torch.Tensor):
+                seq_len = seq_len.tolist()
+            actual_seq_lengths_kv = seq_len

-            # For pcp + spec decode, we flatten seq_lens
-            # to avoid irregular attn_mask shape,
-            # so there's no need to divide runtime_shape by spec_multiple
-            pad_length = runtime_shape - len(seq_len)
-            pad_tensor = torch.zeros(pad_length, dtype=seq_len.dtype, device=seq_len.device)
-            seq_len = torch.cat([seq_len, pad_tensor], dim=0)
+            pad_length = runtime_shape - len(actual_seq_lengths_kv)
+            if pad_length > 0:
+                actual_seq_lengths_kv = actual_seq_lengths_kv + [0] * (runtime_shape - len(actual_seq_lengths_kv))

            torch.npu.graph_task_update_begin(update_stream, handle)

-            torch_npu.atb.npu_multi_head_latent_attention(
+            torch_npu.npu_fused_infer_attention_score.out(
                q_nope,
-                q_pe,
                k_nope,
-                k_pe,
-                block_table,
-                seq_len,
-                num_heads,
-                scale,
-                num_kv_heads,
-                return_lse=True,
-                calc_type="calc_type_ring",
+                k_nope,
+                query_rope=q_pe,
+                key_rope=k_pe,
+                num_heads=num_heads,
+                num_key_value_heads=num_kv_heads,
+                input_layout=input_layout,
+                atten_mask=spec_attn_mask,
+                sparse_mode=sparse_mode,
+                scale=scale,
+                antiquant_mode=0,
+                antiquant_scale=None,
+                softmax_lse_flag=True,
+                block_table=block_table,
+                block_size=block_size,
+                actual_seq_lengths_kv=actual_seq_lengths_kv,
+                actual_seq_lengths=actual_seq_lengths,
                workspace=graph_params.workspaces.get(runtime_shape),
-                output=attn_output,
-                lse=softmax_lse,
+                out=[attn_output, softmax_lse],
            )
            torch.npu.graph_task_update_end(update_stream)