[CP&SP] Integrate FIA operator in mla_cp._forward_decode (#5641)
### What this PR does / why we need it?
Replace the npu_multi_head_latent_attention with FIA operator in
mla_cp.py _forward_decode.
Adjust mla_attn_dpc_pcp in acl_graph.py
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef
---------
Signed-off-by: 白永斌 <baiyongbin3@h-partners.com>
Signed-off-by: Bai Yongbin <845473182@qq.com>
Signed-off-by: tongyuzhou <t00886357@china.huawei.com>
Co-authored-by: 白永斌 <baiyongbin3@h-partners.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: tongyuzhou <t00886357@china.huawei.com>
This commit is contained in:
@@ -468,45 +468,56 @@ def update_mla_attn_dcp_pcp_params(update_stream, forward_context, runtime_shape
|
||||
):
|
||||
(
|
||||
q_nope,
|
||||
q_pe,
|
||||
k_nope,
|
||||
q_pe,
|
||||
k_pe,
|
||||
block_table,
|
||||
seq_len,
|
||||
num_heads,
|
||||
scale,
|
||||
num_kv_heads,
|
||||
input_layout,
|
||||
spec_attn_mask,
|
||||
sparse_mode,
|
||||
scale,
|
||||
block_table,
|
||||
block_size,
|
||||
actual_seq_lengths,
|
||||
actual_seq_lengths_kv,
|
||||
attn_output,
|
||||
softmax_lse,
|
||||
) = param
|
||||
|
||||
decode_meta = forward_context.attn_metadata[key].decode
|
||||
seq_len = decode_meta.cp_seq_len
|
||||
if isinstance(seq_len, torch.Tensor):
|
||||
seq_len = seq_len.tolist()
|
||||
actual_seq_lengths_kv = seq_len
|
||||
|
||||
# For pcp + spec decode, we flatten seq_lens
|
||||
# to avoid irregular attn_mask shape,
|
||||
# so there's no need to divide runtime_shape by spec_multiple
|
||||
pad_length = runtime_shape - len(seq_len)
|
||||
pad_tensor = torch.zeros(pad_length, dtype=seq_len.dtype, device=seq_len.device)
|
||||
seq_len = torch.cat([seq_len, pad_tensor], dim=0)
|
||||
pad_length = runtime_shape - len(actual_seq_lengths_kv)
|
||||
if pad_length > 0:
|
||||
actual_seq_lengths_kv = actual_seq_lengths_kv + [0] * (runtime_shape - len(actual_seq_lengths_kv))
|
||||
|
||||
torch.npu.graph_task_update_begin(update_stream, handle)
|
||||
|
||||
torch_npu.atb.npu_multi_head_latent_attention(
|
||||
torch_npu.npu_fused_infer_attention_score.out(
|
||||
q_nope,
|
||||
q_pe,
|
||||
k_nope,
|
||||
k_pe,
|
||||
block_table,
|
||||
seq_len,
|
||||
num_heads,
|
||||
scale,
|
||||
num_kv_heads,
|
||||
return_lse=True,
|
||||
calc_type="calc_type_ring",
|
||||
k_nope,
|
||||
query_rope=q_pe,
|
||||
key_rope=k_pe,
|
||||
num_heads=num_heads,
|
||||
num_key_value_heads=num_kv_heads,
|
||||
input_layout=input_layout,
|
||||
atten_mask=spec_attn_mask,
|
||||
sparse_mode=sparse_mode,
|
||||
scale=scale,
|
||||
antiquant_mode=0,
|
||||
antiquant_scale=None,
|
||||
softmax_lse_flag=True,
|
||||
block_table=block_table,
|
||||
block_size=block_size,
|
||||
actual_seq_lengths_kv=actual_seq_lengths_kv,
|
||||
actual_seq_lengths=actual_seq_lengths,
|
||||
workspace=graph_params.workspaces.get(runtime_shape),
|
||||
output=attn_output,
|
||||
lse=softmax_lse,
|
||||
out=[attn_output, softmax_lse],
|
||||
)
|
||||
torch.npu.graph_task_update_end(update_stream)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user