feat(attention_cp): support chunked prefill for Qwen3Next with PCP&DCP (#6900)

### What this PR does / why we need it? Support chunked prefill for Qwen3Next with PCP&DCP - vLLM version: v0.16.0 - vLLM main: 15d76f74e2 --------- Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
2026-03-09 17:55:09 +08:00
parent a76a509fae
commit 13adcbe44b
6 changed files with 63 additions and 63 deletions
--- a/vllm_ascend/attention/utils.py
+++ b/vllm_ascend/attention/utils.py
@@ -113,6 +113,10 @@ class AscendPrefillContextParallelMetadata:
    # when entering from linear-attention to attention
    pcp_enter_fa_restore_idx: torch.Tensor = None

+    # scatter the full sequence across all pcp ranks
+    # when exiting from attention to linear-attention
+    pcp_exit_fa_scatter_idx: torch.Tensor = None
+
    # the number of tokens padded in linear-attn per rank
    pcp_padded_tokens_fla: int = 0