[Feat] support basic pcp&dcp for qwen3next (#6091)

### What this PR does / why we need it? This PR implements Context Parallelism (CP) support for the Qwen3-Next model, including PCP (Parallel Context Parallelism) and DCP (Dynamic/Data Context Parallelism). - vLLM version: v0.15.0 - vLLM main: f176443446 --------- Signed-off-by: SunnyLee219 <3294305115@qq.com> Signed-off-by: Jingchun Gao <gaojingchun1@huawei.com> Signed-off-by: 白永斌 <baiyongbin3@h-partners.com> Signed-off-by: Bai Yongbin <845473182@qq.com> Co-authored-by: SunnyLee219 <3294305115@qq.com> Co-authored-by: Jingchun Gao <gaojingchun1@huawei.com> Co-authored-by: 白永斌 <baiyongbin3@h-partners.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
2026-02-28 21:44:08 +08:00
parent 64fba51275
commit 9d09488b4a
16 changed files with 906 additions and 81 deletions
--- a/vllm_ascend/attention/utils.py
+++ b/vllm_ascend/attention/utils.py
@@ -101,6 +101,21 @@ class AscendPrefillContextParallelMetadata:
    # original max_query_len before pcp split
    max_query_len_pcp_full: int = 0

+    # the following attributes are specifically used in hybrid-attn models.
+    pcp_use_hybrid_attn: bool = False
+
+    pcp_unpad_mask: torch.Tensor = None
+
+    # to get the right order of query in prefill per rank
+    pcp_fa_query_idx: torch.Tensor = None
+
+    # restore the full sequence across all pcp ranks
+    # when entering from linear-attention to attention
+    pcp_enter_fa_restore_idx: torch.Tensor = None
+
+    # the number of tokens padded in linear-attn per rank
+    pcp_padded_tokens_fla: int = 0
+

@dataclass
 class AscendCommonAttentionMetadata(CommonAttentionMetadata):