[long_seq_Feat] support chunk prefill (#4158)

### What this PR does / why we need it? 1、qwen GQA attention_v1 optim 2、DeepSeek MLA refactor, all gather q -> all gather kv 3、modelrunner refactor for chunk prefill, we remove some code not use - vLLM version: v0.11.0 - vLLM main: 2918c1b49c --------- Signed-off-by: LookAround <lixushi@huawei.com> Signed-off-by: Delphine-Nic <tanwenqin@huawei.com> Co-authored-by: Delphine-Nic <tanwenqin@huawei.com>
2025-11-14 08:43:37 +08:00
parent 7294f89e43
commit 5ec96fd46c
6 changed files with 419 additions and 941 deletions
--- a/vllm_ascend/attention/utils.py
+++ b/vllm_ascend/attention/utils.py
@@ -20,13 +20,6 @@ class AscendPrefillContextParallelMetadata:

    num_computed_tokens_of_pcp_dcp: Optional[list[list[list[int]]]] = None

-    local_chunked_kv_lens: Optional[list[Optional[list[Optional[list[Optional[
-        list[int]]]]]]]] = None
-
-    mask_for_non_zero_chunk: Optional[List[bool]] = None
-
-    max_chunk_num: int = 0
-
    q_head_idx_tensor: torch.Tensor = None

    q_tail_idx_tensor: torch.Tensor = None
@@ -115,23 +108,6 @@ class AscendCommonAttentionMetadata:
        AscendPrefillContextParallelMetadata] = None


-def extract_req_dcp_by_chunk_pcp(lst,
-                                 chunk_idx,
-                                 dcp_size,
-                                 pcp_rank,
-                                 fill_value=0):
-    num_reqs = len(lst)
-    results: List[List[int]] = []
-    for i in range(num_reqs):
-        if len(lst[i]) == 0 or chunk_idx >= len(lst[i]):
-            # empty req or this req has no corresponding chunk, fill 0
-            results.append([fill_value] * dcp_size)
-            continue
-        dcp_values = lst[i][chunk_idx][pcp_rank]
-        results.append(dcp_values)
-    return results
-
-
 def filter_chunked_req_indices(
    seq_len: torch.Tensor,
    mask_for_non_zero_chunk: Optional[List[bool]],