support cp&dcp (#3260)
### What this PR does / why we need it? This PR adds the Prefill Context Parallelism (PCP) feature, which corresponds to DCP. For specific implementation details, please refer to the RFC https://github.com/vllm-project/vllm/issues/25749. TL;DR: PCP enhances long-sequence inference capabilities by partitioning the sequence dimension during the prefill stage. ### Does this PR introduce _any_ user-facing change? The current implementation primarily includes the following changes: Modified ModelRunner.py for CP partitioning logic for tokens; Modified attention_v1.py and mla_v1.py to adapt the GQA/MLA backend to PCP. Modified block_tables.py to extend the KV cache storage based on DCP&PCP; Added necessary command-line arguments to control parallelism for PCP; ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: LookAround <lixushi@huawei.com> Signed-off-by: chenjie <chenjie137@huawei.com> Signed-off-by: Delphine-Nic <tanwenqin@huawei.com> Signed-off-by: zhangsicheng5 <zhangsicheng5@huawei.com> Signed-off-by: Feng Liu <liufeng248@huawei.com> Signed-off-by: gaojc <1055866782@qq.com> Signed-off-by: weiguihua2 <weiguihua2@huawei.com> Signed-off-by: z50049692 <zhangmingwei11@huawei.com> Co-authored-by: chenjie <chenjie137@huawei.com> Co-authored-by: Delphine-Nic <tanwenqin@huawei.com> Co-authored-by: zhangsicheng5 <zhangsicheng5@huawei.com> Co-authored-by: Feng Liu <liufeng248@huawei.com> Co-authored-by: gaojc <1055866782@qq.com> Co-authored-by: weiguihua2 <weiguihua2@huawei.com> Co-authored-by: z50049692 <zhangmingwei11@huawei.com> Co-authored-by: w00896881 <wangzixuan40@huawei.com>
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, List
|
||||
from typing import Any, List, Optional
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
@@ -9,6 +9,39 @@ from vllm.distributed.kv_transfer import (get_kv_transfer_group,
|
||||
from vllm.forward_context import ForwardContext, get_forward_context
|
||||
|
||||
|
||||
@dataclass
|
||||
# class AscendCommonLongSequenceMetadata:
|
||||
class AscendPrefillContextParallelMetadata:
|
||||
pcp_allgather_restore_idx: torch.Tensor = None
|
||||
|
||||
num_actual_tokens_pcp_padded: Optional[int] = None
|
||||
|
||||
num_computed_tokens_of_pcp_dcp: Optional[list[Optional[list[Optional[
|
||||
list[int]]]]]] = None
|
||||
|
||||
q_head_idx_tensor: torch.Tensor = None
|
||||
|
||||
q_tail_idx_tensor: torch.Tensor = None
|
||||
|
||||
kv_with_q_head_nomask_idx_tensor: torch.Tensor = None
|
||||
|
||||
kv_with_q_head_mask_idx_tensor: torch.Tensor = None
|
||||
|
||||
kv_with_q_tail_nomask_idx_tensor: torch.Tensor = None
|
||||
|
||||
kv_with_q_tail_mask_idx_tensor: torch.Tensor = None
|
||||
|
||||
attn_mask_seqlens: torch.Tensor = None
|
||||
|
||||
head_attn_nomask_seqlens: torch.Tensor = None
|
||||
|
||||
tail_attn_nomask_seqlens: torch.Tensor = None
|
||||
|
||||
q_full_idx: torch.Tensor = None
|
||||
|
||||
pcp_prefill_mask: torch.Tensor = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class AscendCommonAttentionMetadata:
|
||||
"""
|
||||
@@ -72,6 +105,9 @@ class AscendCommonAttentionMetadata:
|
||||
cos: torch.Tensor = None
|
||||
sin: torch.Tensor = None
|
||||
|
||||
prefill_context_parallel_metadata: Optional[
|
||||
AscendPrefillContextParallelMetadata] = None
|
||||
|
||||
|
||||
def split_decodes_and_prefills(
|
||||
common_attn_metadata: AscendCommonAttentionMetadata,
|
||||
|
||||
Reference in New Issue
Block a user