[long_seq_Feat] support chunk prefill (#4158)
### What this PR does / why we need it?
1、qwen GQA attention_v1 optim
2、DeepSeek MLA refactor, all gather q -> all gather kv
3、modelrunner refactor for chunk prefill, we remove some code not use
- vLLM version: v0.11.0
- vLLM main:
2918c1b49c
---------
Signed-off-by: LookAround <lixushi@huawei.com>
Signed-off-by: Delphine-Nic <tanwenqin@huawei.com>
Co-authored-by: Delphine-Nic <tanwenqin@huawei.com>
This commit is contained in:
@@ -20,13 +20,6 @@ class AscendPrefillContextParallelMetadata:
|
||||
|
||||
num_computed_tokens_of_pcp_dcp: Optional[list[list[list[int]]]] = None
|
||||
|
||||
local_chunked_kv_lens: Optional[list[Optional[list[Optional[list[Optional[
|
||||
list[int]]]]]]]] = None
|
||||
|
||||
mask_for_non_zero_chunk: Optional[List[bool]] = None
|
||||
|
||||
max_chunk_num: int = 0
|
||||
|
||||
q_head_idx_tensor: torch.Tensor = None
|
||||
|
||||
q_tail_idx_tensor: torch.Tensor = None
|
||||
@@ -115,23 +108,6 @@ class AscendCommonAttentionMetadata:
|
||||
AscendPrefillContextParallelMetadata] = None
|
||||
|
||||
|
||||
def extract_req_dcp_by_chunk_pcp(lst,
|
||||
chunk_idx,
|
||||
dcp_size,
|
||||
pcp_rank,
|
||||
fill_value=0):
|
||||
num_reqs = len(lst)
|
||||
results: List[List[int]] = []
|
||||
for i in range(num_reqs):
|
||||
if len(lst[i]) == 0 or chunk_idx >= len(lst[i]):
|
||||
# empty req or this req has no corresponding chunk, fill 0
|
||||
results.append([fill_value] * dcp_size)
|
||||
continue
|
||||
dcp_values = lst[i][chunk_idx][pcp_rank]
|
||||
results.append(dcp_values)
|
||||
return results
|
||||
|
||||
|
||||
def filter_chunked_req_indices(
|
||||
seq_len: torch.Tensor,
|
||||
mask_for_non_zero_chunk: Optional[List[bool]],
|
||||
|
||||
Reference in New Issue
Block a user