[v0.18.0][Bugfix] fix ds3.2 dcp mtp (#7681)
### What this PR does / why we need it? Fixed the issue where the DCP overlaps the MTP scenario in the ds3.2 scenario. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? cherry-pick from: https://github.com/vllm-project/vllm-ascend/pull/7617 Signed-off-by: weiguihua2 <weiguihua2@huawei.com>
This commit is contained in:
@@ -232,6 +232,7 @@ class AscendAttentionCPMetadataBuilder(AscendAttentionMetadataBuilder):
|
||||
block_tables=block_table,
|
||||
query_start_loc=query_start_loc,
|
||||
seq_lens=seq_lens,
|
||||
seq_lens_cpu=seq_lens,
|
||||
seq_lens_list=seq_lens.tolist(),
|
||||
max_query_len=common_attn_metadata.max_query_len,
|
||||
actual_seq_lengths_q=query_start_loc_cpu[1:].tolist(),
|
||||
|
||||
@@ -257,8 +257,8 @@ class AscendSFACPImpl(AscendSFAImpl):
|
||||
return self._align_to_graph_bucket_tokens(attn_output, attn_metadata)
|
||||
|
||||
def _align_to_graph_bucket_tokens(self, attn_output: torch.Tensor | None, attn_metadata: M) -> torch.Tensor | None:
|
||||
if attn_output is None:
|
||||
return None
|
||||
if attn_output is None or self.pcp_size == 1:
|
||||
return attn_output
|
||||
# In graph/piecewise mode, output buffer uses graph bucket token size
|
||||
# (forward_context.num_tokens), while PCP path may compute only valid
|
||||
# tokens. Align to the larger one to avoid later write-back mismatch.
|
||||
|
||||
Reference in New Issue
Block a user