revert TND modify when dcp pcp (#3948)

### What this PR does / why we need it?
1、revert TND modify when dcp pcp, which is introduced by
f57bdb09fc
2、deal aclgraph pad border issue

- vLLM version: v0.11.0
- vLLM main:
83f478bb19

Signed-off-by: weiguihua2 <weiguihua2@huawei.com>
This commit is contained in:
weiguihua2
2025-11-03 22:22:17 +08:00
committed by GitHub
parent cc2cd42ad3
commit 5453033a41
3 changed files with 27 additions and 17 deletions

View File

@@ -476,6 +476,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
self.pcp_padded_slot_mapping = torch.zeros(self.max_num_tokens,
dtype=torch.int32,
device=self.device)
self.num_actual_tokens_pcp_padded = 0
if self.speculative_config and self.pcp_size > 1:
self.input_ids_pcp_full = torch.zeros(self.max_num_tokens,
dtype=torch.int32,
@@ -1915,7 +1916,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
hidden_states = hidden_states[:-pad_size, :]
if self.pcp_size > 1:
hidden_states = get_pcp_group().all_gather(hidden_states, 0)
hidden_states = get_pcp_group().all_gather(
hidden_states[:self.num_actual_tokens_pcp_padded //
self.pcp_size], 0)
hidden_states = torch.index_select(
hidden_states, 0,
self.pcp_allgather_restore_idx[:hidden_states.shape[0]])
@@ -4304,6 +4307,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
num_decodes = sum(self.input_batch.num_computed_tokens_cpu[:num_reqs]
>= self.input_batch.num_prompt_tokens[:num_reqs])
num_actual_tokens_pcp_padded = total_num_scheduled_tokens * self.pcp_size
self.num_actual_tokens_pcp_padded = num_actual_tokens_pcp_padded
long_seq_metadata = None
if self.pcp_size * self.dcp_size > 1:
long_seq_metadata = AscendPrefillContextParallelMetadata(