[P/D][Bugfix] Layerwise stacking MTP error. (#7036)

### What this PR does / why we need it?
The community has added a cleaning mechanism for the metadata after the
main model finishes running. The MTP layer should not clean the
metadata, and a new condition has been added to avoid cleaning it.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
By ci

- vLLM version: v0.16.0
- vLLM main:
4034c3d32e

---------

Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com>
This commit is contained in:
wangxiaoteng888
2026-03-09 10:55:43 +08:00
committed by GitHub
parent 675387f1fd
commit a3f4f6b10b

View File

@@ -1287,25 +1287,49 @@ class NPUModelRunner(GPUModelRunner):
has_encoder_input = self.model_config.is_encoder_decoder and num_encoder_reqs > 0
# Run forward pass
with (
record_function_or_nullcontext("forward"),
set_ascend_forward_context(
attn_metadata,
self.vllm_config,
num_tokens=num_tokens_padded,
num_tokens_across_dp=num_tokens_across_dp,
aclgraph_runtime_mode=cudagraph_mode,
batch_descriptor=batch_desc,
num_actual_tokens=scheduler_output.total_num_scheduled_tokens,
model_instance=self.model,
max_tokens_across_pcp=0 if self.pcp_size == 1 else self.pcp_manager.max_num_tokens_across_pcp,
skip_compiled=has_encoder_input,
),
self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output,
):
hidden_states = self._model_forward(
num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs
)
clear_kv_metadata = self.speculative_config is None
if vllm_version_is("0.16.0"):
with (
record_function_or_nullcontext("forward"),
set_ascend_forward_context(
attn_metadata,
self.vllm_config,
num_tokens=num_tokens_padded,
num_tokens_across_dp=num_tokens_across_dp,
aclgraph_runtime_mode=cudagraph_mode,
batch_descriptor=batch_desc,
num_actual_tokens=scheduler_output.total_num_scheduled_tokens,
model_instance=self.model,
max_tokens_across_pcp=0 if self.pcp_size == 1 else self.pcp_manager.max_num_tokens_across_pcp,
skip_compiled=has_encoder_input,
),
self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output,
):
hidden_states = self._model_forward(
num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs
)
else:
with (
record_function_or_nullcontext("forward"),
set_ascend_forward_context(
attn_metadata,
self.vllm_config,
num_tokens=num_tokens_padded,
num_tokens_across_dp=num_tokens_across_dp,
aclgraph_runtime_mode=cudagraph_mode,
batch_descriptor=batch_desc,
num_actual_tokens=scheduler_output.total_num_scheduled_tokens,
model_instance=self.model,
max_tokens_across_pcp=0 if self.pcp_size == 1 else self.pcp_manager.max_num_tokens_across_pcp,
skip_compiled=has_encoder_input,
),
self.maybe_get_kv_connector_output(
scheduler_output, clear_metadata=clear_kv_metadata
) as kv_connector_output,
):
hidden_states = self._model_forward(
num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs
)
with record_function_or_nullcontext("post process"):
aux_hidden_states = None
if self.use_aux_hidden_state_outputs: