[P/D][Bugfix] Layerwise stacking MTP error. (#7036)

### What this PR does / why we need it? The community has added a cleaning mechanism for the metadata after the main model finishes running. The MTP layer should not clean the metadata, and a new condition has been added to avoid cleaning it. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? By ci - vLLM version: v0.16.0 - vLLM main: 4034c3d32e --------- Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com>
2026-03-09 10:55:43 +08:00
parent 675387f1fd
commit a3f4f6b10b
1 changed files with 43 additions and 19 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -1287,6 +1287,8 @@ class NPUModelRunner(GPUModelRunner):
        has_encoder_input = self.model_config.is_encoder_decoder and num_encoder_reqs > 0
        # Run forward pass
        clear_kv_metadata = self.speculative_config is None
        if vllm_version_is("0.16.0"):
            with (
                record_function_or_nullcontext("forward"),
                set_ascend_forward_context(
@@ -1306,6 +1308,28 @@ class NPUModelRunner(GPUModelRunner):
                hidden_states = self._model_forward(
                    num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs
                )
        else:
            with (
                record_function_or_nullcontext("forward"),
                set_ascend_forward_context(
                    attn_metadata,
                    self.vllm_config,
                    num_tokens=num_tokens_padded,
                    num_tokens_across_dp=num_tokens_across_dp,
                    aclgraph_runtime_mode=cudagraph_mode,
                    batch_descriptor=batch_desc,
                    num_actual_tokens=scheduler_output.total_num_scheduled_tokens,
                    model_instance=self.model,
                    max_tokens_across_pcp=0 if self.pcp_size == 1 else self.pcp_manager.max_num_tokens_across_pcp,
                    skip_compiled=has_encoder_input,
                ),
                self.maybe_get_kv_connector_output(
                    scheduler_output, clear_metadata=clear_kv_metadata
                ) as kv_connector_output,
            ):
                hidden_states = self._model_forward(
                    num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs
                )
        with record_function_or_nullcontext("post process"):
            aux_hidden_states = None
            if self.use_aux_hidden_state_outputs: