[P/D][Bugfix] Layerwise stacking MTP error. (#7036)
### What this PR does / why we need it?
The community has added a cleaning mechanism for the metadata after the
main model finishes running. The MTP layer should not clean the
metadata, and a new condition has been added to avoid cleaning it.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
By ci
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com>
This commit is contained in:
@@ -1287,25 +1287,49 @@ class NPUModelRunner(GPUModelRunner):
|
||||
has_encoder_input = self.model_config.is_encoder_decoder and num_encoder_reqs > 0
|
||||
|
||||
# Run forward pass
|
||||
with (
|
||||
record_function_or_nullcontext("forward"),
|
||||
set_ascend_forward_context(
|
||||
attn_metadata,
|
||||
self.vllm_config,
|
||||
num_tokens=num_tokens_padded,
|
||||
num_tokens_across_dp=num_tokens_across_dp,
|
||||
aclgraph_runtime_mode=cudagraph_mode,
|
||||
batch_descriptor=batch_desc,
|
||||
num_actual_tokens=scheduler_output.total_num_scheduled_tokens,
|
||||
model_instance=self.model,
|
||||
max_tokens_across_pcp=0 if self.pcp_size == 1 else self.pcp_manager.max_num_tokens_across_pcp,
|
||||
skip_compiled=has_encoder_input,
|
||||
),
|
||||
self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output,
|
||||
):
|
||||
hidden_states = self._model_forward(
|
||||
num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs
|
||||
)
|
||||
clear_kv_metadata = self.speculative_config is None
|
||||
if vllm_version_is("0.16.0"):
|
||||
with (
|
||||
record_function_or_nullcontext("forward"),
|
||||
set_ascend_forward_context(
|
||||
attn_metadata,
|
||||
self.vllm_config,
|
||||
num_tokens=num_tokens_padded,
|
||||
num_tokens_across_dp=num_tokens_across_dp,
|
||||
aclgraph_runtime_mode=cudagraph_mode,
|
||||
batch_descriptor=batch_desc,
|
||||
num_actual_tokens=scheduler_output.total_num_scheduled_tokens,
|
||||
model_instance=self.model,
|
||||
max_tokens_across_pcp=0 if self.pcp_size == 1 else self.pcp_manager.max_num_tokens_across_pcp,
|
||||
skip_compiled=has_encoder_input,
|
||||
),
|
||||
self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output,
|
||||
):
|
||||
hidden_states = self._model_forward(
|
||||
num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs
|
||||
)
|
||||
else:
|
||||
with (
|
||||
record_function_or_nullcontext("forward"),
|
||||
set_ascend_forward_context(
|
||||
attn_metadata,
|
||||
self.vllm_config,
|
||||
num_tokens=num_tokens_padded,
|
||||
num_tokens_across_dp=num_tokens_across_dp,
|
||||
aclgraph_runtime_mode=cudagraph_mode,
|
||||
batch_descriptor=batch_desc,
|
||||
num_actual_tokens=scheduler_output.total_num_scheduled_tokens,
|
||||
model_instance=self.model,
|
||||
max_tokens_across_pcp=0 if self.pcp_size == 1 else self.pcp_manager.max_num_tokens_across_pcp,
|
||||
skip_compiled=has_encoder_input,
|
||||
),
|
||||
self.maybe_get_kv_connector_output(
|
||||
scheduler_output, clear_metadata=clear_kv_metadata
|
||||
) as kv_connector_output,
|
||||
):
|
||||
hidden_states = self._model_forward(
|
||||
num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs
|
||||
)
|
||||
with record_function_or_nullcontext("post process"):
|
||||
aux_hidden_states = None
|
||||
if self.use_aux_hidden_state_outputs:
|
||||
|
||||
Reference in New Issue
Block a user