[P/D][Bugfix] Layerwise stacking MTP error. (#7036)
### What this PR does / why we need it?
The community has added a cleaning mechanism for the metadata after the
main model finishes running. The MTP layer should not clean the
metadata, and a new condition has been added to avoid cleaning it.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
By ci
- vLLM version: v0.16.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: wangxiaoteng <wangxiaoteng@huawei.com>
This commit is contained in:
@@ -1287,6 +1287,8 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
has_encoder_input = self.model_config.is_encoder_decoder and num_encoder_reqs > 0
|
has_encoder_input = self.model_config.is_encoder_decoder and num_encoder_reqs > 0
|
||||||
|
|
||||||
# Run forward pass
|
# Run forward pass
|
||||||
|
clear_kv_metadata = self.speculative_config is None
|
||||||
|
if vllm_version_is("0.16.0"):
|
||||||
with (
|
with (
|
||||||
record_function_or_nullcontext("forward"),
|
record_function_or_nullcontext("forward"),
|
||||||
set_ascend_forward_context(
|
set_ascend_forward_context(
|
||||||
@@ -1306,6 +1308,28 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
hidden_states = self._model_forward(
|
hidden_states = self._model_forward(
|
||||||
num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs
|
num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
with (
|
||||||
|
record_function_or_nullcontext("forward"),
|
||||||
|
set_ascend_forward_context(
|
||||||
|
attn_metadata,
|
||||||
|
self.vllm_config,
|
||||||
|
num_tokens=num_tokens_padded,
|
||||||
|
num_tokens_across_dp=num_tokens_across_dp,
|
||||||
|
aclgraph_runtime_mode=cudagraph_mode,
|
||||||
|
batch_descriptor=batch_desc,
|
||||||
|
num_actual_tokens=scheduler_output.total_num_scheduled_tokens,
|
||||||
|
model_instance=self.model,
|
||||||
|
max_tokens_across_pcp=0 if self.pcp_size == 1 else self.pcp_manager.max_num_tokens_across_pcp,
|
||||||
|
skip_compiled=has_encoder_input,
|
||||||
|
),
|
||||||
|
self.maybe_get_kv_connector_output(
|
||||||
|
scheduler_output, clear_metadata=clear_kv_metadata
|
||||||
|
) as kv_connector_output,
|
||||||
|
):
|
||||||
|
hidden_states = self._model_forward(
|
||||||
|
num_tokens_padded, input_ids, positions, intermediate_tensors, inputs_embeds, **model_kwargs
|
||||||
|
)
|
||||||
with record_function_or_nullcontext("post process"):
|
with record_function_or_nullcontext("post process"):
|
||||||
aux_hidden_states = None
|
aux_hidden_states = None
|
||||||
if self.use_aux_hidden_state_outputs:
|
if self.use_aux_hidden_state_outputs:
|
||||||
|
|||||||
Reference in New Issue
Block a user