[BugFix] Fix multimodal model support fullgraph error (#3425)
### What this PR does / why we need it? Because the update_attn_params function requires passing the num_tokens parameter, and num_tokens is obtained via postions.shape[0]. However, the multimodal model uses mrope (Multidimensional Rotary Position Embedding), which results in the postions having a shape of 2. Consequently, postions.shape[0] retrieves an incorrect value.We resolve this issue by replacing positions.shape[0] with maybe_padded_num_tokens. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: fan2956 <zhoufan53@huawei.com>
This commit is contained in:
@@ -1520,13 +1520,14 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
|
|
||||||
forward_context = get_forward_context()
|
forward_context = get_forward_context()
|
||||||
if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL:
|
if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL:
|
||||||
|
# TODO: maybe_padded_num_tokens will be removed, use num_input_tokens instead
|
||||||
if self.vllm_config.model_config.use_mla:
|
if self.vllm_config.model_config.use_mla:
|
||||||
# FIXME: Try using `auto_dispatch_capture=True`
|
# FIXME: Try using `auto_dispatch_capture=True`
|
||||||
update_mla_attn_params(self.update_stream, forward_context,
|
update_mla_attn_params(self.update_stream, forward_context,
|
||||||
positions.shape[0])
|
maybe_padded_num_tokens)
|
||||||
else:
|
else:
|
||||||
update_attn_params(self.update_stream, forward_context,
|
update_attn_params(self.update_stream, forward_context,
|
||||||
positions.shape[0])
|
maybe_padded_num_tokens)
|
||||||
|
|
||||||
if get_forward_context().sp_enabled:
|
if get_forward_context().sp_enabled:
|
||||||
hidden_states = tensor_model_parallel_all_gather(hidden_states, 0)
|
hidden_states = tensor_model_parallel_all_gather(hidden_states, 0)
|
||||||
|
|||||||
Reference in New Issue
Block a user