From 4381d296e5a71edad7e72add11169f0dfeb668ec Mon Sep 17 00:00:00 2001 From: Yizhou <136800916+yiz-liu@users.noreply.github.com> Date: Thu, 23 Oct 2025 09:35:18 +0800 Subject: [PATCH] [Fix] Fix attention metadata handling for profiling and MLA (#3636) ### What this PR does / why we need it? Move the creation of dummy attention metadata to occur after the ACL graph runtime mode is determined. This ensures the metadata is initialized with the correct configuration during a profile run. Additionally, remove the `attn_metadata` existence check before updating MLA attention parameters. This change prevents the update from being skipped when metadata is not yet available, ensuring parameters are set correctly. ### Does this PR introduce _any_ user-facing change? None. ### How was this patch tested? None. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: Yizhou Liu --- vllm_ascend/worker/model_runner_v1.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index c7f74d0a..0bfe0f84 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2331,7 +2331,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): forward_context = get_forward_context() assert forward_context is not None if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL and \ - not forward_context.capturing and forward_context.attn_metadata is not None: + not forward_context.capturing: if self.vllm_config.model_config.use_mla: # FIXME: Try using `auto_dispatch_capture=True` update_mla_attn_params(self.update_stream, forward_context, @@ -2419,17 +2419,6 @@ class NPUModelRunner(LoRAModelRunnerMixin): if self.is_kv_producer and not self.is_kv_consumer: with_prefill = True - # TODO(Mengqing): Set create_mixed_batch to False since it's only used in FI warmup - # and not supported in ASCEND now. We could remove it in the future. - attn_metadata = self._build_dummy_attn_metadata( - False, - num_reqs=num_reqs, - num_tokens=num_tokens, - max_query_len=max_query_len, - aclgraph_runtime_mode=aclgraph_runtime_mode, - force_attention=force_attention, - ) - if not self.in_profile_run and self.dynamic_eplb: self.eplb_updator.forward_before() @@ -2476,6 +2465,17 @@ class NPUModelRunner(LoRAModelRunnerMixin): else: aclgraph_runtime_mode = _ag_mode + # TODO(Mengqing): Set create_mixed_batch to False since it's only used in FI warmup + # and not supported in ASCEND now. We could remove it in the future. + attn_metadata = self._build_dummy_attn_metadata( + False, + num_reqs=num_reqs, + num_tokens=num_tokens, + max_query_len=max_query_len, + aclgraph_runtime_mode=aclgraph_runtime_mode, + force_attention=force_attention, + ) + need_dummy_logits = (not self.in_profile_run and lmhead_tp_enable())