From 54bd531db8c21954a87d53d96a90d75ecddebb8c Mon Sep 17 00:00:00 2001 From: Yizhou <136800916+yiz-liu@users.noreply.github.com> Date: Thu, 23 Oct 2025 10:29:30 +0800 Subject: [PATCH] [v0.11.0][Fix] Fix attention metadata handling for profiling and MLA (#3636) (#3643) ### What this PR does / why we need it? This is a port PR of #3636 . Move the creation of dummy attention metadata to occur after the ACL graph runtime mode is determined. This ensures the metadata is initialized with the correct configuration during a profile run. Additionally, remove the `attn_metadata` existence check before updating MLA attention parameters. This change prevents the update from being skipped when metadata is not yet available, ensuring parameters are set correctly. ### Does this PR introduce _any_ user-facing change? None. ### How was this patch tested? None. Signed-off-by: Yizhou Liu --- vllm_ascend/worker/model_runner_v1.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index c7f74d0..0bfe0f8 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2331,7 +2331,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): forward_context = get_forward_context() assert forward_context is not None if forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL and \ - not forward_context.capturing and forward_context.attn_metadata is not None: + not forward_context.capturing: if self.vllm_config.model_config.use_mla: # FIXME: Try using `auto_dispatch_capture=True` update_mla_attn_params(self.update_stream, forward_context, @@ -2419,17 +2419,6 @@ class NPUModelRunner(LoRAModelRunnerMixin): if self.is_kv_producer and not self.is_kv_consumer: with_prefill = True - # TODO(Mengqing): Set create_mixed_batch to False since it's only used in FI warmup - # and not supported in ASCEND now. We could remove it in the future. - attn_metadata = self._build_dummy_attn_metadata( - False, - num_reqs=num_reqs, - num_tokens=num_tokens, - max_query_len=max_query_len, - aclgraph_runtime_mode=aclgraph_runtime_mode, - force_attention=force_attention, - ) - if not self.in_profile_run and self.dynamic_eplb: self.eplb_updator.forward_before() @@ -2476,6 +2465,17 @@ class NPUModelRunner(LoRAModelRunnerMixin): else: aclgraph_runtime_mode = _ag_mode + # TODO(Mengqing): Set create_mixed_batch to False since it's only used in FI warmup + # and not supported in ASCEND now. We could remove it in the future. + attn_metadata = self._build_dummy_attn_metadata( + False, + num_reqs=num_reqs, + num_tokens=num_tokens, + max_query_len=max_query_len, + aclgraph_runtime_mode=aclgraph_runtime_mode, + force_attention=force_attention, + ) + need_dummy_logits = (not self.in_profile_run and lmhead_tp_enable())